From 1effb0c9e5e3ce01612ffcc02412daab68c6655d Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Thu, 12 Sep 2024 12:57:57 +0800
Subject: [PATCH] MNN:Sync: Sync Internal 2.9.5

---
 3rd_party/OpenCLHeaders/CL/cl2.hpp            |    4 -
 CMakeLists.txt                                |    1 +
 codegen/OpFuse.cpp                            |    2 +-
 docs/compile/cmake.md                         |    1 +
 docs/contribute/backend.md                    |   90 +-
 docs/contribute/op.md                         |   10 +-
 docs/faq.md                                   |    4 +-
 docs/index.rst                                |    2 +-
 docs/inference/module.md                      |   86 +-
 docs/start/overall.md                         |    4 +-
 docs/tools/convert.md                         |    1 -
 docs/tools/quant.md                           |    6 +-
 docs/tools/test.md                            |   34 +-
 docs/transformers/diffusion.md                |   23 +-
 docs/transformers/llm.md                      |  140 +-
 express/Executor.cpp                          |    5 +-
 express/module/Module.cpp                     |    8 +-
 express/module/StaticModule.cpp               |   22 +-
 include/MNN/Interpreter.hpp                   |   17 +-
 include/MNN/MNNDefine.h                       |    2 +-
 package_scripts/ios/buildiOS.sh               |   23 +-
 package_scripts/ios/buildiOS_with_armv7.sh    |   42 +
 package_scripts/mac/buildFrameWork.sh         |    8 +-
 project/android/build_32.sh                   |    1 -
 project/ios/MNN.xcodeproj/project.pbxproj     |   88 +-
 pymnn/test/model_test.py                      |   16 +-
 source/backend/arm82/Arm82Backend.cpp         |    1 +
 source/backend/arm82/Arm82Functions.cpp       |   24 +-
 source/backend/arm82/CMakeLists.txt           |    7 +
 .../low_memory/MNNDynamicQuanInput_ARM82.S    |   24 +-
 .../MNNDynamicQuantAndReorder_ARM82.S         |    9 +-
 ...MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S |   51 +-
 ...GemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S |  129 +-
 ...GemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S |   84 +-
 .../MNNPackedMatMulFP16_int4.S                |    0
 .../MNNPackedMatMulFP16_int8.S                |    0
 .../MNNPackedMatMulRemainFP16_int4.S          |    0
 .../MNNPackedMatMulRemainFP16_int8.S          |    0
 .../backend/coreml/backend/CoreMLBackend.cpp  |    2 +-
 .../backend/coreml/backend/CoreMLBackend.hpp  |    2 +-
 source/backend/cpu/CMakeLists.txt             |    4 +
 source/backend/cpu/CPUAttention.cpp           |  183 +-
 source/backend/cpu/CPUAttention.hpp           |    9 +-
 source/backend/cpu/CPUBackend.cpp             |  155 +-
 source/backend/cpu/CPUBackend.hpp             |   24 +-
 source/backend/cpu/CPUCast.cpp                |    5 +-
 source/backend/cpu/CPUConvolution.cpp         |  170 -
 source/backend/cpu/CPUConvolution.hpp         |    7 +-
 source/backend/cpu/CPUDeconvolution.cpp       |    6 +-
 source/backend/cpu/CPUDynamicQuant.cpp        |    2 +-
 source/backend/cpu/CPUFloatToInt8.cpp         |    4 +-
 source/backend/cpu/CPUFloatToInt8.hpp         |    2 +-
 source/backend/cpu/CPUImageProcess.cpp        |    1 -
 source/backend/cpu/CPUProposal.cpp            |   23 +-
 source/backend/cpu/CPUProposal.hpp            |    9 +-
 source/backend/cpu/KVCacheManager.cpp         |  292 +-
 source/backend/cpu/KVCacheManager.hpp         |   70 +-
 source/backend/cpu/arm/CMakeLists.txt         |    4 +
 source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S |   33 +
 .../backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S |   43 +
 .../cpu/arm/arm32/MNNBGRToBGR555Fast.S        |   46 +
 .../cpu/arm/arm32/MNNBGRToBGR565Fast.S        |   51 +
 .../backend/cpu/arm/arm32/MNNBGRToGRAYFast.S  |   46 +
 source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S  |   34 +
 source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S |   95 +
 source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S |   98 +
 source/backend/cpu/arm/arm32/MNNFloat2Int8.S  |   43 +-
 .../backend/cpu/arm/arm32/MNNGRAYToC3Fast.S   |   35 +
 .../backend/cpu/arm/arm32/MNNGRAYToC4Fast.S   |   36 +
 .../arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S |    3 +-
 .../MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S  |    2 +
 .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S    |    3 +-
 .../backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S |   38 +
 .../backend/cpu/arm/arm32/MNNRGBAToBGRFast.S  |   38 +
 .../backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S |   43 +
 .../cpu/arm/arm32/MNNRGBToBGR555Fast.S        |   46 +
 .../cpu/arm/arm32/MNNRGBToBGR565Fast.S        |   54 +
 source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S  |   36 +
 .../backend/cpu/arm/arm32/MNNRGBToGRAYFast.S  |   43 +
 source/backend/cpu/arm/arm64/MNNBGRAToBGR.S   |  129 +
 source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S  |   92 +
 source/backend/cpu/arm/arm64/MNNBGRToBGR555.S |  169 +
 source/backend/cpu/arm/arm64/MNNBGRToBGR565.S |  187 +
 source/backend/cpu/arm/arm64/MNNBGRToGRAY.S   |   92 +
 source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S  |  116 +
 source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S |   88 +
 source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S |   92 +
 source/backend/cpu/arm/arm64/MNNFloat2Int8.S  |  160 +-
 .../backend/cpu/arm/arm64/MNNGRAYToC3Fast.S   |  124 +
 .../backend/cpu/arm/arm64/MNNGRAYToC4Fast.S   |  139 +
 .../MNNGemmInt8AddBiasScale_ARMV82_Unit.S     |   57 +-
 source/backend/cpu/arm/arm64/MNNPackC2.S      |  107 +
 .../backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S |  147 +
 .../backend/cpu/arm/arm64/MNNRGBAToBGRFast.S  |  134 +
 .../backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S |   96 +
 source/backend/cpu/arm/arm64/MNNRGBToBGR.S    |  126 +
 source/backend/cpu/arm/arm64/MNNRGBToBGR555.S |  169 +
 source/backend/cpu/arm/arm64/MNNRGBToBGR565.S |  187 +
 .../backend/cpu/arm/arm64/MNNRGBToGRAYFast.S  |   92 +
 .../cpu/arm/arm64/MNNSamplerC3BilinearOpt.S   |  171 +
 .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S    |  120 +-
 .../MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S  |  164 +-
 .../MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S  |   63 +-
 .../MNNPackedMatMulRemain_int4.S              |    0
 .../MNNPackedMatMulRemain_int8.S              |    0
 .../MNNPackedMatMul_int4.S                    |    0
 .../MNNPackedMatMul_int8.S                    |    0
 .../backend/cpu/compute/CommonOptFunction.cpp |   14 +-
 .../cpu/compute/ConvInt8TiledExecutor.cpp     |  196 +-
 .../cpu/compute/ConvInt8TiledExecutor.hpp     |    5 +-
 .../backend/cpu/compute/ConvInt8Winograd.cpp  |   48 +-
 .../backend/cpu/compute/ConvInt8Winograd.hpp  |    1 +
 .../cpu/compute/ConvolutionFloatFactory.cpp   |   10 +-
 .../backend/cpu/compute/GemmInt8Executor.cpp  |   33 +-
 .../cpu/compute/IdstConvolutionInt8.cpp       |    4 +-
 .../cpu/compute/ImageProcessFunction.cpp      |  218 +-
 .../backend/cpu/compute/Int8FunctionsOpt.cpp  |   37 +-
 source/backend/cpu/compute/Int8FunctionsOpt.h |   15 +-
 source/backend/cpu/x86_x64/AVX2Backend.cpp    |    1 +
 source/backend/cpu/x86_x64/AVX2Functions.cpp  |    5 +-
 source/backend/cpu/x86_x64/CMakeLists.txt     |    6 +
 .../cpu/x86_x64/FunctionDispatcher.cpp        |    5 +-
 .../cpu/x86_x64/avx/FunctionSummary.hpp       |    7 +-
 source/backend/cpu/x86_x64/avx/GemmAVX2.cpp   |   14 +-
 .../backend/cpu/x86_x64/avx/GemmFunction.hpp  |    2 +-
 source/backend/cpu/x86_x64/avx/GemmInt8.cpp   |   19 +-
 .../backend/cpu/x86_x64/avx512/GemmInt8.cpp   |   15 +-
 .../cpu/x86_x64/sse/FunctionSummary.hpp       |    6 +-
 source/backend/cpu/x86_x64/sse/GemmInt8.cpp   |   41 +-
 source/backend/cpu/x86_x64/sse/GemmSSE.cpp    |    4 +-
 source/backend/cuda/core/CUDABackend.cpp      |    2 +-
 source/backend/cuda/core/CUDABackend.hpp      |    2 +-
 source/backend/hiai/backend/NPUBackend.cpp    |    2 +-
 source/backend/hiai/backend/NPUBackend.hpp    |    2 +-
 source/backend/metal/MetalAttention.mm        |   40 +-
 source/backend/metal/MetalBackend.hpp         |   13 +-
 source/backend/metal/MetalBackend.mm          |   83 +-
 source/backend/metal/MetalBinary.mm           |    6 +-
 .../metal/MetalConvolutionDepthwise.mm        |    8 +-
 source/backend/metal/MetalUnary.mm            |    4 +-
 source/backend/nnapi/backend/NNAPIBackend.cpp |    2 +-
 source/backend/nnapi/backend/NNAPIBackend.hpp |    2 +-
 .../backend/opencl/core/BufferConvertor.cpp   |  215 +-
 .../backend/opencl/core/BufferConvertor.hpp   |    5 +-
 source/backend/opencl/core/OpenCLBackend.cpp  |  124 +-
 source/backend/opencl/core/OpenCLBackend.hpp  |    4 +-
 source/backend/opencl/core/OpenCLGemmTune.cpp |  146 +-
 .../opencl/core/runtime/OpenCLRuntime.cpp     |   19 +-
 .../opencl/core/runtime/OpenCLRuntime.hpp     |    1 +
 .../execution/buffer/ArgMaxBufExecution.cpp   |  185 +-
 .../execution/buffer/ArgMaxBufExecution.hpp   |    3 +
 .../buffer/AttentionBufExecution.cpp          |  860 +-
 .../buffer/AttentionBufExecution.hpp          |  104 +-
 .../execution/buffer/BinaryBufExecution.cpp   |   39 +-
 .../execution/buffer/CastBufExecution.cpp     |   47 +-
 .../execution/buffer/ConvBufExecution.cpp     |  373 +-
 .../execution/buffer/ConvBufExecution.hpp     |    7 +-
 .../buffer/ConvBufLowMemoryExecution.cpp      |  466 +-
 .../buffer/ConvBufLowMemoryExecution.hpp      |   11 +-
 .../execution/buffer/ConvBufWinograd.cpp      |   70 +-
 .../execution/buffer/ConvBufWinograd.hpp      |    6 +-
 .../buffer/ConvSubgroupBufExecution.cpp       |   24 +-
 .../execution/buffer/DeconvBufExecution.cpp   |    1 +
 .../buffer/DepthwiseConvBufExecution.cpp      |   11 +-
 .../DepthwiseConvSubgroupBufExecution.cpp     |   19 +-
 .../buffer/GridSampleBufExecution.cpp         |    2 +-
 .../buffer/GroupNormBufExecution.cpp          |   70 +-
 .../buffer/GroupNormBufExecution.hpp          |    1 -
 .../execution/buffer/Interp3DBufExecution.cpp |    2 +-
 .../execution/buffer/InterpBufExecution.cpp   |    2 +-
 .../buffer/LayerNormBufExecution.cpp          |  162 +-
 .../execution/buffer/LoopBufExecution.cpp     |  753 +-
 .../execution/buffer/LoopBufExecution.hpp     |    8 +-
 .../execution/buffer/MatmulBufExecution.cpp   |   70 +-
 .../execution/buffer/PoolBufExecution.cpp     |    3 +-
 .../execution/buffer/RangeBufExecution.cpp    |   38 +-
 .../execution/buffer/RasterBufExecution.cpp   |  121 +-
 .../buffer/ReductionBufExecution.cpp          |  140 +-
 .../buffer/ReductionBufExecution.hpp          |    3 +-
 .../execution/buffer/ReluBufExecution.cpp     |    7 +-
 .../execution/buffer/ScaleBufExecution.cpp    |   20 +-
 .../execution/buffer/ScaleBufExecution.hpp    |    1 +
 .../execution/buffer/SelectBufExecution.cpp   |   13 +-
 .../buffer/SelfAttentionBufExecution.cpp      |   29 +-
 .../execution/buffer/SoftmaxBufExecution.cpp  |  195 +-
 .../execution/buffer/SoftmaxBufExecution.hpp  |    5 +-
 .../buffer/SplitGeluBufExecution.cpp          |   19 +-
 .../buffer/StrassenMatmulOpenCLComputor.cpp   |   21 +
 .../execution/buffer/UnaryBufExecution.cpp    |   49 +-
 .../backend/opencl/execution/cl/argmax_buf.cl |  313 +-
 .../opencl/execution/cl/attention_buf.cl      |  937 +-
 .../backend/opencl/execution/cl/binary_buf.cl |  119 +-
 .../execution/cl/binary_subgroup_buf.cl       |   28 +-
 .../opencl/execution/cl/buffer_convert_buf.cl |  264 +-
 .../execution/cl/buffer_convert_quant.cl      |   26 +-
 .../backend/opencl/execution/cl/cast_buf.cl   |   56 +-
 .../opencl/execution/cl/conv_2d_buf.cl        |  208 +-
 .../execution/cl/conv_2d_c16_subgroup_buf.cl  |   36 +-
 .../execution/cl/conv_2d_c1_subgroup_buf.cl   |   38 +-
 .../opencl/execution/cl/conv_2d_int_buf.cl    |   47 +-
 .../backend/opencl/execution/cl/deconv_2d.cl  |    6 +-
 .../execution/cl/depthwise_conv2d_buf.cl      |   56 +-
 .../cl/depthwise_conv2d_subgroup_buf.cl       |   12 +-
 .../backend/opencl/execution/cl/gather_buf.cl |   89 +-
 .../backend/opencl/execution/cl/gemm_buf.cl   |  305 +-
 .../opencl/execution/cl/gemm_conv1x1_buf.cl   |  760 ++
 .../execution/cl/gemm_quant_batch_buf.cl      |  821 --
 .../opencl/execution/cl/gemv_conv1x1_buf.cl   | 1388 +--
 .../opencl/execution/cl/grid_sample_buf.cl    |   39 +-
 .../opencl/execution/cl/input_transe_buf.cl   |   16 +-
 .../backend/opencl/execution/cl/interp_buf.cl |   20 +-
 .../opencl/execution/cl/layernorm_buf.cl      |  362 +-
 .../backend/opencl/execution/cl/loop_buf.cl   |  184 +-
 .../backend/opencl/execution/cl/matmul_buf.cl |  546 +-
 .../opencl/execution/cl/matmul_params_buf.cl  |  231 +-
 .../opencl/execution/cl/opencl_codegen.py     |    2 +-
 .../opencl/execution/cl/opencl_program.cc     | 8115 +++++++----------
 .../opencl/execution/cl/opencl_source_map.hpp |   12 +-
 .../opencl/execution/cl/pooling_buf.cl        |   14 +-
 .../execution/cl/pooling_subgroup_buf.cl      |   32 +-
 .../backend/opencl/execution/cl/range_buf.cl  |   45 +-
 .../backend/opencl/execution/cl/raster_buf.cl |  124 +-
 .../opencl/execution/cl/reduction_buf.cl      |  357 +-
 .../backend/opencl/execution/cl/scale_buf.cl  |   25 +-
 .../opencl/execution/cl/self_attention_buf.cl |   36 +-
 .../opencl/execution/cl/softmax_buf.cl        |  291 +-
 .../opencl/execution/cl/splitgelu_buf.cl      |   98 +-
 .../backend/opencl/execution/cl/unary_buf.cl  |   49 +-
 .../opencl/execution/cl/unary_subgroup_buf.cl |   13 +-
 .../execution/cl/winogradTransform_buf.cl     |    6 +-
 .../cl/winogradTransform_subgroup_buf.cl      |    8 +-
 .../opencl/execution/image/ConvExecution.cpp  |    2 +-
 .../image/ConvLowMemoryExecution.cpp          |    6 +-
 source/backend/opengl/GLBackend.cpp           |    4 +-
 source/backend/opengl/GLBackend.hpp           |    2 +-
 .../backend/tensorrt/backend/TRTBackend.cpp   |    2 +-
 .../backend/tensorrt/backend/TRTBackend.hpp   |    2 +-
 .../vulkan/component/VulkanPipeline.cpp       |    5 +-
 .../backend/vulkan/runtime/VulkanRuntime.cpp  |    2 +-
 .../backend/vulkan/runtime/VulkanRuntime.hpp  |    2 +-
 source/core/Backend.hpp                       |   16 +-
 source/core/BufferAllocator.cpp               |  169 +-
 source/core/BufferAllocator.hpp               |   46 +-
 source/core/OpCommonUtils.cpp                 |   10 +-
 source/core/Pipeline.cpp                      |   12 +-
 source/core/Session.cpp                       |   29 +-
 source/core/Session.hpp                       |    2 +
 source/cv/ImageProcess.cpp                    |    1 -
 source/cv/ImageProcessUtils.cpp               |   48 +-
 source/cv/ImageProcessUtils.hpp               |    1 +
 source/geometry/GeometryComputerUtils.cpp     |   11 +-
 test.sh                                       |   72 +-
 test/MNNTestSuite.cpp                         |   15 +-
 test/core/BufferAllocatorTest.cpp             |    4 +-
 test/cv/ImageProcessTest.cpp                  |  482 +-
 test/expr/ModuleTest.cpp                      |  143 +-
 test/op/AttentionTest.cpp                     |  241 +
 test/op/RasterTest.cpp                        |    7 +
 test/speed/HybridConvSpeedTest.cpp            |   24 +-
 tools/converter/include/config.hpp            |    1 +
 tools/converter/source/common/cli.cpp         |   12 +-
 tools/converter/source/common/writeFb.cpp     |    4 +-
 .../postconvert/AddTensorFormatConverter.cpp  |    4 +-
 tools/cpp/CMakeLists.txt                      |    5 +
 tools/cpp/ModuleBasic.cpp                     |    9 +
 tools/cpp/getPerformance.cpp                  |   19 +-
 tools/cv/benchmark/opencv_benchmark.cpp       |   14 +-
 tools/cv/source/imgproc/filter.cpp            |   13 +-
 tools/train/source/nn/NN.cpp                  |    2 +-
 transformers/diffusion/export/convert_mnn.py  |   21 +
 transformers/diffusion/pipeline.cpp           |    3 +
 transformers/llm/config.json                  |    4 +-
 transformers/llm/engine/include/llm/llm.hpp   |    5 +-
 transformers/llm/engine/ios/README.md         |   44 +
 transformers/llm/engine/ios/ios_app.jpg       |  Bin 0 -> 28478 bytes
 transformers/llm/engine/ios/mnn-llm/icon.png  |  Bin 0 -> 370381 bytes
 .../mnn-llm/mnn-llm.xcodeproj/project.pbxproj |  453 +
 .../contents.xcworkspacedata                  |    7 +
 .../xcshareddata/IDEWorkspaceChecks.plist     |    8 +
 .../AccentColor.colorset/Contents.json        |   11 +
 .../AppIcon.appiconset/Contents.json          |   14 +
 .../AppIcon.appiconset/icon.png               |  Bin 0 -> 370381 bytes
 .../mnn-llm/Assets.xcassets/Contents.json     |    6 +
 .../ios/mnn-llm/mnn-llm/ContentView.swift     |  152 +
 .../mnn-llm/LLMInferenceEngineWrapper.h       |   29 +
 .../mnn-llm/LLMInferenceEngineWrapper.mm      |  106 +
 .../Preview Assets.xcassets/Contents.json     |    6 +
 .../mnn-llm/mnn-llm/mnn-llm-Bridging-Header.h |    5 +
 .../ios/mnn-llm/mnn-llm/mnn_llmApp.swift      |   17 +
 transformers/llm/engine/llm_demo.cpp          |    8 +-
 transformers/llm/engine/model/bench.txt       |    4 +
 transformers/llm/engine/src/llm.cpp           |   60 +-
 transformers/llm/engine/src/llmconfig.hpp     |   17 +-
 transformers/llm/export/README.md             |  163 +-
 transformers/llm/export/README_en.md          |   92 -
 transformers/llm/export/llm_export.py         | 1430 ---
 .../Baichuan2-7B-Chat/modeling_baichuan.py    |  825 --
 .../llm_models/Llama-2-7b-chat-ms/config.json |   28 -
 .../Llama-2-7b-chat-ms/configuration_llama.py |  174 -
 .../Llama-2-7b-chat-ms/modeling_llama.py      | 1040 ---
 .../Llama-3-8B-Instruct/config.json           |   31 -
 .../configuration_llama.py                    |  174 -
 .../Llama-3-8B-Instruct/modeling_llama.py     | 1040 ---
 .../llm_models/MiniCPM-1.2b/config.json       |   28 -
 .../MiniCPM-1.2b/configuration_llama.py       |  174 -
 .../MiniCPM-1.2b/convert_minicpm_to_llama.py  |   38 -
 .../llm_models/MiniCPM-1.2b/modeling_llama.py | 1010 --
 .../llm_models/MiniCPM-2.4b/config.json       |   28 -
 .../MiniCPM-2.4b/configuration_llama.py       |  174 -
 .../llm_models/MiniCPM-2.4b/modeling_llama.py | 1010 --
 .../Qwen-1_8B-Chat/modeling_qwen.py           | 1406 ---
 .../llm_models/Qwen-1_8B/modeling_qwen.py     | 1406 ---
 .../llm_models/Qwen-7B-Chat/config.json       |   37 -
 .../llm_models/Qwen-7B-Chat/modeling_qwen.py  | 1199 ---
 .../llm_models/Qwen-VL-Chat/modeling_qwen.py  | 1162 ---
 .../llm_models/Qwen1_5-0_5B-Chat/config.json  |   32 -
 .../Qwen1_5-0_5B-Chat/configuration_qwen2.py  |  144 -
 .../Qwen1_5-0_5B-Chat/modeling_qwen2.py       | 1436 ---
 .../llm_models/Qwen1_5-1_8B-Chat/config.json  |   31 -
 .../Qwen1_5-1_8B-Chat/configuration_qwen2.py  |  144 -
 .../Qwen1_5-1_8B-Chat/modeling_qwen2.py       | 1436 ---
 .../llm_models/Qwen1_5-4B-Chat/config.json    |   31 -
 .../Qwen1_5-4B-Chat/configuration_qwen2.py    |  144 -
 .../Qwen1_5-4B-Chat/modeling_qwen2.py         | 1436 ---
 .../llm_models/Qwen1_5-7B-Chat/config.json    |   31 -
 .../Qwen1_5-7B-Chat/configuration_qwen2.py    |  144 -
 .../Qwen1_5-7B-Chat/modeling_qwen2.py         | 1436 ---
 .../Qwen2-0_5B-Instruct/config.json           |   31 -
 .../configuration_qwen2.py                    |  144 -
 .../Qwen2-0_5B-Instruct/modeling_qwen2.py     | 1436 ---
 .../Qwen2-1_5B-Instruct/config.json           |   31 -
 .../configuration_qwen2.py                    |  144 -
 .../Qwen2-1_5B-Instruct/modeling_qwen2.py     | 1436 ---
 .../export/llm_models/Qwen2-1_5B/config.json  |   31 -
 .../Qwen2-1_5B/configuration_qwen2.py         |  144 -
 .../llm_models/Qwen2-1_5B/modeling_qwen2.py   | 1434 ---
 .../llm_models/Qwen2-7B-Instruct/config.json  |   31 -
 .../Qwen2-7B-Instruct/configuration_qwen2.py  |  144 -
 .../Qwen2-7B-Instruct/modeling_qwen2.py       | 1436 ---
 .../TinyLlama-1_1B-Chat/config.json           |   30 -
 .../configuration_llama.py                    |  174 -
 .../TinyLlama-1_1B-Chat/modeling_llama.py     | 1040 ---
 .../export/llm_models/Yi-6B-Chat/config.json  |   29 -
 .../Yi-6B-Chat/configuration_llama.py         |  174 -
 .../llm_models/Yi-6B-Chat/modeling_llama.py   | 1040 ---
 .../llm_models/chatglm-6b/modeling_chatglm.py | 1441 ---
 .../chatglm2-6b/modeling_chatglm.py           | 1193 ---
 .../chatglm3-6b/modeling_chatglm.py           | 1293 ---
 .../codegeex2-6b/modeling_chatglm.py          | 1092 ---
 .../deepseek-llm-7b-chat/config.json          |   28 -
 .../configuration_llama.py                    |  174 -
 .../deepseek-llm-7b-chat/modeling_llama.py    | 1040 ---
 .../glm-4-9b-chat/modeling_chatglm.py         | 1238 ---
 .../internlm-chat-7b/modeling_internlm.py     | 1046 ---
 .../export/llm_models/phi-2/modeling_phi.py   |  989 --
 transformers/llm/export/llmexport.py          | 1705 ++++
 356 files changed, 17963 insertions(+), 51052 deletions(-)
 create mode 100755 package_scripts/ios/buildiOS_with_armv7.sh
 rename source/backend/arm82/asm/arm64/{low_memory => normal_memory}/MNNPackedMatMulFP16_int4.S (100%)
 rename source/backend/arm82/asm/arm64/{low_memory => normal_memory}/MNNPackedMatMulFP16_int8.S (100%)
 rename source/backend/arm82/asm/arm64/{low_memory => normal_memory}/MNNPackedMatMulRemainFP16_int4.S (100%)
 rename source/backend/arm82/asm/arm64/{low_memory => normal_memory}/MNNPackedMatMulRemainFP16_int8.S (100%)
 create mode 100644 source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S
 create mode 100644 source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBGRAToBGR.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBGRToBGR555.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBGRToBGR565.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNBGRToGRAY.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNPackC2.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBToBGR.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBToBGR555.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBToBGR565.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S
 create mode 100644 source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S
 rename source/backend/cpu/arm/arm64/{low_memory => normal_memory}/MNNPackedMatMulRemain_int4.S (100%)
 rename source/backend/cpu/arm/arm64/{low_memory => normal_memory}/MNNPackedMatMulRemain_int8.S (100%)
 rename source/backend/cpu/arm/arm64/{low_memory => normal_memory}/MNNPackedMatMul_int4.S (100%)
 rename source/backend/cpu/arm/arm64/{low_memory => normal_memory}/MNNPackedMatMul_int8.S (100%)
 create mode 100644 source/backend/opencl/execution/cl/gemm_conv1x1_buf.cl
 delete mode 100644 source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
 create mode 100644 test/op/AttentionTest.cpp
 create mode 100644 transformers/diffusion/export/convert_mnn.py
 create mode 100644 transformers/llm/engine/ios/README.md
 create mode 100644 transformers/llm/engine/ios/ios_app.jpg
 create mode 100644 transformers/llm/engine/ios/mnn-llm/icon.png
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.pbxproj
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/contents.xcworkspacedata
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AccentColor.colorset/Contents.json
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/Contents.json
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/icon.png
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/Contents.json
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/ContentView.swift
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.h
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/Preview Content/Preview Assets.xcassets/Contents.json
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn-llm-Bridging-Header.h
 create mode 100644 transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn_llmApp.swift
 create mode 100644 transformers/llm/engine/model/bench.txt
 delete mode 100644 transformers/llm/export/README_en.md
 delete mode 100644 transformers/llm/export/llm_export.py
 delete mode 100755 transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
 delete mode 100755 transformers/llm/export/llm_models/Llama-2-7b-chat-ms/config.json
 delete mode 100644 transformers/llm/export/llm_models/Llama-2-7b-chat-ms/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
 delete mode 100755 transformers/llm/export/llm_models/Llama-3-8B-Instruct/config.json
 delete mode 100644 transformers/llm/export/llm_models/Llama-3-8B-Instruct/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/config.json
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/convert_minicpm_to_llama.py
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-1.2b/modeling_llama.py
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-2.4b/config.json
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-2.4b/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/MiniCPM-2.4b/modeling_llama.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen-1_8B-Chat/modeling_qwen.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen-1_8B/modeling_qwen.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen-7B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen1_5-4B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-4B-Chat/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen1_5-7B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-7B-Chat/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen2-1_5B/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
 delete mode 100644 transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
 delete mode 100755 transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
 delete mode 100755 transformers/llm/export/llm_models/Yi-6B-Chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/Yi-6B-Chat/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
 delete mode 100644 transformers/llm/export/llm_models/chatglm-6b/modeling_chatglm.py
 delete mode 100644 transformers/llm/export/llm_models/chatglm2-6b/modeling_chatglm.py
 delete mode 100755 transformers/llm/export/llm_models/chatglm3-6b/modeling_chatglm.py
 delete mode 100755 transformers/llm/export/llm_models/codegeex2-6b/modeling_chatglm.py
 delete mode 100755 transformers/llm/export/llm_models/deepseek-llm-7b-chat/config.json
 delete mode 100644 transformers/llm/export/llm_models/deepseek-llm-7b-chat/configuration_llama.py
 delete mode 100644 transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
 delete mode 100755 transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py
 delete mode 100755 transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
 delete mode 100644 transformers/llm/export/llm_models/phi-2/modeling_phi.py
 create mode 100644 transformers/llm/export/llmexport.py

diff --git a/3rd_party/OpenCLHeaders/CL/cl2.hpp b/3rd_party/OpenCLHeaders/CL/cl2.hpp
index 4db4f7cf6..305e88f30 100644
--- a/3rd_party/OpenCLHeaders/CL/cl2.hpp
+++ b/3rd_party/OpenCLHeaders/CL/cl2.hpp
@@ -403,10 +403,6 @@
 # pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead")
 # define CL_HPP_USE_CL_DEVICE_FISSION
 #endif
-#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
-# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
-# define CL_HPP_ENABLE_EXCEPTIONS
-#endif
 #if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
 # pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
 # define CL_HPP_NO_STD_VECTOR
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a893e0854..7b940476e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,7 @@ option(MNN_INTERNAL "Build with MNN internal features, such as model authenticat
 option(MNN_JNI "Build MNN Jni for java to use" OFF)
 option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
+option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
 
 IF (OHOS)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
diff --git a/codegen/OpFuse.cpp b/codegen/OpFuse.cpp
index 480825470..5e1ccbb91 100644
--- a/codegen/OpFuse.cpp
+++ b/codegen/OpFuse.cpp
@@ -275,7 +275,7 @@ bool codegen(std::vector<Schedule::OpCacheInfo>& infos, std::vector<std::vector<
         auto inputs = tensors.first;
         auto outputs = tensors.second;
         // build Plugin Op
-        SharedPtr<Command> cmdPlugin;
+        std::shared_ptr<Command> cmdPlugin;
         {
             auto sourceCode = fuseModule.codegen();
             if(mapKernelSources.find(sourceCode) == mapKernelSources.end()) {
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
index 092c9d1ec..95f9d5760 100644
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@@ -80,6 +80,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENCV_BENCH     | 构建MNN的OpenCV功能是否开启性能benchmark，默认为`OFF` |
 | MNN_VULKAN_IMAGE     | 构建MNN的Vulkan后端时采用Image内存模式，以便支持FP16和部分移动端上GPU的加速，默认为`ON` |
 | MNN_LOW_MEMORY       | 是否支持低内存模式，支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化，默认为`OFF` |
+| MNN_CPU_WEIGHT_DEQUANT_GEMM       | 是否编译CPU权重反量化的矩阵乘Kernel， 如果打开该编译宏并且在CPU推理时设置MNN::BackendConfig::MemoryMode=Memory_Normal，就会使用权重反量化算子进行权重量化模型的推理，默认为`OFF` |
 | MNN_SUPPORT_RENDER   | 是否支持图形渲染相关算子实现，默认为 `OFF` |
 | MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现，默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo，默认为`OFF` |
diff --git a/docs/contribute/backend.md b/docs/contribute/backend.md
index b54f177f6..caa10ee2a 100644
--- a/docs/contribute/backend.md
+++ b/docs/contribute/backend.md
@@ -1,5 +1,7 @@
 # 自定义后端
-Backend是MNN对计算设备的抽象。MNN当前已经支持CPU、Vulkan、OpenCL、Metal等Backend，**只在计算设备暂未支持时新增Backend**，新增Op，请参阅[新增Op文档](customize_op)。
+Runtime-Backend是MNN对计算设备的抽象。MNN当前已经支持CPU、Vulkan、OpenCL、Metal、CUDA等Backend，**只在计算设备暂未支持时新增Backend**，新增Op，请参阅[新增Op文档](op)。
+
+
 
 ## 声明
 所有新增Backend都需继承`Backend`类，并实现所有纯虚函数。
@@ -10,8 +12,10 @@ class XPUBackend final : public Backend {
   virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) override;
   virtual void onExecuteBegin() const override;
   virtual void onExecuteEnd() const override;
-  virtual bool onAcquireBuffer(const Tensor* tensor, StorageType storageType) override;
-  virtual bool onReleaseBuffer(const Tensor* tensor, StorageType storageType) override;
+  virtual void onResizeBegin() override;
+  virtual ErrorCode onResizeEnd() override;
+
+  virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) override;
   virtual bool onClearBuffer() override;
   virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
 }
@@ -91,7 +95,7 @@ static XPUCreatorRegister<XPUPoolingCreator> __reg(OpType_Pooling);
 ```
 
 ## 内存管理
-Backend通过`onAcquireBuffer`为tensor分配内存，通过`onReleaseBuffer`为tensor释放内存。内存有三种存储模式：`STATIC`内存不复用，一般用于op常量存储；`DYNAMIC`内存可复用，一般用于变量存储；`DYNAMIC_SEPERATE`内存在pipeline间可复用，一般用于pipeline常量存储。`_onAcquireBuffer_`_和_`_onReleaseBuffer_`_中可以不实际分配/释放内存，只记录内存用量变更，在_`_onAllocateBuffer_`_调用时，再根据用量计算出优化方案，一次性完成分配/释放。_
+Backend通过`onAcquire`创建`MemObj`内存对象，定义其析构函数以便为tensor释放内存。内存有三种存储模式：`STATIC`内存不复用，一般用于op常量存储；`DYNAMIC`内存可复用，一般用于变量存储；`DYNAMIC_SEPERATE`内存在pipeline间可复用，一般用于pipeline常量存储。
 
 ```cpp
 /** backend buffer storage type */
@@ -118,31 +122,13 @@ enum StorageType {
      */
     DYNAMIC_SEPERATE
 };
-/**
- * @brief allocate buffer of tensor for given storage type.
- * @param tensor        buffer provider.
- * @param storageType   buffer storage type.
- * @return success or not.
- */
-virtual bool onAcquireBuffer(const Tensor* tensor, StorageType storageType) = 0;
-/**
- * @brief release buffer of tensor for given storage type.
- * @param tensor        buffer provider.
- * @param storageType   buffer storage type.
- * @return success or not.
- */
-virtual bool onReleaseBuffer(const Tensor* tensor, StorageType storageType) = 0;
-```
-
-在所有内存都分配完成后，backend会收到`onAllocateBuffer`回调：
-```cpp
-/**
- * @brief callback after all buffers needed by backend ops were allocated.
- * @return success or not. (result not used currently)
- */
-virtual bool onAllocateBuffer() {
-    return true;
-}
+    /**
+     * @brief allocate buffer of tensor for given storage type.
+     * @param tensor        buffer provider.
+     * @param storageType   buffer storage type.
+     * @return MemObj for release, if failed, return nullptr.
+     */
+    virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) = 0;
 ```
 
 Backend在调用`onClearBuffer`时，需要释放所有`DYNAMIC`和`DYNAMIC_SEPERATE`存储模式的内存：
@@ -189,17 +175,47 @@ virtual void onExecuteEnd() const = 0;
 
 ```
 
-## 注册Backend
-最后，定义Backend Creator，注册方法中调用`MNNInsertExtraBackendCreator`就可以完成Backend的注册，这里的注册方法需要在BackendRegister.cpp中声明并调用：
+## Runtime（运行时）
+对于使用同一种后端，且存在先后顺序，不会同时运行的模型，MNN提供机制使其共享部分计算资源，比如线程池，内存池等等。
+这部分计算资源使用Runtime存储。而Backend则由Runtime创建
+
+### 实现Runtime
+Runtime主要实现如下接口：
+
+```
+    virtual Backend* onCreate(const BackendConfig* config = nullptr, Backend* origin = nullptr) const = 0;
+
+    /**
+     @brief reset runtime
+     */
+    virtual void onReset(int numberThread, const BackendConfig* config, bool full) {
+        // Do nothing
+    }
+
+    /**
+     @brief clear unuseful resource
+     @param level clear level: 0 - 100, bigger mean clear more, smaller mean cache more
+     */
+    virtual void onGabageCollect(int level) = 0;
+
+```
+
+- onCreate ：创建 Backend
+- onReset ：重设默认配置
+- onGabageCollect ：清理资源以节省内存
+
+
+### 注册Runtime
+注册方法中调用`MNNInsertExtraRuntimeCreator`就可以完成Runtime的注册，这里的注册方法需要在Backend.cpp中声明并调用：
 ```cpp
-class XPUBackendCreator : public BackendCreator {
-    virtual Backend *onCreate(const Backend::Info &info) const {
-        return new MetalBackend;
+class XPURuntimeCreator : public RuntimeCreator {
+    virtual Runtime* onCreate(const Backend::Info &info) const {
+        return new XPURuntime;
     }
 };
-void registerCPUBackendCreator() {
-    MNNInsertExtraBackendCreator(MNN_FORWARD_CPU, new CPUBackendCreator);
+void registerXPURuntimeCreator() {
+    MNNInsertExtraBackendCreator(MNN_FORWARD_XPU, new XPURuntimeCreator);
 };
 ```
 
-使用cmake编译时，完成代码修改后，也需要相应修改CMakeLists.txt。
\ No newline at end of file
+使用cmake编译时，完成代码修改后，也需要相应修改CMakeLists.txt。
diff --git a/docs/contribute/op.md b/docs/contribute/op.md
index 059a84d25..7a28397de 100644
--- a/docs/contribute/op.md
+++ b/docs/contribute/op.md
@@ -1,6 +1,14 @@
 # 自定义算子
 ## 概述
-在添加自定义算子前，请参阅[算子列表](../en/ops)，避免不必要的重复。
+在添加自定义算子前，请查看算子列表，避免不必要的重复。
+
+```bash
+./MNNConvert -f CAFFE --OP
+./MNNConvert -f TF --OP
+./MNNConvert -f ONNX --OP
+./MNNConvert -f TORCH --OP 
+```
+
 ### MNN 算子转换与实现结构
 MNN 的算子转换与实现如下图，
 - 模型转换包括以下步骤，二选一：
diff --git a/docs/faq.md b/docs/faq.md
index c9b6344c0..db7241f12 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -250,7 +250,7 @@ OpenCL / Vulkan 采用静态变量自注册的方式往 MNN 主库注册后端.
 
 
 ## 性能相关
-### 使用 GPU 时，调用 copyToHostTensor / copyFromHostTensor 非常慢
+### 使用 GPU 时，调用 copyToHostTensor / readMap 非常慢
 GPU 后端调用 copy 的时间包含两个部分
 
 - 异构数据拷贝
@@ -258,7 +258,7 @@ GPU 后端调用 copy 的时间包含两个部分
 
 对 GPU 后端而言，在数据被要求对用户可见（比如复制 output tensor 数据出来）之前，是允许异步执行的。
 在数据被用户要求可见之时，会等待相应的异步操作完成。
-因此有可能 复制 output tensor 的过程包括了等待 GPU 算子异步执行完成，导致缓慢。
+因此有可能 复制 output tensor 的过程包括了等待 GPU 算子异步执行完成，导致看上去缓慢。
 ### GPU 为什么比 CPU 跑得慢？
 有如下原因： 
 
diff --git a/docs/index.rst b/docs/index.rst
index 827a85235..174a53cf3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -72,7 +72,7 @@
 
 .. toctree::
    :maxdepth: 1
-   :caption: 测试工具
+   :caption: 工具集
    :name: tools
 
    tools/convert
diff --git a/docs/inference/module.md b/docs/inference/module.md
index 7ec90a8a4..3fea3589a 100644
--- a/docs/inference/module.md
+++ b/docs/inference/module.md
@@ -5,19 +5,25 @@
 - 模型推理与`Session`的区别是不需要用户显式resize，支持控制流，所以当模型中有`if`或`while`时必须使用`Module`推理
 ### 相关数据结构
 - `Module` Module接口的核心类，表示一个模型的虚类；实际加载模型时会创建其子类
-- `Executor` 包含若干个`RuntimeManager`，提供内存管理接口，每个`Executor`必须在单线程环境下运行。默认提供全局 `Executor`，需要并发执行时，可自行创建。
-- `ExecutorScope`  用于在子线程中绑定`Executor`，多线程并发必需
-- `VARP` 作为`Module`的输入输出，也是[Expr API](expr.md)中的基础数据结构
+- `Executor` 提供内存管理和后端资源管理能力，每个`Executor`必须在单线程环境下运行。同一个`Executor`可以用于多个顺序执行的`Module`
+- `ExecutorScope`  用于在子线程中绑定`Executor`，多线程并发必需。默认在创建`Module`时使用全局 `Executor`，如果有多个Module在不同线程并发执行时，需要各自创建`Executor`，并用`ExecutorScope`绑定。
+- `VARP` 是`Module`的输入输出，也是[Expr API](expr.md)中的基础数据结构
 
 ## 工作流程
-配置Executor(可选) -> 创建 RuntimeManager(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module
-### （可选）配置Executor
-`Executor`给用户提供接口来配置推理后端、线程数等属性，以及做性能统计、算子执行的回调函数、内存回收等功能。 提供一个全局的Exector对象，用户不用创建或持有对象即可直接使用。
+创建和配置Executor -> 创建 RuntimeManager(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module -> 销毁Executor
+### 创建和配置Executor
+`Executor`给用户提供接口来配置推理后端、线程数等属性，以及做性能统计、算子执行的回调函数、内存回收等功能。 推荐针对自身模块创建单独的Executor ，若使用全局的Exector对象，对于多个模块在不同线程运行时可能会发生冲突。
 ```cpp
-// 配置默认全局Exector
-MNN::BackendConfig backend_config;    // default backend config 
+// 创建Exector
+MNN::BackendConfig backendConfig;    // default backend config 
+std::shared_ptr<MNN::Express::Executor> executor = MNN::Express::Executor::newExecutor(MNN_FORWARD_CPU, backendConfig, 1);
+
 // 设置使用4线程+CPU
-MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, backend_config, 4);
+executor->setGlobalExecutorConfig(MNN_FORWARD_CPU, backend_config, 4);
+
+// 绑定Executor，在创建/销毁/使用Module或进行表达式计算之前都需要绑定
+MNN::Express::ExecutorScope _s(executor);
+
 ``` 
 
 ### （可选）创建 RuntimeManager
@@ -39,6 +45,68 @@ std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtmgr(MNN::Express::Exec
 rtmgr->setCache(".cachefile");
 ```
 
+RuntimeManager 可以设置 hint , mode , cache, externalpath ，以支持扩展功能。
+
+```
+void setCache(std::string cacheName);
+void updateCache();
+void setMode(Interpreter::SessionMode mode);
+void setHint(Interpreter::HintMode mode, int value);
+void setExternalPath(std::string path, int type);
+bool getInfo(Interpreter::SessionInfoCode code, void* ptr);
+```
+
+#### cache 设置
+对于GPU后端（Metal/OpenCL等），可以设置缓存文件路径，存储AutoTuning结果和Program编译结果，以加速第二次之后的Module load 过程。
+
+```
+    std::shared_ptr<Executor::RuntimeManager> rtmgr(Executor::RuntimeManager::createRuntimeManager(config));
+    rtmgr->setCache(cacheFileName);
+
+    std::shared_ptr<Module> module(Module::load(inputNames, outputNames, modelName.c_str(), rtmgr, mdConfig));
+    /*... Make Inputs*/
+    auto outputs = module->onForward(inputs);
+
+    // Update cache file
+    rtmgr->updateCache();
+```
+
+#### mode 设置
+可以通过设置mode开启/关闭一些功能，示例：
+
+```
+// 创建出来的 Module 支持插入回调函数
+rtmgr->setMode(Interpreter::Session_Debug);
+```
+
+并非所有枚举都适用 Module 的创建，有效值如下：
+
+- Interpreter::SessionMode::Session_Debug : 支持逐算子调试
+- Interpreter::SessionMode::Session_Release : 关闭逐算子调试功能，可以轻微提升性能【默认选项】
+- Interpreter::SessionMode::Session_Backend_Fix : 固定使用用户设置的后端【默认选项】
+- Interpreter::SessionMode::Session_Backend_Auto : MNN根据用户倾向，预估load Module耗时，如果耗时较短则使用用户设置的后端，否则使用CPU
+
+
+#### hint 设置
+通过 hint 设置，可以在后端支持的情况下设置相应属性，有效值如下：
+
+- Interpreter::HintMode::WINOGRAD_MEMORY_LEVEL ：使用 Winograd 算法优化卷积时，内存占用倾向，默认为 3 ，若希望降低内存占用可设为 0 
+- Interpreter::HintMode::GEOMETRY_COMPUTE_MASK ：几何计算相关优化开关，1为区域合并，2为复合区域合并，4为使用loop算子，8为支持几何计算重计算，需要多个功能开启时把对应值叠加。默认为功能全开。
+- Interpreter::HintMode::DYNAMIC_QUANT_OPTIONS ：动态量化选项，1为 Per Batch，2为Per Tensor 。默认为2。
+- Interpreter::HintMode::CPU_LITTLECORE_DECREASE_RATE ：对于 Android 设备存在大中小核的情况，大核算力到中核算力的衰减比例。默认为50（中核算力为大核的50%）
+
+
+#### ExternalPath
+在设备可能出现内存不足时，可以通过 setExternalPath 指定路径，让MNN把部分内存用mmap分配。这样操作系统可在内存不足时会将其转换为读写文件，避免内存不足程序闪退。示例：
+
+```
+runtime_manager_->setExternalPath("tmp", MNN::Interpreter::EXTERNAL_WEIGHT_DIR);
+runtime_manager_->setExternalPath("tmp", MNN::Interpreter::EXTERNAL_FEATUREMAP_DIR);
+```
+
+- MNN::Interpreter::EXTERNAL_WEIGHT_DIR : 权重重排后的内存转换为文件存储
+- MNN::Interpreter::EXTERNAL_FEATUREMAP_DIR : 中间内存转换为文件存储
+
 ### 创建Module
 `Module`可以通过指定模型，输入输出的名称，配置文件创建
 ```cpp
diff --git a/docs/start/overall.md b/docs/start/overall.md
index 02203b20e..de13f21e0 100644
--- a/docs/start/overall.md
+++ b/docs/start/overall.md
@@ -6,6 +6,6 @@
 ### 训练
 在训练框架上，根据训练数据训练出模型的阶段。虽然当前MNN也提供了[训练模型的能力](../train/expr.md)，但主要用于端侧训练或模型调优。在数据量较大时，依然建议使用成熟的训练框架，如TensorFlow、PyTorch等。除了自行训练外，也可以直接利用开源的预训练模型。
 ### 转换
-将其他训练框架模型转换为MNN模型的阶段。MNN当前支持Tensorflow(Lite)、Caffe、ONNX和TorchScript的模型转换。模型转换工具可以参考[编译文档](../compile/tools.html#id2)和[使用说明](../tools/convert.md)。支持转换的算子，可以参考[算子列表文档](../tools/convert.html#id7)；在遇到不支持的算子时，可以尝试[自定义算子](../contribute/op.md)，或在Github上给我们[提交issue](https://github.com/alibaba/MNN/issues/74)。此外，[模型打印工具](../tools/convert.html#id8)可以用于输出模型结构，辅助调试。除模型转换外，MNN也提供了[模型量化工具](../tools/quant.md)，可以对浮点模型进行量化压缩。
+将其他训练框架模型转换为MNN模型的阶段。MNN当前支持Tensorflow(Lite)、Caffe、ONNX和TorchScript的模型转换。模型转换工具可以参考[使用说明](../tools/convert.md)。支持转换的算子，可以参考[算子列表文档](../tools/convert.html#id7)；在遇到不支持的算子时，可以尝试[自定义算子](../contribute/op.md)，或在Github上给我们[提交issue](https://github.com/alibaba/MNN/issues/74)。此外，[模型打印工具](../tools/convert.html#id8)可以用于输出模型结构，辅助调试。除模型转换外，MNN也提供了[模型量化工具](../tools/quant.md)，可以对浮点模型进行量化压缩。
 ### 推理
-在端侧加载MNN模型进行推理的阶段。端侧运行库的编译请参考各平台的编译文档：[iOS](../compile/engine.html#ios)、[Android](../compile/engine.html#android)、[Linux/macOS/Ubuntu](../compile/engine.html#linux-macos)、[Windows](../compile/engine.html#windows)。我们提供了[API接口文档](https://github.com/alibaba/MNN/tree/master/doc/API)，也详细说明了[会话创建](../inference/session.html#id1)、[数据输入](../inference/session.html#id8)、[执行推理](../inference/session.html#id17)、[数据输出](../inference/session.html#id21)相关的接口和参数。`demo/exec`下提供了使用示例，如图像识别 `demo/exec/pictureRecognition.cpp` ，图像实例分割（人像分割）`demo/exec/segment.cpp`，[更多demo](demo.md)。此外，[测试工具](../tools/test.md)和[benchmark工具](../tools/benchmark.md)也可以用于问题定位。
\ No newline at end of file
+在端侧加载MNN模型进行推理的阶段。端侧运行库的编译请参考各平台的编译文档：[iOS](../compile/engine.html#ios)、[Android](../compile/engine.html#android)、[Linux/macOS/Ubuntu](../compile/engine.html#linux-macos)、[Windows](../compile/engine.html#windows)。我们提供了[API接口文档](https://github.com/alibaba/MNN/tree/master/doc/API)，也详细说明了[会话创建](../inference/session.html#id1)、[数据输入](../inference/session.html#id8)、[执行推理](../inference/session.html#id17)、[数据输出](../inference/session.html#id21)相关的接口和参数。`demo/exec`下提供了使用示例，如图像识别 `demo/exec/pictureRecognition.cpp` ，图像实例分割（人像分割）`demo/exec/segment.cpp`，[更多demo](demo.md)。此外，[测试工具](../tools/test.md)和[benchmark工具](../tools/benchmark.md)也可以用于问题定位。
diff --git a/docs/tools/convert.md b/docs/tools/convert.md
index fdc707bc1..b815405bf 100644
--- a/docs/tools/convert.md
+++ b/docs/tools/convert.md
@@ -1,5 +1,4 @@
 # 模型转换工具
-[从源码编译](../compile/tools.html#id2)
 ## 参数说明
 ```bash
 Usage:
diff --git a/docs/tools/quant.md b/docs/tools/quant.md
index c2a26d1d5..1a66b6e1b 100644
--- a/docs/tools/quant.md
+++ b/docs/tools/quant.md
@@ -1,7 +1,7 @@
 # 单输入模型离线量化工具
 `./quantized.out origin.mnn quan.mnn imageInputConfig.json`
 
-通用（任意输入个数、维度、类型）模型离线量化请看[说明](https://mnn-docs.readthedocs.io/en/latest/tools/compress.html#id10)
+MNN quantized.out工具已支持通用（任意输入个数、维度、类型）模型离线量化， 但这里的多输入模型仅仅支持非图片输入类模型。
 
 MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查看[文档](https://mnn-docs.readthedocs.io/en/latest/tools/compress.html)选择使用
 
@@ -38,6 +38,10 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 | MAX_ABS | 使用权值的绝对值的最大值进行对称量化 |
 | ADMM | 使用ADMM方法进行权值量化 |
 
+## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
+| input_type | `str` | 输入数据的类型，"sequence" |
+| path | `str` | 存放校正特征量化系数的输入数据目录 |，例如该目录下包含2个输入数据集input_0和input_1，子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹，可使用 testMNNFromOnnx.py 等脚本生成，参考模型转换的正确性校验部分。
+
 ## 量化模型的使用
 和浮点模型同样使用方法，输入输出仍然为浮点类型
 ## 参考资料
diff --git a/docs/tools/test.md b/docs/tools/test.md
index 532877f9e..02c2d3df0 100644
--- a/docs/tools/test.md
+++ b/docs/tools/test.md
@@ -1,5 +1,5 @@
 # 测试工具
-[从源码编译](../compile/tools.html#id4)使用cmake编译时，build目录下的产物也包含测试使用的工具集，下面逐项说明。
+使用cmake编译时，默认打开 MNN_BUILD_TOOLS 编译宏，对应build目录下的产物也包含测试使用的工具集，下面逐项说明。
 
 ## GetMNNInfo
 ### 功能
@@ -95,6 +95,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
 - 128 : 使用文件夹下面的 input.mnn 和 output.mnn 做为输入和对比输出，对于数据量较大的情况宜用此方案
 - 512 : 开启使用Winograd算法计算卷积时的内存优化，开启后模型的运行时内存会降低，但可能导致性能损失。
 - 1024: 使用动态量化推理时，对输入数据分batch量化以提高模型的推理精度
+- 2048: 使用mmap方式，使用文件存储中间内存。存储文件的目录为当前目录/tmp，需要先建tmp文件夹
 
 
 ### 示例
@@ -262,19 +263,10 @@ stopOp.c_str()=s  in main, 278
 Correct ! Run second pass
 Correct !
 ```
-### 在Android中使用
-先编译相关的库和可执行文件，然后push到Android手机上，用adb执行命令，参考`project/android/testCommon.sh`
-```bash
-cd project/android
-mkdir build_64
-cd build_64 && ../build_64.sh
-../updateTest.sh
-../testCommon.sh ./backendTest.out temp.mnn 3 0.15 1
-```
 
 ## getPerformance
 ### 功能
-获取当前设备的CPU性能，打印出每个CPU核心的频率；在Android设备上还会打印该设备CPU的浮点计算能力(GFLOPS)
+获取当前设备的CPU性能和内存访问性能，打印出每个CPU核心的频率；在Android设备上还会打印该设备CPU的浮点计算能力(GFLOPS)
 
 *各核心频率仅在Linux/Android环境中有效，计算能力仅在Android中有效*
 ### 参数
@@ -475,6 +467,7 @@ Matrix:
 ### 示例
 ```bash
 $ ./fuseTest user.spirv user.json
+```
 
 ## GpuInterTest.out
 ### 功能
@@ -488,3 +481,22 @@ GPU 内存输入测试用例
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `numberThread:int` GPU的线程数，可选，默认为`1`
 - `precision_memory:int` 测试精度与内存模式，precision_memory % 16 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; precision_memory / 16 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 2(low) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
+
+
+## 在Android中使用测试工具
+- project/android/updateTest.sh 可以把编译好的库和可执行文件 push 到Android手机的/data/local/tmp/MNN 目录
+- project/android/testCommon.sh 可以在 /data/local/tmp/MNN 目录下执行可执行程序
+
+其他的资源文件需要自行使用 adb push ，将其放到手机的 /data/local/tmp/MNN 目录下，比如 adb push temp.mnn /data/local/tmp/MNN/temp.mnn
+
+如下例子是在Android设备上使用 backendTest.out ，其中 temp.mnn 路径为 /data/local/tmp/MNN/temp.mnn
+
+```bash
+cd project/android
+mkdir build_64
+cd build_64 && ../build_64.sh
+../updateTest.sh
+../testCommon.sh ./backendTest.out temp.mnn 3 0.15 1
+```
+
+
diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
index 70e64766b..5c6d341fb 100644
--- a/docs/transformers/diffusion.md
+++ b/docs/transformers/diffusion.md
@@ -17,8 +17,8 @@ https://huggingface.co/IDEA-CCNL/Taiyi-Stable-Diffusion-1B-Chinese-v0.1/tree/mai
 ## 模型转换
 ### 将Huggingface的Stable Diffusion模型 转为onnx模型
 ```sh
-cd mnn_path/transformers/diffusion/
-python export/onnx_export.py \
+cd mnn_path/transformers/diffusion/export
+python onnx_export.py \
     --model_path hf_sd_load_path \
     --output_path onnx_save_path
 ```
@@ -30,20 +30,19 @@ conda activate ldm
 在conda环境中执行模型转换脚本
 
 ### 将onnx模型转为mnn模型
-新建diffusion mnn模型文件夹，将转好的mnn文件放在该文件夹下。
-1. 实现encoder从onnx模型 -> mnn模型
-```
-./MNNConvert -f ONNX --modelFile onnx_save_path/text_encoder/model.onnx --MNNModel mnn_save_path/text_encoder.mnn --weightQuantBits 8 --bizCode biz
-```
-2. 实现denoiser unet从onnx模型 -> mnn模型
+新建diffusion mnn模型文件夹 mnn_save_path ，将转好的mnn文件放在该文件夹下。
+
+执行脚本
 ```
-./MNNConvert -f ONNX --modelFile onnx_save_path/unet/model.onnx --MNNModel mnn_save_path/unet.mnn --transformerFuse --weightQuantBits 8 --bizCode biz
-注意：对于非OpenCL后端推理，需要去掉--transformerFuse。
+python3 convert_mnn.py ../onnx ~/alicnn/AliNNPrivate/build/diffusion "--weightQuantBits=8"
 ```
-3. 实现decoder从onnx模型 -> mnn模型
+
+若希望在OpenCL后端上进一步加速，可加上--transformerFuse:
 ```
-./MNNConvert -f ONNX --modelFile onnx_save_path/vae_decoder/model.onnx --keepInputFormat --MNNModel mnn_save_path/vae_decoder.mnn --weightQuantBits 8 --bizCode biz
+# 适用OpenCL 后端推理
+python3 convert_mnn.py onnx_path mnn_save_path "--weightQuantBits=8 --transformerFuse"
 ```
+
 ## 编译Diffusion Demo
 ### Linux/MAC/Windows上
 ```
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
index 5e77ab0cb..0d00de862 100644
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@@ -6,68 +6,59 @@
 
 ## 模型导出
 
-`llm_export`是一个llm模型导出工具，能够将llm模型导出为onnx和mnn模型。
+`llmexport`是一个llm模型导出工具，能够将llm模型导出为onnx和mnn模型。
 
 ### 用法
 1. 将需要导出的LLM项目clone到本地，如：Qwen2-0.5B-Instruct
 ```sh
 git clone https://www.modelscope.cn/qwen/Qwen2-0.5B-Instruct.git
 ```
-3. 执行`llm_export.py`导出模型
+3. 执行`llmexport.py`导出模型
 ```sh
 cd ./transformers/llm/export
 # 导出模型，tokenizer和embedding，并导出对应的mnn模型
-python llm_export.py \
-        --type Qwen2-0_5B-Instruct \
+python llmexport.py \
         --path /path/to/Qwen2-0.5B-Instruct \
-        --export \
-        --export_token \
-        --export_embed --embed_bin \
-        --export_mnn
+        --export mnn
 ```
 4. 导出产物
 导出产物为：
-1. `embeddings_bf16.bin`: 模型的embedding权重二进制文件，推理时使用；
-2. `llm_config.json`: 模型的配置信息，推理时使用；
-3. `llm.onnx`: 模型的onnx文件，推理时不使用；
-4. `tokenizer.txt`: 模型的tokenzier文件，推理时使用；
-5. `llm.mnn`: 模型的mnn文件，推理时使用；
-6. `llm.mnn.weight`: 模型的mnn权重，推理时使用；
+1. `config.json`: 模型运行时的配置，可手动修改；
+2. `embeddings_bf16.bin`: 模型的embedding权重二进制文件，推理时使用；
+3. `llm.mnn`: 模型的mnn文件，推理时使用；
+4. `llm.mnn.json`: mnn模型对应的json文件，apply_lora或者gptq量化权重时使用；
+5. `llm.mnn.weight`: 模型的mnn权重，推理时使用；
+6. `llm.onnx`: 模型的onnx文件，不包含权重，推理时不使用；
+7. `llm_config.json`: 模型的配置信息，推理时使用；
+8. `tokenizer.txt`: 模型的tokenzier文件，推理时使用；
 目录结构如下所示：
 ```
 .
-├── onnx
-|    ├── embeddings_bf16.bin
-|    ├── llm_config.json
-|    ├── llm.onnx
-|    └── tokenizer.txt
-└── mnn
+└── model
+     ├── config.json
+     ├── embeddings_bf16.bin
      ├── llm.mnn
-     └── llm.mnn.weight
+     ├── llm.mnn.json
+     ├── llm.mnn.weight
+     ├── llm.onnx
+     ├── llm_config.json
+     └── tokenizer.txt
 ```
 
 ### 功能
-- 支持将模型完整导出为一个onnx模型，使用`--export`
-- 支持将模型分段导出为多个模型，使用`--export_split`
-- 支持导出模型的词表到一个文本文件，每行代表一个token；其中token使用base64编码；使用`--export_verbose`
-- 支持导出模型的Embedding层为一个onnx模型，使用`--export_embed`，同时支持bf16格式，使用`--embed_bf16`
-- 支持分层导出模型的block，使用`--export_blocks`导出全部层；使用`--export_block $id`导出指定层
-- 支持导出模型的lm_head层为一个onnx模型，使用`--export_lm`
-- 支持导出多模态模型的visual模型为一个onnx模型，使用`--export_visual`
+- 支持将模型为onnx或mnn模型，使用`--export onnx`或`--export mnn`
 - 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
-- 支持在导出onnx模型后使用onnxruntime对结果一致性进行校验，使用`--export_test`
-- 支持将tokenizer导出为文本文件，使用`--export_token`
-- 支持将导出的onnx模型转换为mnn模型，默认转换为非对称4bit量化，使用`--export_mnn`
-- 指定导出路径使用`--onnx_path`和`--mnn_path`
 - 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
+- 制定量化bit数使用`--quant_bit`；量化的block大小使用`--quant_block`
+- 使用`--lm_quant_bit`来制定lm_head层权重的量化bit数，不指定则使用`--quant_bit`的量化bit数
+- 支持使用自己编译的`MNNConvert`，使用`--mnnconvert`
 
 ### 参数
 ```
-usage: llm_export.py [-h] --path PATH
-                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}]
-                     [--lora_path LORA_PATH] [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token]
-                     [--export_embed] [--export_visual] [--export_lm] [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bin] [--embed_bf16] [--skip_slim]
+usage: llmexport.py [-h] --path PATH [--type TYPE] [--lora_path LORA_PATH] [--dst_path DST_PATH] [--test TEST] [--export EXPORT]
+                    [--skip_slim] [--quant_bit QUANT_BIT] [--quant_block QUANT_BLOCK] [--lm_quant_bit LM_QUANT_BIT]
+                    [--mnnconvert MNNCONVERT]
 
 llm_exporter
 
@@ -77,33 +68,22 @@ options:
                         Can be either:
                         	- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
                         	- A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Qwen2-1_5B-Instruct,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}
-                        type(`str`, *optional*):
+  --type TYPE           type(`str`, *optional*):
                         	The pretrain llm model type.
   --lora_path LORA_PATH
                         lora path, defaut is `None` mean not apply lora.
-  --onnx_path ONNX_PATH
-                        export onnx model path, defaut is `./onnx`.
-  --mnn_path MNN_PATH   export mnn model path, defaut is `./mnn`.
-  --export_mnn          Whether or not to export mnn model after onnx.
-  --export_verbose      Whether or not to export onnx with verbose.
-  --export_test         Whether or not to export onnx with test using onnxruntime.
+  --dst_path DST_PATH   export onnx/mnn model to path, defaut is `./model`.
   --test TEST           test model inference with query `TEST`.
-  --export              export model to an `onnx` model.
-  --export_split        export model split to some `onnx` models:
-                        	- embedding model.
-                        	- block models.
-                        	- lm_head model.
-  --export_token        export llm tokenizer to a txt file.
-  --export_embed        export llm embedding to an `onnx` model.
-  --export_visual       export llm visual model to an `onnx` model.
-  --export_lm           export llm lm_head to an `onnx` model.
-  --export_block EXPORT_BLOCK
-                        export llm block [id] to an `onnx` model.
-  --export_blocks       export llm all blocks to `onnx` models.
-  --embed_bin           export embedding weight as bin file with dtype `bfloat16`
-  --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --export EXPORT       export model to an onnx/mnn model.
   --skip_slim           Whether or not to skip onnx-slim.
+  --quant_bit QUANT_BIT
+                        mnn quant bit, 4 or 8, default is 4.
+  --quant_block QUANT_BLOCK
+                        mnn quant block, default is 0 mean channle-wise.
+  --lm_quant_bit LM_QUANT_BIT
+                        mnn lm_head quant bit, 4 or 8, default is `quant_bit`.
+  --mnnconvert MNNCONVERT
+                        local mnnconvert path, if invalid, using pymnn.
 ```
 
 ## 模型推理
@@ -111,6 +91,37 @@ options:
 ### 编译
 
 [从源码编译](../compile/other.html#id4)
+在原有编译过程中增加必需编译宏即可： -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true 
+
+- mac / linux / windows
+
+以 mac / linux 为例 :
+```
+make build
+cd build
+cmake ../ -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true
+make -j16
+```
+
+x86架构额外加 MNN_AVX512 的宏：
+```
+make build
+cd build
+cmake ../ -DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_AVX512=true
+make -j16
+```
+
+- Android：额外增加 MNN_ARM82 的宏
+```
+cd project/android
+mkdir build_64
+../build_64.sh "-DMNN_LOW_MEMORY=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true -DMNN_BUILD_LLM=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_ARM82=true"
+```
+
+- iOS: 参考 transformers/llm/engine/ios/README.md
+```
+sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_BUILD_LLM=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true"
+```
 
 ### 使用
 #### 运行时配置
@@ -144,11 +155,16 @@ options:
 - 推理配置
   - max_new_tokens: 生成时最大token数，默认为`512`
   - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`
-  - quant_kv: 存储`kv cache`时是否量化，可选为：`0, 1, 2, 3`，默认为`0`，含义如下：
+  - quant_qkv: CPU attention 算子中`query, key, value`是否量化，可选为：`0, 1, 2, 3, 4`，默认为`0`，含义如下：
     - 0: key和value都不量化
     - 1: 使用非对称8bit量化存储key
-    - 2: 使用fp8格式寸处value
-    - 3: 使用非对称8bit量化存储key，使用fp8格式寸处value
+    - 2: 使用fp8格式量化存储value
+    - 3: 使用非对称8bit量化存储key，使用fp8格式量化存储value
+    - 4: 量化kv的同时使用非对称8bit量化query，并使用int8矩阵乘计算Q*K
+  - use_mmap: 是否使用mmap方式，在内存不足时将权重写入磁盘，避免溢出，默认为false，手机上建议设成true
+  - kvcache_mmap: 是否使用mmap方式，在内存不足时将在KV Cache 写入磁盘，避免溢出，默认为false
+  - tmp_path: 启用 mmap 相关功能时，写入磁盘的缓存目录
+    - iOS 上可用如下语句创建临时目录并设置：`NSString *tempDirectory = NSTemporaryDirectory();llm->set_config("{\"tmp_path\":\"" + std::string([tempDirectory UTF8String]) + "\"}")`
 - 硬件配置
   - backend_type: 推理使用硬件后端类型，默认为：`"cpu"`
   - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
@@ -266,4 +282,4 @@ options:
       thread1.join();
       thread2.join();
   }
-  ```
\ No newline at end of file
+  ```
diff --git a/express/Executor.cpp b/express/Executor.cpp
index 437d72df6..5f6a6dd48 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -154,9 +154,8 @@ std::shared_ptr<Executor> Executor::getGlobalExecutor() {
         RuntimeHint hint;
         hint.memoryAllocatorType = 0;// Defer
         bn->setRuntimeHint(hint);
-        static std::shared_ptr<Executor> executorStatic;
-        executorStatic.reset(new Executor(bn, MNN_FORWARD_CPU, 1));
-        gExecutor = &executorStatic;
+        gExecutor = new std::shared_ptr<Executor>;
+        gExecutor->reset(new Executor(bn, MNN_FORWARD_CPU, 1));
     });
     return *gExecutor;
 }
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index 4ba49c27a..a0976bd67 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -330,11 +330,17 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
     if (nullptr == rtMgr.get()) {
         rtMgr.reset(_createDefaultRuntimeManager(config));
     }
+    bool needReset = false;
     if (rtMgr->getInside()->mExternalFile.empty()) {
         // Set Default externalFile
         rtMgr->setExternalFile(std::string(fileName) + ".weight");
+        needReset = true;
     }
-    return loadInternal(inputs, outputs, buffer.get(), buffer.size(), rtMgr, config);
+    auto res = loadInternal(inputs, outputs, buffer.get(), buffer.size(), rtMgr, config);
+    if (needReset) {
+        rtMgr->setExternalFile("");
+    }
+    return res;
 }
 
 Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config) {
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
index 986185534..31a07c632 100644
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@@ -33,7 +33,7 @@ static const StaticModule* getStaticModule(const Module* m) {
 }
 
 static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLINT
-                                                                       Schedule::ScheduleInfo& scheduleInfo, Backend* backend, Backend* backupBackend, const Module* base = nullptr) {
+                                                                       Schedule::ScheduleInfo& scheduleInfo, Backend* firstbackend, Backend* backupBackend, const Module* base = nullptr) {
     std::map<const std::string, std::shared_ptr<Execution>> base_executions;
     if (base != nullptr) {
         // has base module
@@ -59,6 +59,10 @@ static std::vector<std::shared_ptr<BufferStorage>> preRearrangeWeights( // NOLIN
         auto op    = pipelineInfo[i].op;
         std::unique_ptr<OpT> op_table(op->UnPack());
         std::shared_ptr<Execution> exe;
+        Backend* backend = firstbackend;
+        if (info.type == Schedule::CONSTANT) {
+            backend = backupBackend;
+        }
         switch (op->type()) {
             case MNN::OpType_DepthwiseConvInt8:
             case MNN::OpType_ConvInt8:
@@ -304,20 +308,8 @@ StaticModule::StaticModule(std::vector<int> inputs,
     std::map<const Op*, std::pair<std::shared_ptr<Execution>, DataType>> exeCache;
     MNN_ASSERT(1 == scheduleInfo.pipelineInfo.size());
     auto& bnCache = scheduleInfo.pipelineInfo[0].first;
-    bnCache.cache.first.reset(rt.first[bnCache.info.type]->onCreate(bnCache.info.user));
-    if (bnCache.cache.first->type() == MNN_FORWARD_CPU) {
-        bnCache.cache.second = bnCache.cache.first;
-    } else {
-        // Use Multi-thread if user has set numberthread > 1
-        BackendConfig defaultConfig;
-        defaultConfig.flags = 4;
-        auto cpurt = rt.first.find(MNN_FORWARD_CPU);
-        if (cpurt != rt.first.end()) {
-            bnCache.cache.second.reset(cpurt->second->onCreate(&defaultConfig));
-        } else {
-            bnCache.cache.second.reset(rt.second->onCreate(&defaultConfig));
-        }
-    }
+    // Create Backend for prearrange
+    Session::createPipelineBackend(scheduleInfo.pipelineInfo[0], rt);
     if (config.rearrange) {
         mResource->mBuffer = preRearrangeWeights(scheduleInfo, bnCache.cache.first.get(), bnCache.cache.second.get(), config.base);
     } else {
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index bac8fb341..edeceb296 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -224,11 +224,12 @@ class MNN_PUBLIC Interpreter {
         // Default is 50
         CPU_LITTLECORE_DECREASE_RATE = 6,
 
-        // 0: Do not quantize kvcache, just store float
-        // 1: Only quantize key cache, use int8 asymmetric quantization 
-        // 2: Only quantize value cache, use fp8 quantization
-        // 3: quantize both key and value cache as described above
-        KVCACHE_QUANT_OPTIONS = 7,
+        // 0: Do not quantize
+        // 1: Only quantize key, use int8 asymmetric quantization 
+        // 2: Only quantize value, use fp8 quantization
+        // 3: quantize both key and value
+        // 4: quantize query, key and value, and use gemm int8 kernel to compute K*V
+        QKV_QUANT_OPTIONS = 7,
 
         // size limit of kvcache in memory (for a single layer)
         // if the size of kvcache exceeds the limit, it will be moved to disk
@@ -238,6 +239,12 @@ class MNN_PUBLIC Interpreter {
     enum ExternalPathType {
         // Path of the kvcache directory
         EXTERNAL_PATH_KVCACHE_DIR = 0,
+        
+        // Mid Buffer Cache File
+        EXTERNAL_FEATUREMAP_DIR = 1,
+
+        // Weight Buffer Cache File
+        EXTERNAL_WEIGHT_DIR = 2,
 
         // Other types ...
     };
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index 215939a99..8a0af32de 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 4
+#define MNN_VERSION_PATCH 5
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/package_scripts/ios/buildiOS.sh b/package_scripts/ios/buildiOS.sh
index 3722f1f61..0f0942d31 100755
--- a/package_scripts/ios/buildiOS.sh
+++ b/package_scripts/ios/buildiOS.sh
@@ -12,31 +12,12 @@ cd Static
 rm -rf ios_64
 mkdir ios_64
 cd ios_64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
 echo "Building AArch64"
 make MNN -j16
 echo "End Building AArch64"
 cd ../
 
-rm -rf ios_32
-mkdir ios_32
-cd ios_32
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="armv7;armv7s" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_BUILD_SHARED_LIBS=false $1
-echo "Building AArch32"
-make MNN -j16
-echo "End Building AArch32"
-cd ../
-
-find ios_32 -name "MNN*framework"
-find ios_64 -name "MNN*framework"
-
-mv ios_32/MNN.framework/MNN ios_32/MNN.framework/MNN_32
+mv ios_64/MNN.framework MNN.framework
 
-echo "Creating Fat Binary"
-lipo -create ios_32/MNN.framework/MNN_32 ios_64/MNN.framework/MNN -output ios_32/MNN.framework/MNN
-rm ios_32/MNN.framework/MNN_32
-echo "Patching Framework Headers"
-rm -rf ./MNN.framework
-cp -R ios_32/MNN.framework ./MNN.framework
-rm -rf ios_32
 rm -rf ios_64
diff --git a/package_scripts/ios/buildiOS_with_armv7.sh b/package_scripts/ios/buildiOS_with_armv7.sh
new file mode 100755
index 000000000..ea5851791
--- /dev/null
+++ b/package_scripts/ios/buildiOS_with_armv7.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+echo "Change directory to MNN_SOURCE_ROOT/project/ios before running this script"
+echo "Current PWD: ${PWD}"
+
+rm -rf MNN-iOS-CPU-GPU
+mkdir MNN-iOS-CPU-GPU
+cd MNN-iOS-CPU-GPU
+# Static Begin
+mkdir Static 
+cd Static
+
+rm -rf ios_64
+mkdir ios_64
+cd ios_64
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="arm64" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_ARM82=true -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
+echo "Building AArch64"
+make MNN -j16
+echo "End Building AArch64"
+cd ../
+
+rm -rf ios_32
+mkdir ios_32
+cd ios_32
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../../../cmake/ios.toolchain.cmake -DMNN_METAL=ON -DARCHS="armv7;armv7s" -DENABLE_BITCODE=0 -DMNN_AAPL_FMWK=1 -DMNN_SEP_BUILD=0 -DMNN_BUILD_SHARED_LIBS=false -DMNN_USE_THREAD_POOL=OFF $1
+echo "Building AArch32"
+make MNN -j16
+echo "End Building AArch32"
+cd ../
+
+find ios_32 -name "MNN*framework"
+find ios_64 -name "MNN*framework"
+
+mv ios_32/MNN.framework/MNN ios_32/MNN.framework/MNN_32
+
+echo "Creating Fat Binary"
+lipo -create ios_32/MNN.framework/MNN_32 ios_64/MNN.framework/MNN -output ios_32/MNN.framework/MNN
+rm ios_32/MNN.framework/MNN_32
+echo "Patching Framework Headers"
+rm -rf ./MNN.framework
+cp -R ios_32/MNN.framework ./MNN.framework
+rm -rf ios_32
+rm -rf ios_64
diff --git a/package_scripts/mac/buildFrameWork.sh b/package_scripts/mac/buildFrameWork.sh
index e0810e07e..7f955ed89 100755
--- a/package_scripts/mac/buildFrameWork.sh
+++ b/package_scripts/mac/buildFrameWork.sh
@@ -18,7 +18,7 @@ cd Static
 # ARM
 mkdir mac_a64
 cd mac_a64
-cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 -DMNN_BUILD_SHARED_LIBS=OFF $1
+cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 -DMNN_BUILD_SHARED_LIBS=OFF $1
 echo "Building ARM64"
 make MNN -j16
 echo "End Building ARM64"
@@ -27,7 +27,7 @@ cd ../
 # X86
 mkdir mac_x64
 cd mac_x64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 -DMNN_BUILD_SHARED_LIBS=OFF $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 -DMNN_BUILD_SHARED_LIBS=OFF $1
 echo "Building x86"
 make MNN -j16
 echo "End Building x86"
@@ -52,7 +52,7 @@ cd Dynamic
 # ARM
 mkdir mac_a64
 cd mac_a64
-cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 $1
+cmake ../../../ -DMNN_USE_SSE=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="arm64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DMNN_ARM82=ON -DCMAKE_OSX_ARCHITECTURES=arm64 $1
 echo "Building ARM64"
 make MNN -j16
 echo "End Building ARM64"
@@ -61,7 +61,7 @@ cd ../
 # X86
 mkdir mac_x64
 cd mac_x64
-cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 $1
+cmake ../../../ -DCMAKE_BUILD_TYPE=Release -DMNN_OPENCL=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON -DARCHS="x86_64" -DMNN_AAPL_FMWK=ON -DMNN_SEP_BUILD=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64 $1
 echo "Building x86"
 make MNN -j16
 echo "End Building x86"
diff --git a/project/android/build_32.sh b/project/android/build_32.sh
index e83655009..24f0eb8cc 100755
--- a/project/android/build_32.sh
+++ b/project/android/build_32.sh
@@ -4,7 +4,6 @@ cmake ../../../ \
 -DCMAKE_BUILD_TYPE=Release \
 -DANDROID_ABI="armeabi-v7a" \
 -DANDROID_STL=c++_static \
--DCMAKE_BUILD_TYPE=Release \
 -DANDROID_NATIVE_API_LEVEL=android-14  \
 -DANDROID_TOOLCHAIN=clang \
 -DMNN_USE_LOGCAT=false \
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index f576703bf..535f50d27 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -771,6 +771,25 @@
 		C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C4F906B127688C3A0026B847 /* NMSModule.hpp */; };
 		C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4F906B227688C3A0026B847 /* NMSModule.cpp */; };
 		C4FB6CB22769DF0800963B07 /* GeometryCumSum.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */; };
+		CE072A132C91AEE700F190FD /* MNNBGRToBGR555.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */; };
+		CE072A142C91AEE700F190FD /* MNNBGRAToGRAY.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */; };
+		CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */; };
+		CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */; };
+		CE072A172C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */; };
+		CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */; };
+		CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */; };
+		CE072A1A2C91AEE700F190FD /* MNNRGBToGRAYFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */; };
+		CE072A1B2C91AEE700F190FD /* MNNBGRToBGR565.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */; };
+		CE072A1C2C91AEE700F190FD /* MNNRGBAToBGRFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */; };
+		CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */; };
+		CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */; };
+		CE072A1F2C91AEE700F190FD /* MNNRGBToBGR.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */; };
+		CE072A202C91AEE700F190FD /* MNNGRAYToC3Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */; };
+		CE072A212C91AEE700F190FD /* MNNRGBToBGR565.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */; };
+		CE072A222C91AEE700F190FD /* MNNPackC2.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A122C91AEE700F190FD /* MNNPackC2.S */; };
+		CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
+		CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
+		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@@ -805,6 +824,8 @@
 		CEE9B95B2A3AA4D4006438F2 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */; };
 		CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */; };
 		CEE9B95D2A3AA4D4006438F2 /* MNNCubicSampleC16.S in Sources */ = {isa = PBXBuildFile; fileRef = CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */; };
+		CEEDB5542C7475A100FED0DC /* MNNFileUtils.h in Headers */ = {isa = PBXBuildFile; fileRef = CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */; };
+		CEEDB5552C7475A100FED0DC /* MNNFileUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */; };
 		EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB45C776244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */ = {isa = PBXBuildFile; fileRef = EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */; };
 		EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */; };
@@ -1607,6 +1628,25 @@
 		C4F906B127688C3A0026B847 /* NMSModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = NMSModule.hpp; sourceTree = "<group>"; };
 		C4F906B227688C3A0026B847 /* NMSModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NMSModule.cpp; sourceTree = "<group>"; };
 		C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCumSum.cpp; sourceTree = "<group>"; };
+		CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToBGR555.S; path = arm/arm64/MNNBGRToBGR555.S; sourceTree = "<group>"; };
+		CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRAToGRAY.S; path = arm/arm64/MNNBGRAToGRAY.S; sourceTree = "<group>"; };
+		CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToGRAYFast.S; path = arm/arm64/MNNRGBAToGRAYFast.S; sourceTree = "<group>"; };
+		CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRAToBGR.S; path = arm/arm64/MNNBGRAToBGR.S; sourceTree = "<group>"; };
+		CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNSamplerC3BilinearOpt.S; path = arm/arm64/MNNSamplerC3BilinearOpt.S; sourceTree = "<group>"; };
+		CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGRAYToC4Fast.S; path = arm/arm64/MNNGRAYToC4Fast.S; sourceTree = "<group>"; };
+		CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToGRAY.S; path = arm/arm64/MNNBGRToGRAY.S; sourceTree = "<group>"; };
+		CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToGRAYFast.S; path = arm/arm64/MNNRGBToGRAYFast.S; sourceTree = "<group>"; };
+		CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNBGRToBGR565.S; path = arm/arm64/MNNBGRToBGR565.S; sourceTree = "<group>"; };
+		CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToBGRFast.S; path = arm/arm64/MNNRGBAToBGRFast.S; sourceTree = "<group>"; };
+		CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBAToBGRAFast.S; path = arm/arm64/MNNRGBAToBGRAFast.S; sourceTree = "<group>"; };
+		CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR555.S; path = arm/arm64/MNNRGBToBGR555.S; sourceTree = "<group>"; };
+		CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR.S; path = arm/arm64/MNNRGBToBGR.S; sourceTree = "<group>"; };
+		CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNGRAYToC3Fast.S; path = arm/arm64/MNNGRAYToC3Fast.S; sourceTree = "<group>"; };
+		CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNRGBToBGR565.S; path = arm/arm64/MNNRGBToBGR565.S; sourceTree = "<group>"; };
+		CE072A122C91AEE700F190FD /* MNNPackC2.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackC2.S; path = arm/arm64/MNNPackC2.S; sourceTree = "<group>"; };
+		CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
+		CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
+		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@@ -1643,6 +1683,8 @@
 		CEE9B9572A3AA4D4006438F2 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CEE9B9582A3AA4D4006438F2 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CEE9B9592A3AA4D4006438F2 /* MNNCubicSampleC16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNCubicSampleC16.S; sourceTree = "<group>"; };
+		CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNFileUtils.h; sourceTree = "<group>"; };
+		CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MNNFileUtils.cpp; sourceTree = "<group>"; };
 		EB45C773244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB45C775244D7C6600E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S; sourceTree = "<group>"; };
 		EB8D2ABD246A4975009948D1 /* Arm82OpRegister.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82OpRegister.cpp; path = ../arm82/Arm82OpRegister.cpp; sourceTree = "<group>"; };
@@ -1878,6 +1920,8 @@
 		488873AC215B639D0079B12E /* core */ = {
 			isa = PBXGroup;
 			children = (
+				CEEDB5532C7475A100FED0DC /* MNNFileUtils.cpp */,
+				CEEDB5522C7475A100FED0DC /* MNNFileUtils.h */,
 				48C84B9B250F722B00EE7666 /* Command.hpp */,
 				4819FB1524C138DF0050BD09 /* GeometryConvUtils.cpp */,
 				4819FB1324C138DF0050BD09 /* GeometryConvUtils.hpp */,
@@ -1921,6 +1965,25 @@
 		48887410215B639D0079B12E /* cpu */ = {
 			isa = PBXGroup;
 			children = (
+				CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */,
+				CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */,
+				CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */,
+				CE072A062C91AEE700F190FD /* MNNBGRAToBGR.S */,
+				CE072A042C91AEE700F190FD /* MNNBGRAToGRAY.S */,
+				CE072A032C91AEE700F190FD /* MNNBGRToBGR555.S */,
+				CE072A0B2C91AEE700F190FD /* MNNBGRToBGR565.S */,
+				CE072A092C91AEE700F190FD /* MNNBGRToGRAY.S */,
+				CE072A102C91AEE700F190FD /* MNNGRAYToC3Fast.S */,
+				CE072A082C91AEE700F190FD /* MNNGRAYToC4Fast.S */,
+				CE072A122C91AEE700F190FD /* MNNPackC2.S */,
+				CE072A0D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S */,
+				CE072A0C2C91AEE700F190FD /* MNNRGBAToBGRFast.S */,
+				CE072A052C91AEE700F190FD /* MNNRGBAToGRAYFast.S */,
+				CE072A0F2C91AEE700F190FD /* MNNRGBToBGR.S */,
+				CE072A0E2C91AEE700F190FD /* MNNRGBToBGR555.S */,
+				CE072A112C91AEE700F190FD /* MNNRGBToBGR565.S */,
+				CE072A0A2C91AEE700F190FD /* MNNRGBToGRAYFast.S */,
+				CE072A072C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S */,
 				CEE4566A2BC0E23D00F062C1 /* CPUExternalConst.cpp */,
 				95278CE62B9F0999009E9B29 /* CPUDynamicQuant.cpp */,
 				95278CE52B9F0999009E9B29 /* CPUDynamicQuant.hpp */,
@@ -2969,6 +3032,7 @@
 				489D7A982550FDC900AD896A /* MNNMetalContext.h in Headers */,
 				952298B82B4D4CC80043978B /* coreMLLayerNorm.hpp in Headers */,
 				92FF029323AA0B5A00AC97F6 /* CPURange.hpp in Headers */,
+				CEEDB5542C7475A100FED0DC /* MNNFileUtils.h in Headers */,
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
@@ -3260,6 +3324,8 @@
 				950B29002A0C9B4D0002F454 /* MNNScaleAndAddBiasInt8.S in Sources */,
 				92FF04BD23AA0BFB00AC97F6 /* Execution.cpp in Sources */,
 				92FF030A23AA0B5A00AC97F6 /* MNNLineDepthWiseInt8AddBiasScaleUnit.S in Sources */,
+				CE072A212C91AEE700F190FD /* MNNRGBToBGR565.S in Sources */,
+				CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */,
 				92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */,
 				48FA474623AA127B00172C3B /* NeuralNetWorkOp.cpp in Sources */,
 				4D9A936E26255BDA00F9B43C /* CoreMLArgMax.cpp in Sources */,
@@ -3270,6 +3336,7 @@
 				48747D63245D9E33000B9709 /* GeometryPermute.cpp in Sources */,
 				92FF032C23AA0B5A00AC97F6 /* MNNWinogradMatrixProductRight.S in Sources */,
 				48BB6EF625220AA80056E195 /* MNNTranspose32Bit4x4.S in Sources */,
+				CE072A1C2C91AEE700F190FD /* MNNRGBAToBGRFast.S in Sources */,
 				CEE9B95C2A3AA4D4006438F2 /* MNNBilinearSampleC8.S in Sources */,
 				48BB6EF025220A930056E195 /* MNNTranspose32Bit4x4.S in Sources */,
 				92FF031223AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
@@ -3296,6 +3363,7 @@
 				4D9A935F26255BDA00F9B43C /* NeuralNetwork.pb-c.c in Sources */,
 				4D0C80E32862FC4100C7CAD6 /* CoreMLOPRegister.cpp in Sources */,
 				92FF02BE23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
+				CE072A1A2C91AEE700F190FD /* MNNRGBToGRAYFast.S in Sources */,
 				4A224A0B27D0C2D9000A9260 /* ConvolutionPackFreeWinograd.cpp in Sources */,
 				48608B52250632EC00CB1D71 /* GeometryComputerUtils.cpp in Sources */,
 				489D7A8A2550FDC900AD896A /* MetalConvolutionDepthwise.mm in Sources */,
@@ -3330,6 +3398,7 @@
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
+				CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
 				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
@@ -3353,6 +3422,7 @@
 				48747D6F245D9E33000B9709 /* GeometryConcat.cpp in Sources */,
 				4819FB3224C1396A0050BD09 /* GeometryReduce.cpp in Sources */,
 				950B28EF29F627F70002F454 /* MNNBinaryMaxInt8.S in Sources */,
+				CE072A132C91AEE700F190FD /* MNNBGRToBGR555.S in Sources */,
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
@@ -3395,6 +3465,7 @@
 				48958783268EBA7C00EA01A7 /* ShapeSegmentMean.cpp in Sources */,
 				48747D61245D9E33000B9709 /* ConvertUtils.cpp in Sources */,
 				92FF043B23AA0B7100AC97F6 /* ShapeDetectionPostProcess.cpp in Sources */,
+				CE072A1B2C91AEE700F190FD /* MNNBGRToBGR565.S in Sources */,
 				48417FF124D13BF50056D9A7 /* GeometryELU.cpp in Sources */,
 				48C84B9A250F720C00EE7666 /* CPULayerNorm.cpp in Sources */,
 				4DF87C4A2887D3560003E2D4 /* calib3d.cpp in Sources */,
@@ -3449,6 +3520,7 @@
 				92FF034223AA0B5A00AC97F6 /* CPUReduction.cpp in Sources */,
 				92FF02CF23AA0B5A00AC97F6 /* MNNMinFloat.S in Sources */,
 				C4F906B0276886040026B847 /* GeometryTopK.cpp in Sources */,
+				CEEDB5552C7475A100FED0DC /* MNNFileUtils.cpp in Sources */,
 				48CA2F572681844C003A1796 /* MNNUnpackC8FP16.S in Sources */,
 				92FF030E23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
 				4837147225A599EC004DBDED /* Arm82Binary.cpp in Sources */,
@@ -3473,6 +3545,7 @@
 				4D9A936726255BDA00F9B43C /* CoreMLReduction.cpp in Sources */,
 				48F5881324DEA3F000C484A2 /* GeometryConv3D.cpp in Sources */,
 				4882C8BA241A22B800DAC168 /* OpCommonUtils.cpp in Sources */,
+				CE072A202C91AEE700F190FD /* MNNGRAYToC3Fast.S in Sources */,
 				92FF02B523AA0B5A00AC97F6 /* CPUTopKV2.cpp in Sources */,
 				92FF02BD23AA0B5A00AC97F6 /* MNNMatrixProd.S in Sources */,
 				489D7A872550FDC900AD896A /* MetalOPRegister.mm in Sources */,
@@ -3536,17 +3609,21 @@
 				4D759B2C25FF89EE0037B0B6 /* GeometryShape.cpp in Sources */,
 				11A01A07258785EA00745FA7 /* MNNVectorTop1Float.S in Sources */,
 				48747D6E245D9E33000B9709 /* GeometrySlice.cpp in Sources */,
+				CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */,
 				CECF8C7D299CAD9400D3875B /* md5.c in Sources */,
 				92FF041923AA0B7100AC97F6 /* ShapeQuantizedMaxPool.cpp in Sources */,
 				92FF038A23AA0B5A00AC97F6 /* CPURange.cpp in Sources */,
+				CE072A182C91AEE700F190FD /* MNNGRAYToC4Fast.S in Sources */,
 				CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */,
 				92FF03A123AA0B5A00AC97F6 /* Int8FunctionsOpt.cpp in Sources */,
+				CE072A222C91AEE700F190FD /* MNNPackC2.S in Sources */,
 				92FF026523AA0B5A00AC97F6 /* CPUQuantizedAvgPool.cpp in Sources */,
 				92FF029423AA0B5A00AC97F6 /* CPUMatMul.cpp in Sources */,
 				48747D62245D9E33000B9709 /* GeometryOPRegister.cpp in Sources */,
 				4838EA8B2611C1310027232C /* ShapeGridSample.cpp in Sources */,
 				92FF03A323AA0B5A00AC97F6 /* ConvOpt.cpp in Sources */,
 				92FF02CD23AA0B5A00AC97F6 /* MNNNV21ToRGBUnit.S in Sources */,
+				CE072A172C91AEE700F190FD /* MNNSamplerC3BilinearOpt.S in Sources */,
 				92FF029A23AA0B5A00AC97F6 /* CPUQuantizedMaxPool.cpp in Sources */,
 				48F5881124DEA3F000C484A2 /* GeometryPooling3D.cpp in Sources */,
 				92FF042423AA0B7100AC97F6 /* ShapeROIPooling.cpp in Sources */,
@@ -3569,11 +3646,13 @@
 				92FF02B123AA0B5A00AC97F6 /* CPUBackend.cpp in Sources */,
 				4D9A936226255BDA00F9B43C /* FeatureTypes.pb-c.c in Sources */,
 				486E1A9924F5078D00C16006 /* CPURandomUniform.cpp in Sources */,
+				CE072A1F2C91AEE700F190FD /* MNNRGBToBGR.S in Sources */,
 				92FF02C823AA0B5A00AC97F6 /* MNNNV21ToBGRUnit.S in Sources */,
 				92FF045C23AA0B7100AC97F6 /* ShapeBroadcastTo.cpp in Sources */,
 				48747D49245D9D24000B9709 /* RuntimeFactory.cpp in Sources */,
 				92FF02AE23AA0B5A00AC97F6 /* CPUProposal.cpp in Sources */,
 				92FF042723AA0B7100AC97F6 /* ShapeMatMul.cpp in Sources */,
+				CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */,
 				92FF042823AA0B7100AC97F6 /* ShapeInterp.cpp in Sources */,
 				92FF02D623AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				48FB9DCA24A848D0008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
@@ -3610,6 +3689,7 @@
 				CECF8C64299CAD8400D3875B /* LogHelper.mm in Sources */,
 				48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
 				92FF02EA23AA0B5A00AC97F6 /* MNNGemmInt8AddBiasScale_16x4_Unit.S in Sources */,
+				CE072A162C91AEE700F190FD /* MNNBGRAToBGR.S in Sources */,
 				48A8A61A21D101DE00C2B9A7 /* Matrix_CV.cpp in Sources */,
 				4DDD8E102B1D70C1005065D1 /* MNNTranspose16Bit8x8.S in Sources */,
 				489D7A8C2550FDC900AD896A /* MetalDeconvolution.mm in Sources */,
@@ -3659,6 +3739,7 @@
 				48F9E54C2493511200E46522 /* MNNPackedMatMul.S in Sources */,
 				C4D4824327BA67DE0021C2B9 /* GeometryDet.cpp in Sources */,
 				92FF026F23AA0B5A00AC97F6 /* CPUInt8ToFloat.cpp in Sources */,
+				CE072A142C91AEE700F190FD /* MNNBGRAToGRAY.S in Sources */,
 				92FF037E23AA0B5A00AC97F6 /* CPUDetectionPostProcess.cpp in Sources */,
 				4D4CF4682760946500A36D9F /* geometric.cpp in Sources */,
 				92FF045023AA0B7100AC97F6 /* ShapeCropAndResize.cpp in Sources */,
@@ -3671,6 +3752,7 @@
 				92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
 				CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
 				CECF8C78299CAD9400D3875B /* log_util_imp.cpp in Sources */,
+				CE072A152C91AEE700F190FD /* MNNRGBAToGRAYFast.S in Sources */,
 				92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
 				952298B22B4D39050043978B /* MetalLoop.mm in Sources */,
 				48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */,
@@ -3691,6 +3773,7 @@
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
 				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
+				CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
 				48608B51250632EC00CB1D71 /* GeometryComputer.cpp in Sources */,
 				92FF02FF23AA0B5A00AC97F6 /* MNNFloat2Int8.S in Sources */,
@@ -3720,6 +3803,7 @@
 				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
+				CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
 				489D7A762550FDC800AD896A /* MetalReduction.mm in Sources */,
 				92FF032023AA0B5A00AC97F6 /* MNNMatrixSub.S in Sources */,
 				C43C81FF251894BD00A0FF84 /* ThreadPool.cpp in Sources */,
@@ -4101,7 +4185,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -4188,7 +4272,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Q48UX93J22;
+				DEVELOPMENT_TEAM = 6G7464HHUS;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
diff --git a/pymnn/test/model_test.py b/pymnn/test/model_test.py
index 1da4e85f0..011cedb71 100644
--- a/pymnn/test/model_test.py
+++ b/pymnn/test/model_test.py
@@ -80,18 +80,21 @@ def MNNDataType2NumpyDataType(data_type):
     else:
         return np.float32
 
-def createTensor(tensor, file=''):
+def createTensor(tensor, file='', empty=False):
     shape = tensor.getShape()
     data_type = tensor.getDataType()
     dtype = MNNDataType2NumpyDataType(data_type)
     if file == '':
-        data = np.ones(shape, dtype=dtype)
+        if empty:
+            data = np.zeros(shape, dtype=dtype)
+        else:
+            data = np.ones(shape, dtype=dtype)
     else:
         data = loadtxt(file, shape, dtype)
-    return MNN.Tensor(shape, tensor.getDataType(), data, tensor.getDimensionType())
+    return MNN.Tensor(shape, tensor.getDataType(), data.copy(), tensor.getDimensionType())
 
 def compareTensor(tensor, file, tolerance=5e-2):
-    outputNumpyData = tensor.getNumpyData()
+    outputNumpyData = tensor.getNumpyData().copy()
     expectNumpyData = loadtxt(file, tensor.getShape())
     max_abs_dif = np.abs(outputNumpyData - expectNumpyData).max()
     max_exp_val = np.abs(expectNumpyData).max()
@@ -117,6 +120,11 @@ def modelTest(modelPath, givenName, expectName):
     net = MNN.Interpreter(modelPath)
     session = net.createSession()
     allInput = net.getSessionInputAll(session)
+    # zero for all inputs
+    for name in allInput:
+        inputTensor = allInput[name] 
+        inputHost = createTensor(inputTensor, givenName, True)
+        inputTensor.copyFrom(inputHost)
     # input
     inputTensor = net.getSessionInput(session)
     inputHost = createTensor(inputTensor, givenName)
diff --git a/source/backend/arm82/Arm82Backend.cpp b/source/backend/arm82/Arm82Backend.cpp
index 7b13b852b..377243388 100644
--- a/source/backend/arm82/Arm82Backend.cpp
+++ b/source/backend/arm82/Arm82Backend.cpp
@@ -118,6 +118,7 @@ void Arm82Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor
         CPUBackend::onCopyBuffer(srcTensor, dstTensor);
         return;
     }
+    _resetDynamicMemory();
     auto source = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
     auto dest   = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
     auto srcType = MNN_FORWARD_CPU;
diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
index 2e4e9dc6b..92749c426 100644
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@@ -35,12 +35,14 @@ void MNNPackedMatMulFP16(float* C, const float* A, const float* B, const size_t*
 // parameter: [aStride, l, h, cStride, bExtraStride]
 void MNNPackedMatMulRemainFP16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 void MNNPackedMatMulFP16_int4(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulRemainFP16_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulFP16_int8(float* C, const float* A, const float* B, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
 void MNNPackedMatMulRemainFP16_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b);
+#endif
 
+#ifdef MNN_LOW_MEMORY
 void MNNAbsMaxFP16(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack);
 void MNNQuantScaleFP16(float* sum, float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch);
 void MNNDynamicQuantFP16(const float* src, int8_t* dst, const float* scale, size_t src_depth_quad, size_t realSize, int pack);
@@ -48,8 +50,6 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
 #endif
 #if defined(__aarch64__)
 void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
-void MNNSumByAxisLForMatmul_A_ARM86(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
-void MNNSumByAxisLForMatmul_A_ARM82(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
 #endif
 void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
 
@@ -735,29 +735,25 @@ bool Arm82Functions::init() {
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul, MNNPackedMatMulFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain, MNNPackedMatMulRemainFP16);
 #if defined(__aarch64__)
-#ifdef MNN_LOW_MEMORY
+    gInstance->supportFp16arith = origin->supportFp16arith;
+    gInstance->supportSDot = origin->supportSDot;
+    gInstance->supportI8mm = origin->supportI8mm;
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
     // Weight Dequant Gemm Kernels
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul_int4, MNNPackedMatMulFP16_int4);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain_int4, MNNPackedMatMulRemainFP16_int4);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMul_int8, MNNPackedMatMulFP16_int8);
     FUNC_PTR_ASSIGN(gInstance->MNNPackedMatMulRemain_int8, MNNPackedMatMulRemainFP16_int8);
+#endif
+#ifdef MNN_LOW_MEMORY
     // Dynamic Qaunt Helper Functions
     FUNC_PTR_ASSIGN(gInstance->MNNAbsMax, MNNAbsMaxFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNQuantScale, MNNQuantScaleFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNDynamicQuant, MNNDynamicQuantFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNQuantSum, MNNQuantSumFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
-    // Dynamic Quant Gemm Kernels.
-    gInstance->supportFp16arith = origin->supportFp16arith;
-    gInstance->supportSDot = origin->supportSDot;
-    gInstance->supportI8mm = origin->supportI8mm;
 #endif
-    if (gInstance->supportSDot) {
-        FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, MNNSumByAxisLForMatmul_A_ARM82);
-    }
-    if (gInstance->supportI8mm) {
-        FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, MNNSumByAxisLForMatmul_A_ARM86);
-    }
+    FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
 #endif
     FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
     FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
diff --git a/source/backend/arm82/CMakeLists.txt b/source/backend/arm82/CMakeLists.txt
index cc9fc0ab7..afbe55dbb 100644
--- a/source/backend/arm82/CMakeLists.txt
+++ b/source/backend/arm82/CMakeLists.txt
@@ -10,10 +10,17 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
     if (MNN_LOW_MEMORY)
         file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
     endif()
+    if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+        file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/normal_memory/*)
+    endif()
     add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
     if (MNN_LOW_MEMORY)
         target_compile_options(MNN_Arm82 PRIVATE -DMNN_LOW_MEMORY)
     endif()
+    
+    if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNN_Arm82 PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+    endif()
     target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -DENABLE_ARMV82)
 else()
 # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S
index 22919922f..0812f45c4 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuanInput_ARM82.S
@@ -90,21 +90,31 @@
 Note: Only used in dynamic quant,so do not need compare min max!
  */
 asm_function DynamicQuanInput_ARM82
-//void DynamicQuanInput_ARM82(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint);
-//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint
+//void DynamicQuanInput_ARM82(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, float* zeroPoint, ssize_t quanParamVec);
+//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint, x7:quanParamVec
 stp d14, d15, [sp, #-64]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
 ld1 {v29.s}[0], [x3] // Load scale
-// copy zero point
-dup v30.4s, w6
-fcvtn v31.4h, v29.4s
-scvtf v30.4s, v30.4s
+ld1 {v30.s}[0], [x6] // Load zero
+
+and x8, x7, #1 // if load vector scale
+and x9, x7, #2 // if load vector zero
+cbz x8, LOAD_VECTOR_ZERO
+ld1 {v29.4s}, [x3] // scale
+
+LOAD_VECTOR_ZERO:
+cbz x9, START
+ld1 {v30.4s}, [x6] // zero
 
+
+START:
+// copy zero point
+fcvtn v31.4h, v29.4s // fp16 scale
+fcvtn v30.4h, v30.4s // fp16 zero
 dup v31.8h, v31.h[0]
-fcvtn v30.4h, v30.4s
 dup v30.8h, v30.h[0]
 
 FL28:
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S
index 44e3568f1..5a8381765 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantAndReorder_ARM82.S
@@ -1,5 +1,5 @@
 //
-//  DynamicQuanInput_ARM82.S
+//  DynamicQuanInputAndReorder_ARM82.S
 //  MNN
 //
 //  Created by MNN on 2019/01/22.
@@ -101,15 +101,12 @@ stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
 ld1 {v29.s}[0], [x3] // Load scale
-// copy zero point
-dup v30.4s, w6
+ld1 {v30.s}[0], [x6] // Load zero point
 fcvtn v31.4h, v29.4s
-scvtf v30.4s, v30.4s
-
+fcvtn v30.4h, v30.4s
 add x13, x8, x8
 
 dup v31.8h, v31.h[0]
-fcvtn v30.4h, v30.4s
 dup v30.8h, v30.h[0]
 
 mov x9, x1 // first N*4
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
index 143ec060a..ad9313244 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
@@ -115,19 +115,16 @@ ldr x27, [x6, #64]  // blockNum
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
 mov x22, #48 // src_steps
-add x24, x15, x15
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
     blt TILE_8
 
 L8LoopDz_TILE_12:
-    // ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
-    // Init 0
     SET_BIAS v8, v9, v10, v11
     SET_BIAS v12, v13, v14, v15
     SET_BIAS v16, v17, v18, v19
@@ -137,13 +134,13 @@ L8LoopDz_TILE_12:
 
     mov x28, x2
     L8LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x2], #32 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x2], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
@@ -156,7 +153,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -169,9 +166,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    //add x2, x2, x15
-    //add x24, x15, x15
-    add x2, x28, x24
+    add x2, x28, x15
     sub x5, x5, #1
 
     L8Tile12Quan:
@@ -217,8 +212,6 @@ L8LoopDz_TILE_12:
     MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3 
     MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
 
-    //ld1r {v0.4s}, [x23] // f32 min
-    //ld1r {v1.4s}, [x24] // f32 max
     MLA_WEIGHTZERO v20, v2, v6, 0 // tile:0, oc:4-7
     MLA_WEIGHTZERO v21, v2, v6, 1 // tile:1, oc:4-7
     MLA_WEIGHTZERO v22, v2, v6, 2 // tile:2, oc:4-7
@@ -297,8 +290,6 @@ L8LoopDz_TILE_12:
     blt End
 
 TILE_8:
-    //ld1r {v26.4s}, [x23] // f32 min
-    //ld1r {v27.4s}, [x24] // f32 max
     cmp x7, #8
     blt TILE_4
     mov x10, x0
@@ -319,18 +310,18 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
     mov x28, x12
     L8LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x12], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -343,9 +334,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile8Quan:
@@ -468,15 +457,13 @@ L8LoopDz_TILE_4:
 
     mov x28, x12
     L8LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -484,9 +471,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile4Quan:
@@ -571,23 +556,17 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
-    //mov v8.16b, v0.16b
-    //mov v9.16b, v1.16b
     mov x28, x12
     L8LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
         bne L8LoopSz_TILE_1
 
     L8LoopSzEnd_TILE_1:
-    //add x12, x12, x15
-    //add x24, x15, x15
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile1Quan:
@@ -630,11 +609,7 @@ L8LoopDz_TILE_1:
     sub x23, x23, #2
     fmax v0.8h, v24.8h, v0.8h
     fmin v0.8h, v25.8h, v0.8h
-    // st1 {v8.4s}, [x10], x4
-    // st1 {v9.4s}, [x10], x4
 
-    //fcvtn v0.4h, v8.4s
-    //fcvtn2 v0.8h, v9.4s
     TILE1_STORE:
     st1 {v0.8h}, [x10], x4
 
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
index 5d92ae056..dd893b292 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
@@ -114,16 +114,14 @@ ldr x27, [x6, #64]  // blockNum
 mov x21, #16 // sizeof(float16_t) * PACK
 mul x27, x27, x3
 Start:
-lsl x15, x27, #3 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mov x22, #48 // src_steps
-add x24, x15, x15
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
     blt TILE_8
 
 L8LoopDz_TILE_12:
-    // ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
     movi v7.16b, #15
@@ -138,13 +136,11 @@ L8LoopDz_TILE_12:
 
     mov x28, x2
     L8LoopSz_TILE_12:
-        ld1 {v3.d}[0], [x2], x15 // weight
-        ld1 {v4.d}[0], [x2], #8
+        ld1 {v5.16b}, [x2], #16 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -155,10 +151,6 @@ L8LoopDz_TILE_12:
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
         .inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
@@ -168,7 +160,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -181,7 +173,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    add x2, x28, x24
+    add x2, x28, x15
     sub x5, x5, #1
 
     L8Tile12Quan:
@@ -227,8 +219,6 @@ L8LoopDz_TILE_12:
     MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3 
     MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
 
-    //ld1r {v0.4s}, [x23] // f32 min
-    //ld1r {v1.4s}, [x24] // f32 max
     MLA_WEIGHTZERO v20, v2, v6, 0 // tile:0, oc:4-7
     MLA_WEIGHTZERO v21, v2, v6, 1 // tile:1, oc:4-7
     MLA_WEIGHTZERO v22, v2, v6, 2 // tile:2, oc:4-7
@@ -304,7 +294,7 @@ L8LoopDz_TILE_12:
     L8Tile12LoopCheck:
     cmp x5, #1
     bge L8LoopDz_TILE_12
-    blt End
+    b End
 
 TILE_8:
     cmp x7, #8
@@ -327,27 +317,24 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
     mov x28, x12
     L8LoopSz_TILE_8:
-        ld1 {v3.d}[0], [x12], x15 // weight
-        ld1 {v4.d}[0], [x12], #8
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
+        //zip1 v3.16b, v5.16b, v6.16b
+        //zip2 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -360,7 +347,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile8Quan:
@@ -446,10 +433,6 @@ L8LoopDz_TILE_8:
     st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x10], #64
     st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x4
 
-    //st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
-    //st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
-    //st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], #64
-    //st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4
     add x4, x4, #64
 
     L8Tile8LoopCheck:
@@ -483,24 +466,20 @@ L8LoopDz_TILE_4:
 
     mov x28, x12
     L8LoopSz_TILE_4:
-        ld1 {v3.d}[0], [x12], x15 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
+        //zip1 v3.16b, v5.16b, v6.16b
+        //zip2 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -508,7 +487,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile4Quan:
@@ -593,29 +572,61 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
+    
     mov x28, x12
-    L8LoopSz_TILE_1:
-        ld1 {v3.d}[0], [x12], x15 // weight
+    cmp x13, #4
+    blt L8LoopSz_TILE_1_lu1
+
+    L8LoopSz_TILE_1_lu4:
+        ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x12], #64 // weight: hu=0,1,2,3,pack=0~7
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
-        // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ld1 {v0.s}[1], [x11], x22
+        ld1 {v0.s}[2], [x11], x22
+        ld1 {v0.s}[3], [x11], x22
 
-        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        sub x13, x13, #4
+        // int4->int8
+        ushr v12.16b, v3.16b, #4
+        and v22.16b, v3.16b, v7.16b
+
+        ushr v15.16b, v4.16b, #4
+        and v23.16b, v4.16b, v7.16b
+
+        ushr v18.16b, v5.16b, #4
+        and v24.16b, v5.16b, v7.16b
+
+        ushr v21.16b, v6.16b, #4
+        and v25.16b, v6.16b, v7.16b
+
+        cmp x13, #4
+        //sub x12, x12, x15
+        .inst 0x4f80e188 // sdot v8.4s, v12.16b, v0.4b[0]
+        .inst 0x4f80e2c9 // sdot v9.4s, v22.16b, v0.4b[0]
+        .inst 0x4fa0e1e8 // sdot v8.4s, v15.16b, v0.4b[1]
+        .inst 0x4fa0e2e9 // sdot v9.4s, v23.16b, v0.4b[1]
+        .inst 0x4f80ea48 // sdot v8.4s, v18.16b, v0.4b[2]
+        .inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]
+        .inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]
+        .inst 0x4fa0eb29 // sdot v9.4s, v25.16b, v0.4b[3]
+        bge L8LoopSz_TILE_1_lu4
+
+    cbz x13, L8LoopSzEnd_TILE_1
+
+    L8LoopSz_TILE_1_lu1:
+        ld1 {v4.16b}, [x12], #16 // weight
+        ld1 {v0.s}[0], [x11], x22 // src
+        //ld1 {v4.d}[0], [x12], #8 // weight
         subs x13, x13, #1
         // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
-        sub x12, x12, x15
+        ushr v3.16b, v4.16b, #4
+        and v12.16b, v4.16b, v7.16b
 
-        .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
-        bne L8LoopSz_TILE_1
+        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        .inst 0x4f80e189 // sdot v9.4s, v12.16b, v0.4b[0]
+        bne L8LoopSz_TILE_1_lu1
 
     L8LoopSzEnd_TILE_1:
-    add x12, x28, x24
+    add x12, x28, x15
     sub x14, x14, #1
 
     L8Tile1Quan:
@@ -658,8 +669,6 @@ L8LoopDz_TILE_1:
     sub x23, x23, #2
     fmax v0.8h, v24.8h, v0.8h
     fmin v0.8h, v25.8h, v0.8h
-    // st1 {v8.4s}, [x10], x4
-    // st1 {v9.4s}, [x10], x4
     TILE1_STORE:
     st1 {v0.8h}, [x10], x4
 
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
index 7022af3a1..f6f6625d7 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
@@ -150,6 +150,7 @@ LoopDz_TILE_10:
     mov x12, x2 // weight
     mov x13, x3 // src_depth_quad
     mov x10, x0 // tag dst address
+    movi v2.16b, #15
 
     SET_0_5 v12, v16, v20, v24, v28 // oc:0,1,0,1
     SET_0_5 v13, v17, v21, v25, v29 // oc:2,3,2,3
@@ -158,7 +159,6 @@ LoopDz_TILE_10:
 
 LoopSz_TILE_10:
     ld1 {v0.16b, v1.16b}, [x12], #32                    // weight
-    movi v2.16b, #15
     ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x11], #64    // src: E0-E9
     ld1 {v7.16b}, [x11], #16
     // int4->int8
@@ -763,50 +763,88 @@ TILE_1:
     mov x20, x9 // bias
     mov x6, x28 // weightQuanBias
 LoopDz_TILE_1:
-    //ld1 {v7.4s, v8.4s}, [x20], #32  // bias
     mov x11, x1 // src
     mov x12, x25 // weight
     mov x13, x3 // src_depth_quad
     mov x10, x26
 
-    //dup v16.2d, v7.d[0] // oc:0,1,0,1
-    //dup v17.2d, v7.d[1] // oc:2,3,2,3
-    //dup v18.2d, v8.d[0] // oc:4,5,4,5
-    //dup v19.2d, v8.d[1] // oc:6,7,6,7
     movi v16.4s, #0 // oc:0,1,0,1
     movi v17.4s, #0 // oc:2,3,2,3
     movi v18.4s, #0 // oc:4,5,4,5
     movi v19.4s, #0 // oc:6,7,6,7
 
-    //movi v22.4s, #0 // oc:0,1,0,1
-    //movi v23.4s, #0 // oc:2,3,2,3
-    //movi v24.4s, #0 // oc:4,5,4,5
-    //movi v25.4s, #0 // oc:6,7,6,7
+    cmp x13, #4
+    blt LoopSz1_TILE_1_lu1
+LoopSz1_TILE_1_lu4:
+    ld1 {v5.16b, v6.16b, v7.16b, v8.16b}, [x12], #64     // weight
+    ld1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x12], #64
+    ld1 {v0.8b}, [x11], x22                              // src
+    ld1 {v1.8b}, [x11], x22
+    ld1 {v2.8b}, [x11], x22
+    ld1 {v3.8b}, [x11], x22
+
+    // int4->int8
+    ushr v4.16b, v5.16b, #4
+    ushr v14.16b, v6.16b, #4
+    and v13.16b, v5.16b, v30.16b
+    and v15.16b, v6.16b, v30.16b
+
+    ushr v20.16b, v7.16b, #4
+    ushr v21.16b, v8.16b, #4
+    and v22.16b, v7.16b, v30.16b
+    and v23.16b, v8.16b, v30.16b
+
+    ushr v24.16b, v9.16b, #4
+    ushr v25.16b, v10.16b, #4
+    and v26.16b, v9.16b, v30.16b
+    and v27.16b, v10.16b, v30.16b
+
+    ushr v5.16b, v11.16b, #4
+    ushr v6.16b, v12.16b, #4
+    and v7.16b, v11.16b, v30.16b
+    and v8.16b, v12.16b, v30.16b
+
+    sub x13, x13, #4
+
+    .inst 0x4e84a410 // smmla v16.4s, v0.16b, v4.16b
+    .inst 0x4e8ea411 // smmla v17.4s, v0.16b, v14.16b
+    .inst 0x4e8da412 // smmla v18.4s, v0.16b, v13.16b
+    .inst 0x4e8fa413 // smmla v19.4s, v0.16b, v15.16b
+
+    .inst 0x4e94a430 // smmla v16.4s, v1.16b, v20.16b
+    .inst 0x4e95a431 // smmla v17.4s, v1.16b, v21.16b
+    .inst 0x4e96a432 // smmla v18.4s, v1.16b, v22.16b
+    .inst 0x4e97a433 // smmla v19.4s, v1.16b, v23.16b
+    cmp x13, #4
+    .inst 0x4e98a450 // smmla v16.4s, v2.16b, v24.16b
+    .inst 0x4e99a451 // smmla v17.4s, v2.16b, v25.16b
+    .inst 0x4e9aa452 // smmla v18.4s, v2.16b, v26.16b
+    .inst 0x4e9ba453 // smmla v19.4s, v2.16b, v27.16b
+
+    .inst 0x4e85a470 // smmla v16.4s, v3.16b, v5.16b
+    .inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b
+    .inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b
+    .inst 0x4e88a473 // smmla v19.4s, v3.16b, v8.16b
+    
+    bge LoopSz1_TILE_1_lu4
+    cbz x13, LoopSzEnd_TILE_1
 
-LoopSz1_TILE_1:
-    // src    : 1 x [1 x 8] : v2
-    // weight : 2 x [2 x 8] : v0-1
-    // dst    : 1 x 2 x [2] : v30-v31
+LoopSz1_TILE_1_lu1:
     ld1 {v13.16b, v14.16b}, [x12], #32   // weight
-    ld1 {v2.8b}, [x11], x22           // src
+    ld1 {v2.8b}, [x11], x22              // src
     // int4->int8
     ushr v0.16b, v13.16b, #4
     and v3.16b, v13.16b, v30.16b
     ushr v1.16b, v14.16b, #4
     and v4.16b, v14.16b, v30.16b
+    subs x13, x13, #1
 
     .inst 0x4e80a450 // smmla v16.4s, v2.16b, v0.16b
     .inst 0x4e81a451 // smmla v17.4s, v2.16b, v1.16b
     .inst 0x4e83a452 // smmla v18.4s, v2.16b, v3.16b
     .inst 0x4e84a453 // smmla v19.4s, v2.16b, v4.16b
-    subs x13, x13, #1
-    bne LoopSz1_TILE_1
-
-    LoopSz_TILE_1_ADD:
-    //add v16.4s, v16.4s, v22.4s
-    //add v17.4s, v17.4s, v23.4s
-    //add v18.4s, v18.4s, v24.4s
-    //add v19.4s, v19.4s, v25.4s
+    
+    bne LoopSz1_TILE_1_lu1
 
 LoopSzEnd_TILE_1:
     add x25, x25, x15
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S b/source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int4.S
similarity index 100%
rename from source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int4.S
rename to source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int4.S
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S b/source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int8.S
similarity index 100%
rename from source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulFP16_int8.S
rename to source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulFP16_int8.S
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S b/source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int4.S
similarity index 100%
rename from source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int4.S
rename to source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int4.S
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S b/source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int8.S
similarity index 100%
rename from source/backend/arm82/asm/arm64/low_memory/MNNPackedMatMulRemainFP16_int8.S
rename to source/backend/arm82/asm/arm64/normal_memory/MNNPackedMatMulRemainFP16_int8.S
diff --git a/source/backend/coreml/backend/CoreMLBackend.cpp b/source/backend/coreml/backend/CoreMLBackend.cpp
index 0a8ee125e..8342e68dd 100644
--- a/source/backend/coreml/backend/CoreMLBackend.cpp
+++ b/source/backend/coreml/backend/CoreMLBackend.cpp
@@ -300,7 +300,7 @@ namespace MNN {
 
     CoreMLRuntime::~CoreMLRuntime() {}
 
-    Backend* CoreMLRuntime::onCreate(const BackendConfig* config) const {
+    Backend* CoreMLRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
         return new CoreMLBackend(this);
     }
 
diff --git a/source/backend/coreml/backend/CoreMLBackend.hpp b/source/backend/coreml/backend/CoreMLBackend.hpp
index 121b18192..b9136690b 100644
--- a/source/backend/coreml/backend/CoreMLBackend.hpp
+++ b/source/backend/coreml/backend/CoreMLBackend.hpp
@@ -26,7 +26,7 @@ namespace MNN {
         CoreMLRuntime(const Backend::Info& info);
         virtual ~CoreMLRuntime();
         virtual CompilerType onGetCompilerType() const override;
-        virtual Backend* onCreate(const BackendConfig* conf) const override;
+        virtual Backend* onCreate(const BackendConfig* conf, Backend* origin) const override;
         virtual void onGabageCollect(int level) override;
         virtual std::pair<const void*, size_t> onGetCache() override {
             return std::make_pair(mCacheBuffer, mCacheSize);
diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt
index 41426c66c..e37ae3e55 100644
--- a/source/backend/cpu/CMakeLists.txt
+++ b/source/backend/cpu/CMakeLists.txt
@@ -24,6 +24,10 @@ if(MNN_LOW_MEMORY)
   target_compile_options(MNNCPU PRIVATE -DMNN_LOW_MEMORY)
 endif()
 
+if(MNN_CPU_WEIGHT_DEQUANT_GEMM)
+  target_compile_options(MNNCPU PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+endif()
+
 # X86_64 AVX/SSE
 if (MNN_USE_SSE)
     include(${CMAKE_CURRENT_LIST_DIR}/x86_x64/CMakeLists.txt)
diff --git a/source/backend/cpu/CPUAttention.cpp b/source/backend/cpu/CPUAttention.cpp
index 8a5a89ec3..7f4c6ff44 100644
--- a/source/backend/cpu/CPUAttention.cpp
+++ b/source/backend/cpu/CPUAttention.cpp
@@ -30,22 +30,51 @@
 namespace MNN {
 
 template <typename T>
-static void pack_query(Tensor* query, char* pack_q, int mNumHead, int mHeadDim, int eP, int seq_len, int h, float q_scale) {
-    T * query_src = query->host<T>();
-    T * query_dst = reinterpret_cast<T*>(pack_q);
-    for (int i = 0; i < seq_len; i++) {
-        int out_index = i / eP;
-        int in_index  = i % eP;
-        for (int j = 0; j < mHeadDim; j++) {
-            query_dst[out_index * mHeadDim * eP + j * eP + in_index] = query_src[i * mNumHead * mHeadDim + h * mHeadDim + j] * q_scale;
+void CPUAttention::pack_query(Tensor* query, char* pack_q, char* sum_q, int seq_len, int h, float q_scale) {
+    if (mUseGemmInt8) { // Shape of Query: numhead, [seqlen/eP8, headdim/lP8, eP8, lP8]
+        mMinQ[h] = query->host<T>()[h * mHeadDim];
+        mMaxQ[h] = query->host<T>()[h * mHeadDim];
+        for (int i = 0; i < seq_len; i++) {
+            T * query_src = query->host<T>() + i * mNumHead * mHeadDim + h * mHeadDim;
+            for (int j = 0; j < mHeadDim; j++) {
+                mMinQ[h] = ALIMIN(mMinQ[h], query_src[j]);
+                mMaxQ[h] = ALIMAX(mMaxQ[h], query_src[j]);
+            }
+        }
+        mQueryScale[h] = (mMaxQ[h] - mMinQ[h]) / 255.0f;
+        mQueryZeroPoint[h] = -255.0f * mMinQ[h] / (mMaxQ[h] - mMinQ[h]) - 128.0;
+        for (int i = 0; i < seq_len; i++) {
+            T * query_src = query->host<T>() + i * mNumHead * mHeadDim + h * mHeadDim;
+            float sumQ = 0;
+            int out_index = i / eP8;
+            int in_index  = i % eP8;
+            for (int j = 0; j < mHeadDim; j++) {
+                int a = j / lP8;
+                int b = j % lP8;
+                int quant_res = (int)roundf(query_src[j] / mQueryScale[h] + mQueryZeroPoint[h]);
+                sumQ += quant_res;
+                *((int8_t*)pack_q + out_index * UP_DIV(mHeadDim, lP8) * eP8 * lP8 + a * eP8 * lP8 + in_index * lP8 + b) = quant_res;
+            }
+            *((float*)sum_q + out_index * eP8 + in_index) = sumQ * mQueryScale[h];
+        }
+    }
+    else {
+        T * query_src = query->host<T>();
+        T * query_dst = reinterpret_cast<T*>(pack_q);    
+        for (int i = 0; i < seq_len; i++) {
+            int out_index = i / eP;
+            int in_index  = i % eP;
+            for (int j = 0; j < mHeadDim; j++) {
+                query_dst[out_index * mHeadDim * eP + j * eP + in_index] = query_src[i * mNumHead * mHeadDim + h * mHeadDim + j] * q_scale;
+            }
         }
     }
 }
 
 template <typename T>
-static void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len, int unit) {
+void CPUAttention::unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len) {
     float * dst = unpack_qk_dst;
-    T * src = (T *)(pack_qk_src);
+    T * src = (T *)(pack_qk_src);    
     // [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len]
     for (int i = 0; i < seq_len; i++) {
         for (int j = 0; j < kv_seq_len; j++) {
@@ -119,6 +148,11 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
     mThreadNum = ((CPUBackend *)backend())->threadNumber();
     unit  = core->pack;
     bytes = core->bytes;
+    int qkvQuantOptions = static_cast<CPUBackend *>(backend())->getRuntime()->hint().qkvQuantOption;
+    mUseGemmInt8 = (qkvQuantOptions == 4);
+    if (mUseGemmInt8) {
+        static_cast<CPUBackend*>(backend())->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
+    }
     auto query = inputs[0];
     auto key   = inputs[1];
     int seq_len = query->shape()[1];
@@ -126,12 +160,28 @@ ErrorCode CPUAttention::onResize(const std::vector<Tensor*>& inputs, const std::
     mHeadDim = query->shape()[3];
     mKvNumHead = key->shape()[2];
     mKVCacheManager->onResize(mKvNumHead, mHeadDim);
-    mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP}));
-    mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
-    backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
-    backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
-    backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
-    backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);
+    if (mUseGemmInt8) {
+        mPackQ.reset(Tensor::createDevice<int8_t>({mThreadNum, UP_DIV(seq_len, eP8), UP_DIV(mHeadDim, lP8), eP8 * lP8}));
+        mSumQ.reset(Tensor::createDevice<int32_t>({mThreadNum, UP_DIV(seq_len, eP8), eP8}));
+        mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
+        backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mSumQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mSumQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        mMinQ.resize(mNumHead);
+        mMaxQ.resize(mNumHead);
+        mQueryScale.resize(mNumHead);
+        mQueryZeroPoint.resize(mNumHead);
+    } else {
+        mPackQ.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), mHeadDim, eP}));
+        mPackQKV.reset(Tensor::createDevice<float>({mThreadNum, UP_DIV(mHeadDim, unit), seq_len, unit}));
+        backend()->onAcquireBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onAcquireBuffer(mPackQKV.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQ.get(), Backend::DYNAMIC);
+        backend()->onReleaseBuffer(mPackQKV.get(), Backend::DYNAMIC);    
+    }
     return NO_ERROR;
 }
 
@@ -179,12 +229,12 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     // Temporary tensors for intermediate results
     std::shared_ptr<Tensor> packQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(kv_seq_len, unit), seq_len, unit}));
     std::shared_ptr<Tensor> unpackQK(Tensor::createDevice<int32_t>({mThreadNum, seq_len, kv_seq_len}));
-    std::shared_ptr<Tensor> softmaxQK(Tensor::createDevice<int>({mThreadNum, seq_len, kv_seq_len}));
+    std::shared_ptr<Tensor> softmMaxQ(Tensor::createDevice<int32_t>({mThreadNum, seq_len, kv_seq_len}));
     std::shared_ptr<Tensor> newPackQK(Tensor::createDevice<float>({mThreadNum, UP_DIV(seq_len, eP), kv_seq_len, eP}));
     std::shared_ptr<Tensor> dequantV(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), kv_seq_len, hP}));
     backend()->onAcquireBuffer(packQK.get(), Backend::STATIC);
     backend()->onAcquireBuffer(unpackQK.get(), Backend::STATIC);
-    backend()->onAcquireBuffer(softmaxQK.get(), Backend::STATIC);
+    backend()->onAcquireBuffer(softmMaxQ.get(), Backend::STATIC);
     backend()->onAcquireBuffer(newPackQK.get(), Backend::STATIC);
     if (quant_value) {
         backend()->onAcquireBuffer(dequantV.get(), Backend::STATIC);
@@ -194,48 +244,100 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
     std::function<void(int)> mCompute = [=](int tId) {
         auto pack_q      = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP) * mHeadDim * eP * bytes;
         auto pack_qk     = packQK->host<char>() + tId * UP_DIV(kv_seq_len, unit) * seq_len * unit * bytes;
+        char * sum_q     = nullptr;
         auto unpack_qk   = unpackQK->host<float>() + tId * seq_len * kv_seq_len;
-        auto softmax_qk  = softmaxQK->host<float>() + tId * seq_len * kv_seq_len;
+        auto softmax_qk  = softmMaxQ->host<float>() + tId * seq_len * kv_seq_len;
         auto new_pack_qk = newPackQK->host<char>() + tId * UP_DIV(seq_len, eP) * kv_seq_len * eP * bytes;
         auto pack_qkv    = mPackQKV->host<char>() + tId * UP_DIV(mHeadDim, unit) * seq_len * unit * bytes;
         auto QxK         = quant_key ? core->MNNPackedMatMul_int8 : core->MNNPackedMatMul;
         auto QxK_remain  = quant_key ? core->MNNPackedMatMulRemain_int8 : core->MNNPackedMatMulRemain;
         int  head_index  = tId * tileCount;
+        if (mUseGemmInt8) {
+            pack_q  = mPackQ->host<char>() + tId * UP_DIV(seq_len, eP8) * UP_DIV(mHeadDim, lP8) * eP8 * lP8;
+            sum_q   = mSumQ->host<char>() + tId * UP_DIV(seq_len, eP8) * eP8 * 4;
+        }
         for (int h = head_index; h < head_index + tileCount && h < mNumHead; h++) {
             int    kv_h            = h / group_size;
             char * key_addr        = mKVCacheManager->addrOfKey(kv_h);
-            char * scale_addr      = quant_key ? mKVCacheManager->addrOfScale(kv_h) : nullptr;
-            char * zero_point_addr = quant_key ? mKVCacheManager->addrOfZeroPoint(kv_h) : nullptr;
-            char * value_addr      = quant_value ? dequantV->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes : mKVCacheManager->addrOfValue(kv_h);
+            char * scale_addr      = mKVCacheManager->addrOfScale(kv_h);
+            char * zero_point_addr = mKVCacheManager->addrOfZeroPoint(kv_h);
+            char * key_sum_addr    = mKVCacheManager->addrOfKeySum(kv_h);
+            char * value_addr      = quant_value ? (dequantV->host<char>() + kv_h * UP_DIV(mHeadDim, hP) * kv_seq_len * hP * bytes) : mKVCacheManager->addrOfValue(kv_h);
             if (bytes == 2) {
-                pack_query<FLOAT16_T>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
+                pack_query<FLOAT16_T>(query, pack_q, sum_q, seq_len, h, q_scale);
             } else {
-                pack_query<float>(query, pack_q, mNumHead, mHeadDim, eP, seq_len, h, q_scale);
+                pack_query<float>(query, pack_q, sum_q, seq_len, h, q_scale);
             }
             // query @ key
-            int loop_e = seq_len / eP;
-            int remain = seq_len % eP;
-            size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0};
-            for (int i = 0 ; i < loop_e; i++) {
-                QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
+            if (mUseGemmInt8) {
+                auto GemmInt8Kernel = static_cast<CPUBackend*>(backend())->int8Functions()->Int8GemmKernel;
+                if (bytes == 2 && unit == 8) {
+                    GemmInt8Kernel = static_cast<CPUBackend*>(backend())->int8Functions()->MNNGemmInt8AddBiasScale_Unit_FP16;
+                }
+                std::vector<float> postScale(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    postScale[i] = ((float*)scale_addr)[i] * mQueryScale[h] * q_scale;
+                }
+                std::vector<float> weightQuantBias(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    weightQuantBias[i] = -((float*)scale_addr)[i] * ((float*)zero_point_addr)[i] * q_scale;
+                }
+                std::vector<float> biasFloat(ROUND_UP(kv_seq_len, hP8), 0.0f);
+                for (int i = 0; i < kv_seq_len; i++) {
+                    biasFloat[i] = -mQueryScale[h] * mQueryZeroPoint[h] * ((float*)key_sum_addr)[i] * q_scale;
+                }
+                QuanPostTreatParameters post;
+                post.bias = nullptr;
+                post.biasFloat = biasFloat.data();
+                post.blockNum = 1;
+                post.extraBias = nullptr;
+                post.extraScale = nullptr;
+                post.fp32minmax = nullptr;
+                post.scale = postScale.data();
+                post.useInt8 = false;
+                post.weightQuanBias = weightQuantBias.data();
+                int N = UP_DIV(seq_len, eP8);
+                for (int i = 0; i < N; i++) {
+                    int realcount = ALIMIN(eP8, seq_len - i * eP8);
+                    post.srcKernelSum = (float*)((char*)sum_q + i * eP8 * 4);
+                    GemmInt8Kernel(
+                        (int8_t*)pack_qk + i * eP8 * unit * bytes,
+                        (int8_t*)pack_q + i * ROUND_UP(mHeadDim, lP8) * eP8,
+                        (int8_t*)key_addr,
+                        UP_DIV(mHeadDim, lP8),
+                        seq_len * unit * bytes,
+                        UP_DIV(kv_seq_len, unit),
+                        &post,
+                        realcount
+                    );
+                }
+            }
+            else {
+                int loop_e = seq_len / eP;
+                int remain = seq_len % eP;
+                size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)mHeadDim, (size_t)kv_seq_len, (size_t)seq_len * unit * bytes, 0, 0, 0};
+                for (int i = 0 ; i < loop_e; i++) {
+                    QxK((float*)(pack_qk + (i * eP * unit) * bytes), (float*)(pack_q + (i * mHeadDim * eP) * bytes), (float*)key_addr, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
+                }
+                QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
             }
-            QxK_remain((float*)(pack_qk + (loop_e * eP * unit) * bytes), (float*)(pack_q + (loop_e * mHeadDim * eP) * bytes), (float*)key_addr, remain, shapeParameters, nullptr, nullptr, (float*)scale_addr, (float*)zero_point_addr);
             // qk: [kv_seq_len/unit, seq_len, unit] -> [seq_len, kv_seq_len] -> [seq_len/eP, kv_seq_len, eP]
             if(bytes == 2) {
-                unpack_QK<FLOAT16_T>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
+                unpack_QK<FLOAT16_T>(unpack_qk, pack_qk, seq_len, kv_seq_len);
                 mask_QK<FLOAT16_T>(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits<float>::lowest(), mask->host<int>(), float_mask);
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
                 pack_QK<FLOAT16_T>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             } else {
-                unpack_QK<float>(unpack_qk, pack_qk, seq_len, kv_seq_len, unit);
+                unpack_QK<float>(unpack_qk, pack_qk, seq_len, kv_seq_len);
                 mask_QK<float>(unpack_qk, seq_len, kv_seq_len, mScale, std::numeric_limits<float>::lowest(), mask->host<int>(), float_mask);
                 softmax_QK(softmax_qk, unpack_qk, seq_len, kv_seq_len);
                 pack_QK<float>(new_pack_qk, softmax_qk, seq_len, kv_seq_len, eP);
             }
             // qk @ v
-            shapeParameters[1] = kv_seq_len;
-            shapeParameters[2] = mHeadDim;
+            size_t shapeParameters[7] = {(size_t)eP * bytes, (size_t)kv_seq_len, (size_t)mHeadDim, (size_t)seq_len * unit * bytes, 0, 0, 0};
             shapeParameters[5] = quant_value ? 0 : (max_len - kv_seq_len) * hP * bytes;
+            int loop_e = seq_len / eP;
+            int remain = seq_len % eP;
             for (int i = 0 ; i < loop_e; i++) {
                 core->MNNPackedMatMul((float*)(pack_qkv + (i * eP * unit) * bytes), (float*)(new_pack_qk + (i * kv_seq_len * eP) * bytes), (float*)value_addr, shapeParameters, nullptr, nullptr, nullptr, nullptr);
             }
@@ -257,7 +359,7 @@ ErrorCode CPUAttention::onExecute(const std::vector<Tensor*>& inputs, const std:
 
     backend()->onReleaseBuffer(packQK.get(), Backend::STATIC);
     backend()->onReleaseBuffer(unpackQK.get(), Backend::STATIC);
-    backend()->onReleaseBuffer(softmaxQK.get(), Backend::STATIC);
+    backend()->onReleaseBuffer(softmMaxQ.get(), Backend::STATIC);
     backend()->onReleaseBuffer(newPackQK.get(), Backend::STATIC);
     if (quant_value){
         backend()->onReleaseBuffer(dequantV.get(), Backend::STATIC);
@@ -277,10 +379,13 @@ bool CPUAttention::onClone(Backend* bn, const Op* op, Execution** dst) {
 
 CPUAttention::CPUAttention(Backend *backend, bool kv_cache) : Execution(backend), mKVCache(kv_cache) {
     if (mKVCache) {
+        mPackQ.reset(Tensor::createDevice<float>({1, 1, 1, 1}));
+        mPackQKV.reset(Tensor::createDevice<float>({1, 1, 1, 1}));
         MNN::KVCacheManager::KVCacheConfig kvconfig;
-        int kvcacheQuantOptions = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheQuantOption;
-        kvconfig.mQuantKey   = (kvcacheQuantOptions & 1);
-        kvconfig.mQuantValue = ((kvcacheQuantOptions >> 1) & 1);
+        int qkvQuantOptions = static_cast<CPUBackend *>(backend)->getRuntime()->hint().qkvQuantOption;
+        kvconfig.mUseInt8Kernel = (qkvQuantOptions == 4);
+        kvconfig.mQuantKey   = (qkvQuantOptions == 4) || (qkvQuantOptions & 1);
+        kvconfig.mQuantValue = (qkvQuantOptions == 4) || ((qkvQuantOptions >> 1) & 1);
         kvconfig.mKVCacheDir = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheDirPath;
         kvconfig.mKVCacheSizeLimit = static_cast<CPUBackend *>(backend)->getRuntime()->hint().kvcacheSizeLimit;
         kvconfig.mExpandChunk = 64;
@@ -305,4 +410,4 @@ REGISTER_CPU_OP_CREATOR_TRANSFORMER(CPUAttentionCreator, OpType_Attention);
 
 } // namespace MNN
 
-#endif // MNN_SUPPORT_TRANSFORMER_FUSE
\ No newline at end of file
+#endif // MNN_SUPPORT_TRANSFORMER_FUSE
diff --git a/source/backend/cpu/CPUAttention.hpp b/source/backend/cpu/CPUAttention.hpp
index 4aba816f3..a05b68712 100644
--- a/source/backend/cpu/CPUAttention.hpp
+++ b/source/backend/cpu/CPUAttention.hpp
@@ -29,12 +29,17 @@ class CPUAttention : public Execution {
     bool mIsPrefill      = true;
     bool mIsFirstPrefill = true;
     bool mKVCache        = true;
+    bool mUseGemmInt8    = false;
     int bytes = 4;
     int mThreadNum = 1;;
-    int eP, lP, hP, unit;
+    int eP, lP, hP, unit; // float matmul packing
+    int eP8, lP8, hP8;    // GemmInt8 packing
     int mNumHead, mKvNumHead, mHeadDim;
-    std::shared_ptr<Tensor> mPackQ, mPackQKV;
+    std::shared_ptr<Tensor> mPackQ, mPackQKV, mSumQ;
     std::shared_ptr<KVCacheManager> mKVCacheManager = nullptr;
+    std::vector<float> mMinQ, mMaxQ, mQueryScale, mQueryZeroPoint;
+    template <typename T> void pack_query(Tensor* query, char* pack_q, char* sum_q, int seq_len, int h, float q_scale);
+    template <typename T> void unpack_QK(float * unpack_qk_dst, char * pack_qk_src, int seq_len, int kv_seq_len);
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index 99156a447..dd3401dcf 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -37,6 +37,7 @@
 #include "x86_x64/AVX2Backend.hpp"
 #endif
 
+#define MNN_CPU_MAX_BUFFER_INDEX 2
 #define MNN_CPU_CHECK_NAN 1
 #define MNN_CPU_USE_DEFAULT_BACKEND 4
 namespace MNN {
@@ -208,7 +209,12 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
 }
 
 CPURuntime::CPURuntime(const Backend::Info& info) {
-    mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
+    auto rawAlloc = BufferAllocator::Allocator::createDefault();
+    mStaticAllocator.reset(new EagerBufferAllocator(rawAlloc));
+    mDynamic.resize(MNN_CPU_MAX_BUFFER_INDEX);
+    for (auto& buf : mDynamic) {
+        buf.root = rawAlloc;
+    }
     mThreadNumber = info.numThread;
     mPower   = BackendConfig::Power_Normal;
     mMemory  = BackendConfig::Memory_Normal;
@@ -231,17 +237,49 @@ CPURuntime:: ~ CPURuntime() {
 }
 float CPURuntime::onGetMemoryInMB() {
     auto staticMemoryInMB = mStaticAllocator->totalSize() / 1024.0f / 1024.0f;
-    return staticMemoryInMB;
+    float dynamicMemoryInMB = 0.0f;
+    for (auto& buf : mDynamic) {
+        dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
+    }
+    return staticMemoryInMB + dynamicMemoryInMB;
 }
 bool CPURuntime::onCheckInfo(Backend::Info& info) const {
     info.numThread = mThreadNumber;
     return true;
 }
+SingleBufferWithAllocator* CPURuntime::buffer(int index) const {
+    if (mDynamicMmap.empty()) {
+        return mDynamic.data() + index;
+    }
+    return mDynamicMmap.data() + index;
+}
 
-Backend* CPURuntime::onCreate(const BackendConfig* config) const {
+Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) const {
+    if (hint().midMemoryPath.size() > 0) {
+        if (mDynamicMmap.empty()) {
+            // Only support set featuremap dir once
+            mDynamicMmap.resize(2);
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().midMemoryPath.c_str(), "dynamic");
+            for (auto& buf : mDynamicMmap) {
+                buf.root = mmapMem;
+            }
+        }
+    }
+    if (hint().weightMemoryPath.size() > 0) {
+        if (nullptr == mStaticAllocatorCache.get()) {
+            // Only support set weightmap dir once
+            mStaticAllocatorCache = mStaticAllocator;
+            auto mmapMem = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "static");
+            mStaticAllocator.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
+        }
+    }
     auto precision = mPrecision;
     auto memory = mMemory;
     size_t flags = mFlags;
+    if (nullptr != origin) {
+        auto cpuBn = static_cast<CPUBackend*>(origin);
+        mSharedDmaInfo = cpuBn->mDmaInfo;
+    }
     _resetGroupCompute();
     if (nullptr != config) {
         precision = config->precision;
@@ -251,30 +289,36 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
 #ifdef LOG_VERBOSE
     MNN_PRINT("cpu backend was created by runtime:%p\n", this);
 #endif
-
+    CPUBackend* res = nullptr;
+    do {
 #ifdef MNN_USE_ARMV82
-    auto core = MNNGetCoreFunctions();
-    if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
-        return new Arm82Backend(this, memory);
-    }
+        auto core = MNNGetCoreFunctions();
+        if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
+            res = new Arm82Backend(this, memory);
+            break;
+        }
 #endif
 #ifdef MNN_SUPPORT_BF16
-    if (precision == BackendConfig::Precision_Low_BF16 && BF16Functions::get()) {
-        auto res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU_EXTENSION, 0);
-        res->mCoreFunctions = BF16Functions::get();
-        return res;
-    }
+        if (precision == BackendConfig::Precision_Low_BF16 && BF16Functions::get()) {
+            res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU_EXTENSION, 0);
+            res->mCoreFunctions = BF16Functions::get();
+            break;
+        }
 #endif
-    if (flags == MNN_CPU_USE_DEFAULT_BACKEND) {
-        return new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, 0);
-    }
+        if (flags == MNN_CPU_USE_DEFAULT_BACKEND) {
+            res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, 0);
+            break;
+        }
 #ifdef MNN_USE_SSE
-    if (AVX2Backend::isValid()) {
-        return new AVX2Backend(this, memory, flags);
-    }
+        if (AVX2Backend::isValid()) {
+            res = new AVX2Backend(this, memory, flags);
+            break;
+        }
 #endif
-
-    return new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, flags);
+        res = new CPUBackend(this, precision, memory, MNN_FORWARD_CPU, flags);
+    } while (false);
+    mSharedDmaInfo = nullptr;
+    return res;
 }
 
 int CPURuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
@@ -298,6 +342,11 @@ int CPURuntime::onGetRuntimeStatus(RuntimeStatus statusEnum) const {
 
 void CPURuntime::onGabageCollect(int level) {
     mStaticAllocator->release(false);
+    if (level >= 100) {
+        for (auto& buf : mDynamic) {
+            buf.release();
+        }
+    }
 }
 
 
@@ -339,25 +388,34 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
     map->insert(std::make_pair(t, c));
     return true;
 }
-
+BufferAllocator* CPURuntime::createDynamicBufferAlloctor(int index) const {
+    if (hint().memoryAllocatorType == Runtime::Allocator_Defer) {
+        return new DeferBufferAllocator(buffer(index));
+    }
+    if (nullptr != mStaticAllocatorCache.get()) {
+        return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocatorCache.get()));
+    }
+    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get()));
+}
 CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type, size_t flags) : Backend(type) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("cpu backend create\n");
 #endif
     mMemory = memory;
     mRuntime = const_cast<CPURuntime*>(runtime);
-    std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
-    if (mRuntime->hint().memoryAllocatorType == Runtime::Allocator_Defer) {
-        mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
+    auto dynamicAlloc = mRuntime->mSharedDmaInfo;
+    if (nullptr == dynamicAlloc.get()) {
+        mDmaInfo.reset(new CPURuntime::DynamicAllocator);
+        mDmaInfo->mDynamicAllocator.reset(mRuntime->createDynamicBufferAlloctor(0));
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocator.get();
     } else {
-        mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
+        mDmaInfo = dynamicAlloc;
     }
-    mCurrentDynamicAllocator = mDynamicAllocator.get();
     mStaticAllocator = runtime->mStaticAllocator;
     mPrecisionMode = precision;
     mCoreFunctions = MNNGetCoreFunctions();
     mInt8CoreFunctions = MNNGetInt8CoreFunctions();
-    mCacheGroup.resize(2);
+    mCacheGroup.resize(MNN_CPU_MAX_BUFFER_INDEX);
     for (int i=0; i<mCacheGroup.size(); ++i) {
         mCacheGroup[i].reset(new CPUResizeCache);
     }
@@ -367,8 +425,15 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
 CPUBackend::~CPUBackend() {
     mCacheGroup.clear();
 }
+void CPUBackend::_resetDynamicMemory() const {
+    mDmaInfo->mDynamicAllocator->apply();
+    if (nullptr != mDmaInfo->mDynamicAllocatorBackup.get()) {
+        mDmaInfo->mDynamicAllocatorBackup->apply();
+    }
+}
 
 void CPUBackend::onExecuteBegin() const {
+    _resetDynamicMemory();
     mRuntime->onConcurrencyBegin();
 }
 
@@ -377,23 +442,20 @@ void CPUBackend::onExecuteEnd() const {
 }
 
 void CPUBackend::onResizeBegin() {
-    mCurrentDynamicAllocator->reset();
+    mDmaInfo->mCurrentDynamicAllocator->reset();
 }
 bool CPUBackend::onSelectDynamicAllocator(int index, int maxIndex) {
     if (maxIndex > 2) {
         return false;
     }
-    if (maxIndex == 2 && mDynamicAllocatorBackup.get() == nullptr) {
-        if (mRuntime->hint().memoryAllocatorType == Runtime::Allocator_Defer) {
-            mDynamicAllocatorBackup.reset(new DeferBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get())));
-        } else {
-            mDynamicAllocatorBackup.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticAllocator.get())));
-        }
+    if (maxIndex == 2 && mDmaInfo->mDynamicAllocatorBackup.get() == nullptr) {
+        mDmaInfo->mDynamicAllocatorBackup.reset(mRuntime->createDynamicBufferAlloctor(1));
     }
     if (1 == index) {
-        mCurrentDynamicAllocator = mDynamicAllocatorBackup.get();
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocatorBackup.get();
     } else {
-        mCurrentDynamicAllocator = mDynamicAllocator.get();
+        mRuntime->buffer(0)->release();
+        mDmaInfo->mCurrentDynamicAllocator = mDmaInfo->mDynamicAllocator.get();
     }
     mCache = mCacheGroup[index].get();
     return true;
@@ -401,7 +463,11 @@ bool CPUBackend::onSelectDynamicAllocator(int index, int maxIndex) {
 
 ErrorCode CPUBackend::onResizeEnd() {
     getCache()->release();
-    return mCurrentDynamicAllocator->compute();
+    auto code = mDmaInfo->mCurrentDynamicAllocator->compute();
+    if (NO_ERROR != code) {
+        return code;
+    }
+    return NO_ERROR;
 }
 
 Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType storageType) {
@@ -431,11 +497,11 @@ Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType
             break;
         }
         case DYNAMIC: {
-            chunk = mCurrentDynamicAllocator->alloc(size, false);
+            chunk = mDmaInfo->mCurrentDynamicAllocator->alloc(size, false);
             break;
         }
         case DYNAMIC_SEPERATE: {
-            chunk = mCurrentDynamicAllocator->alloc(size, true);
+            chunk = mDmaInfo->mCurrentDynamicAllocator->alloc(size, true);
             break;
         }
         default:
@@ -453,7 +519,7 @@ Backend::MemObj* CPUBackend::allocBuffer(size_t size, Tensor* dest, StorageType
     if (storageType == STATIC) {
         res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
     } else {
-        res = new CPUMemObj(mCurrentDynamicAllocator, chunk, size);
+        res = new CPUMemObj(mDmaInfo->mCurrentDynamicAllocator, chunk, size);
         chunk.attach(dest);
     }
     if (chunk.ptr()) {
@@ -591,8 +657,11 @@ const Runtime* CPUBackend::getRuntime() {
 }
 
 bool CPUBackend::onClearBuffer() {
+    if (nullptr != mRuntime->mStaticAllocatorCache.get()) {
+        mStaticAllocator = mRuntime->mStaticAllocatorCache;
+    }
     mCache->reset();
-    mCurrentDynamicAllocator->release(true);
+    mDmaInfo->mCurrentDynamicAllocator->release(true);
     return true;
 }
 
@@ -606,9 +675,9 @@ std::pair<int, int> CPUBackend::multiThreadDivide(int size) const {
     return std::make_pair(sizeDivide, scheduleNumber);
 }
 void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
+    _resetDynamicMemory();
     auto& srcBuffer = srcTensor->buffer();
     auto& dstBuffer = dstTensor->buffer();
-
     if (srcBuffer.dimensions != dstBuffer.dimensions ) {
         if (srcBuffer.dim[srcBuffer.dimensions - 1].extent != 1 && dstBuffer.dim[dstBuffer.dimensions - 1].extent != 1) {
             MNN_ERROR("srcBuffer dimension not equal to dstBuffer, can't copy buffer\n");
diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
index 1286df907..b4c9843d0 100644
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@@ -20,11 +20,16 @@
 namespace MNN {
 class CPURuntime : public Runtime {
 public:
+    struct DynamicAllocator {
+        std::shared_ptr<BufferAllocator> mDynamicAllocator;
+        std::shared_ptr<BufferAllocator> mDynamicAllocatorBackup;
+        BufferAllocator* mCurrentDynamicAllocator = nullptr;
+    };
     friend class CPUBackend;
     CPURuntime(const Backend::Info& info);
     virtual ~ CPURuntime();
     int onGetRuntimeStatus(RuntimeStatus statusEnum) const override;
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
@@ -43,10 +48,13 @@ class CPURuntime : public Runtime {
         return mThreadOpen;
     }
 #endif
+    SingleBufferWithAllocator* buffer(int index) const;
+    BufferAllocator* createDynamicBufferAlloctor(int index) const;
+
 private:
     void _bindCPUCore() const;
     void _resetThreadPool();
-    std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
+    mutable std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
     int mThreadNumber;
 #ifdef MNN_USE_THREAD_POOL
     mutable int mTaskIndex = -1;
@@ -64,6 +72,10 @@ class CPURuntime : public Runtime {
     static Backend*(*gExtraCreate)(const Runtime* runtime);
     size_t mFlags = 0;
     mutable int mCurrentTID = 0;
+    mutable std::vector<SingleBufferWithAllocator> mDynamic;
+    mutable std::vector<SingleBufferWithAllocator> mDynamicMmap;
+    mutable std::shared_ptr<DynamicAllocator> mSharedDmaInfo;
+    mutable std::shared_ptr<EagerBufferAllocator> mStaticAllocatorCache;
 };
 struct CoreFunctions;
 struct CoreInt8Functions;
@@ -122,6 +134,7 @@ class CPUBackend : public Backend {
     const CoreInt8Functions* int8Functions() const {
         return mInt8CoreFunctions;
     }
+    void _resetDynamicMemory() const;
 public:
     class Creator {
     public:
@@ -141,7 +154,7 @@ class CPUBackend : public Backend {
 #endif
 
     BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
-        return mCurrentDynamicAllocator;
+        return mDmaInfo->mCurrentDynamicAllocator;
     }
 
     BackendConfig::MemoryMode memoryMode() const {
@@ -164,22 +177,19 @@ class CPUBackend : public Backend {
     static DataType getDataType(const Tensor* tensor);
     friend class CPURuntime;
 
-
 protected:
     MemObj* allocBuffer(size_t size, Tensor* dest,  StorageType storageType);
     CoreFunctions* mCoreFunctions;
     CoreInt8Functions* mInt8CoreFunctions;
 private:
+    std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
     std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
-    std::shared_ptr<BufferAllocator> mDynamicAllocator;
-    std::shared_ptr<BufferAllocator> mDynamicAllocatorBackup;
     CPURuntime* mRuntime;
     BackendConfig::PrecisionMode mPrecisionMode;
     BackendConfig::MemoryMode mMemory;
     static std::map<OpType, CPUBackend::Creator*>* gCreator;
     CPUResizeCache* mCache;
     std::vector<std::shared_ptr<CPUResizeCache>> mCacheGroup;
-    BufferAllocator* mCurrentDynamicAllocator = nullptr;
 };
 /** execution cast wrapper. insert tensor cast dynamic. */
 class CastWrapExecution : public Execution {
diff --git a/source/backend/cpu/CPUCast.cpp b/source/backend/cpu/CPUCast.cpp
index ad989f0f3..1bc72dbb1 100644
--- a/source/backend/cpu/CPUCast.cpp
+++ b/source/backend/cpu/CPUCast.cpp
@@ -21,13 +21,12 @@ ErrorCode CPUCastCreator::cast(const void* inputRaw, void* outputRaw, ConvertTyp
     int remain = number % pack;
     if (type == FlOAT_TO_INT8) {
         scale = (scale == 0.f ? 0.f : 1.f / scale);
-        std::vector<float> scales(pack, scale);
-        bn->int8Functions()->MNNFloat2Int8((float*)(inputRaw), (int8_t*)(outputRaw), c4Size, scales.data(), min, max, zero);
+        bn->int8Functions()->MNNFloat2Int8((float*)(inputRaw), (int8_t*)(outputRaw), c4Size, &scale, min, max, &zero, 0);
         if (remain > 0) {
             std::vector<float> tempSrc(pack);
             std::vector<int8_t> tempDst(pack);
             ::memcpy(tempSrc.data(), (float*)(inputRaw) + c4Size * pack, remain * sizeof(float));
-            bn->int8Functions()->MNNFloat2Int8(tempSrc.data(), tempDst.data(), 1, scales.data(), min, max, zero);
+            bn->int8Functions()->MNNFloat2Int8(tempSrc.data(), tempDst.data(), 1, &scale, min, max, &zero, 0);
             ::memcpy(static_cast<int8_t*>(outputRaw) + c4Size * pack, tempDst.data(), remain * sizeof(int8_t));
         }
         return NO_ERROR;
diff --git a/source/backend/cpu/CPUConvolution.cpp b/source/backend/cpu/CPUConvolution.cpp
index 109b4cc6a..eb34aa9c2 100644
--- a/source/backend/cpu/CPUConvolution.cpp
+++ b/source/backend/cpu/CPUConvolution.cpp
@@ -117,7 +117,6 @@ void CPUConvolution::MutableResourceInt8::updateInputOutputScale(std::vector<flo
         return;
     }
 
-    int size = mResource->mOutputCount;
     const int kernelNum = static_cast<int>(mResource->mInt8WeightKernelSum.size());
     auto biasData    = mResource->mOriginBias->host<float>();
     auto alphaData   = mResource->mOriginScale->host<float>();
@@ -189,7 +188,6 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     const int8_t* weightSrc = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-    resource->mOutputCount = outputCount;
     if (!ConvolutionCommon::getConvInt8Parameters(op, quanCommon, backend, weightSrc, weightSize, scalePtr, biasPtr, betaPtr)) {
         return nullptr;
     }
@@ -254,174 +252,6 @@ std::shared_ptr<CPUConvolution::ResourceInt8> CPUConvolution::makeResourceInt8(B
     return resource;
 }
 
-void CPUConvolution::makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8) {
-    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
-    auto conv2d = op->main_as_Convolution2D();
-    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
-    MNN_ASSERT(quanBuffer || resourceInt8);
-    resource->backend = backend;
-    auto core = static_cast<CPUBackend*>(backend)->functions();
-    // common parameters
-    int outputCount = conv2d->common()->outputCount();
-    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
-    int ocUp4 = ROUND_UP(outputCount, core->pack);
-    int8_t* weightOrigin;
-
-    // Save weight quant scale and bias: wf=scale*wi+bias
-    resource->mDequantize.mScaleBias.reset(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
-    auto success = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc denquant scaleBias memory error\n");
-        return;
-    }
-    auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
-    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
-    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
-
-    std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon;
-    // Load quant scale and bias
-    if (quanBuffer) {
-        quantCommon = ConvolutionCommon::load(op, backend, false, true);
-        weightOrigin = quantCommon->weight.get(); // weight before reorder
-
-        int h = quantCommon->alpha.size();
-        if (core->bytes == 2) {
-            if (quantCommon->asymmetric) {
-                std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-                core->MNNFp32ToLowp(quantCommon->alpha.get(), tmp.get(), h);
-                for (int i=0; i< h/2; ++i) {
-                    reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[2 * i + 1];
-                    reinterpret_cast<int16_t*>(biasPtr)[i] = tmp[2 * i];
-                }
-            } else {
-                core->MNNFp32ToLowp(quantCommon->alpha.get(), reinterpret_cast<int16_t*>(alphaPtr), h);
-            }
-        } else {
-            if (quantCommon->asymmetric) {
-                h = h / 2;
-                for (int i=0; i<h; ++i) {
-                    alphaPtr[i] = quantCommon->alpha.get()[2 * i + 1];
-                    biasPtr[i] = quantCommon->alpha.get()[2 * i];
-                }
-            } else {
-                for (int i=0; i<h; ++i) {
-                    alphaPtr[i] = quantCommon->alpha.get()[i];
-                    biasPtr[i] = 0.f;
-                }
-            }
-        }
-    } else {
-        weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
-        auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
-        auto wScale = resourceInt8->mOriginScale->host<float>();
-        int h = ocUp4;
-        if (core->bytes == 2) {
-            std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-            core->MNNFp32ToLowp(wScale, tmp.get(), h);
-            for (int i=0; i< h; ++i) {
-                reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
-                reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
-            }
-        } else {
-            for (int i=0; i< h; ++i) {
-                alphaPtr[i] = wScale[i];
-                biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
-            }
-        }
-    }
-
-    // Compute float weightKernelSum
-    resource->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
-    success = resource->backend->onAcquireBuffer(resource->mWeightKernelSum.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc denquant mWeightKernelSum memory error\n");
-        return;
-    }
-    auto weightKernelSum = resource->mWeightKernelSum->host<float>();
-    for (int i = 0; i < outputCount; ++i) {
-        int sum = 0;
-        for (int j = 0; j < LSize; ++j) {
-            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
-        }
-        if(core->bytes == 2) {
-            auto scale = reinterpret_cast<int16_t*>(alphaPtr)[i];
-            auto bias = reinterpret_cast<int16_t*>(biasPtr)[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        } else {
-            auto scale = alphaPtr[i];
-            auto bias = biasPtr[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        }
-    }
-}
-
-void CPUConvolution::makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8) {
-    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
-    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
-    MNN_ASSERT(quanBuffer || resourceInt8);
-    auto core = static_cast<CPUBackend*>(backend)->functions();
-    // common parameters
-    int outputCount = conv2d->common()->outputCount();
-    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
-    int ocUp4 = ROUND_UP(outputCount, core->pack);
-    int8_t* weightOrigin;
-
-    // Save weight quant scale and bias: wf=scale*wi+bias
-    std::shared_ptr<Tensor> scaleBias(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
-    auto success = backend->onAcquireBuffer(scaleBias.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc dequant scaleBias memory error\n");
-        return;
-    }
-    auto alphaPtr = scaleBias->host<float>();
-    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
-    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
-    
-    // Load quant scale and bias
-    weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
-    auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
-    auto wScale = resourceInt8->mOriginScale->host<float>();
-    int h = ocUp4;
-    if (core->bytes == 2) {
-        std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-        core->MNNFp32ToLowp(wScale, tmp.get(), h);
-        for (int i=0; i< h; ++i) {
-            reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
-            reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
-        }
-    } else {
-        for (int i=0; i< h; ++i) {
-            alphaPtr[i] = wScale[i];
-            biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
-        }
-    }
-    resourceInt8->mOriginScale = scaleBias;
-    
-    // Compute float weightKernelSum
-    resourceInt8->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
-    success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC);
-    if (!success) {
-        MNN_ERROR("Alloc dequant mWeightKernelSum memory error\n");
-        return;
-    }
-    auto weightKernelSum = resourceInt8->mWeightKernelSum->host<float>();
-    for (int i = 0; i < outputCount; ++i) {
-        int sum = 0;
-        for (int j = 0; j < LSize; ++j) {
-            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
-        }
-        if(core->bytes == 2) {
-            auto scale = reinterpret_cast<int16_t*>(alphaPtr)[i];
-            auto bias = reinterpret_cast<int16_t*>(biasPtr)[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        } else {
-            auto scale = alphaPtr[i];
-            auto bias = biasPtr[i];
-            weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
-        }
-    }
-}
-
 CPUConvolution::CPUConvolution(const Convolution2DCommon *convOp, Backend *b) : MNN::Execution(b), mCommon(convOp) {
     // Do nothing
 }
diff --git a/source/backend/cpu/CPUConvolution.hpp b/source/backend/cpu/CPUConvolution.hpp
index a34f68aad..8975f5963 100644
--- a/source/backend/cpu/CPUConvolution.hpp
+++ b/source/backend/cpu/CPUConvolution.hpp
@@ -69,12 +69,8 @@ class CPUConvolution : public Execution {
         bool mRelu;
         int mActBits;  // quant bits
 
-        int mOutputCount;
         bool mUseConvQuan = true;
         bool mWeightAsymmetricQuant = true;
-#ifdef MNN_USE_SSE
-        std::vector<int> offsets;
-#endif
         // Origin Attributes from net
         float mInputScale = 0.0f;
         float mOutputScale = 0.0f;
@@ -82,6 +78,7 @@ class CPUConvolution : public Execution {
         int32_t mOutputZeroPoint;
         int8_t mClampMin;
         int8_t mClampMax;
+        bool mDynamicQuant = false;
     };
     struct MutableResourceInt8 {
         MutableResourceInt8(std::shared_ptr<ResourceInt8> res, Backend* backend);
@@ -100,8 +97,6 @@ class CPUConvolution : public Execution {
         bool mValid;
     };
     static std::shared_ptr<ResourceInt8> makeResourceInt8(Backend *backend, const MNN::Op *op, int pack=4);
-    static void makeResource(Backend* backend, std::shared_ptr<Resource> resource, const MNN::Op *op, std::shared_ptr<ResourceInt8> resourceInt8 = nullptr);
-    static void makeResourceNew(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<ResourceInt8> resourceInt8);
     CPUConvolution(const Convolution2DCommon *convOp, Backend *b);
     virtual ~CPUConvolution() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index 0364ad58e..6a75b3c61 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -346,7 +346,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     }
 
     mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
-                       strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scales,
+                       strideX, threadNumber, src_width, src_height, plane, input, biasTensor, this, core, gcore, batch, outi8, scale,
                        minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
         auto colBufferPtr = mTempOutput->host<uint8_t>();
         auto biasPtr      = biasTensor->host<float>();
@@ -391,7 +391,9 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
             }
             core->MNNAxByClampBroadcastUnit((float*)dstZ, (float*)dstZ, (const float*)((uint8_t*)biasPtr +  unitBytes * z), src_height * src_width * batch, 0, 0, 1, mPostParameters.data());
             if (outi8) {
-                gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, scales.data(), minValue, maxValue, zeroPoint);
+                float scaleOne = scale;
+                float zeroOne  = zeroPoint;
+                gcore->MNNFloat2Int8((float*)dstZ, (int8_t*)(outputPtr + z * float2Int8_step * core->pack), float2Int8_step, &scaleOne, minValue, maxValue, &zeroOne, 0);
             }
         }
     }, threadNumber));
diff --git a/source/backend/cpu/CPUDynamicQuant.cpp b/source/backend/cpu/CPUDynamicQuant.cpp
index 508d58627..ac4dd40c8 100644
--- a/source/backend/cpu/CPUDynamicQuant.cpp
+++ b/source/backend/cpu/CPUDynamicQuant.cpp
@@ -46,7 +46,7 @@ ErrorCode CPUDynamicQuant::onExecute(const std::vector<Tensor*> &inputs,
     int pack = core->pack;
     std::vector<float> qsVec(pack, quantScale);
     int sizeDiv = UP_DIV(size, pack);
-    int8core->MNNFloat2Int8(inputPtr, outputPtr, sizeDiv, qsVec.data(), -128, 127, (ssize_t)zeroPoint);
+    int8core->MNNFloat2Int8(inputPtr, outputPtr, sizeDiv, &quantScale, -128, 127, &zeroPoint, 0);
     float* scale = outputs[1]->host<float>();
     float* zeros = outputs[2]->host<float>();
     *scale = dequantScale;
diff --git a/source/backend/cpu/CPUFloatToInt8.cpp b/source/backend/cpu/CPUFloatToInt8.cpp
index 9a9329e5e..7770377c6 100644
--- a/source/backend/cpu/CPUFloatToInt8.cpp
+++ b/source/backend/cpu/CPUFloatToInt8.cpp
@@ -36,7 +36,7 @@ CPUFloatToInt8::CPUFloatToInt8(Backend* backend, const MNN::Op* param) : Executi
         memcpy(mScales->host<float>(), scale->tensorScale()->data(), scaleLen * sizeof(float));
     }
 
-    mZeroPoint = scale->zeroPoint();
+    mZeroPoint = static_cast<float>(scale->zeroPoint());
     mClampMin = scale->clampMin();
     mClampMax = scale->clampMax();
 }
@@ -78,7 +78,7 @@ ErrorCode CPUFloatToInt8::onExecute(const std::vector<Tensor*>& inputs, const st
         const auto srcChannelPtr   = inputDataPtr + tId * oc4Stride * pack;
         const auto scaleChannelPtr = scaleDataPtr + z * pack;
         auto dstChannlePtr         = outputDataPtr + tId * oc4Stride * pack;
-        int8F->MNNFloat2Int8(srcChannelPtr, dstChannlePtr, oc4Stride, scaleChannelPtr, mClampMin, mClampMax, mZeroPoint);
+        int8F->MNNFloat2Int8(srcChannelPtr, dstChannlePtr, oc4Stride, scaleChannelPtr, mClampMin, mClampMax, &mZeroPoint, 1);
     }
     MNN_CONCURRENCY_END();
     return NO_ERROR;
diff --git a/source/backend/cpu/CPUFloatToInt8.hpp b/source/backend/cpu/CPUFloatToInt8.hpp
index 7d26a90db..82ca68efe 100644
--- a/source/backend/cpu/CPUFloatToInt8.hpp
+++ b/source/backend/cpu/CPUFloatToInt8.hpp
@@ -22,7 +22,7 @@ class CPUFloatToInt8 : public Execution {
 
 private:
     std::shared_ptr<Tensor> mScales;
-    int8_t mZeroPoint;
+    float mZeroPoint;
     int8_t mClampMin;
     int8_t mClampMax;
     int mClipBits;
diff --git a/source/backend/cpu/CPUImageProcess.cpp b/source/backend/cpu/CPUImageProcess.cpp
index 078291c72..37d56b1b8 100644
--- a/source/backend/cpu/CPUImageProcess.cpp
+++ b/source/backend/cpu/CPUImageProcess.cpp
@@ -15,7 +15,6 @@
 #include <utility>
 
 namespace MNN {
-#define CACHE_SIZE 256
 
 ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input = inputs[0];
diff --git a/source/backend/cpu/CPUProposal.cpp b/source/backend/cpu/CPUProposal.cpp
index 84e67cdcb..6cc5ff4a4 100644
--- a/source/backend/cpu/CPUProposal.cpp
+++ b/source/backend/cpu/CPUProposal.cpp
@@ -16,12 +16,17 @@
 #include <MNN/AutoTime.hpp>
 namespace MNN {
 
-CPUProposal::CPUProposal(Backend *backend, const Proposal *proposal) : Execution(backend), mProposal(proposal) {
-    auto ratioCount = mProposal->ratios()->float32s()->size();
-    auto numScale   = mProposal->scales()->float32s()->size();
+CPUProposal::CPUProposal(Backend *backend, const Proposal *proposal) : Execution(backend) {
+    auto ratioCount = proposal->ratios()->float32s()->size();
+    auto numScale   = proposal->scales()->float32s()->size();
     mAnchors.reset(4 * ratioCount * numScale);
+    mCache.featStride   = proposal->featStride();
+    mCache.preNmsTopN   = proposal->preNmsTopN();
+    mCache.nmsThreshold = proposal->nmsThreshold();
+    mCache.afterNmsTopN = proposal->afterNmsTopN();
+    mCache.minSize      = proposal->minSize();
 
-    auto baseSize = mProposal->baseSize();
+    auto baseSize = proposal->baseSize();
     const auto cx = baseSize * 0.5f;
     const auto cy = baseSize * 0.5f;
     auto ratios   = proposal->ratios()->float32s()->data();
@@ -117,11 +122,11 @@ ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std:
     auto score  = inputs[0];
     auto boxes  = inputs[1];
     auto imInfo = inputs[2];
-    auto featStride   = mProposal->featStride();
-    auto preNmsTopN   = mProposal->preNmsTopN();
-    auto nmsThreshold = mProposal->nmsThreshold();
-    auto afterNmsTopN = mProposal->afterNmsTopN();
-    auto minSize      = mProposal->minSize();
+    auto featStride   = mCache.featStride;
+    auto preNmsTopN   = mCache.preNmsTopN;
+    auto nmsThreshold = mCache.nmsThreshold;
+    auto afterNmsTopN = mCache.afterNmsTopN;
+    auto minSize      = mCache.minSize;
 
     float* tmpScorePtr = (float*)mScoreBuffer.ptr();
     // download
diff --git a/source/backend/cpu/CPUProposal.hpp b/source/backend/cpu/CPUProposal.hpp
index f002deb3c..8da27db4a 100644
--- a/source/backend/cpu/CPUProposal.hpp
+++ b/source/backend/cpu/CPUProposal.hpp
@@ -24,8 +24,15 @@ class CPUProposal : public Execution {
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
+    struct ProposalCache {
+        int32_t featStride;
+        int32_t preNmsTopN;
+        int32_t minSize;
+        int32_t afterNmsTopN;
+        float nmsThreshold;
+    };
 private:
-    const Proposal *mProposal;
+    ProposalCache mCache;
     AutoStorage<float> mAnchors;
     MemChunk mScoreBuffer;
 };
diff --git a/source/backend/cpu/KVCacheManager.cpp b/source/backend/cpu/KVCacheManager.cpp
index 7804d3dd5..5fd8c1d37 100644
--- a/source/backend/cpu/KVCacheManager.cpp
+++ b/source/backend/cpu/KVCacheManager.cpp
@@ -13,7 +13,7 @@
 
 namespace MNN {
 
-// @brief  Translate an address to a hex number string
+// Translate an address to a hex number string
 static inline std::string addrToHex(void *addr) {
     std::string result = "";
     uint64_t n = (uint64_t)addr;
@@ -106,11 +106,27 @@ void KVCacheManager::unmapKVCache(size_t keySize, size_t valueSize)
 */
 void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
     /*===================================  Key  ===================================*/
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        auto new_key = Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8});
+        mBackend->onAcquireBuffer(new_key, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+        mPastKey.reset(new_key);
+    }
+    else if (mConfig.mQuantKey) {
         auto new_key = Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
         mBackend->onAcquireBuffer(new_key, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
         mPastKey.reset(new_key);
     }
@@ -118,7 +134,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         auto new_key = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP});
         mBackend->onAcquireBuffer(new_key, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                new_key->host<char>() + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
         mPastKey.reset(new_key);
     }
@@ -128,7 +148,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         mBackend->onAcquireBuffer(new_value, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
         mPastValue.reset(new_value);
@@ -138,7 +162,11 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
         mBackend->onAcquireBuffer(new_value, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    new_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
         mPastValue.reset(new_value);
@@ -151,16 +179,35 @@ void KVCacheManager::expandKVCacheInMem(int oldMaxLength) {
 */
 void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     /*===================================  Key  ===================================*/
+    if (mConfig.mUseInt8Kernel) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+        mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
+        mPastKey.reset();
+    }
     if (mConfig.mQuantKey) {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
         mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
         mPastKey.reset();
     }
     else {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                mPastKey->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
         mBackend->onReleaseBuffer(mPastKey.get(), Backend::STATIC);
         mPastKey.reset();
@@ -169,7 +216,11 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     if (mConfig.mQuantValue) {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
         mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
@@ -178,7 +229,11 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
     else {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    mPastValue->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
         mBackend->onReleaseBuffer(mPastValue.get(), Backend::STATIC);
@@ -189,14 +244,12 @@ void KVCacheManager::moveKVCacheFromMemToDisk(int oldMaxLength) {
 /*
 **  @brief  Expand the size of kvcache files in disk
 */
-void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) {
-    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+void KVCacheManager::expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int oldValueSize, int keySize, int valueSize) {
     // Step 1: Copy the old kvcache from files to temporary buffers in memory
     std::shared_ptr<Tensor> old_key, old_value;
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8}));
+    } else if (mConfig.mQuantKey) {
         old_key.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));
     } else {
         old_key.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(oldMaxLength, hP), mHeadDim, hP}));  
@@ -216,25 +269,49 @@ void KVCacheManager::expandKVCacheInDisk(int oldMaxLength) {
     resetKVCacheFileSize(keySize, valueSize);
     mmapKVCache(keySize, valueSize);
     // Step 3: Move the kvcache from temporary buffers in memory to disk
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8,
+                UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8
+            );
+        }
+    } else if (mConfig.mQuantKey) {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,  old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP, UP_DIV(oldMaxLength, hP) * mHeadDim * hP);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP
+            );
         }
     } else {
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes, old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes, UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes);
+            memcpy(
+                mMapKeyAddr + h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes,
+                old_key->host<char>() + h * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes,
+                UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes
+            );
         }
     }
     if (mConfig.mQuantValue) {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP, oldMaxLength * hP);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP,
+                    old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP,
+                    oldMaxLength * hP
+                );
             }
         }
     } else {
         for (int h = 0; h < mKvNumHead; h++) {
             for (int i = 0; i < UP_DIV(mHeadDim, hP); i++) {
-                memcpy(mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes, old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes, oldMaxLength * hP * mBytes);
+                memcpy(
+                    mMapValueAddr + (h * UP_DIV(mHeadDim, hP) + i) * mMaxLength * hP * mBytes,
+                    old_value->host<char>() + (h * UP_DIV(mHeadDim, hP) + i) * oldMaxLength * hP * mBytes,
+                    oldMaxLength * hP * mBytes
+                );
             }
         }
     }
@@ -253,12 +330,22 @@ void KVCacheManager::onResize(int kv_num_head, int head_dim) {
     if (mThreadNum > mKvNumHead) {
         mThreadNum = mKvNumHead;
     }
+    if (mConfig.mUseInt8Kernel) {
+        static_cast<CPUBackend *>(mBackend)->int8Functions()->MNNGetGemmUnit(&hP8, &lP8, &eP8);
+    }
 }
 
 void KVCacheManager::onAlloc(int kv_seq_len) {
     mMaxLength = kv_seq_len + mConfig.mExpandChunk;
-    size_t keySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t keySize = 0, valueSize = 0;
+    if (mConfig.mUseInt8Kernel) {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+    } else if (mConfig.mQuantKey) {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+    } else {
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+    }
+    valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
     /*============== Put the kvcache in disk ===========*/
     if (mConfig.mKVCacheSizeLimit != -1 && keySize + valueSize > mConfig.mKVCacheSizeLimit) {
         createKVCacheFile();
@@ -268,7 +355,9 @@ void KVCacheManager::onAlloc(int kv_seq_len) {
     }
     /*============== Put the kvcache in memory ===========*/
     else {
-        if (mConfig.mQuantKey) {
+        if (mConfig.mUseInt8Kernel) {
+            mPastKey.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), UP_DIV(mHeadDim, lP8), hP8 * lP8}));
+        } else if (mConfig.mQuantKey) {
             mPastKey.reset(Tensor::createDevice<int8_t>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
         } else {
             mPastKey.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), mHeadDim, hP}));
@@ -278,15 +367,22 @@ void KVCacheManager::onAlloc(int kv_seq_len) {
         } else {
             mPastValue.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mHeadDim, hP), mMaxLength, hP}));
         }
-        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC);
-        mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC);    
-    }
-    /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
-    if (mConfig.mQuantKey) {
-        mDequantKeyScale.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
-        mDequantKeyZeroPoint.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP}));
-        mBackend->onAcquireBuffer(mDequantKeyScale.get(), Backend::STATIC);
-        mBackend->onAcquireBuffer(mDequantKeyZeroPoint.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mPastKey.get(), Backend::STATIC); 
+        mBackend->onAcquireBuffer(mPastValue.get(), Backend::STATIC); 
+    }
+    // scale, zero point and sum of key for quantization
+    if (mConfig.mUseInt8Kernel) {
+        mKeyScale.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mKeyZeroPoint.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mKeySum.reset(Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8}));
+        mBackend->onAcquireBuffer(mKeyScale.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeyZeroPoint.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeySum.get(), Backend::STATIC);
+    } else if (mConfig.mQuantKey) {
+        mKeyScale.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), hP}));
+        mKeyZeroPoint.reset(Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), hP}));
+        mBackend->onAcquireBuffer(mKeyScale.get(), Backend::STATIC);
+        mBackend->onAcquireBuffer(mKeyZeroPoint.get(), Backend::STATIC);
     }
 }
 
@@ -296,10 +392,19 @@ void KVCacheManager::onRealloc(int kv_seq_len) {
     }
     int oldMaxLength = mMaxLength;
     mMaxLength = kv_seq_len + mConfig.mExpandChunk;
-    size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-    size_t keySize      = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-    size_t valueSize    = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    size_t oldKeySize, oldValueSize, keySize, valueSize;
+    if (mConfig.mUseInt8Kernel) {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+    } else if (mConfig.mQuantKey) {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+    } else {
+        oldKeySize = (size_t)mKvNumHead * UP_DIV(oldMaxLength, hP) * mHeadDim * hP * mBytes;
+        keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+    }
+    oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * oldMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+    valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
     /*==== No limit for kvcache ====*/
     if (mConfig.mKVCacheSizeLimit == -1) {
         expandKVCacheInMem(oldMaxLength);
@@ -318,51 +423,100 @@ void KVCacheManager::onRealloc(int kv_seq_len) {
     }
     /*==== Last time the kvcache is disk, now it should be in disk too ====*/
     else {
-        expandKVCacheInDisk(oldMaxLength);
+        expandKVCacheInDisk(oldMaxLength, oldKeySize, oldValueSize, keySize, valueSize);
     }
     /* No matter where is the kvcache, the scales and zero points are always in memory, since their size is very small */
-    if (mConfig.mQuantKey) {
+    if (mConfig.mUseInt8Kernel) {
+        auto new_scale = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        auto new_zeroPoint = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        auto new_sum = Tensor::createDevice<int32_t>({mKvNumHead, UP_DIV(mMaxLength, hP8), hP8});
+        mBackend->onAcquireBuffer(new_scale, Backend::STATIC);
+        mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
+        mBackend->onAcquireBuffer(new_sum, Backend::STATIC);
+        for (int h = 0; h < mKvNumHead; h++) {
+            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+            memcpy(new_sum->host<char>() + h * UP_DIV(mMaxLength, hP8) * hP8 * 4, mKeySum->host<char>() + h * UP_DIV(oldMaxLength, hP8) * hP8 * 4, UP_DIV(oldMaxLength, hP8) * hP8 * 4);
+        }
+        mKeyScale.reset(new_scale);
+        mKeyZeroPoint.reset(new_zeroPoint);
+        mKeySum.reset(new_sum);
+    } else if (mConfig.mQuantKey) {
         auto new_scale = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
         auto new_zeroPoint = Tensor::createDevice<float>({mKvNumHead, UP_DIV(mMaxLength, hP), 1, hP});
         mBackend->onAcquireBuffer(new_scale, Backend::STATIC);
         mBackend->onAcquireBuffer(new_zeroPoint, Backend::STATIC);
         for (int h = 0; h < mKvNumHead; h++) {
-            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
-            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mDequantKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+            memcpy(new_scale->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mKeyScale->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
+            memcpy(new_zeroPoint->host<char>() + h * UP_DIV(mMaxLength, hP) * hP * mBytes, mKeyZeroPoint->host<char>() + h * UP_DIV(oldMaxLength, hP) * hP * mBytes, UP_DIV(oldMaxLength, hP) * hP * mBytes);
         }
-        mDequantKeyScale.reset(new_scale);
-        mDequantKeyZeroPoint.reset(new_zeroPoint);
+        mKeyScale.reset(new_scale);
+        mKeyZeroPoint.reset(new_zeroPoint);
     }
 }
 
 void KVCacheManager::onClear() {
     if (mKVCacheInDisk) {
-        size_t oldKeySize   = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
-        size_t oldValueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
-        unmapKVCache(oldKeySize, oldValueSize);
+        size_t keySize = 0, valueSize = 0;
+        if (mConfig.mUseInt8Kernel) {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        } else if (mConfig.mQuantKey) {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+        } else {
+            keySize = (size_t)mKvNumHead * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+        }
+        valueSize = (size_t)mKvNumHead * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);    
+        unmapKVCache(keySize, valueSize);
         removeKVCacheFile();
         mKVCacheInDisk = false;
     }
-    else {
-        mPastKey.reset();
-        mPastValue.reset();
-    }
+    mPastKey.reset();
+    mPastValue.reset();
+    mKeyScale.reset();
+    mKeyZeroPoint.reset();
+    mKeySum.reset();
     mMaxLength = mPastLength = 0;
 }
 
 template <typename T>
-static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim,
-                    int hP, int kv_h, bool quantKey, char* scale, char* zero_point, const MNN::CoreFunctions * core) {
-    if (quantKey) {
-        int8_t * key_dst = reinterpret_cast<int8_t*>(pack_key);
-        T * scale_dst = reinterpret_cast<T*>(scale);
-        T * zeroPoint_dst = reinterpret_cast<T*>(zero_point);
+void KVCacheManager::pack_key(const Tensor* key, int seq_len, int kv_h) {
+    if (mConfig.mUseInt8Kernel) {  // [maxlen/hP8, headdim/lP8, hP8, lP8]
+        int8_t * key_dst = reinterpret_cast<int8_t*>(addrOfKey(kv_h));
+        float * scale_dst = reinterpret_cast<float*>(addrOfScale(kv_h));
+        float * zeroPoint_dst = reinterpret_cast<float*>(addrOfZeroPoint(kv_h));
+        float * sum_dst = reinterpret_cast<float*>(addrOfKeySum(kv_h));
+        for (int s = 0; s < seq_len; s++) {
+            T * key_src = key->host<T>() + s * mKvNumHead * mHeadDim + kv_h * mHeadDim;
+            float minKey = key_src[0];
+            float maxKey = key_src[0];
+            float sumKey = key_src[0];
+            for (int d = 1; d < mHeadDim; d++) {
+                minKey = ALIMIN(minKey, key_src[d]);
+                maxKey = ALIMAX(maxKey, key_src[d]);
+                sumKey += key_src[d];
+            }
+            int out_index = (mPastLength + s) / hP8;
+            int in_index  = (mPastLength + s) % hP8;
+            scale_dst[out_index * hP8 + in_index] = (maxKey - minKey) / 255.0f;
+            zeroPoint_dst[out_index * hP8 + in_index] = -255.0f * minKey / (maxKey - minKey) - 128.0;
+            sum_dst[out_index * hP8 + in_index] = sumKey;
+            for (int d = 0; d < mHeadDim; d++) {
+                int i = d / lP8;
+                int j = d % lP8;
+                key_dst[out_index * UP_DIV(mHeadDim, lP8) * hP8 * lP8 + i * hP8 * lP8 + in_index * lP8 + j] = roundf((key_src[d] - minKey) / (maxKey - minKey) * 255.0f - 128.0f);
+            }
+        }
+    }
+    else if (mConfig.mQuantKey) {  // [maxlen/hP, headdim, hP]
+        int8_t * key_dst = reinterpret_cast<int8_t*>(addrOfKey(kv_h));
+        T * scale_dst = reinterpret_cast<T*>(addrOfScale(kv_h));
+        T * zeroPoint_dst = reinterpret_cast<T*>(addrOfZeroPoint(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             int out_index = (mPastLength + i) / hP;
             int in_index  = (mPastLength + i) % hP;
             T minKey, maxKey;
-            core->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim);
+            static_cast<CPUBackend*>(mBackend)->functions()->MNNCountMaxMinValue((float*)key_src, (float*)&minKey, (float*)&maxKey, mHeadDim);
             scale_dst[out_index * hP + in_index] = (maxKey - minKey) / 255.0f;
             zeroPoint_dst[out_index * hP + in_index] = 128.0f * (maxKey - minKey) / 255.0f + minKey;
             for (int j = 0; j < mHeadDim; j++) {
@@ -370,8 +524,8 @@ static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq
             }
         }
     }
-    else {
-        T * key_dst = reinterpret_cast<T*>(pack_key);
+    else { // [maxlen/hP, headdim, hP]
+        T * key_dst = reinterpret_cast<T*>(addrOfKey(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * key_src = key->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             int out_index = (mPastLength + i) / hP;
@@ -384,16 +538,16 @@ static void pack_key(const Tensor* key, char* pack_key, int mPastLength, int seq
 }
 
 template <typename T>
-static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, int mPastLength, int seq_len, int mKvNumHead, int mHeadDim, int hP, int kv_h, bool quantValue, const MNN::CoreFunctions * core) {
-    if (quantValue) {
-        fp8_t * value_dst = reinterpret_cast<fp8_t*>(pack_value);
+void KVCacheManager::pack_value(const Tensor* value, int seq_len, int kv_h) { // [headdim/hP, maxlen, hP]
+    if (mConfig.mQuantValue) {
+        fp8_t * value_dst = reinterpret_cast<fp8_t*>(addrOfValue(kv_h));
         uint8_t * buf = (uint8_t *)MNNMemoryAllocAlign(mHeadDim, MNN_MEMORY_ALIGN_DEFAULT);
         for (int i = 0; i < seq_len; i++) {
             T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             if (sizeof(T) == 2) {
-                core->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim);
+                static_cast<CPUBackend*>(mBackend)->functions()->MNNFp16ToFp8(buf, (uint16_t*)value_src, mHeadDim);
             } else {
-                core->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim);
+                static_cast<CPUBackend*>(mBackend)->functions()->MNNFp32ToFp8(buf, (float*)value_src, mHeadDim);
             }
             for (int j = 0; j < mHeadDim; j++) {
                 int out_index = j / hP;
@@ -404,7 +558,7 @@ static void pack_value(const Tensor* value, char* pack_value, int mMaxLength, in
         MNNMemoryFreeAlign(buf);
     }
     else {
-        T * value_dst = reinterpret_cast<T*>(pack_value);
+        T * value_dst = reinterpret_cast<T*>(addrOfValue(kv_h));
         for (int i = 0; i < seq_len; i++) {
             T * value_src = value->host<T>() + i * mKvNumHead * mHeadDim + kv_h * mHeadDim;
             for (int j = 0; j < mHeadDim; j++) {
@@ -423,11 +577,11 @@ void KVCacheManager::onPushBack(const Tensor * key, const Tensor * value) {
     std::function<void(int)> packKV = [=](int tid) {
         for (int kv_h = tid * tileCount; kv_h < (tid+1) * tileCount && kv_h < mKvNumHead; kv_h++) {
             if (mBytes == 2) {
-                pack_key<FLOAT16_T>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
-                pack_value<FLOAT16_T>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+                pack_key<FLOAT16_T>(key, seq_len, kv_h);
+                pack_value<FLOAT16_T>(value, seq_len, kv_h);
             } else {
-                pack_key<float>(key, addrOfKey(kv_h), mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantKey, addrOfScale(kv_h), addrOfZeroPoint(kv_h), core);
-                pack_value<float>(value, addrOfValue(kv_h), mMaxLength, mPastLength, seq_len, mKvNumHead, mHeadDim, hP, kv_h, mConfig.mQuantValue, core);
+                pack_key<float>(key, seq_len, kv_h);
+                pack_value<float>(value, seq_len, kv_h);
             }
         }
     };
diff --git a/source/backend/cpu/KVCacheManager.hpp b/source/backend/cpu/KVCacheManager.hpp
index 582481990..c34e25c82 100644
--- a/source/backend/cpu/KVCacheManager.hpp
+++ b/source/backend/cpu/KVCacheManager.hpp
@@ -29,8 +29,9 @@ namespace MNN {
 class KVCacheManager : public NonCopyable{
 public:
     struct KVCacheConfig {
-        bool mQuantKey   = false;               // Quantize keys to int8 or not
-        bool mQuantValue = false;               // Quantize values to fp8 or not
+        bool mQuantKey      = false;            // Quantize keys to int8 or not
+        bool mQuantValue    = false;            // Quantize values to fp8 or not
+        bool mUseInt8Kernel = false;            // Whether to use int8 gemm kernel in CPU attention
         std::string mKVCacheDir = "/tmp";       // Path of the kvcache files in disk
         size_t mKVCacheSizeLimit = -1;          // The limit of the kvcache size
         int  mExpandChunk = 64;                 // Number of expand chunks when the buffer is full
@@ -38,10 +39,11 @@ class KVCacheManager : public NonCopyable{
 private:
     Backend * mBackend;
     KVCacheConfig mConfig;
-    std::shared_ptr<Tensor> mPastKey;               // numhead, [maxlen/eP, headdim, eP]
-    std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/eP, maxlen, eP]
-    std::shared_ptr<Tensor> mDequantKeyScale;       // numhead, [maxlen/eP, 1, eP]
-    std::shared_ptr<Tensor> mDequantKeyZeroPoint;   // numhead, [maxlen/eP, 1, eP]
+    std::shared_ptr<Tensor> mPastKey;               // {numhead, [maxlen/hP, headdim, hP]} or {numhead, [maxlen/hP8, headdim/lP8, hP8, lP8]} 
+    std::shared_ptr<Tensor> mPastValue;             // numhead, [headdim/hP, maxlen, hP]
+    std::shared_ptr<Tensor> mKeyScale;              // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
+    std::shared_ptr<Tensor> mKeyZeroPoint;          // {numhead, [maxlen/hP, hP]} or {numhead, [maxlen/hP8, hP8]}
+    std::shared_ptr<Tensor> mKeySum;                // numhead, [maxlen/hP8, hP8]
     file_t mKeyCacheFD   = INVALID_FILE;            // The file descriptor of keys
     file_t mValueCacheFD = INVALID_FILE;            // The file descriptor of values
     char * mMapKeyAddr   = nullptr;                 // Memory-mapped address of keys
@@ -49,8 +51,10 @@ class KVCacheManager : public NonCopyable{
     bool mKVCacheInDisk  = false;                   // Whether the kvcache is in disk or in memory now
     int  mPastLength     = 0;                       // Length of past kvcache
     int  mMaxLength      = 0;                       // Capacity of current kvcache buffer (how many kv items can be stored at most)
-    int  eP, lP, hP, mBytes, mThreadNum;
-    int  mKvNumHead = 0, mHeadDim   = 0;
+    int  eP, lP, hP;                                // Packing mode for float matmul
+    int  eP8, lP8, hP8;                             // Packing mode for int8 gemm kernel
+    int  mBytes = 4, mThreadNum = 1;
+    int  mKvNumHead = 0, mHeadDim = 0;
     void createKVCacheFile();
     void removeKVCacheFile();
     void resetKVCacheFileSize(size_t keySize, size_t valueSize);
@@ -58,7 +62,9 @@ class KVCacheManager : public NonCopyable{
     void unmapKVCache(size_t keySize, size_t valueSize);
     void expandKVCacheInMem(int oldMaxLength);
     void moveKVCacheFromMemToDisk(int oldMaxLength);
-    void expandKVCacheInDisk(int oldMaxLength);
+    void expandKVCacheInDisk(int oldMaxLength, int oldKeySize, int oldValueSize, int keySize, int valueSize);
+    template <typename T> void pack_key(const Tensor* key, int seq_len, int kv_h);
+    template <typename T> void pack_value(const Tensor* value, int seq_len, int kv_h);
 public:
     KVCacheManager(Backend * backend, KVCacheConfig & kvConfig) {
         mBackend   = backend;
@@ -80,10 +86,13 @@ class KVCacheManager : public NonCopyable{
         return mPastValue.get();
     }
     const Tensor * scale() {
-        return mDequantKeyScale.get();
+        return mKeyScale.get();
     }
     const Tensor * zeroPoint() {
-        return mDequantKeyZeroPoint.get();
+        return mKeyZeroPoint.get();
+    }
+    const Tensor * keySum() {
+        return mKeySum.get();
     }
     bool inDisk() {
         return mKVCacheInDisk;
@@ -96,23 +105,46 @@ class KVCacheManager : public NonCopyable{
     }
     char * addrOfKey(int kv_h) {
         char * baseAddr = mKVCacheInDisk ? mMapKeyAddr : mPastKey->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * (mConfig.mQuantKey ? 1 : mBytes);
+        if (mConfig.mUseInt8Kernel) {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP8) * UP_DIV(mHeadDim, lP8) * hP8 * lP8;
+        } else if (mConfig.mQuantKey) {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP;
+        } else {
+            return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * mHeadDim * hP * mBytes;
+        }
     }
     char * addrOfValue(int kv_h) {
         char * baseAddr = mKVCacheInDisk ? mMapValueAddr : mPastValue->host<char>();
-        return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * (mConfig.mQuantValue ? 1 : mBytes);
+        if (mConfig.mQuantValue) {
+            return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP;
+        } else {
+            return baseAddr + kv_h * UP_DIV(mHeadDim, hP) * mMaxLength * hP * mBytes;
+        }
     }
     char * addrOfScale(int kv_h) {
-        if (mConfig.mQuantKey == false)
+        if (mConfig.mUseInt8Kernel) {
+            return mKeyScale->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        } else if (mConfig.mQuantKey) {
+            return mKeyScale->host<char>() + kv_h * UP_DIV(mMaxLength, hP) * hP * mBytes;
+        } else {
             return nullptr;
-        char * baseAddr = mDequantKeyScale->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+        }
     }
     char * addrOfZeroPoint(int kv_h) {
-        if (mConfig.mQuantKey == false)
+        if (mConfig.mUseInt8Kernel) {
+            return mKeyZeroPoint->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        } else if (mConfig.mQuantKey) {
+            return mKeyZeroPoint->host<char>() + kv_h * UP_DIV(mMaxLength, hP) * hP * mBytes;
+        } else {
+            return nullptr;
+        }
+    }
+    char * addrOfKeySum(int kv_h) {
+        if (mConfig.mUseInt8Kernel) {
+            return mKeySum->host<char>() + kv_h * UP_DIV(mMaxLength, hP8) * hP8 * 4;
+        }else {
             return nullptr;
-        char * baseAddr = mDequantKeyZeroPoint->host<char>();
-        return baseAddr + kv_h * UP_DIV(mMaxLength, hP) * 1 * hP * mBytes;
+        }
     }
     void onResize(int kv_num_head, int head_dim);
     void onAlloc(int kv_seq_len);
diff --git a/source/backend/cpu/arm/CMakeLists.txt b/source/backend/cpu/arm/CMakeLists.txt
index d23e5adb4..d8d06136c 100644
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@@ -15,6 +15,10 @@ if (MNN_LOW_MEMORY)
     FILE(GLOB MNN_AArch64_SRC ${MNN_AArch64_SRC} ${CMAKE_CURRENT_LIST_DIR}/arm64/low_memory/*.[sS])
 endif()
 
+if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+    FILE(GLOB MNN_AArch64_SRC ${MNN_AArch64_SRC} ${CMAKE_CURRENT_LIST_DIR}/arm64/normal_memory/*.[sS])
+endif()
+
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
     message(STATUS "Enabling AArch32 Assemblies")
     add_library(MNNARM32 OBJECT ${MNN_AArch32_SRC} ${MNN_NEON_SRC})
diff --git a/source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S b/source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S
new file mode 100644
index 000000000..74f47c637
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBGRAToBGRC8.S
@@ -0,0 +1,33 @@
+//
+//  MNNBGRAToBGRC8.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRAToBGRC8
+// void MNNBGRAToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vst3.8 {d0, d1, d2}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S b/source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S
new file mode 100644
index 000000000..30d53059c
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBGRAToGRAYFast.S
@@ -0,0 +1,43 @@
+//
+//  MNNBGRAToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRAToGRAYFast
+// void MNNBGRAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vmull.u16 q4, d0, d4 // b*7
+vmlal.u16 q4, d1, d5 // g*38
+vmlal.u16 q4, d2, d6 // r*19
+
+vqshrn.u16 d8, q4, #6
+vst1.u8 {d8}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S b/source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S
new file mode 100644
index 000000000..c2c48546c
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBGRToBGR555Fast.S
@@ -0,0 +1,46 @@
+//
+//  MNNBGRToBGR555Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToBGR555Fast
+// void MNNBGRToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vand.u8 d2, d2, d30 // r & ~7
+vand.u8 d1, d1, d30 // g & ~7
+vshr.u8 d0, d0, #3  // b >> 3
+vshll.u8 q2, d2, #7
+vshll.u8 q3, d1, #2
+vmovl.u8 q8, d0
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S b/source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S
new file mode 100644
index 000000000..db21624b7
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBGRToBGR565Fast.S
@@ -0,0 +1,51 @@
+//
+//  MNNBGRToBGR565Fast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToBGR565Fast
+// void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+
+push {lr}
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+vmov.s8 q14, #4
+vneg.s8 q14, q14
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // b, g, r
+
+vand.u8 d2, d2, d30 // r & ~7
+vand.u8 d1, d1, d28 // g & ~3
+vshr.u8 d0, d0, #3  // b >> 3
+
+vshll.u8 q2, d2, #7
+vshl.u8 q2, q2, #1
+vshll.u8 q3, d1, #3
+vmovl.u8 q8, d0
+
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S b/source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S
new file mode 100644
index 000000000..0fb87d2a9
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNBGRToGRAYFast.S
@@ -0,0 +1,46 @@
+//
+//  MNNBGRToGRAYFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNBGRToGRAYFast
+// void MNNBGRToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+// b*7
+// g*38
+// r*19
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // b,g,r
+vmull.u8 q8, d0, d4
+vmlal.u8 q8, d1, d5
+vmlal.u8 q8, d2, d6
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S b/source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S
new file mode 100644
index 000000000..ff9e20724
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNC3ToC4Fast.S
@@ -0,0 +1,34 @@
+//
+//  MNNC3ToC4Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToC4Fast
+// void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d3, #255
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vst4.u8 {d0, d1, d2, d3}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S b/source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S
new file mode 100644
index 000000000..08e6a6c53
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNC3ToXYZFast.S
@@ -0,0 +1,95 @@
+//
+//  MNNC3ToXYZFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToXYZFast
+// void MNNC3ToXYZFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+// Auto Load: r0: source, r1: dest, r2: count, r3: c
+
+push {lr}
+vpush {q4-q7}
+
+// q4-q6, const
+vld1.32 {d8[0]}, [r3]! // C0
+vld1.32 {d8[1]}, [r3]! // C1
+vld1.32 {d9[0]}, [r3]! // C2
+vld1.32 {d9[1]}, [r3]! // C3
+vld1.32 {d10[0]}, [r3]! // C4
+vld1.32 {d10[1]}, [r3]! // C5
+vld1.32 {d11[0]}, [r3]! // C6
+vld1.32 {d11[1]}, [r3]! // C7
+vld1.32 {d12[0]}, [r3]! // C8
+
+vmov.u16 q15, #128
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmovl.u8 q2, d0 // r: uint8_t -> uint16_t
+vmovl.u8 q3, d1
+vmovl.u8 q13, d2
+
+vmovl.u16 q7, d4 // r
+vmovl.u16 q8, d5 // r
+vmovl.u16 q9, d6 // g
+vmovl.u16 q10, d7 // g
+vmovl.u16 q11, d26 // b
+vmovl.u16 q12, d27 // b
+
+// r*C0, g*C1, b*C2
+vmul.s32 q0, q7, d8[0]
+vmul.s32 q1, q8, d8[0]
+vmla.s32 q0, q9, d8[1]
+vmla.s32 q1, q10, d8[1]
+vmla.s32 q0, q11, d9[0]
+vmla.s32 q1, q12, d9[0]
+
+// r*C3, g*C4, b*C5
+vmul.s32 q2, q7, d9[1]
+vmul.s32 q3, q8, d9[1]
+vmla.s32 q2, q9, d10[0]
+vmla.s32 q3, q10, d10[0]
+vmla.s32 q2, q11, d10[1]
+vmla.s32 q3, q12, d10[1]
+
+// r*C6, g*C7, b*C8
+vmul.s32 q13, q7, d11[0]
+vmul.s32 q14, q8, d11[0]
+vmla.s32 q13, q9, d11[1]
+vmla.s32 q14, q10, d11[1]
+vmla.s32 q13, q11, d12[0]
+vmla.s32 q14, q12, d12[0]
+
+vrshrn.u32 d0, q0, #12
+vrshrn.u32 d1, q1, #12
+vrshrn.u32 d2, q2, #12
+vrshrn.u32 d3, q3, #12
+vrshrn.u32 d4, q13, #12
+vrshrn.u32 d5, q14, #12
+
+vqmovn.u16 d0, q0
+vqmovn.u16 d1, q1
+vqmovn.u16 d2, q2
+
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+vpop {q4-q7}
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S b/source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S
new file mode 100644
index 000000000..fb37aea9b
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNC3ToYUVFast.S
@@ -0,0 +1,98 @@
+//
+//  MNNC3ToYUVFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNC3ToYUVFast
+// void MNNC3ToYUVFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+// Auto Load: r0: source, r1: dest, r2: count, r3: c
+
+push {lr}
+vpush {q4-q7}
+
+// q4-q6, const
+vld1.32 {d8[0]}, [r3]! // C0
+vld1.32 {d8[1]}, [r3]! // C1
+vld1.32 {d9[0]}, [r3]! // C2
+vld1.32 {d9[1]}, [r3]! // C3
+vld1.32 {d10[0]}, [r3]! // C4
+vld1.32 {d10[1]}, [r3]! // C5
+vld1.32 {d11[0]}, [r3]! // C6
+vld1.32 {d11[1]}, [r3]! // C7
+vld1.32 {d12[0]}, [r3]! // C8
+
+vmov.u16 q15, #128
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmovl.u8 q2, d0 // r: uint8_t -> uint16_t
+vmovl.u8 q3, d1
+vmovl.u8 q13, d2
+
+vmovl.u16 q7, d4 // r
+vmovl.u16 q8, d5 // r
+vmovl.u16 q9, d6 // g
+vmovl.u16 q10, d7 // g
+vmovl.u16 q11, d26 // b
+vmovl.u16 q12, d27 // b
+
+// r*C0, g*C1, b*C2
+vmul.s32 q0, q7, d8[0]
+vmul.s32 q1, q8, d8[0]
+vmla.s32 q0, q9, d8[1]
+vmla.s32 q1, q10, d8[1]
+vmla.s32 q0, q11, d9[0]
+vmla.s32 q1, q12, d9[0]
+
+// r*C3, g*C4, b*C5
+vmul.s32 q2, q7, d9[1]
+vmul.s32 q3, q8, d9[1]
+vmla.s32 q2, q9, d10[0]
+vmla.s32 q3, q10, d10[0]
+vmla.s32 q2, q11, d10[1]
+vmla.s32 q3, q12, d10[1]
+
+// r*C6, g*C7, b*C8
+vmul.s32 q13, q7, d11[0]
+vmul.s32 q14, q8, d11[0]
+vmla.s32 q13, q9, d11[1]
+vmla.s32 q14, q10, d11[1]
+vmla.s32 q13, q11, d12[0]
+vmla.s32 q14, q12, d12[0]
+
+vrshrn.u32 d0, q0, #14
+vrshrn.u32 d1, q1, #14
+vrshrn.u32 d2, q2, #14
+vrshrn.u32 d3, q3, #14
+vrshrn.u32 d4, q13, #14
+vrshrn.u32 d5, q14, #14
+
+vadd.u16 q1, q1, q15
+vadd.u16 q2, q2, q15
+
+vqmovn.u16 d0, q0
+vqmovn.u16 d1, q1
+vqmovn.u16 d2, q2
+
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+vpop {q4-q7}
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
index b8702173c..07446d42e 100644
--- a/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
+++ b/source/backend/cpu/arm/arm32/MNNFloat2Int8.S
@@ -22,26 +22,49 @@ vcvt.s32.f32 \x, q13
 .endm
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, ssize_t zeroPoint);
-//r0:src, r1:dst, r2:sizeQuad, r3:scale, r4:aMin, r5:aMax, r6:zeroPoint
-
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, ssize_t aMin, ssize_t aMax, float* zeroPoint, ssize_t quanParamVec);
+// Auto load: r0:src, r1:dst, r2:sizeQuad, r3:scale
+// Load from sp: aMin, aMax, lr: zeroPoint, r12: quanParamVec
 push {lr}
 
 vmov.f32 q10, #0.5
 vmov.f32 q11, #-0.5
-
-ldr r12, [sp, #4]
-vld1.32 {q15}, [r3]
+vmov.s32 q1, #1
+// scale
+vld1.32 {d30[0]}, [r3]
+vdup.32 q15, d30[0]
 // min
+ldr r12, [sp, #4]
 vdup.s8 d28, r12
 // max
 ldr r12, [sp, #8]
 vdup.s8 d29, r12
 // zeropoint
-ldr r12, [sp, #12]
-vdup.s32 q9, r12
-vcvt.f32.s32 q9, q9
-
+ldr lr, [sp, #12]
+vld1.32 {d18[0]}, [lr]
+vdup.32 q9, d18[0]
+
+// quanParamVec
+ldr r12, [sp, #16]
+cmp r12, #3
+bne LOAD_VEC_ZERO
+vld1.f32 {q9}, [lr] // load vector zero
+vld1.f32 {q15}, [r3] // load vector scale
+b COMPUTE
+
+LOAD_VEC_ZERO:
+cmp r12, #2
+bne LOAD_VEC_SCALE
+vld1.f32 {q9}, [lr] // load vector zero
+b COMPUTE
+
+LOAD_VEC_SCALE:
+cmp r12, #1
+bne COMPUTE
+vld1.f32 {q15}, [r3] // load vector scale
+
+
+COMPUTE:
 cmp r2, #3
 ble FL1
 
diff --git a/source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S b/source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S
new file mode 100644
index 000000000..401b0b009
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNGRAYToC3Fast.S
@@ -0,0 +1,35 @@
+//
+//  MNNGRAYToC3Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNGRAYToC3Fast
+// void MNNGRAYToC3Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld1.8 {d0}, [r0]!
+vmov d1, d0
+vmov d2, d0
+vst3.u8 {d0, d1, d2}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S b/source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S
new file mode 100644
index 000000000..aaca8f5b1
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNGRAYToC4Fast.S
@@ -0,0 +1,36 @@
+//
+//  MNNGRAYToC4Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNGRAYToC4Fast
+// void MNNGRAYToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d3, #255
+L1:
+vld1.8 {d0}, [r0]!
+vmov d1, d0
+vmov d2, d0
+vst4.u8 {d0, d1, d2, d3}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
index 72ff71423..8b62af530 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -51,7 +51,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
-
+sub sp, sp, #36
 
 ldr r7, [r6, #16]  // r7: useInt8
 
@@ -418,6 +418,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
index 25c9e5359..8d9d0ef63 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
@@ -42,6 +42,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
+sub sp, sp, #36
 
 // Only int8 output use this kernel.
 
@@ -301,6 +302,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index f7988025b..0e3966b9e 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -51,7 +51,7 @@ ldr r8, [r6, #0]
 ldr lr, [r6, #4]
 
 vpush {q4-q7}
-
+sub sp, sp, #36
 // Branch1: input is int8_t, output is float32, DO NOT USE "scale".
 // Branch2: input is int8_t, output is float32. USE "scale", DO NOT USE "minValue" and "maxValue".
 // Branch3: input is int8_t, output is int8_t.  USE "scale", "minValue" and "maxValue".
@@ -398,6 +398,7 @@ L1LoopCheck:
     bne L1LoopDz
 
 End:
+add sp, sp, #36
 vpop {q4-q7}
 pop {r4-r8, r10, pc}
 
diff --git a/source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S b/source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S
new file mode 100644
index 000000000..5eb583031
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBAToBGRAFast.S
@@ -0,0 +1,38 @@
+//
+//  MNNRGBAToBGRAFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToBGRAFast
+// void MNNRGBAToBGRAFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]! // r,g,b,a
+
+// swap d0,d2
+vmov.32 d4, d2
+vmov.32 d2, d0
+vmov.32 d0, d4
+vst4.8 {d0, d1, d2, d3}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S b/source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S
new file mode 100644
index 000000000..5a709f900
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBAToBGRFast.S
@@ -0,0 +1,38 @@
+//
+//  MNNRGBAToBGRFast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToBGRFast
+// void MNNRGBAToBGRFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]! // r,g,b,a
+
+// swap d0,d2
+vmov.32 d4, d2
+vmov.32 d2, d0
+vmov.32 d0, d4
+vst3.8 {d0, d1, d2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S b/source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S
new file mode 100644
index 000000000..d54f02a59
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBAToGRAYFast.S
@@ -0,0 +1,43 @@
+//
+//  MNNRGBAToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBAToGRAYFast
+// void MNNRGBAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld4.8 {d0, d1, d2, d3}, [r0]!
+vmull.u8 q8, d2, d4 // b*7
+vmlal.u8 q8, d1, d5 // g*38
+vmlal.u8 q8, d0, d6 // r*19
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S b/source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S
new file mode 100644
index 000000000..ce328ea1d
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBToBGR555Fast.S
@@ -0,0 +1,46 @@
+//
+//  MNNRGBToBGR555Fast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGR555Fast
+// void MNNRGBToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vand.u8 d0, d0, d30 // r & ~7
+vand.u8 d1, d1, d30 // g & ~7
+vshr.u8 d2, d2, #3  // b >> 3
+vshll.u8 q2, d0, #7
+vshll.u8 q3, d1, #2
+vmovl.u8 q8, d2
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S b/source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S
new file mode 100644
index 000000000..2cc804876
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBToBGR565Fast.S
@@ -0,0 +1,54 @@
+//
+//  MNNRGBToBGR565Fast.S
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGR565Fast
+// void MNNRGBToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+// b*7
+// g*38
+// r*19
+
+push {lr}
+vmov.s8 q15, #8
+vneg.s8 q15, q15
+vmov.s8 q14, #4
+vneg.s8 q14, q14
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]! // r,g,b
+
+vand.u8 d0, d0, d30 // r & ~7
+vand.u8 d1, d1, d28 // g & ~3
+vshr.u8 d2, d2, #3  // b >> 3
+
+vshll.u8 q2, d0, #7
+vshl.u8 q2, q2, #1
+vshll.u8 q3, d1, #3
+vmovl.u8 q8, d2
+
+vorr.u8 q2, q2, q3
+vorr.u8 q2, q2, q8
+
+vst1.16 {q2}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S b/source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S
new file mode 100644
index 000000000..f097b94bf
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBToBGRC8.S
@@ -0,0 +1,36 @@
+//
+//  MNNRGBToBGRC8.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToBGRC8
+// void MNNRGBToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmov d3, d2
+vmov d4, d1
+vmov d5, d0
+vst3.8 {d3, d4, d5}, [r1]!
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S b/source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S
new file mode 100644
index 000000000..258cb8892
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNRGBToGRAYFast.S
@@ -0,0 +1,43 @@
+//
+//  MNNRGBToGRAYFast.S
+//  MNN
+//
+//  Created by MNN on 2024/08/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNRGBToGRAYFast
+// void MNNRGBToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+// Auto Load: r0: source, r1: dest, r2: count
+
+push {lr}
+
+vmov.i8 d4, #7
+vmov.i8 d5, #38
+vmov.i8 d6, #19
+
+L1:
+vld3.8 {d0, d1, d2}, [r0]!
+vmull.u8 q8, d2, d4 // b*7
+vmlal.u8 q8, d1, d5 // g*38
+vmlal.u8 q8, d0, d6 // r*19
+
+vqshrn.u16 d16, q8, #6
+vst1.8 {d16}, [r1]!
+
+subs r2, r2, #1
+bne L1
+
+End:
+pop {pc}
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBGRAToBGR.S b/source/backend/cpu/arm/arm64/MNNBGRAToBGR.S
new file mode 100644
index 000000000..14a684fdf
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBGRAToBGR.S
@@ -0,0 +1,129 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNBGRAToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRAToBGRC8
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #12
+mov v16.16b, v0.16b
+mov v17.16b, v1.16b
+mov v18.16b, v2.16b
+mov v19.16b, v4.16b
+mov v20.16b, v5.16b
+mov v21.16b, v6.16b
+
+mov v22.16b, v8.16b
+mov v23.16b, v9.16b
+mov v24.16b, v10.16b
+mov v25.16b, v12.16b
+mov v26.16b, v13.16b
+mov v27.16b, v14.16b
+
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+mov v4.16b, v28.16b
+mov v5.16b, v29.16b
+mov v6.16b, v30.16b
+mov v8.16b, v0.16b
+mov v9.16b, v1.16b
+mov v10.16b, v2.16b
+
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v19.16b, v20.16b, v21.16b}, [x1], #48
+st3 {v22.16b, v23.16b, v24.16b}, [x1], #48
+st3 {v25.16b, v26.16b, v27.16b}, [x1], #48
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+st3 {v8.16b, v9.16b, v10.16b}, [x1], #48
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+mov v16.16b, v0.16b
+mov v17.16b, v1.16b
+mov v18.16b, v2.16b
+mov v19.16b, v4.16b
+mov v20.16b, v5.16b
+mov v21.16b, v6.16b
+
+mov v22.16b, v8.16b
+mov v23.16b, v9.16b
+mov v24.16b, v10.16b
+mov v25.16b, v12.16b
+mov v26.16b, v13.16b
+mov v27.16b, v14.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v19.16b, v20.16b, v21.16b}, [x1], #48
+st3 {v22.16b, v23.16b, v24.16b}, [x1], #48
+st3 {v25.16b, v26.16b, v27.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v6.16b, v7.16b, v8.16b, v9.16b}, [x0], #64
+sub x2, x2, #4
+mov v10.16b, v0.16b
+mov v11.16b, v1.16b
+mov v12.16b, v2.16b
+mov v13.16b, v6.16b
+mov v14.16b, v7.16b
+mov v15.16b, v8.16b
+
+st3 {v10.16b, v11.16b, v12.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+mov v4.16b, v0.16b
+mov v5.16b, v1.16b
+mov v6.16b, v2.16b
+sub x2, x2, #2
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+mov v5.8b, v0.8b
+mov v6.8b, v1.8b
+mov v7.8b, v2.8b
+st3 {v5.8b, v6.8b, v7.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S b/source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S
new file mode 100644
index 000000000..edf9f80fd
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBGRAToGRAY.S
@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRAToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #7
+movi v30.16b, #38
+movi v31.16b, #19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v14.16b, v15.16b, v16.16b, v17.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBGRToBGR555.S b/source/backend/cpu/arm/arm64/MNNBGRToBGR555.S
new file mode 100644
index 000000000..d4c8ddcd7
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBGRToBGR555.S
@@ -0,0 +1,169 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToBGR555Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v11.16b, v11.16b, #3  // b >> 3
+and v26.16b, v26.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v31.16b // g & ~7
+ushr v24.16b, v24.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v11.16b
+
+ushll v6.8h, v26.8b, #7
+ushll v7.8h, v25.8b, #2
+uxtl v27.8h, v24.8b
+ushll2 v28.8h, v26.16b, #7
+ushll2 v29.8h, v25.16b, #2
+uxtl2 v30.8h, v24.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v11.16b, v11.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v11.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+sub x2, x2, #2
+ushr v0.16b, v0.16b, #3  // b >> 3
+
+ushll v3.8h, v2.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v0.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v2.8b, v2.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v31.8b // g & ~7
+ushr v0.8b, v0.8b, #3  // b >> 3
+ushll v2.8h, v2.8b, #7
+ushll v1.8h, v1.8b, #2
+uxtl v0.8h, v0.8b
+orr v3.16b, v0.16b, v1.16b
+orr v3.16b, v3.16b, v2.16b
+
+st1 {v3.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBGRToBGR565.S b/source/backend/cpu/arm/arm64/MNNBGRToBGR565.S
new file mode 100644
index 000000000..0210c0f0c
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBGRToBGR565.S
@@ -0,0 +1,187 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToBGR565Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+movi v30.16b, #4
+neg v30.16b, v30.16b
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v11.16b, v11.16b, #3  // b >> 3
+and v26.16b, v26.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v30.16b // g & ~3
+ushr v24.16b, v24.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v11.16b
+
+ushll v6.8h, v26.8b, #7
+shl v6.8h, v6.8h, #1
+ushll v7.8h, v25.8b, #3
+uxtl v27.8h, v24.8b
+ushll2 v28.8h, v26.16b, #7
+shl v28.8h, v28.8h, #1
+ushll2 v29.8h, v25.16b, #3
+uxtl2 v30.8h, v24.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+movi v30.16b, #4
+neg v30.16b, v30.16b
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v0.16b, v0.16b, #3  // b >> 3
+and v13.16b, v13.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v11.16b, v11.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+ushll v14.8h, v13.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v11.8b
+ushll2 v17.8h, v13.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v11.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v2.16b, v2.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+sub x2, x2, #2
+ushr v0.16b, v0.16b, #3  // b >> 3
+
+ushll v3.8h, v2.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v0.8b
+ushll2 v8.8h, v2.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v0.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v2.8b, v2.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v30.8b // g & ~3
+ushr v0.8b, v0.8b, #3  // b >> 3
+ushll v2.8h, v2.8b, #7
+shl v2.8h, v2.8h, #1
+ushll v1.8h, v1.8b, #3
+uxtl v0.8h, v0.8b
+orr v3.16b, v0.16b, v1.16b
+orr v3.16b, v3.16b, v2.16b
+
+st1 {v3.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNBGRToGRAY.S b/source/backend/cpu/arm/arm64/MNNBGRToGRAY.S
new file mode 100644
index 000000000..cd746a40a
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNBGRToGRAY.S
@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNBGRToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNBGRToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #7
+movi v30.16b, #38
+movi v31.16b, #19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v14.16b, v15.16b, v16.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S b/source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S
new file mode 100644
index 000000000..2c24bed03
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNC3ToC4Fast.S
@@ -0,0 +1,116 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToC4Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v3.16b, #255
+movi v7.16b, #255
+movi v11.16b, #255
+movi v15.16b, #255
+movi v19.16b, #255
+movi v23.16b, #255
+movi v27.16b, #255
+movi v31.16b, #255
+
+L16:
+cmp x2, #16
+blt L12
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
+ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+ld3 {v28.16b, v29.16b, v30.16b}, [x0], #48
+sub x2, x2, #16
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], #64
+b L16
+
+L12:
+cmp x2, #12
+blt L8
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v16.16b, v17.16b, v18.16b}, [x0], #48
+ld3 {v20.16b, v21.16b, v22.16b}, [x0], #48
+sub x2, x2, #12
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+ld3 {v8.16b, v9.16b, v10.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+sub x2, x2, #8
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+st4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x1], #64
+st4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v4.16b, v5.16b, v6.16b}, [x0], #48
+sub x2, x2, #4
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+sub x2, x2, #2
+
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S b/source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S
new file mode 100644
index 000000000..dba224df8
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNC3ToXYZFast.S
@@ -0,0 +1,88 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNC3ToXYZFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToXYZFast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+ld1r {v23.4s}, [x3], #4
+ld1r {v24.4s}, [x3], #4
+ld1r {v25.4s}, [x3], #4
+ld1r {v26.4s}, [x3], #4
+ld1r {v27.4s}, [x3], #4
+ld1r {v28.4s}, [x3], #4
+ld1r {v29.4s}, [x3], #4
+ld1r {v30.4s}, [x3], #4
+ld1r {v31.4s}, [x3], #4
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ushll v0.8h, v0.8b, #0 // r: uint8_t -> uint16_t
+ushll v1.8h, v1.8b, #0
+ushll v2.8h, v2.8b, #0
+
+uxtl v3.4s, v0.4h  // r
+uxtl2 v4.4s, v0.8h // r
+uxtl v5.4s, v1.4h  // g
+uxtl2 v6.4s, v1.8h  // g
+uxtl v7.4s, v2.4h  // b
+uxtl2 v8.4s, v2.8h // b
+
+// r*C0, g*C1, b*C2
+mul v9.4s, v3.4s, v23.4s
+mul v10.4s, v4.4s, v23.4s
+mla v9.4s, v5.4s, v24.4s
+mla v10.4s, v6.4s, v24.4s
+mla v9.4s, v7.4s, v25.4s
+mla v10.4s, v8.4s, v25.4s
+
+// r*C3, g*C4, b*C5
+mul v15.4s, v3.4s, v26.4s
+mul v16.4s, v4.4s, v26.4s
+mla v15.4s, v5.4s, v27.4s
+mla v16.4s, v6.4s, v27.4s
+mla v15.4s, v7.4s, v28.4s
+mla v16.4s, v8.4s, v28.4s
+
+// r*C6, g*C7, b*C8
+mul v21.4s, v3.4s, v29.4s
+mul v22.4s, v4.4s, v29.4s
+mla v21.4s, v5.4s, v30.4s
+mla v22.4s, v6.4s, v30.4s
+mla v21.4s, v7.4s, v31.4s
+mla v22.4s, v8.4s, v31.4s
+
+uqrshrn v11.4h, v9.4s, #12
+uqrshrn2 v11.8h, v10.4s, #12
+uqrshrn v12.4h, v15.4s, #12
+uqrshrn2 v12.8h, v16.4s, #12
+uqrshrn v13.4h, v21.4s, #12
+uqrshrn2 v13.8h, v22.4s, #12
+
+uqxtn v14.8b, v11.8h
+uqxtn v15.8b, v12.8h
+uqxtn v16.8b, v13.8h
+
+
+st3 {v14.8b, v15.8b, v16.8b}, [x1], #24 
+sub x2, x2, #1
+b L1
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S b/source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S
new file mode 100644
index 000000000..8bd316552
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNC3ToYUVFast.S
@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNC3ToYUVFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNC3ToYUVFast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+ld1r {v23.4s}, [x3], #4
+ld1r {v24.4s}, [x3], #4
+ld1r {v25.4s}, [x3], #4
+ld1r {v26.4s}, [x3], #4
+ld1r {v27.4s}, [x3], #4
+ld1r {v28.4s}, [x3], #4
+ld1r {v29.4s}, [x3], #4
+ld1r {v30.4s}, [x3], #4
+ld1r {v31.4s}, [x3], #4
+movi v17.8h, #128
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+ushll v0.8h, v0.8b, #0 // r: uint8_t -> uint16_t
+ushll v1.8h, v1.8b, #0
+ushll v2.8h, v2.8b, #0
+
+uxtl v3.4s, v0.4h  // r
+uxtl2 v4.4s, v0.8h // r
+uxtl v5.4s, v1.4h  // g
+uxtl2 v6.4s, v1.8h  // g
+uxtl v7.4s, v2.4h  // b
+uxtl2 v8.4s, v2.8h // b
+
+// r*C0, g*C1, b*C2
+mul v9.4s, v3.4s, v23.4s
+mul v10.4s, v4.4s, v23.4s
+mla v9.4s, v5.4s, v24.4s
+mla v10.4s, v6.4s, v24.4s
+mla v9.4s, v7.4s, v25.4s
+mla v10.4s, v8.4s, v25.4s
+
+// r*C3, g*C4, b*C5
+mul v15.4s, v3.4s, v26.4s
+mul v16.4s, v4.4s, v26.4s
+mla v15.4s, v5.4s, v27.4s
+mla v16.4s, v6.4s, v27.4s
+mla v15.4s, v7.4s, v28.4s
+mla v16.4s, v8.4s, v28.4s
+
+// r*C6, g*C7, b*C8
+mul v21.4s, v3.4s, v29.4s
+mul v22.4s, v4.4s, v29.4s
+mla v21.4s, v5.4s, v30.4s
+mla v22.4s, v6.4s, v30.4s
+mla v21.4s, v7.4s, v31.4s
+mla v22.4s, v8.4s, v31.4s
+
+uqrshrn v11.4h, v9.4s, #14
+uqrshrn2 v11.8h, v10.4s, #14
+uqrshrn v12.4h, v15.4s, #14
+uqrshrn2 v12.8h, v16.4s, #14
+uqrshrn v13.4h, v21.4s, #14
+uqrshrn2 v13.8h, v22.4s, #14
+
+add v12.8h, v12.8h, v17.8h
+add v13.8h, v13.8h, v17.8h
+
+uqxtn v14.8b, v11.8h
+uqxtn v15.8b, v12.8h
+uqxtn v16.8b, v13.8h
+
+
+st3 {v14.8b, v15.8b, v16.8b}, [x1], #24 
+sub x2, x2, #1
+b L1
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
index 98816cfde..8b5a4e42c 100644
--- a/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
+++ b/source/backend/cpu/arm/arm64/MNNFloat2Int8.S
@@ -14,21 +14,35 @@
 .align 5
 
 asm_function MNNFloat2Int8
-//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, size_t zeroPoint);
-//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint
+//void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, float* scale, size_t aMin, size_t aMax, float* zeroPoint, ssize_t quanParamVec);
+//x0:src, x1:dst, x2:sizeQuad, x3:scale, x4:aMin, x5:aMax, x6:zeroPoint, x7: quanParamVec
 stp d14, d15, [sp, #-64]!
 stp d12, d13, [sp, #16]
 stp d10, d11, [sp, #32]
 stp d8,  d9,  [sp, #48]
 
-ld1 {v31.4s}, [x3]
+ld1r {v31.4s}, [x3]
 
 dup v30.16b, w4
 dup v29.16b, w5
 
 // copy zero point
-dup v28.4s, w6
-scvtf v28.4s, v28.4s
+ld1r {v28.4s}, [x6]
+
+cmp x7, #3
+bne LOAD_SCALE_VEC
+ld1 {v31.4s}, [x3] // scale
+ld1 {v28.4s}, [x6] // zero
+b FL32
+LOAD_SCALE_VEC:
+cmp x7, #1
+bne LOAD_ZERO_VEC
+ld1 {v31.4s}, [x3] // scale
+b FL32
+LOAD_ZERO_VEC:
+cmp x7, #2
+bne FL32
+ld1 {v28.4s}, [x6] // zero
 
 FL32:
 cmp x2, #32
@@ -44,58 +58,53 @@ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
 // ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
 // ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
-fadd v7.4s, v7.4s, v28.4s
-
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
-fadd v15.4s, v15.4s, v28.4s
-
-
 fmul v16.4s, v16.4s, v31.4s
-fadd v16.4s, v16.4s, v28.4s
 fmul v17.4s, v17.4s, v31.4s
-fadd v17.4s, v17.4s, v28.4s
 fmul v18.4s, v18.4s, v31.4s
-fadd v18.4s, v18.4s, v28.4s
 fmul v19.4s, v19.4s, v31.4s
-fadd v19.4s, v19.4s, v28.4s
-
 fmul v20.4s, v20.4s, v31.4s
-fadd v20.4s, v20.4s, v28.4s
 fmul v21.4s, v21.4s, v31.4s
-fadd v21.4s, v21.4s, v28.4s
 fmul v22.4s, v22.4s, v31.4s
-fadd v22.4s, v22.4s, v28.4s
 fmul v23.4s, v23.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
+fadd v7.4s, v7.4s, v28.4s
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
+fadd v15.4s, v15.4s, v28.4s
+fadd v16.4s, v16.4s, v28.4s
+fadd v17.4s, v17.4s, v28.4s
+fadd v18.4s, v18.4s, v28.4s
+fadd v19.4s, v19.4s, v28.4s
+fadd v20.4s, v20.4s, v28.4s
+fadd v21.4s, v21.4s, v28.4s
+fadd v22.4s, v22.4s, v28.4s
 fadd v23.4s, v23.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -171,21 +180,21 @@ sqxtn2 v4.16b, v5.8h
 sqxtn2 v6.16b, v7.8h
 
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
+
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
 fadd v15.4s, v15.4s, v28.4s
 
 fcvtas v8.4s, v8.4s
@@ -207,8 +216,8 @@ sqxtn v19.4h, v14.4s
 sqxtn2 v19.8h, v15.4s
 
 smin v24.16b, v24.16b, v29.16b
-smax v24.16b, v24.16b, v30.16b
 smin v25.16b, v26.16b, v29.16b
+smax v24.16b, v24.16b, v30.16b
 smax v25.16b, v25.16b, v30.16b
 
 sqxtn v20.8b, v16.8h
@@ -217,18 +226,18 @@ sqxtn v21.8b, v18.8h
 sqxtn2 v21.16b, v19.8h
 
 smin v26.16b, v0.16b, v29.16b
-smax v26.16b, v26.16b, v30.16b
 smin v27.16b, v2.16b, v29.16b
+smax v26.16b, v26.16b, v30.16b
 smax v27.16b, v27.16b, v30.16b
 
 smin v12.16b, v4.16b, v29.16b
-smax v12.16b, v12.16b, v30.16b
 smin v13.16b, v6.16b, v29.16b
+smax v12.16b, v12.16b, v30.16b
 smax v13.16b, v13.16b, v30.16b
 
 smin v14.16b, v20.16b, v29.16b
-smax v14.16b, v14.16b, v30.16b
 smin v15.16b, v21.16b, v29.16b
+smax v14.16b, v14.16b, v30.16b
 smax v15.16b, v15.16b, v30.16b
 
 st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x1], #64
@@ -248,39 +257,37 @@ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
 ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
 ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
-fadd v7.4s, v7.4s, v28.4s
-
 fmul v8.4s, v8.4s, v31.4s
-fadd v8.4s, v8.4s, v28.4s
 fmul v9.4s, v9.4s, v31.4s
-fadd v9.4s, v9.4s, v28.4s
 fmul v10.4s, v10.4s, v31.4s
-fadd v10.4s, v10.4s, v28.4s
 fmul v11.4s, v11.4s, v31.4s
-fadd v11.4s, v11.4s, v28.4s
-
 fmul v12.4s, v12.4s, v31.4s
-fadd v12.4s, v12.4s, v28.4s
 fmul v13.4s, v13.4s, v31.4s
-fadd v13.4s, v13.4s, v28.4s
 fmul v14.4s, v14.4s, v31.4s
-fadd v14.4s, v14.4s, v28.4s
 fmul v15.4s, v15.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
+fadd v7.4s, v7.4s, v28.4s
+fadd v8.4s, v8.4s, v28.4s
+fadd v9.4s, v9.4s, v28.4s
+fadd v10.4s, v10.4s, v28.4s
+fadd v11.4s, v11.4s, v28.4s
+fadd v12.4s, v12.4s, v28.4s
+fadd v13.4s, v13.4s, v28.4s
+fadd v14.4s, v14.4s, v28.4s
 fadd v15.4s, v15.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -350,21 +357,21 @@ FLLoop8:
 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
-fadd v3.4s, v3.4s, v28.4s
-
 fmul v4.4s, v4.4s, v31.4s
-fadd v4.4s, v4.4s, v28.4s
 fmul v5.4s, v5.4s, v31.4s
-fadd v5.4s, v5.4s, v28.4s
 fmul v6.4s, v6.4s, v31.4s
-fadd v6.4s, v6.4s, v28.4s
 fmul v7.4s, v7.4s, v31.4s
+
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
+fadd v3.4s, v3.4s, v28.4s
+fadd v4.4s, v4.4s, v28.4s
+fadd v5.4s, v5.4s, v28.4s
+fadd v6.4s, v6.4s, v28.4s
 fadd v7.4s, v7.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
@@ -405,15 +412,14 @@ cmp x2, #3
 ble FL1
 
 FLLoop4:
-ld1 {v0.4s, v1.4s}, [x0], #32
+ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64
 fmul v0.4s, v0.4s, v31.4s
-fadd v0.4s, v0.4s, v28.4s
-ld1 {v2.4s, v3.4s}, [x0], #32
 fmul v1.4s, v1.4s, v31.4s
-fadd v1.4s, v1.4s, v28.4s
 fmul v2.4s, v2.4s, v31.4s
-fadd v2.4s, v2.4s, v28.4s
 fmul v3.4s, v3.4s, v31.4s
+fadd v0.4s, v0.4s, v28.4s
+fadd v1.4s, v1.4s, v28.4s
+fadd v2.4s, v2.4s, v28.4s
 fadd v3.4s, v3.4s, v28.4s
 
 fcvtas v0.4s, v0.4s
diff --git a/source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S b/source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S
new file mode 100644
index 000000000..852e3c5aa
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNGRAYToC3Fast.S
@@ -0,0 +1,124 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNGRAYToC3Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNGRAYToC3Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld1 {v21.16b, v22.16b}, [x0], #32
+sub x2, x2, #12
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+
+mov v23.16b, v21.16b
+mov v24.16b, v21.16b
+mov v25.16b, v21.16b
+
+mov v27.16b, v22.16b
+mov v28.16b, v22.16b
+mov v29.16b, v22.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+st3 {v17.16b, v18.16b, v19.16b}, [x1], #48
+st3 {v23.16b, v24.16b, v25.16b}, [x1], #48
+st3 {v27.16b, v28.16b, v29.16b}, [x1], #48
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #8
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v13.16b, v14.16b, v15.16b}, [x1], #48
+st3 {v17.16b, v18.16b, v19.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld1 {v0.16b, v1.16b}, [x0], #32
+sub x2, x2, #4
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld1 {v0.16b}, [x0], #16
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+sub x2, x2, #2
+st3 {v5.16b, v6.16b, v7.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld1 {v0.8b}, [x0], #8
+mov v5.8b, v0.8b
+mov v6.8b, v0.8b
+mov v7.8b, v0.8b
+st3 {v5.8b, v6.8b, v7.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S b/source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S
new file mode 100644
index 000000000..e13691d2f
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNGRAYToC4Fast.S
@@ -0,0 +1,139 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNGRAYToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNGRAYToC4Fast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+movi v31.16b, #255
+
+L12:
+cmp x2, #12
+blt L8
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld1 {v21.16b, v22.16b}, [x0], #32
+sub x2, x2, #12
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+mov v16.16b, v31.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+mov v20.16b, v31.16b
+
+mov v23.16b, v21.16b
+mov v24.16b, v21.16b
+mov v25.16b, v21.16b
+mov v26.16b, v31.16b
+
+mov v27.16b, v22.16b
+mov v28.16b, v22.16b
+mov v29.16b, v22.16b
+mov v30.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+st4 {v13.16b, v14.16b, v15.16b, v16.16b}, [x1], #64
+st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x1], #64
+st4 {v23.16b, v24.16b, v25.16b, v26.16b}, [x1], #64
+st4 {v27.16b, v28.16b, v29.16b, v30.16b}, [x1], #64
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #8
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+mov v13.16b, v2.16b
+mov v14.16b, v2.16b
+mov v15.16b, v2.16b
+mov v16.16b, v31.16b
+
+mov v17.16b, v3.16b
+mov v18.16b, v3.16b
+mov v19.16b, v3.16b
+mov v20.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+st4 {v13.16b, v14.16b, v15.16b, v16.16b}, [x1], #64
+st4 {v17.16b, v18.16b, v19.16b, v20.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld1 {v0.16b, v1.16b}, [x0], #32
+sub x2, x2, #4
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+
+mov v9.16b, v1.16b
+mov v10.16b, v1.16b
+mov v11.16b, v1.16b
+mov v12.16b, v31.16b
+
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+st4 {v9.16b, v10.16b, v11.16b, v12.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld1 {v0.16b}, [x0], #16
+mov v5.16b, v0.16b
+mov v6.16b, v0.16b
+mov v7.16b, v0.16b
+mov v8.16b, v31.16b
+sub x2, x2, #2
+st4 {v5.16b, v6.16b, v7.16b, v8.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld1 {v0.8b}, [x0], #8
+mov v5.8b, v0.8b
+mov v6.8b, v0.8b
+mov v7.8b, v0.8b
+mov v8.8b, v31.8b
+st4 {v5.8b, v6.8b, v7.8b, v8.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
index d1fdd68bd..339bbd37e 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
@@ -127,7 +127,7 @@ stp x23, x24, [sp, #(16 * 8)]
 
 ldr x27, [x6, #64]  // blockNum
 mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x27, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
 
 ldr w28, [x6, #24]  // useInt8
 ldr x25, [x6, #40]  // xKernelSum
@@ -135,9 +135,9 @@ ldr x26, [x6, #48]  // weightQuantBias
 ldr x24, [x6, #80]  // extraScale
 
 add x23, x6, #16  // int8 max ptr
-mov x21, #4 // sizeof(int8_t) * UNIT
+mov x21, #4 // sizeof(int8_t) * pack
 cbnz w28, Start
-mov x21, #16 // sizeof(float) * UNIT
+mov x21, #16 // sizeof(float) * pack
 ldr x23, [x6, #56]  // fp32minmax
 Start:
 mov x22, #48 // src_steps
@@ -148,7 +148,6 @@ TILE_12:
     cmp x5, #2
     blt L4LoopDz_TILE_12
 L8LoopDz_TILE_12:
-    //ld1 {v0.4s, v1.4s}, [x9], #32 // bias
     mov x11, x1
     mov x13, x3
     mov x20, x0 // tag dst address
@@ -162,13 +161,13 @@ L8LoopDz_TILE_12:
     SET_BIAS v28, v29, v30, v31
 
     L8LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x2], #32 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x2], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
@@ -181,7 +180,7 @@ L8LoopDz_TILE_12:
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -194,8 +193,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    // add x2, x2, x15
-    add x2, x27, x15, LSL #1
+    add x2, x27, x15
     sub x5, x5, #2
 
     L8Tile12Quan:
@@ -352,7 +350,7 @@ L8LoopDz_TILE_12:
     L8Tile12LoopCheck:
     cmp x5, #1
     bgt L8LoopDz_TILE_12
-    blt End
+    cbz x5, End
 
 L4LoopDz_TILE_12:
     SET_BIAS v8, v9, v10, v11
@@ -360,7 +358,7 @@ L4LoopDz_TILE_12:
     SET_BIAS v16, v17, v18, v19
 
     L4LoopSz_TILE_12:
-        ld1 {v3.16b}, [x2], #16 // weight
+        ld1 {v3.16b}, [x2] // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -370,6 +368,7 @@ L4LoopDz_TILE_12:
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
+        add x2, x2, #32 // weight offset=lp*hp=32
         subs x3, x3, #1
         .inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
         .inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
@@ -497,18 +496,18 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
 
     L8LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        ld1 {v4.16b}, [x12], #16
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -521,8 +520,7 @@ L8LoopDz_TILE_8:
         bne L8LoopSz_TILE_8
 
     L8LoopSzEnd_TILE_8:
-    //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile8Quan:
@@ -652,12 +650,13 @@ L4LoopDz_TILE_8:
     SET_BIAS v12, v13, v14, v15
 
     L4LoopSz_TILE_8:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12] // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
+        add x12, x12, #32 // weight offset=lp*hp
         subs x13, x13, #1
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
@@ -772,15 +771,14 @@ L8LoopDz_TILE_4:
     SET_BIAS v12, v13, v14, v15
 
     L8LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
+
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -788,8 +786,7 @@ L8LoopDz_TILE_4:
         bne L8LoopSz_TILE_4
 
     L8LoopSzEnd_TILE_4:
-    //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile4Quan:
@@ -879,9 +876,10 @@ L4LoopDz_TILE_4:
     SET_BIAS v8, v9, v10, v11
 
     L4LoopSz_TILE_4:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12]      // weight
         ld1 {v0.16b}, [x11], x22 // src
         subs x13, x13, #1
+        add x12, x12, #32 // weight offset = lp*hp
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
@@ -974,17 +972,15 @@ L8LoopDz_TILE_1:
     movi v8.16b, #0
     movi v9.16b, #0
     L8LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], x15 // weight
+        ld1 {v3.16b, v4.16b}, [x12], #32 // weight
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.16b}, [x12], #16 // weight
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
         bne L8LoopSz_TILE_1
 
     L8LoopSzEnd_TILE_1:
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile1Quan:
@@ -1067,9 +1063,10 @@ L4LoopDz_TILE_1:
     mov x13, x3
     movi v8.16b, #0
     L4LoopSz_TILE_1:
-        ld1 {v3.16b}, [x12], #16 // weight
+        ld1 {v3.16b}, [x12] // weight
         ld1 {v0.s}[0], [x11], x22 // src
         subs x13, x13, #1
+        add x12, x12, #32 // weight offset = lp*hp
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         bne L4LoopSz_TILE_1
 
@@ -1132,11 +1129,11 @@ cbz x24, Tile1_End_Offset
 add x24, x24, #4
 
 Tile1_End_Offset:
-    sub x7, x7, #1
+    subs x7, x7, #1
     add x0, x0, x21
     add x1, x1, #4
     add x25, x25, #4
-    b TILE_1
+    bne TILE_1
 
 End:
 ldp x23, x24, [sp, #(16 * 8)]
diff --git a/source/backend/cpu/arm/arm64/MNNPackC2.S b/source/backend/cpu/arm/arm64/MNNPackC2.S
new file mode 100644
index 000000000..3a66bafd9
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNPackC2.S
@@ -0,0 +1,107 @@
+//
+//  MNNPackInt8C2.S
+//  MNN
+//
+//  Created by MNN on 2019/02/02.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNPackInt8C2
+//void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int32_t* areaOffset)
+//Auto load:
+//x0:dst, x1:src, x2:area, x3:depth, x4: areaOffset, x5: areaOffset
+
+ldr w10, [x4, #4] // dstDepthOffset
+ldr w9, [x4, #0] // srcDepthOffset
+uxtw x10, w10
+uxtw x9, w9
+
+//x12: srcDepthOffset:area*sizeof(float)
+mov x12, #4
+mul x12, x9, x12
+
+//r10 -> 2 * (dstArea * sizeof(float) - area * sizeof(float))
+mov x5, #8
+sub x10, x10, x2
+mul x10, x5, x10
+
+//r9 -> (srcArea * sizeof(float) - area * sizeof(float))
+mov x6, #4
+sub x9, x9, x2
+mul x9, x6, x9
+
+UpL2:
+cmp x3, #1
+ble UpL1
+
+UpL2Loop:
+add x5, x1, x12
+mov x8, x2
+cmp x8, #3
+ble UpL2AreaRemain
+UpL2AreaLoop:
+ld1 {v0.4s}, [x1], #16
+ld1 {v1.4s}, [x5], #16
+
+st2 {v0.4s, v1.4s}, [x0], #32
+sub x8, x8, #4
+cmp x8, #4
+bge UpL2AreaLoop
+
+cmp x8, #0
+beq UpL2AreaRemainEnd
+UpL2AreaRemain:
+ld1 {v0.s}[0], [x1], #4
+ld1 {v0.s}[1], [x5], #4
+
+st1 {v0.d}[0], [x0], #8
+
+subs x8, x8, #1
+bne UpL2AreaRemain
+
+UpL2AreaRemainEnd:
+sub x3, x3, #2
+add x1, x5, x9
+cmp x3, #2
+add x0, x10, x0
+bge UpL2Loop
+
+UpL1:
+cmp x3, #0
+beq UpEnd
+mov x8, x2
+cmp x8, #3
+ble UpL1AreaRemain
+UpL1AreaLoop:
+ld1 {v0.4s}, [x1], #16
+movi v1.4s, #0
+
+st2 {v0.4s, v1.4s}, [x0], #32
+sub x8, x8, #4
+cmp x8, #4
+bge UpL1AreaLoop
+
+cmp x8, #0
+beq UpL1AreaRemainEnd
+UpL1AreaRemain:
+movi v0.4s, #0
+ld1 {v0.s}[0], [x1], #4
+
+st1 {v0.d}[0], [x0], #8
+
+subs x8, x8, #1
+bne UpL1AreaRemain
+
+UpL1AreaRemainEnd:
+
+UpEnd:
+
+ret
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S b/source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S
new file mode 100644
index 000000000..884d48e8f
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBAToBGRAFast.S
@@ -0,0 +1,147 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBAToBGRAFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToBGRAFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L10:
+cmp x2, #10
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #10
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+mov v27.16b, v11.16b
+
+mov v0.16b, v14.16b
+mov v1.16b, v13.16b
+mov v2.16b, v12.16b
+mov v3.16b, v15.16b
+
+mov v4.16b, v30.16b
+mov v5.16b, v29.16b
+mov v6.16b, v28.16b
+mov v7.16b, v31.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+st4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
+
+b L10
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+mov v27.16b, v11.16b
+
+mov v28.16b, v14.16b
+mov v29.16b, v13.16b
+mov v30.16b, v12.16b
+mov v31.16b, v15.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+st4 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #64
+st4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x1], #64
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+sub x2, x2, #4
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+mov v23.16b, v7.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+st4 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #64
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #2
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+mov v19.16b, v3.16b
+
+st4 {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+mov v16.8b, v2.8b
+mov v17.8b, v1.8b
+mov v18.8b, v0.8b
+mov v19.8b, v3.8b
+
+st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [x1], #32
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S b/source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S
new file mode 100644
index 000000000..d894c0c13
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBAToBGRFast.S
@@ -0,0 +1,134 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBAToBGRFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToBGRFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L10:
+cmp x2, #10
+blt L8
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+ld4 {v28.16b, v29.16b, v30.16b, v31.16b}, [x0], #64
+sub x2, x2, #10
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+
+mov v0.16b, v14.16b
+mov v1.16b, v13.16b
+mov v2.16b, v12.16b
+
+mov v4.16b, v30.16b
+mov v5.16b, v29.16b
+mov v6.16b, v28.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+st3 {v24.16b, v25.16b, v26.16b}, [x1], #48
+st3 {v0.16b, v1.16b, v2.16b}, [x1], #48
+st3 {v4.16b, v5.16b, v6.16b}, [x1], #48
+
+b L10
+
+
+L8:
+cmp x2, #8
+blt L4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+ld4 {v8.16b, v9.16b, v10.16b, v11.16b}, [x0], #64
+ld4 {v12.16b, v13.16b, v14.16b, v15.16b}, [x0], #64
+sub x2, x2, #8
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+mov v24.16b, v10.16b
+mov v25.16b, v9.16b
+mov v26.16b, v8.16b
+
+mov v28.16b, v14.16b
+mov v29.16b, v13.16b
+mov v30.16b, v12.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+st3 {v24.16b, v25.16b, v26.16b}, [x1], #48
+st3 {v28.16b, v29.16b, v30.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], #64
+sub x2, x2, #4
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+mov v20.16b, v6.16b
+mov v21.16b, v5.16b
+mov v22.16b, v4.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+st3 {v20.16b, v21.16b, v22.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+sub x2, x2, #2
+
+mov v16.16b, v2.16b
+mov v17.16b, v1.16b
+mov v18.16b, v0.16b
+
+st3 {v16.16b, v17.16b, v18.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+mov v16.8b, v2.8b
+mov v17.8b, v1.8b
+mov v18.8b, v0.8b
+
+st3 {v16.8b, v17.8b, v18.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S b/source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S
new file mode 100644
index 000000000..d83e3c8a1
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBAToGRAYFast.S
@@ -0,0 +1,96 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBAToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #19
+movi v30.16b, #38
+movi v31.16b, #7
+
+// b*7
+// g*38
+// r*19
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+ld4 {v14.16b, v15.16b, v16.16b, v17.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+umull2 v7.8h, v0.16b, v29.16b
+umlal2 v7.8h, v1.16b, v30.16b
+umlal2 v7.8h, v2.16b, v31.16b
+
+umull v18.8h, v14.8b, v29.8b
+umlal v18.8h, v15.8b, v30.8b
+umlal v18.8h, v16.8b, v31.8b
+
+umull2 v21.8h, v14.16b, v29.16b
+umlal2 v21.8h, v15.16b, v30.16b
+umlal2 v21.8h, v16.16b, v31.16b
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+umull2 v7.8h, v0.16b, v29.16b
+umlal2 v7.8h, v1.16b, v30.16b
+umlal2 v7.8h, v2.16b, v31.16b
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], #32
+
+umull v4.8h, v0.8b, v29.8b
+umlal v4.8h, v1.8b, v30.8b
+umlal v4.8h, v2.8b, v31.8b
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBToBGR.S b/source/backend/cpu/arm/arm64/MNNRGBToBGR.S
new file mode 100644
index 000000000..f12cf78e0
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBToBGR.S
@@ -0,0 +1,126 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+// void MNNRGBToBGR(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGRC8
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+L12:
+cmp x2, #12
+blt L8
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v15.16b, v16.16b, v17.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+ld3 {v27.16b, v28.16b, v29.16b}, [x0], #48
+sub x2, x2, #12
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+mov v18.16b, v14.16b
+mov v19.16b, v13.16b
+mov v20.16b, v12.16b
+mov v21.16b, v17.16b
+mov v22.16b, v16.16b
+mov v23.16b, v15.16b
+
+mov v0.16b, v26.16b
+mov v1.16b, v25.16b
+mov v2.16b, v24.16b
+mov v6.16b, v29.16b
+mov v7.16b, v28.16b
+mov v8.16b, v27.16b
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v18.16b, v19.16b, v20.16b}, [x1], #48
+st3 {v21.16b, v22.16b, v23.16b}, [x1], #48
+st3 {v0.16b, v1.16b, v2.16b}, [x1], #48
+st3 {v6.16b, v7.16b, v8.16b}, [x1], #48
+
+b L12
+
+
+L8:
+cmp x2, #8
+blt L4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+ld3 {v12.16b, v13.16b, v14.16b}, [x0], #48
+ld3 {v15.16b, v16.16b, v17.16b}, [x0], #48
+sub x2, x2, #8
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+mov v18.16b, v14.16b
+mov v19.16b, v13.16b
+mov v20.16b, v12.16b
+mov v21.16b, v17.16b
+mov v22.16b, v16.16b
+mov v23.16b, v15.16b
+
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+st3 {v18.16b, v19.16b, v20.16b}, [x1], #48
+st3 {v21.16b, v22.16b, v23.16b}, [x1], #48
+b L8
+
+L4:
+cmp x2, #4
+blt L2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v6.16b, v7.16b, v8.16b}, [x0], #48
+sub x2, x2, #4
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+mov v9.16b, v8.16b
+mov v10.16b, v7.16b
+mov v11.16b, v6.16b
+
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+st3 {v9.16b, v10.16b, v11.16b}, [x1], #48
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+mov v3.16b, v2.16b
+mov v4.16b, v1.16b
+mov v5.16b, v0.16b
+sub x2, x2, #2
+st3 {v3.16b, v4.16b, v5.16b}, [x1], #48
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+mov v3.8b, v2.8b
+mov v4.8b, v1.8b
+mov v5.8b, v0.8b
+st3 {v3.8b, v4.8b, v5.8b}, [x1], #24
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBToBGR555.S b/source/backend/cpu/arm/arm64/MNNRGBToBGR555.S
new file mode 100644
index 000000000..d34a588c9
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBToBGR555.S
@@ -0,0 +1,169 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGR555Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v13.16b, v13.16b, #3  // b >> 3
+and v24.16b, v24.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v31.16b // g & ~7
+ushr v26.16b, v26.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v13.16b
+
+ushll v6.8h, v24.8b, #7
+ushll v7.8h, v25.8b, #2
+uxtl v27.8h, v26.8b
+ushll2 v28.8h, v24.16b, #7
+ushll2 v29.8h, v25.16b, #2
+uxtl2 v30.8h, v26.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v31.16b // g & ~7
+ushr v13.16b, v13.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+ushll v15.8h, v12.8b, #2
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+ushll2 v18.8h, v12.16b, #2
+uxtl2 v19.8h, v13.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v31.16b // g & ~7
+sub x2, x2, #2
+ushr v2.16b, v2.16b, #3  // b >> 3
+
+ushll v3.8h, v0.8b, #7
+ushll v4.8h, v1.8b, #2
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+ushll2 v9.8h, v1.16b, #2
+uxtl2 v10.8h, v2.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v0.8b, v0.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v31.8b // g & ~7
+ushr v2.8b, v2.8b, #3  // b >> 3
+ushll v0.8h, v0.8b, #7
+ushll v1.8h, v1.8b, #2
+uxtl v2.8h, v2.8b
+orr v0.16b, v0.16b, v1.16b
+orr v0.16b, v0.16b, v2.16b
+
+st1 {v0.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBToBGR565.S b/source/backend/cpu/arm/arm64/MNNRGBToBGR565.S
new file mode 100644
index 000000000..359ba392b
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBToBGR565.S
@@ -0,0 +1,187 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToBGR565Fast
+// x0: source, x1: dest, x2: count, x3: c
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v31.16b, #8
+neg v31.16b, v31.16b
+
+L6:
+cmp x2, #6
+blt L4
+
+movi v30.16b, #4
+neg v30.16b, v30.16b
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+ld3 {v24.16b, v25.16b, v26.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v13.16b, v13.16b, #3  // b >> 3
+and v24.16b, v24.16b, v31.16b // r & ~7
+and v25.16b, v25.16b, v30.16b // g & ~3
+ushr v26.16b, v26.16b, #3  // b >> 3
+sub x2, x2, #6
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v13.16b
+
+ushll v6.8h, v24.8b, #7
+shl v6.8h, v6.8h, #1
+ushll v7.8h, v25.8b, #3
+uxtl v27.8h, v26.8b
+ushll2 v28.8h, v24.16b, #7
+shl v28.8h, v28.8h, #1
+ushll2 v29.8h, v25.16b, #3
+uxtl2 v30.8h, v26.16b
+
+orr v0.16b, v3.16b, v4.16b
+orr v0.16b, v0.16b, v5.16b
+orr v1.16b, v8.16b, v9.16b
+orr v1.16b, v1.16b, v10.16b
+
+orr v2.16b, v14.16b, v15.16b
+orr v2.16b, v2.16b, v16.16b
+orr v3.16b, v17.16b, v18.16b
+orr v3.16b, v3.16b, v19.16b
+
+orr v4.16b, v6.16b, v7.16b
+orr v4.16b, v4.16b, v27.16b
+orr v5.16b, v28.16b, v29.16b
+orr v5.16b, v5.16b, v30.16b
+
+st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+st1 {v4.8h, v5.8h}, [x1], #32
+
+b L6
+
+L4:
+movi v30.16b, #4
+neg v30.16b, v30.16b
+cmp x2, #4
+blt L2
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v11.16b, v12.16b, v13.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~3
+ushr v2.16b, v2.16b, #3  // b >> 3
+and v11.16b, v11.16b, v31.16b // r & ~7
+and v12.16b, v12.16b, v30.16b // g & ~3
+ushr v13.16b, v13.16b, #3  // b >> 3
+sub x2, x2, #4
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+ushll v14.8h, v11.8b, #7
+shl v14.8h, v14.8h, #1
+ushll v15.8h, v12.8b, #3
+uxtl v16.8h, v13.8b
+ushll2 v17.8h, v11.16b, #7
+shl v17.8h, v17.8h, #1
+ushll2 v18.8h, v12.16b, #3
+uxtl2 v19.8h, v13.16b
+
+
+orr v20.16b, v3.16b, v4.16b
+orr v20.16b, v20.16b, v5.16b
+orr v21.16b, v8.16b, v9.16b
+orr v21.16b, v21.16b, v10.16b
+
+orr v22.16b, v14.16b, v15.16b
+orr v22.16b, v22.16b, v16.16b
+orr v23.16b, v17.16b, v18.16b
+orr v23.16b, v23.16b, v19.16b
+
+st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x1], #64
+
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+and v0.16b, v0.16b, v31.16b // r & ~7
+and v1.16b, v1.16b, v30.16b // g & ~7
+sub x2, x2, #2
+ushr v2.16b, v2.16b, #3  // b >> 3
+
+ushll v3.8h, v0.8b, #7
+shl v3.8h, v3.8h, #1
+ushll v4.8h, v1.8b, #3
+uxtl v5.8h, v2.8b
+ushll2 v8.8h, v0.16b, #7
+shl v8.8h, v8.8h, #1
+ushll2 v9.8h, v1.16b, #3
+uxtl2 v10.8h, v2.16b
+
+orr v6.16b, v3.16b, v4.16b
+orr v6.16b, v6.16b, v5.16b
+orr v7.16b, v8.16b, v9.16b
+orr v7.16b, v7.16b, v10.16b
+
+st1 {v6.8h, v7.8h}, [x1], #32
+
+b L2
+
+L1:
+cmp x2, #1
+blt End
+
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+and v0.8b, v0.8b, v31.8b // r & ~7
+and v1.8b, v1.8b, v30.8b // g & ~7
+ushr v2.8b, v2.8b, #3  // b >> 3
+ushll v0.8h, v0.8b, #7
+shl v0.8h, v0.8h, #1
+ushll v1.8h, v1.8b, #3
+uxtl v2.8h, v2.8b
+orr v0.16b, v0.16b, v1.16b
+orr v0.16b, v0.16b, v2.16b
+
+st1 {v0.8h}, [x1], #16
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S b/source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S
new file mode 100644
index 000000000..09ffb3ac7
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNRGBToGRAYFast.S
@@ -0,0 +1,92 @@
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+// void MNNRGBToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+asm_function MNNRGBToGRAYFast
+// x0: source, x1: dest, x2: count
+stp d14, d15, [sp, #(-16 * 4)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+
+movi v29.16b, #19
+movi v30.16b, #38
+movi v31.16b, #7
+
+L4:
+cmp x2, #4
+blt L2
+
+sub x2, x2, #4
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+ld3 {v14.16b, v15.16b, v16.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+umull v18.8h, v14.8b, v29.8b // b*7
+umlal v18.8h, v15.8b, v30.8b // g*38
+umlal v18.8h, v16.8b, v31.8b // r*19
+
+umull2 v21.8h, v14.16b, v29.16b // b*7
+umlal2 v21.8h, v15.16b, v30.16b // g*38
+umlal2 v21.8h, v16.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+uqshrn v5.8b, v18.8h, #6
+uqshrn2 v5.16b, v21.8h, #6
+
+st1 {v4.16b, v5.16b}, [x1], #32
+b L4
+
+L2:
+cmp x2, #2
+blt L1
+
+sub x2, x2, #2
+ld3 {v0.16b, v1.16b, v2.16b}, [x0], #48
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+umull2 v7.8h, v0.16b, v29.16b // b*7
+umlal2 v7.8h, v1.16b, v30.16b // g*38
+umlal2 v7.8h, v2.16b, v31.16b // r*19
+
+uqshrn v4.8b, v4.8h, #6
+uqshrn2 v4.16b, v7.8h, #6
+
+st1 {v4.16b}, [x1], #16
+b L2
+
+L1:
+cmp x2, #1
+blt End
+ld3 {v0.8b, v1.8b, v2.8b}, [x0], #24
+
+umull v4.8h, v0.8b, v29.8b // b*7
+umlal v4.8h, v1.8b, v30.8b // g*38
+umlal v4.8h, v2.8b, v31.8b // r*19
+
+uqshrn v10.8b, v4.8h, #6
+
+st1 {v10.8b}, [x1], #8
+
+End:
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 4)
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S b/source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S
new file mode 100644
index 000000000..b809984c3
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNSamplerC3BilinearOpt.S
@@ -0,0 +1,171 @@
+//
+//  MNNSamplerC3BilinearOpt.S
+//  MNN
+//
+//  Created by MNN on 2018/11/20.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+//void MNNSamplerC3BilinearOpt(const unsigned char* source, unsigned char* dest, float* points, size_t count, size_t iw, size_t ih, size_t yStride);
+asm_function MNNSamplerC3BilinearOpt
+
+//Auto: x0:source, x1:dest, x2:points, x3:count
+//x4: xMax, x5: yMax, x6:yStride
+
+movi v19.4s, #0
+
+ld1 {v0.2s, v1.2s}, [x2]
+//L4:
+//cmp x3, #4
+//blt L1
+//dup v16.4s, w4
+//dup v17.4s, w5
+//movi v3.2s, #4
+//scvtf v3.2s, v3.2s 
+//fmul v3.2s, v3.2s, v1.2s
+//dup v25.4s, v3.s[0]
+//dup v26.4s, v3.s[1]
+//
+//fadd v2.2s, v0.2s, v1.2s
+//mov v4.s[0], v0.s[0]
+//fadd v3.2s, v2.2s, v1.2s
+//mov v5.s[0], v0.s[1]
+//mov v4.s[1], v2.s[0]
+//mov v5.s[1], v2.s[1]
+//mov v4.s[2], v3.s[0]
+//fadd v2.2s, v3.2s, v1.2s
+//mov v5.s[2], v3.s[1]
+//mov v4.s[3], v2.s[0]
+//mov v5.s[3], v2.s[1]
+//
+//dup v23.4s, w6
+//movi v24.4s, #4
+//dup v22.2d, x0
+//
+//L4Loop:
+//fcvtns v6.4s, v4.4s
+//fcvtns v7.4s, v5.4s
+//
+//smin v6.4s, v6.4s, v16.4s
+//smin v7.4s, v7.4s, v17.4s
+//smax v6.4s, v6.4s, v19.4s
+//smax v7.4s, v7.4s, v19.4s
+//
+//mul v7.4s, v7.4s, v23.4s
+//mla v7.4s, v6.4s, v24.4s
+//uxtl v6.2d, v7.2s
+//uxtl2 v7.2d, v7.4s
+//add v6.2d, v6.2d, v22.2d
+//add v7.2d, v7.2d, v22.2d
+//
+//mov x12, v6.d[0]
+//mov x13, v6.d[1]
+//ld1 {v3.s}[0], [x12]
+//mov x12, v7.d[0]
+//ld1 {v3.s}[1], [x13]
+//fadd v5.4s, v26.4s, v5.4s
+//mov x13, v7.d[1]
+//ld1 {v3.s}[2], [x12]
+//fadd v4.4s, v25.4s, v4.4s
+//ld1 {v3.s}[3], [x13]
+//
+//st1 {v3.4s}, [x1], #16
+//
+//
+//sub x3, x3, #4
+//cmp x3, #4
+//bge L4Loop
+//
+//mov v0.s[0], v4.s[0]
+//mov v0.s[1], v5.s[0]
+
+
+L1:
+cmp x3, #0
+beq End
+mov v16.s[0], w4
+mov v16.s[1], w5 // v16:[xMax, yMax]
+mov w12, #3
+mov v7.s[0], w12 // bpp=4
+mov v7.s[1], w6 // yStride
+dup v20.2d, x0
+
+L1Loop:
+
+fcvtzs v2.2s, v0.2s // [x0, y0]
+frintm v4.2s, v0.2s
+smax v2.2s, v2.2s, v19.2s // max(0, y)
+fcvtps v3.2s, v0.2s // [x1, y1]
+fabd v4.2s, v0.2s, v4.2s // (xF, yF)
+smax v3.2s, v3.2s, v19.2s
+smin v2.2s, v2.2s, v16.2s
+smin v3.2s, v3.2s, v16.2s
+mul v2.2s, v2.2s, v7.2s // [bpp * x0, y0 * yStride]
+mul v3.2s, v3.2s, v7.2s // [bpp * x1, y1 * yStride]
+mov v2.s[2], v3.s[0] // v2: [bpp*x0, y0*yStride, bpp*x1, y0*yStride]
+mov v3.s[2], v2.s[0] // v3: [bpp*x1, y1*yStride, bpp*x0, y1*yStride]
+mov v2.s[3], v2.s[1]
+mov v3.s[3], v3.s[1]
+
+uaddlp v2.2d, v2.4s // [c00, c01]
+uaddlp v3.2d, v3.4s // [c11, c10]
+
+add v2.2d, v20.2d, v2.2d
+add v3.2d, v20.2d, v3.2d
+mov x4, v2.d[0]
+mov x5, v2.d[1]
+ld1 {v5.h}[0], [x4], #2
+ld1 {v5.b}[2], [x4]
+ld1 {v5.h}[2], [x5], #2
+ld1 {v5.b}[6], [x5]
+mov x4, v3.d[0]
+uxtl v5.8h, v5.8b
+mov x5, v3.d[1]
+ld1 {v6.h}[0], [x4], #2
+ld1 {v6.b}[2], [x4]
+ld1 {v6.h}[2], [x5], #2
+ld1 {v6.b}[6], [x5]
+uxtl v6.8h, v6.8b
+//Now v2, v3 is of no use
+
+//v2: LT, v3: RT, v5: LB, v6:BT
+uxtl v2.4s, v5.4h // c00
+uxtl2 v3.4s, v5.8h // c01
+
+ucvtf v2.4s, v2.4s
+uxtl v5.4s, v6.4h // c11
+ucvtf v3.4s, v3.4s
+uxtl2 v6.4s, v6.8h // c10
+ucvtf v5.4s, v5.4s
+ucvtf v6.4s, v6.4s
+
+fsub v3.4s, v3.4s, v2.4s
+fsub v5.4s, v5.4s, v6.4s
+fmla v2.4s, v3.4s, v4.s[0] // (c01-c00)*xF+c00
+fmla v6.4s, v5.4s, v4.s[0] // (c11-c10)*xF+c10
+
+fsub v6.4s, v6.4s, v2.4s
+fmla v2.4s, v6.4s, v4.s[1]
+
+fcvtzs v2.4s, v2.4s
+uqxtn v2.4h, v2.4s
+uqxtn v2.8b, v2.8h
+
+fadd v0.2s, v0.2s, v1.2s
+subs x3, x3, #1
+st1 {v2.h}[0], [x1], #2
+st1 {v2.b}[0], [x1], #1
+
+
+bne L1Loop
+
+End:
+
+ret
+#endif
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index fa8258b66..90ad5673b 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -129,24 +129,17 @@ beq L2Dz
 cmp x8, #1
 beq L1Dz
 
-//cmp w13, #1
-//bne L4LoopDz
-//sub x4, x4, #8          // post->scale != nullptr && post->useInt8 == 1.
 L4LoopDz:
     mov x8, x1
     mov x22, x2
-    ld1 {v0.16b, v1.16b}, [x2], #32 // weight
+    ld1 {v10.16b, v11.16b}, [x2], #32 // weight
     ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64 // src
     // int4->int8
     movi v8.16b, #15
-    ushr v10.16b, v0.16b, #4
-    and v11.16b, v0.16b, v8.16b
-    ushr v12.16b, v1.16b, #4
-    and v13.16b, v1.16b, v8.16b
-    zip1 v0.16b, v10.16b, v11.16b
-    zip2 v1.16b, v10.16b, v11.16b 
-    zip1 v2.16b, v12.16b, v13.16b
-    zip2 v3.16b, v12.16b, v13.16b
+    ushr v0.16b, v10.16b, #4
+    and v2.16b, v10.16b, v8.16b
+    ushr v1.16b, v11.16b, #4
+    and v3.16b, v11.16b, v8.16b
 
     smull v8.8h, v0.8b, v4.8b
     smull v9.8h, v1.8b, v4.8b
@@ -207,17 +200,13 @@ L4LoopDz:
     
     L4LoopSz:
         ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #64
-        ld1 {v0.16b, v1.16b}, [x2], #32
+        ld1 {v10.16b, v11.16b}, [x2], #32
         // int4->int8
         movi v8.16b, #15
-        ushr v10.16b, v0.16b, #4
-        and v11.16b, v0.16b, v8.16b
-        ushr v12.16b, v1.16b, #4
-        and v13.16b, v1.16b, v8.16b
-        zip1 v0.16b, v10.16b, v11.16b
-        zip2 v1.16b, v10.16b, v11.16b 
-        zip1 v2.16b, v12.16b, v13.16b
-        zip2 v3.16b, v12.16b, v13.16b
+        ushr v0.16b, v10.16b, #4
+        and v2.16b, v10.16b, v8.16b
+        ushr v1.16b, v11.16b, #4
+        and v3.16b, v11.16b, v8.16b
 
         smull v8.8h, v0.8b, v4.8b
         smull v9.8h, v1.8b, v4.8b
@@ -355,19 +344,15 @@ sub x4, x4, #8
 L3LoopDz:
     mov x8, x1
     mov x22, x2
-    ld1 {v0.16b, v1.16b}, [x2], #32
+    ld1 {v10.16b, v11.16b}, [x2], #32
     ld1 {v4.16b, v5.16b, v6.16b}, [x1], #48
     add x1, x1, #16
     // int4->int8
     movi v8.16b, #15
-    ushr v10.16b, v0.16b, #4
-    and v11.16b, v0.16b, v8.16b
-    ushr v12.16b, v1.16b, #4
-    and v13.16b, v1.16b, v8.16b
-    zip1 v0.16b, v10.16b, v11.16b
-    zip2 v1.16b, v10.16b, v11.16b 
-    zip1 v2.16b, v12.16b, v13.16b
-    zip2 v3.16b, v12.16b, v13.16b
+    ushr v0.16b, v10.16b, #4
+    and v2.16b, v10.16b, v8.16b
+    ushr v1.16b, v11.16b, #4
+    and v3.16b, v11.16b, v8.16b
 
     smull v8.8h, v0.8b, v4.8b
     smull v9.8h, v1.8b, v4.8b
@@ -418,17 +403,13 @@ L3LoopDz:
 
     L3LoopSz:
         ld1 {v4.16b, v5.16b, v6.16b}, [x1], #48
-        ld1 {v0.16b, v1.16b}, [x2], #32
+        ld1 {v10.16b, v11.16b}, [x2], #32
         // int4->int8
         movi v8.16b, #15
-        ushr v10.16b, v0.16b, #4
-        and v11.16b, v0.16b, v8.16b
-        ushr v12.16b, v1.16b, #4
-        and v13.16b, v1.16b, v8.16b
-        zip1 v0.16b, v10.16b, v11.16b
-        zip2 v1.16b, v10.16b, v11.16b 
-        zip1 v2.16b, v12.16b, v13.16b
-        zip2 v3.16b, v12.16b, v13.16b
+        ushr v0.16b, v10.16b, #4
+        and v2.16b, v10.16b, v8.16b
+        ushr v1.16b, v11.16b, #4
+        and v3.16b, v11.16b, v8.16b
 
         smull v8.8h, v0.8b, v4.8b
         smull v9.8h, v1.8b, v4.8b
@@ -548,20 +529,15 @@ L2Dz:
 L2LoopDz:
     mov x8, x1
     mov x22, x2
-    ld1 {v0.16b, v1.16b}, [x2], #32
+    ld1 {v10.16b, v11.16b}, [x2], #32
     ld1 {v4.16b, v5.16b}, [x1], #32
     // int4->int8
     movi v8.16b, #15
-    ushr v10.16b, v0.16b, #4
-    and v11.16b, v0.16b, v8.16b
-    ushr v12.16b, v1.16b, #4
-    and v13.16b, v1.16b, v8.16b
-    zip1 v0.16b, v10.16b, v11.16b
-    zip2 v1.16b, v10.16b, v11.16b 
-    zip1 v2.16b, v12.16b, v13.16b
-    zip2 v3.16b, v12.16b, v13.16b
-    
-    
+    ushr v0.16b, v10.16b, #4
+    and v2.16b, v10.16b, v8.16b
+    ushr v1.16b, v11.16b, #4
+    and v3.16b, v11.16b, v8.16b
+
     smull v8.8h, v0.8b, v4.8b
     smull v9.8h, v1.8b, v4.8b
     smull v10.8h, v2.8b, v4.8b
@@ -595,17 +571,13 @@ L2LoopDz:
 
     L2LoopSz:
         ld1 {v4.16b, v5.16b}, [x1], #32
-        ld1 {v0.16b, v1.16b}, [x2], #32
+        ld1 {v10.16b, v11.16b}, [x2], #32
         // int4->int8
         movi v8.16b, #15
-        ushr v10.16b, v0.16b, #4
-        and v11.16b, v0.16b, v8.16b
-        ushr v12.16b, v1.16b, #4
-        and v13.16b, v1.16b, v8.16b
-        zip1 v0.16b, v10.16b, v11.16b
-        zip2 v1.16b, v10.16b, v11.16b 
-        zip1 v2.16b, v12.16b, v13.16b
-        zip2 v3.16b, v12.16b, v13.16b
+        ushr v0.16b, v10.16b, #4
+        and v2.16b, v10.16b, v8.16b
+        ushr v1.16b, v11.16b, #4
+        and v3.16b, v11.16b, v8.16b
 
         smull v8.8h, v0.8b, v4.8b
         smull v9.8h, v1.8b, v4.8b
@@ -699,17 +671,14 @@ L1Dz:
 L1LoopDz:
     mov x8, x1
     mov x22, x2
-    ld1 {v0.16b, v1.16b}, [x2], #32
+    ld1 {v10.16b, v11.16b}, [x2], #32
     // int4->int8
     movi v8.16b, #15
-    ushr v10.16b, v0.16b, #4
-    and v11.16b, v0.16b, v8.16b
-    ushr v12.16b, v1.16b, #4
-    and v13.16b, v1.16b, v8.16b
-    zip1 v0.16b, v10.16b, v11.16b
-    zip2 v1.16b, v10.16b, v11.16b 
-    zip1 v2.16b, v12.16b, v13.16b
-    zip2 v3.16b, v12.16b, v13.16b
+    ushr v0.16b, v10.16b, #4
+    and v2.16b, v10.16b, v8.16b
+    ushr v1.16b, v11.16b, #4
+    and v3.16b, v11.16b, v8.16b
+
     dup v16.4s, wzr
     dup v17.4s, wzr
     ld1 {v4.16b}, [x1], #16
@@ -739,19 +708,14 @@ L1LoopDz:
         sadalp v22.4s, v14.8h
         sadalp v23.4s, v15.8h
 
-        ld1 {v0.16b, v1.16b}, [x2], #32
+        ld1 {v10.16b, v11.16b}, [x2], #32
         add x1, x1, #48
         // int4->int8
         movi v8.16b, #15
-        ushr v10.16b, v0.16b, #4
-        and v11.16b, v0.16b, v8.16b
-        ushr v12.16b, v1.16b, #4
-        and v13.16b, v1.16b, v8.16b
-        zip1 v0.16b, v10.16b, v11.16b
-        zip2 v1.16b, v10.16b, v11.16b 
-        zip1 v2.16b, v12.16b, v13.16b
-        zip2 v3.16b, v12.16b, v13.16b
-
+        ushr v0.16b, v10.16b, #4
+        and v2.16b, v10.16b, v8.16b
+        ushr v1.16b, v11.16b, #4
+        and v3.16b, v11.16b, v8.16b
         smull v8.8h, v0.8b, v4.8b
         smull v9.8h, v1.8b, v4.8b
         smull v10.8h, v2.8b, v4.8b
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
index fa9bc1f43..49b9567cc 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
@@ -126,13 +126,13 @@ stp x23, x24, [sp, #(16 * 8)]
 
 ldr x27, [x6, #64]  // blockNum
 mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #3     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x24, [x6, #80]  // extraScale
 
-mov x21, #16 // sizeof(float) * UNIT
+mov x21, #16 // sizeof(float) * pack
 ldr x23, [x6, #56]  // fp32minmax
 Start:
 mov x22, #48 // src_steps
@@ -158,13 +158,11 @@ L8LoopDz_TILE_12:
     SET_BIAS v28, v29, v30, v31
 
     L8LoopSz_TILE_12:
-        ld1 {v3.d}[0], [x2], x15 // weight
-        ld1 {v4.d}[0], [x2], #8
+        ld1 {v5.16b}, [x2], #16 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -175,21 +173,16 @@ L8LoopDz_TILE_12:
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
 
         .inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
         .inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
         .inst 0x4f82e872 // sdot v18.4s, v3.16b, v2.4b[2]
         .inst 0x4fa2e873 // sdot v19.4s, v3.16b, v2.4b[3]
-        
         .inst 0x4f80e094 // sdot v20.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
         .inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
-        sub x2, x2, x15
+
         .inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
         .inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
         .inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
@@ -202,8 +195,7 @@ L8LoopDz_TILE_12:
         bne L8LoopSz_TILE_12
 
     L8LoopSzEnd_TILE_12:
-    // add x2, x2, x15
-    add x2, x27, x15, LSL #1
+    add x2, x27, x15
     sub x5, x5, #2
 
     L8Tile12Quan:
@@ -313,7 +305,7 @@ L8LoopDz_TILE_12:
     L8Tile12LoopCheck:
     cmp x5, #1
     bgt L8LoopDz_TILE_12
-    blt End
+    cbz x5, End
 
 L4LoopDz_TILE_12:
     SET_BIAS v8, v9, v10, v11
@@ -322,12 +314,10 @@ L4LoopDz_TILE_12:
     movi v7.16b, #15
 
     L4LoopSz_TILE_12:
-        ld1 {v3.d}[0], [x2], #8 // weight
+        ld1 {v5.16b}, [x2], #16 // weight
         ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -437,27 +427,22 @@ L8LoopDz_TILE_8:
     SET_BIAS v20, v21, v22, v23
 
     L8LoopSz_TILE_8:
-        ld1 {v3.d}[0], [x12], x15 // weight
-        ld1 {v4.d}[0], [x12], #8
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         .inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
         .inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
         .inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
         .inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
-        sub x12, x12, x15
+
         .inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
@@ -471,7 +456,7 @@ L8LoopDz_TILE_8:
 
     L8LoopSzEnd_TILE_8:
     //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile8Quan:
@@ -567,12 +552,10 @@ L4LoopDz_TILE_8:
     SET_BIAS v12, v13, v14, v15
 
     L4LoopSz_TILE_8:
-        ld1 {v3.d}[0], [x12], #8 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b, v1.16b}, [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -652,7 +635,7 @@ Tile8_End_Offset:
 
 TILE_4:
     cmp x7, #4
-    blt TILE_1
+    blt TILE_1_Init
     mov x10, x0
     mov x12, x2
     mov x14, x5
@@ -672,24 +655,18 @@ L8LoopDz_TILE_4:
     SET_BIAS v12, v13, v14, v15
 
     L8LoopSz_TILE_4:
-        ld1 {v3.d}[0], [x12], x15 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b}, [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
+        and v4.16b, v5.16b, v7.16b
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
         .inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
         .inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
-        // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
+
         subs x13, x13, #1
-        sub x12, x12, x15
         .inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
         .inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
         .inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
@@ -698,7 +675,7 @@ L8LoopDz_TILE_4:
 
     L8LoopSzEnd_TILE_4:
     //add x12, x12, x15
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile4Quan:
@@ -764,12 +741,10 @@ L4LoopDz_TILE_4:
     SET_BIAS v8, v9, v10, v11
 
     L4LoopSz_TILE_4:
-        ld1 {v3.d}[0], [x12], #8 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.16b}, [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
         subs x13, x13, #1
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
         .inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
@@ -826,9 +801,14 @@ Tile4_End_Offset:
     add x1, x1, #16
     add x25, x25, #16
  
-TILE_1:
+TILE_1_Init:
     cbz x7, End
     movi v7.16b, #15
+    cbz x23, TILE_1
+    ld1r {v26.4s}, [x23], #4 // f32 min
+    ld1r {v27.4s}, [x23] // f32 max
+    sub x23, x23, #4
+TILE_1:
     mov x10, x0
     mov x12, x2
     mov x14, x5
@@ -845,28 +825,64 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
-    L8LoopSz_TILE_1:
-        ld1 {v3.d}[0], [x12], x15 // weight
+    cmp x13, #4
+    blt L8LoopSz_TILE_1_lu1
+    //lsl x22, x22, #2
+
+    L8LoopSz_TILE_1_lu4:
+        ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x12], #64 // weight: hu=0,1,2,3,pack=0~7
         ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v4.d}[0], [x12], #8 // weight
+        ld1 {v0.s}[1], [x11], x22
+        ld1 {v0.s}[2], [x11], x22
+        ld1 {v0.s}[3], [x11], x22
+
+        sub x13, x13, #4
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v12.16b, v3.16b, #4
+        and v22.16b, v3.16b, v7.16b
 
-        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        ushr v15.16b, v4.16b, #4
+        and v23.16b, v4.16b, v7.16b
+
+        ushr v18.16b, v5.16b, #4
+        and v24.16b, v5.16b, v7.16b
+
+        ushr v21.16b, v6.16b, #4
+        and v25.16b, v6.16b, v7.16b
+
+        cmp x13, #4
+        .inst 0x4f80e188 // sdot v8.4s, v12.16b, v0.4b[0]
+        .inst 0x4f80e2c9 // sdot v9.4s, v22.16b, v0.4b[0]
+        .inst 0x4fa0e1e8 // sdot v8.4s, v15.16b, v0.4b[1]
+        .inst 0x4fa0e2e9 // sdot v9.4s, v23.16b, v0.4b[1]
+        .inst 0x4f80ea48 // sdot v8.4s, v18.16b, v0.4b[2]
+        .inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]
+        .inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]
+        .inst 0x4fa0eb29 // sdot v9.4s, v25.16b, v0.4b[3]
+        bge L8LoopSz_TILE_1_lu4
+
+    cbz x13, L8LoopSzEnd_TILE_1
+
+    L8LoopSz_TILE_1_lu1:
+        ld1 {v5.16b}, [x12], #16 // weight
+        ld1 {v0.s}[0], [x11], x22 // src
+        //ld1 {v4.d}[0], [x12], #8 // weight
         subs x13, x13, #1
         // int4->int8
-        ushr v5.16b, v4.16b, #4
-        and v6.16b, v4.16b, v7.16b
-        zip1 v4.16b, v5.16b, v6.16b
-        sub x12, x12, x15
+        ushr v3.16b, v5.16b, #4
+        and v12.16b, v5.16b, v7.16b
+
+        //ushr v10.16b, v4.16b, #4
+        //and v11.16b, v4.16b, v7.16b
+        //zip1 v12.16b, v10.16b, v11.16b
 
-        .inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
-        bne L8LoopSz_TILE_1
+        //sub x12, x12, x15
+        .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
+        .inst 0x4f80e189 // sdot v9.4s, v12.16b, v0.4b[0]
+        bne L8LoopSz_TILE_1_lu1
 
     L8LoopSzEnd_TILE_1:
-    add x12, x27, x15, LSL #1
+    add x12, x27, x15
     sub x14, x14, #2
 
     L8Tile1Quan:
@@ -903,9 +919,6 @@ L8LoopDz_TILE_1:
 
     TILE1_POST:
     cbz x23, TILE1_STORE
-    ld1r {v26.4s}, [x23], #4 // f32 min
-    ld1r {v27.4s}, [x23] // f32 max
-    sub x23, x23, #4
     fmin v8.4s, v8.4s, v27.4s
     fmin v9.4s, v9.4s, v27.4s
     fmax v8.4s, v8.4s, v26.4s
@@ -926,12 +939,10 @@ L4LoopDz_TILE_1:
     mov x13, x3
     movi v8.16b, #0
     L4LoopSz_TILE_1:
-        ld1 {v3.d}[0], [x12], #8 // weight
+        ld1 {v5.16b}, [x12], #16 // weight
         ld1 {v0.s}[0], [x11], x22 // src
         // int4->int8
-        ushr v5.16b, v3.16b, #4
-        and v6.16b, v3.16b, v7.16b
-        zip1 v3.16b, v5.16b, v6.16b
+        ushr v3.16b, v5.16b, #4
         subs x13, x13, #1
 
         .inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
@@ -965,9 +976,6 @@ L4LoopDz_TILE_1:
 
     TILE1_L4_POST:
     cbz x23, TILE1_L4_STORE
-    ld1r {v26.4s}, [x23], #4 // f32 min
-    ld1r {v27.4s}, [x23] // f32 max
-    sub x23, x23, #4
     fmax v8.4s, v8.4s, v26.4s
     fmin v8.4s, v8.4s, v27.4s
     TILE1_L4_STORE:
@@ -978,11 +986,11 @@ cbz x24, Tile1_End_Offset
 add x24, x24, #4
 
 Tile1_End_Offset:
-    sub x7, x7, #1
+    subs x7, x7, #1
     add x0, x0, x21
     add x1, x1, #4
     add x25, x25, #4
-    b TILE_1
+    bne TILE_1
 
 End:
 ldp x23, x24, [sp, #(16 * 8)]
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
index b4cc330c2..891196103 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
@@ -138,6 +138,7 @@ LoopDz8_TILE_10:
     mov x11, x1 // src
     mov x12, x2 // weight
     mov x13, x3 // src_depth_quad
+    movi v2.16b, #15
 
     SET_0_5 v12, v16, v20, v24, v28 // oc:0,1,0,1
     SET_0_5 v13, v17, v21, v25, v29 // oc:2,3,2,3
@@ -146,7 +147,6 @@ LoopDz8_TILE_10:
 
 LoopSz_TILE_10:
     ld1 {v0.16b, v1.16b}, [x12], #32                    // weight
-    movi v2.16b, #15
     ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x11], #64    // src: E0-E9
     ld1 {v7.16b}, [x11], #16
     // int4->int8
@@ -1066,7 +1066,64 @@ LoopDz_TILE_1:
     movi v17.4s, #0
     movi v18.4s, #0
     movi v19.4s, #0
-LoopSz_TILE_1:
+cmp x13, #4
+blt LoopSz_TILE_1_lu1
+
+LoopSz1_TILE_1_lu4:
+    ld1 {v5.16b, v6.16b, v7.16b, v8.16b}, [x12], #64     // weight
+    ld1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x12], #64
+    ld1 {v0.8b}, [x11], x22                              // src
+    ld1 {v1.8b}, [x11], x22
+    ld1 {v2.8b}, [x11], x22
+    ld1 {v3.8b}, [x11], x22
+
+    // int4->int8
+    ushr v4.16b, v5.16b, #4
+    ushr v14.16b, v6.16b, #4
+    and v13.16b, v5.16b, v28.16b
+    and v15.16b, v6.16b, v28.16b
+
+    ushr v20.16b, v7.16b, #4
+    ushr v21.16b, v8.16b, #4
+    and v22.16b, v7.16b, v28.16b
+    and v23.16b, v8.16b, v28.16b
+
+    ushr v24.16b, v9.16b, #4
+    ushr v25.16b, v10.16b, #4
+    and v26.16b, v9.16b, v28.16b
+    and v27.16b, v10.16b, v28.16b
+
+    ushr v5.16b, v11.16b, #4
+    ushr v6.16b, v12.16b, #4
+    and v7.16b, v11.16b, v28.16b
+    and v8.16b, v12.16b, v28.16b
+
+    sub x13, x13, #4
+
+    .inst 0x4e84a410 // smmla v16.4s, v0.16b, v4.16b
+    .inst 0x4e8ea411 // smmla v17.4s, v0.16b, v14.16b
+    .inst 0x4e8da412 // smmla v18.4s, v0.16b, v13.16b
+    .inst 0x4e8fa413 // smmla v19.4s, v0.16b, v15.16b
+
+    .inst 0x4e94a430 // smmla v16.4s, v1.16b, v20.16b
+    .inst 0x4e95a431 // smmla v17.4s, v1.16b, v21.16b
+    .inst 0x4e96a432 // smmla v18.4s, v1.16b, v22.16b
+    .inst 0x4e97a433 // smmla v19.4s, v1.16b, v23.16b
+    cmp x13, #4
+    .inst 0x4e98a450 // smmla v16.4s, v2.16b, v24.16b
+    .inst 0x4e99a451 // smmla v17.4s, v2.16b, v25.16b
+    .inst 0x4e9aa452 // smmla v18.4s, v2.16b, v26.16b
+    .inst 0x4e9ba453 // smmla v19.4s, v2.16b, v27.16b
+
+    .inst 0x4e85a470 // smmla v16.4s, v3.16b, v5.16b
+    .inst 0x4e86a471 // smmla v17.4s, v3.16b, v6.16b
+    .inst 0x4e87a472 // smmla v18.4s, v3.16b, v7.16b
+    .inst 0x4e88a473 // smmla v19.4s, v3.16b, v8.16b
+
+    bge LoopSz1_TILE_1_lu4
+    cbz x13, LoopSzEnd_TILE_1
+
+LoopSz_TILE_1_lu1:
     ld1 {v2.8b}, [x11], x22           // src
     // int4->int8
     ld1 {v0.16b, v1.16b}, [x12], #32                   // weight
@@ -1080,7 +1137,7 @@ LoopSz_TILE_1:
     .inst 0x4e89a451 // smmla v17.4s, v2.16b, v9.16b
     .inst 0x4e8aa452 // smmla v18.4s, v2.16b, v10.16b
     .inst 0x4e8ba453 // smmla v19.4s, v2.16b, v11.16b
-    bne LoopSz_TILE_1
+    bne LoopSz_TILE_1_lu1
 LoopSzEnd_TILE_1:
     add x25, x25, x15
     sub x24, x24, #2
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S b/source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMulRemain_int4.S
similarity index 100%
rename from source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int4.S
rename to source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMulRemain_int4.S
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S b/source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMulRemain_int8.S
similarity index 100%
rename from source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMulRemain_int8.S
rename to source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMulRemain_int8.S
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S b/source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMul_int4.S
similarity index 100%
rename from source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int4.S
rename to source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMul_int4.S
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S b/source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMul_int8.S
similarity index 100%
rename from source/backend/cpu/arm/arm64/low_memory/MNNPackedMatMul_int8.S
rename to source/backend/cpu/arm/arm64/normal_memory/MNNPackedMatMul_int8.S
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index d806e0cb9..df1b70970 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -35,8 +35,8 @@ void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count) {
 }
 #endif
 
-#ifdef MNN_LOW_MEMORY
 #ifndef __aarch64__
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 static void _MNNPackedMatMulRemain_int4(float* C, const float* A, const float* fB, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, int aStride, const float* k, const float* b) {
     auto B = reinterpret_cast<const uint8_t*>(fB);
     auto h = parameter[2];
@@ -191,6 +191,9 @@ void MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t
     auto aStride = parameter[0] / sizeof(float);
     _MNNPackedMatMulRemain_int8(C, A, B, eSize, parameter, postParameters, bias, aStride, k, b);
 }
+#endif // MNN_CPU_WEIGHT_DEQUANT_GEMM
+
+#ifdef MNN_LOW_MEMORY
 void MNNAbsMaxFP32(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack) {
     // source: (ic/4, N, 4)
     auto srcStep = pack * realSize;
@@ -261,8 +264,8 @@ void MNNDynamicUpdateConvBiasScale(float* newbias, float* newscale, float* oldbi
     }
 }
 
-#endif // not __aarch64__
 #endif // LOW_MEMORY
+#endif // not __aarch64__
 
 
 static void MNNSumByAxisLForMatmul_A(float* dest, int8_t* source, const float* scale, ssize_t realDstCount, SumByAxisParams sumParams) {
@@ -3422,12 +3425,14 @@ void MNNCoreFunctionInit() {
     gCoreFunction->supportSDot = gCPUInfo.dot;
     gCoreFunction->supportI8mm = gCPUInfo.i8mm;
     gCoreFunction->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A;
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
     // Weight Dequant Gemm Kernels
     gCoreFunction->MNNPackedMatMul_int4 = MNNPackedMatMul_int4;
     gCoreFunction->MNNPackedMatMulRemain_int4 = MNNPackedMatMulRemain_int4;
     gCoreFunction->MNNPackedMatMul_int8 = MNNPackedMatMul_int8;
     gCoreFunction->MNNPackedMatMulRemain_int8 = MNNPackedMatMulRemain_int8;
+#endif
+#ifdef MNN_LOW_MEMORY
     // Dynamic Quant Helper Functions
     gCoreFunction->MNNAbsMax = MNNAbsMaxFP32;
     gCoreFunction->MNNDynamicQuant = MNNDynamicQuantFP32;
@@ -3470,10 +3475,11 @@ void MNNUnpackC2(double* dst, const double* src, size_t area, size_t depth, int*
 void MNNUnpackC2Float(float* dst, const float* src, size_t area, size_t depth, int* areaOffset, int pack) {
     MNNUnpackC2Common<float>(dst, src, area, depth, areaOffset, pack);
 }
-
+#ifndef __aarch64__
 void MNNPackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
     MNNPackC2Common<float>(dst, src, area, depth, areaOffset);
 }
+#endif
 
 void MNNUnpackInt8C2(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
     MNNUnpackC2Common<float>(dst, src, area, depth, areaOffset);
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
index 25fb13a8f..46b1f0739 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -21,8 +21,10 @@ namespace MNN {
 ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op): CPUConvolution(op->main_as_Convolution2D()->common(), backend) {}
 
 ConvInt8TiledExecutor::ConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res): CPUConvolution(op->main_as_Convolution2D()->common(), backend), mResourceInt8(res) {
-    mMutableResource.reset(new MutableResourceInt8(res, backend));
-    mValid = mMutableResource->mValid;
+    if (!res->mDynamicQuant) {
+        mMutableResource.reset(new MutableResourceInt8(res, backend));
+        mValid = mMutableResource->mValid;
+    }
 }
 
 ConvInt8TiledExecutor::~ConvInt8TiledExecutor() {
@@ -34,7 +36,9 @@ bool ConvInt8TiledExecutor::onClone(Backend* bn, const Op* op, Execution** dst)
 }
 
 ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
+    if (nullptr != mMutableResource) {
+        mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
+    }
     CPUConvolution::onResize(inputs, outputs);
     ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, static_cast<CPUBackend*>(backend())->functions(), static_cast<CPUBackend*>(backend())->int8Functions());
     return NO_ERROR;
@@ -234,18 +238,17 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
     }
 }
 
-DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon) : mDynamicQuantExe(true), ConvInt8TiledExecutor(backend, op) {
+DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon) : ConvInt8TiledExecutor(backend, op) {
     auto convOp = op->main_as_Convolution2D();
     auto core = static_cast<CPUBackend*>(backend)->int8Functions();
     auto gcore = static_cast<CPUBackend*>(backend)->functions();
     mResourceInt8.reset(new CPUConvolution::ResourceInt8);
+    mResourceInt8->mDynamicQuant = true;
     GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
-    mMutableResource.reset(new MutableResourceInt8(mResourceInt8, backend));
     // dynamic quant
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     int pack = gcore->pack;
-    bool needPermuteInt4weight = ((UNIT == 8 && SRC_UNIT == 8 && DST_XUNIT ==10) || (UNIT == 64 && SRC_UNIT == 4 && DST_XUNIT ==4));
     auto weightLength = quanCommon->weight.size();
     int kernelCount = mCommon->kernelX() * mCommon->kernelY();
     int oc = convOp->common()->outputCount();
@@ -264,9 +267,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
         std::vector<int32_t> shape;
         if (SRC_UNIT > pack) {
             MNN_ASSERT(SRC_UNIT % pack == 0);
-            shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
+            shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT * SRC_UNIT / 2};
         } else {
-            shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT, SRC_UNIT};
+            shape = {UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount, UNIT * SRC_UNIT / 2};
         }
         mResourceInt8->mWeightInt8.reset(Tensor::createDevice<uint8_t>(shape));
 
@@ -280,32 +283,30 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
         ::memset(dstPtr, 0, mResourceInt8->mWeightInt8->size());
 
         // Pack two int4-weight to one int8-weight.
-        if (false == needPermuteInt4weight) {
-            for (int i = 0; i < hU; i++) {
-                for (int j = 0; j < lU; j++) {
-                    for (int k = 0; k < hP; k++) {
-                        for (int id = 0; id < lP / 2; ++id) {
-                            dstPtr[(i * lU * lP * hP + j * hP * lP + k * lP) / 2 + id] = srcPtr[((i * hP + k) * lP * lU + (j * lP)) / 2 + id];
-                        }
-                    }
+        int cnt = lP * hP / 4;
+        int L = lU * lP;
+        for (int i = 0; i < hU; ++i) {
+            for (int j = 0; j < lU; ++j) {
+                for (int k = 0; k < cnt; ++k) {
+                    int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
+                    
+                    int hpId0     = (2 * k + 1) / lP;
+                    int lpId0     = (2 * k) % lP;
+                    int hpId1     = (2 * (k + cnt) + 1) / lP;
+                    int lpId1     = (2 * (k + cnt)) % lP;
+                    int srcIndx0 = ((i * hP + hpId0) * L + (j * lP + lpId0)) / 2;
+                    int srcIndx1 = ((i * hP + hpId1) * L + (j * lP + lpId1)) / 2;
+                    int s0 = (srcPtr[srcIndx0] >> 4);
+                    int s1 = (srcPtr[srcIndx0] & 15);
+                    int s2 = (srcPtr[srcIndx1] >> 4);
+                    int s3 = (srcPtr[srcIndx1] & 15);
+                    int d0 = s0 * 16 + s2;
+                    int d1 = s1 * 16 + s3;
+                    
+                    dstPtr[dstIndx0] = d0;
+                    dstPtr[dstIndx0 + 1] = d1;
                 }
             }
-        } else {
-            for (int i = 0; i < hU; i++) {
-                for (int j = 0; j < lU; j++) {
-                    auto dst_ptr = dstPtr + (i * lU * lP * hP + j * hP * lP) / 2;
-                    for (int k = 0; k < 16; k++) {
-                        int col = k % 4;
-                        int row = k / 4;
-                        uint8_t s0 = srcPtr[((i * hP + row + 0) * lP * lU + j * lP) / 2 + col];
-                        uint8_t s1 = srcPtr[((i * hP + row + 4) * lP * lU + j * lP) / 2 + col];
-                        uint8_t d0 = (s0 & 0xf0) | (s1 >> 4);
-                        uint8_t d1 = (s0 << 4) | (s1 & 0x0f);
-                        dst_ptr[k * 2 + 0] = d0;
-                        dst_ptr[k * 2 + 1] = d1;
-                    }
-               }
-            }
         }
     } else {
         // std::shared_ptr<Tensor> srcWeight;
@@ -331,27 +332,19 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
             std::shared_ptr<Tensor> weightLow(Tensor::create<uint8_t>({halflen}));
             auto dstint4Ptr = weightLow->host<uint8_t>();
             auto srcint4Ptr = mResourceInt8->mWeightInt8->host<int8_t>();
-            if (false == needPermuteInt4weight) {
-                for (int i=0; i < halflen; ++i) {
-                    int s0 = srcint4Ptr[2 * i + 0];
-                    int s1 = srcint4Ptr[2 * i + 1];
+            int permuteUnit = UNIT * SRC_UNIT;
+            int halfPermuteStride = static_cast<int32_t>(permuteUnit / 2);
+            for (int i = 0; i < leng / permuteUnit; ++i) {
+                auto src0 = srcint4Ptr + i * permuteUnit;
+                auto dst0 = dstint4Ptr + i * halfPermuteStride;
+                for (int j = 0; j < halfPermuteStride; ++j) {
+                    int s0 = src0[j];
+                    int s1 = src0[j + halfPermuteStride];
                     int d = (s0 + 8) * 16 + (s1 + 8);
-                    dstint4Ptr[i] = d;
-                }
-            } else {
-                int permuteUnit = UNIT * SRC_UNIT;
-                int halfPermuteStride = static_cast<int32_t>(permuteUnit / 2);
-                for (int i = 0; i < leng / permuteUnit; ++i) {
-                    auto src0 = srcint4Ptr + i * permuteUnit;
-                    auto dst0 = dstint4Ptr + i * halfPermuteStride;
-                    for (int j = 0; j < halfPermuteStride; ++j) {
-                        int s0 = src0[j];
-                        int s1 = src0[j + halfPermuteStride];
-                        int d = (s0 + 8) * 16 + (s1 + 8);
-                        dst0[j] = d;
-                    }
+                    dst0[j] = d;
                 }
             }
+            
             // Update int4 weight to mWeightInt8.
             mResourceInt8->mWeightInt8 = weightLow;
         } else {
@@ -372,8 +365,68 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
         gcore->MNNFp32ToLowp(mResourceInt8->mReluThreshold.data(), reinterpret_cast<int16_t*>(mResourceInt8->mReluThreshold.data()), 2);
     }
 }
+static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, std::shared_ptr<CPUConvolution::ResourceInt8> resourceInt8) {
+    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
+    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
+    MNN_ASSERT(quanBuffer || resourceInt8);
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    // common parameters
+    int outputCount = conv2d->common()->outputCount();
+    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
+    int ocUp4 = ROUND_UP(outputCount, core->pack);
+    int8_t* weightOrigin;
 
-DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : mDynamicQuantExe(false), ConvInt8TiledExecutor(backend, op, res) {
+    // Save weight quant scale and bias: wf=scale*wi+bias
+    std::shared_ptr<Tensor> scaleBias(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
+    auto success = backend->onAcquireBuffer(scaleBias.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc dequant scaleBias memory error\n");
+        return;
+    }
+    auto alphaPtr = scaleBias->host<float>();
+    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
+    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
+    
+    // Load quant scale and bias
+    weightOrigin = resourceInt8->mWeightInt8->host<int8_t>();
+    auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
+    auto wScale = resourceInt8->mOriginScale->host<float>();
+    int h = ocUp4;
+    if (core->bytes == 2) {
+        std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
+        core->MNNFp32ToLowp(wScale, tmp.get(), h);
+        for (int i=0; i< h; ++i) {
+            reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
+            reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
+        }
+    } else {
+        for (int i=0; i< h; ++i) {
+            alphaPtr[i] = wScale[i];
+            biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
+        }
+    }
+    resourceInt8->mOriginScale = scaleBias;
+    
+    // Compute float weightKernelSum
+    resourceInt8->mWeightKernelSum.reset(Tensor::createDevice<uint8_t>({ocUp4 * 4}));
+    success = backend->onAcquireBuffer(resourceInt8->mWeightKernelSum.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc dequant mWeightKernelSum memory error\n");
+        return;
+    }
+    auto weightKernelSum = resourceInt8->mWeightKernelSum->host<float>();
+    for (int i = 0; i < outputCount; ++i) {
+        int sum = 0;
+        for (int j = 0; j < LSize; ++j) {
+            sum = sum + static_cast<int>(weightOrigin[j + i * LSize]);
+        }
+        auto scale = alphaPtr[i];
+        auto bias = biasPtr[i];
+        weightKernelSum[i] = static_cast<float>(sum) * scale + LSize * bias;
+    }
+}
+
+DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
     std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
     auto convOp = op->main_as_Convolution2D();
     mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
@@ -393,11 +446,11 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
         mGemmKernel = core->Int8GemmKernelFast;
     }
 #endif
-    CPUConvolution::makeResourceNew(backend, convOp, mResourceInt8);
+    _computeAlphaScale(backend, convOp, mResourceInt8);
 }
 
 DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe)
-    : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel), mDynamicQuantExe(exe.mDynamicQuantExe) {
+    : ConvInt8TiledExecutor(backend, op, exe.mResourceInt8), mGemmKernel(exe.mGemmKernel) {
 }
 
 DenseConvInt8TiledExecutor::~DenseConvInt8TiledExecutor() {
@@ -427,14 +480,14 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         && outputs[0]->width() == inputs[0]->width() && outputs[0]->height() == inputs[0]->height()
         && mCommon->strideX() == 1 && mCommon->strideY() == 1 && mCommon->padX() == 0 && mCommon->padY() == 0
         && outputs[0]->height() == 1 && outputs[0]->width() == 1;
-    mUseBatchQuan &= mDynamicQuantExe;
+    mUseBatchQuan &= mResourceInt8->mDynamicQuant;
     mUseBatchQuan &= (inputs[0]->batch() > 1);
     auto core = static_cast<CPUBackend*>(backend())->int8Functions();
     auto gcore =static_cast<CPUBackend*>(backend())->functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
 
-    if (mDynamicQuantExe == false) {
+    if (mResourceInt8->mDynamicQuant == false) {
         mMutableResource->updateInputOutputScale(TensorUtils::getQuantInfo(inputs[0]), TensorUtils::getQuantInfo(outputs[0]));
         CPUConvolution::onResize(inputs, outputs);
         ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
@@ -537,7 +590,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
     if (!success || mBlitInfo.invalid()) {
         return OUT_OF_MEMORY;
     }
-    if (false == mDynamicQuantExe) {
+    if (false == mResourceInt8->mDynamicQuant) {
         bufferAlloc->free(mBlitInfo);
         backend()->onReleaseBuffer(mInputDeqScales.get(), Backend::DYNAMIC);
         backend()->onReleaseBuffer(mTempIm2ColBuffer.get(), Backend::DYNAMIC);
@@ -591,9 +644,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     int UNIT__, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT__, &SRC_UNIT, &DST_XUNIT);
     auto blitProc = core->MNNPackC4Int8ForMatMul_A;
-    if ( mDynamicQuantExe && gcore->bytes == 2 && core->MNNPackC4Int8ForMatMul_A_ARM86FP16) {
-        blitProc = core->MNNPackC4Int8ForMatMul_A_ARM86FP16;
-    }
     const int plane                  = output->batch() * mIm2ColParamter.oh * mIm2ColParamter.ow;
     const int batch                  = input->batch();
     const int PackUnit               = gcore->pack;
@@ -618,12 +668,16 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
 
     auto outputDataPtr = output->host<int8_t>();
-    auto biasPtr       = mMutableResource->mBiasFloat->host<uint8_t>();
-    auto scalePtr      = mMutableResource->mScaleFloat->host<uint8_t>();
-
-    auto inputZeroPoint  = mMutableResource->mInputZeroPoint;
+    uint8_t* biasPtr = nullptr;
+    uint8_t* scalePtr = nullptr;
+    int32_t inputZeroPoint = 0;
     auto inputScalePtr = mInputDeqScales->host<uint8_t>();
-    (reinterpret_cast<float*>(inputScalePtr))[0]     =  mMutableResource->mInputScale;
+    if (nullptr != mMutableResource.get()) {
+        biasPtr       = mMutableResource->mBiasFloat->host<uint8_t>();
+        scalePtr      = mMutableResource->mScaleFloat->host<uint8_t>();
+        inputZeroPoint  = mMutableResource->mInputZeroPoint;
+        (reinterpret_cast<float*>(inputScalePtr))[0] =  mMutableResource->mInputScale;
+    }
 
     auto SingleDynamicQuant = [&] () {
         const auto floatptr = input->host<float>();
@@ -631,7 +685,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         auto inputsize      = static_cast<CPUBackend*>(backend())->getTensorSize(inputs[0]);
         float quantscale    = 0.f;
         float dequantscale  = 0.f;
-        int zeropoint       = 0;
+        float zeropoint       = 0;
 
          /* Count max and min value to compute input scale and zeropoint */
         auto maxMinValPtr = mTempMaxMinValueBuffer->host<uint8_t>();
@@ -675,14 +729,14 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         float range = maxVal - minVal;
         quantscale = 255.0f / range;
         dequantscale = range / 255.0f;
-        zeropoint = static_cast<int32_t>(roundf(-minVal * 255.f / range) - 128.0f);
+        zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
         std::vector<float>qsVec(PackUnit, quantscale);
         auto sizeDiv = UP_DIV(inputsize, PackUnit);
         int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
         if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
-            mQuantAndReorderFunc(floatptr, int8ptr, inputPlane, qsVec.data(), -128, 127, (ssize_t)zeropoint, UP_DIV(input->channel(), PackUnit), 4 * inputPlane);
+            mQuantAndReorderFunc(floatptr, int8ptr, inputPlane, &quantscale, -128, 127, &zeropoint, UP_DIV(input->channel(), PackUnit), 4 * inputPlane);
         } else {
-            mQuantFunc(floatptr, int8ptr, sizeDiv, qsVec.data(), -128, 127, (ssize_t)zeropoint);
+            mQuantFunc(floatptr, int8ptr, sizeDiv, &quantscale, -128, 127, &zeropoint, 0);
         }
 
         /* bias float */
@@ -691,7 +745,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     #else
         int offset = 0;
     #endif
-        auto biasfp32 = mMutableResource->mResource->mOriginBias->host<float>();
+        auto biasfp32 = mResourceInt8->mOriginBias->host<float>();
         auto weightDequantScale = mResourceInt8->mOriginScale->host<float>();
         float zerofp32 = (zeropoint + offset) * dequantscale;
 
@@ -750,14 +804,14 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         inputZeroPoint = 0;
         inputScalePtr     = (uint8_t*)dequantPtr;
         inputDataPtr = mQuantInput->host<int8_t>();
-        biasPtr = mMutableResource->mResource->mOriginBias->host<uint8_t>();
+        biasPtr = mResourceInt8->mOriginBias->host<uint8_t>();
         scalePtr = mResourceInt8->mOriginScale->host<uint8_t>();
     };
     ssize_t oneScale = 1;
     if (mUseBatchQuan) {
         BatchDynamicQuant();
         oneScale = 0;
-    } else if (mDynamicQuantExe) {
+    } else if (mResourceInt8->mDynamicQuant) {
         SingleDynamicQuant();
     } else {
         // offline quant.
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
index c5fc5f4d3..bebeaa5c4 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -61,8 +61,8 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     DenseConvInt8TiledExecutor(Backend* backend, const Op* op, const DenseConvInt8TiledExecutor& exe);
 
     decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
-    std::function<void(const float*, int8_t*, size_t, const float*, ssize_t, ssize_t, ssize_t)> mQuantFunc;
-    std::function<void(const float*, int8_t*, size_t, const float*, ssize_t, ssize_t, ssize_t, size_t, size_t)> mQuantAndReorderFunc = nullptr;
+    std::function<void(const float*, int8_t*, size_t, const float*, ssize_t, ssize_t, const float*, ssize_t)> mQuantFunc;
+    std::function<void(const float*, int8_t*, size_t, const float*, ssize_t, ssize_t, const float*, size_t, size_t)> mQuantAndReorderFunc = nullptr;
     std::function<void(float* dest, int8_t* source, const float* scale, ssize_t realDstCount, SumByAxisParams sumParams)> mSumByAxisLFunc;
     std::shared_ptr<Tensor> mQuantInput;
     std::shared_ptr<Tensor> mDynamicBias;
@@ -76,7 +76,6 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     int mThreadNums;
     int mBlockNum;
     int mOcPerThread;
-    bool mDynamicQuantExe;
     bool mSplitByOc;
     bool mUseBatchQuan;
 };
diff --git a/source/backend/cpu/compute/ConvInt8Winograd.cpp b/source/backend/cpu/compute/ConvInt8Winograd.cpp
index 433b88812..a460c1db8 100644
--- a/source/backend/cpu/compute/ConvInt8Winograd.cpp
+++ b/source/backend/cpu/compute/ConvInt8Winograd.cpp
@@ -189,6 +189,17 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
     UNIT = gcore->pack;
     int pack = gcore->pack;
+
+    mFusedBias.reset(Tensor::createDevice<float>({ROUND_UP(outputs[0]->channel(), pack)}));
+    mValid &= backend()->onAcquireBuffer(mFusedBias.get(), Backend::STATIC);
+    if (!mValid) {
+        return OUT_OF_MEMORY;
+    }
+    auto fusedBiasPtr = mFusedBias->host<float>();
+    ::memset(fusedBiasPtr, 0, mFusedBias->size());
+    for (int i = 0; i < outputs[0]->channel(); ++i) {
+        fusedBiasPtr[i] = mResource->mOriginBias->host<float>()[i] / mResource->mOutputScale + static_cast<float>(mResource->mOutputZeroPoint);
+    }
     
     auto input = mInputFloat.get(), output = outputs[0];
     int batch = input->batch(), ic = input->channel(), oc = output->channel();
@@ -235,9 +246,10 @@ static void mergeAddBiasScaleQuantize(const std::vector<Tensor*>& inputs, Tensor
     for (int i = 1; i < inputs.size(); ++i) {
         core->MNNMatrixAdd(mergeFloat, mergeFloat, inputs[i]->host<float>(), plane * countC4, 0, 0, 0, 1);
     }
-    std::vector<float> fakeScale(countC4 * pack, 1);
-    core->MNNScaleAndAddBias(mergeFloat, mergeFloat, quanParam->biasFloat, fakeScale.data(), plane, countC4);
-    coreInt8->MNNFloat2Int8(mergeFloat, output->host<int8_t>(), plane * countC4, quanParam->scale, quanParam->minValue, quanParam->maxValue, zeroPoint);
+    auto zeroPointPtr = quanParam->biasFloat;
+    for (int i = 0; i < countC4; ++i) {
+        coreInt8->MNNFloat2Int8(mergeFloat + i * plane * pack, output->host<int8_t>() + i * plane * pack, plane, quanParam->scale, quanParam->minValue, quanParam->maxValue, zeroPointPtr + i * pack, 2);
+    }
 }
 
 // AVX: 8 -> 16, arm32/64: 4 -> 16, AVX512: 16 -> 16, arm82: 4 -> 4
@@ -246,6 +258,10 @@ static void _reorderCommon(float* dst, const float* src, size_t area, size_t dep
         MNNPackC4((float*)dst, (const float*)src, area, depth, areaOffset);
         return;
     }
+    if (uFrom == 1 && uTo == 2) {
+        MNNPackInt8C2((float*)dst, (const float*)src, area, depth, areaOffset);
+        return;
+    }
     size_t srcOffset = areaOffset[0], dstOffset = areaOffset[1];
     int z = 0;
     if (uFrom == 2 && uTo == 4) {
@@ -318,10 +334,11 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
         tmp_outputs.push_back(unit.output.get());
     }
     QuanPostTreatParameters quanParam;
-    scale.assign(pack, 1.0 / outputQuant[0]);
-    quanParam.scale = scale.data();
+    float outputdequantScale = 1.0 / mResource->mOutputScale;
+    quanParam.scale = &outputdequantScale;
     // For winograd Int8, will not treat origin bias to int32, use float directly
-    quanParam.biasFloat = mResource->mOriginBias->host<float>();
+    // quanParam.biasFloat = mResource->mOriginBias->host<float>();
+    quanParam.biasFloat = mFusedBias->host<float>();
     quanParam.maxValue = outputQuant[3];
     if (mResource->mRelu) {
         quanParam.minValue = outputQuant[1];
@@ -501,6 +518,13 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
         auto tFunction = [&](int tId) {
             auto _srcOrigin = mTempInputBuffer->host<int8_t>() + tId * mTempInputBuffer->stride(0);
             auto _dstOrigin = mTempOutputBuffer->host<float>() + tId * mTempOutputBuffer->stride(0);
+            QuanPostTreatParameters quanParam;
+            quanParam.useInt8 = 0;
+            quanParam.srcKernelSum = xkernelSum.data();
+            quanParam.weightQuanBias = wKernelSum.data();
+            quanParam.fp32minmax = reluThred.data();
+            quanParam.extraScale = nullptr;
+
             for (int tIndex = (int)tId; tIndex < tileCount; tIndex += threadNumber) {
                 int xIndex  = (int)tIndex * DST_XUNIT;
                 int xReamin = totalCount - xIndex;
@@ -518,8 +542,8 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                     auto _srcInt8Ptr = _srcOrigin + i * mTempInputBuffer->stride(1);
                     
                     auto scaleVec = mWinoResource->transInputScales->host<float>() + i * pack;
-                    int zeroPoint = mWinoResource->transInputZeroPoints[i];
-                    coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * pack, (pack == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, zeroPoint);
+                    float zeroPoint = static_cast<float>(mWinoResource->transInputZeroPoints[i]);
+                    coreInt8->MNNFloat2Int8(buffer2 + i * DST_XUNIT * ic_4 * pack, (pack == SRC_UNIT ? _srcInt8Ptr: (int8_t*)buffer0), ic_4 * DST_XUNIT, scaleVec, -127, 127, &zeroPoint, 0);
                     if (pack != SRC_UNIT) {
                         int areaOffset[] = {DST_XUNIT, DST_XUNIT}, byte = sizeof(float);
                         _reorderCommon((float*)_srcInt8Ptr, buffer0, DST_XUNIT, UP_DIV(ic, byte), areaOffset, pack / byte, SRC_UNIT / byte);
@@ -527,14 +551,12 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                     
                     auto _dstFloatPtr = _dstOrigin + i * dc_4 * xC * pack;
                     auto _weightInt8Ptr = weight + i * mWinoResource->weight->stride(0);
-                    QuanPostTreatParameters quanParam;
+
                     quanParam.biasFloat = (mWinoResource->offsets->host<float>() + i * mWinoResource->offsets->stride(0));
-                    quanParam.useInt8 = 0;
-                    quanParam.srcKernelSum = xkernelSum.data();
-                    quanParam.weightQuanBias = wKernelSum.data();
-                    quanParam.fp32minmax = reluThred.data();
                     quanParam.scale = mWinoResource->scales->host<float>() + i * dc_4 * pack;
                     quanParam.extraScale = nullptr;
+                    quanParam.bias = nullptr;
+                    quanParam.blockNum = 1;
                     gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, xC);
                 }
     #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
diff --git a/source/backend/cpu/compute/ConvInt8Winograd.hpp b/source/backend/cpu/compute/ConvInt8Winograd.hpp
index b876059fa..c3f2d58d5 100644
--- a/source/backend/cpu/compute/ConvInt8Winograd.hpp
+++ b/source/backend/cpu/compute/ConvInt8Winograd.hpp
@@ -36,6 +36,7 @@ class ConvInt8Winograd : public CPUConvolution {
     std::vector<Unit> mUnits;
     std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
     std::shared_ptr<Tensor> mInputFloat;
+    std::shared_ptr<Tensor> mFusedBias;
     
     struct WinoResource {
         std::shared_ptr<Tensor> weight;
diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
index 738d85826..d09a3f6fd 100644
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@@ -82,10 +82,18 @@ Execution* ConvolutionFloatFactory::create(const std::vector<Tensor*>& inputs, c
         return new ConvolutionTiledExecutorMultiInput(conv2d->common(), backend);
     }
 #ifdef MNN_LOW_MEMORY
-    bool lowMemory = static_cast<CPUBackend*>(backend)->memoryMode() != BackendConfig::Memory_High && static_cast<CPUBackend*>(backend)->functions()->MNNPackedMatMul_int8 != nullptr;
+    bool lowMemory = static_cast<CPUBackend*>(backend)->memoryMode() == BackendConfig::Memory_Low;
+    if (static_cast<CPUBackend*>(backend)->functions()->bytes == 2 && static_cast<CPUBackend*>(backend)->int8Functions()->MNNGemmInt8AddBiasScale_Unit_FP16 == nullptr) {
+        // Fall back to fp32
+        return nullptr;
+    }
 #else
     bool lowMemory = false;
 #endif
+
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
+    lowMemory = lowMemory || (static_cast<CPUBackend*>(backend)->memoryMode() != BackendConfig::Memory_High);
+#endif
     const float* originWeight = nullptr;
     const float* originBias   = nullptr;
     int originWeightSize   = 0;
diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp
index a73afdba8..e314f9fcf 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.cpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.cpp
@@ -13,11 +13,42 @@
 #include "core/TensorUtils.hpp"
 
 namespace MNN {
+static void _makeResource(Backend* backend, std::shared_ptr<CPUConvolution::Resource> resource, const MNN::Op *op, std::shared_ptr<CPUConvolution::ResourceInt8> resourceInt8) {
+    /* Used to compute weight quant scale and bias and weightKernelSum of type float. */
+    auto conv2d = op->main_as_Convolution2D();
+    bool quanBuffer = (conv2d->quanParameter() != nullptr && conv2d->quanParameter()->buffer() != nullptr);
+    MNN_ASSERT(quanBuffer || resourceInt8);
+    resource->backend = backend;
+    auto core = static_cast<CPUBackend*>(backend)->functions();
+    // common parameters
+    int outputCount = conv2d->common()->outputCount();
+    int LSize = conv2d->common()->inputCount() * conv2d->common()->kernelX() * conv2d->common()->kernelY();
+    int ocUp4 = ROUND_UP(outputCount, core->pack);
+    int8_t* weightOrigin;
+
+    // Save weight quant scale and bias: wf=scale*wi+bias
+    resource->mDequantize.mScaleBias.reset(Tensor::createDevice<uint8_t>({2 * ocUp4 * core->bytes}));
+    auto success = resource->backend->onAcquireBuffer(resource->mDequantize.mScaleBias.get(), Backend::STATIC);
+    if (!success) {
+        MNN_ERROR("Alloc denquant scaleBias memory error\n");
+        return;
+    }
+    auto alphaPtr = resource->mDequantize.mScaleBias->host<float>();
+    auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + ocUp4 * core->bytes);
+    ::memset(alphaPtr, 0, 2 * ocUp4 * core->bytes);
+    auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
+    auto wScale = resourceInt8->mOriginScale->host<float>();
+    int h = ocUp4;
+    for (int i=0; i< h; ++i) {
+        alphaPtr[i] = wScale[i];
+        biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
+    }
+}
 
 GemmInt8Executor::GemmInt8Executor(Backend* bn, std::shared_ptr<ResourceInt8> resource, const Op *op, decltype(CoreInt8Functions::Int8GemmKernel) gemmKernel, std::vector<int32_t> bias) :
     CPUConvolution(op->main_as_Convolution2D()->common(), bn), mResourceInt8(resource), mMutableResource(resource, bn), mGemmKernel(gemmKernel), mQuantBias(bias){
         mResource.reset(new Resource);
-        CPUConvolution::makeResource(bn, mResource, op, mResourceInt8);
+        _makeResource(bn, mResource, op, mResourceInt8);
 }
 
 GemmInt8Executor::~GemmInt8Executor() {
diff --git a/source/backend/cpu/compute/IdstConvolutionInt8.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
index 05a9df338..20ce94af3 100644
--- a/source/backend/cpu/compute/IdstConvolutionInt8.cpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@@ -175,7 +175,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
         mQuanScale,
         mQuanScale
     };
-    int8_t zeroPoint = 0;
+    float zeroPoint = 0;
     
     std::vector<float> fakeScale(ocC4 * PackUnit, 1.0f);
     QuanPostTreatParameters quanParam;
@@ -199,7 +199,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
         auto srcOrigin = input->host<float>() + input->stride(0) * batchIndex;
         auto dstOrigin = output->host<float>() + output->stride(0) * batchIndex;
 
-        MNNFloat2Int8(srcOrigin, srcCopy, inputTotalSize / 4, quantScale, mAMin, mAMax, zeroPoint);
+        MNNFloat2Int8(srcOrigin, srcCopy, inputTotalSize / 4, &mQuanScale, mAMin, mAMax, &zeroPoint, 0);
         int tileCount = UP_DIV(count, DST_XUNIT);
 
         threadNumber        = std::max(((CPUBackend*)backend())->threadNumber(), 1);
diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp
index d84d2f5e6..340b2386d 100644
--- a/source/backend/cpu/compute/ImageProcessFunction.cpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.cpp
@@ -24,6 +24,23 @@ void MNNSamplerC4NearestOpt(const unsigned char* source, unsigned char* dest, fl
 void MNNSamplerC1NearestOpt(const unsigned char* source, unsigned char* dest, float* points, size_t count, size_t iw, size_t ih, size_t yStride);
 void MNNBlitC1ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
 void MNNBlitC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
+void MNNRGBToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNBGRAToBGRC8(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNGRAYToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNGRAYToC3Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC3ToC4Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNBGRAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNRGBToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNRGBAToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNBGRToGRAYFast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC3ToYUVFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+void MNNC3ToXYZFast(const unsigned char* source, unsigned char* dest, size_t count, int32_t* c);
+void MNNRGBToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNBGRToBGR555Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNBGRToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNRGBToBGR565Fast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNRGBAToBGRAFast(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNRGBAToBGRFast(const unsigned char* source, unsigned char* dest, size_t count);
 }
 
 void MNNGRAYToC4(const unsigned char* source, unsigned char* dest, size_t count) {
@@ -31,16 +48,7 @@ void MNNGRAYToC4(const unsigned char* source, unsigned char* dest, size_t count)
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            auto gray = vld1_u8(source + 8 * i);
-
-            uint8x8x4_t rgba;
-            rgba.val[0] = gray;
-            rgba.val[1] = gray;
-            rgba.val[2] = gray;
-            rgba.val[3] = vdup_n_u8(255);
-            vst4_u8(dest + 32 * i, rgba);
-        }
+        MNNGRAYToC4Fast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -57,15 +65,7 @@ void MNNGRAYToC3(const unsigned char* source, unsigned char* dest, size_t count)
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            auto gray = vld1_u8(source + 8 * i);
-
-            uint8x8x3_t rgba;
-            rgba.val[0] = gray;
-            rgba.val[1] = gray;
-            rgba.val[2] = gray;
-            vst3_u8(dest + 24 * i, rgba);
-        }
+        MNNGRAYToC3Fast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -81,16 +81,7 @@ void MNNC3ToC4(const unsigned char* source, unsigned char* dest, size_t count) {
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            uint8x8x3_t c3 = vld3_u8(source + 24 * i);
-
-            uint8x8x4_t c4;
-            c4.val[0] = c3.val[0];
-            c4.val[1] = c3.val[1];
-            c4.val[2] = c3.val[2];
-            c4.val[3] = vdup_n_u8(255);
-            vst4_u8(dest + 32 * i, c4);
-        }
+        MNNC3ToC4Fast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -105,15 +96,9 @@ void MNNC3ToC4(const unsigned char* source, unsigned char* dest, size_t count) {
 void MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
 #ifdef MNN_USE_NEON
-    int countD8 = (int)count / 8;
+    auto countD8 = count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            uint8x8x4_t rgba = vld4_u8(source + 32 * i);
-            auto t           = rgba.val[0];
-            rgba.val[0]      = rgba.val[2];
-            rgba.val[2]      = t;
-            vst4_u8(dest + 32 * i, rgba);
-        }
+        MNNRGBAToBGRAFast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -128,17 +113,9 @@ void MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t coun
 void MNNRGBAToBGR(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
 #ifdef MNN_USE_NEON
-    int countD8 = (int)count / 8;
+    auto countD8 = count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            uint8x8x4_t rgba = vld4_u8(source + 32 * i);
-
-            uint8x8x3_t bgr;
-            bgr.val[0] = rgba.val[2];
-            bgr.val[1] = rgba.val[1];
-            bgr.val[2] = rgba.val[0];
-            vst3_u8(dest + 24 * i, bgr);
-        }
+        MNNRGBAToBGRFast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -152,18 +129,11 @@ void MNNRGBAToBGR(const unsigned char* source, unsigned char* dest, size_t count
 void MNNRGBToBGR(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
 #ifdef MNN_USE_NEON
-    int countD8 = (int)count / 8;
-    if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            uint8x8x3_t rgba = vld3_u8(source + 24 * i);
-            uint8x8x3_t bgr;
-            bgr.val[0] = rgba.val[2];
-            bgr.val[1] = rgba.val[1];
-            bgr.val[2] = rgba.val[0];
-            vst3_u8(dest + 24 * i, bgr);
-        }
+   int countD8 = (int)count / 8;
+   if (countD8 > 0) {
+        MNNRGBToBGRC8(source, dest, countD8);
         sta = countD8 * 8;
-    }
+   }
 #endif
     for (int i = sta; i < count; ++i) {
         dest[3 * i + 0] = source[3 * i + 2];
@@ -177,15 +147,7 @@ void MNNBGRAToBGR(const unsigned char* source, unsigned char* dest, size_t count
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        for (int i = 0; i < countD8; ++i) {
-            uint8x8x4_t bgra = vld4_u8(source + 32 * i);
-
-            uint8x8x3_t bgr;
-            bgr.val[0] = bgra.val[0];
-            bgr.val[1] = bgra.val[1];
-            bgr.val[2] = bgra.val[2];
-            vst3_u8(dest + 24 * i, bgr);
-        }
+        MNNBGRAToBGRC8(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -198,23 +160,13 @@ void MNNBGRAToBGR(const unsigned char* source, unsigned char* dest, size_t count
 
 void MNNBGRAToGRAY(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
-    /*
-#ifdef MNN_USE_NEON
-    int countD8 = (int)count / 8;
-    if (countD8 > 0) {
-        auto rC = vdup_n_u8(19);
-        auto gC = vdup_n_u8(38);
-        auto bC = vdup_n_u8(7);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld4_u8(source + 32 * i);
-            auto res   = vmull_u8(rC, rgb.val[2]) + vmull_u8(gC, rgb.val[1]) + vmull_u8(bC, rgb.val[0]);
-            auto resU8 = vshrn_n_u16(res, 6);
-            vst1_u8(dest + 8 * i, resU8);
-        }
+ #if defined MNN_USE_NEON
+     int countD8 = (int)count / 8;
+     if (countD8 > 0) {
+        MNNBGRAToGRAYFast(source, dest, countD8);
         sta = countD8 * 8;
-    }
-#endif
-    */
+     }
+ #endif
     for (int i = sta; i < count; ++i) {
         int r = source[4 * i + 2];
         int g = source[4 * i + 1];
@@ -228,23 +180,14 @@ void MNNBGRAToGRAY(const unsigned char* source, unsigned char* dest, size_t coun
 
 void MNNRGBAToGRAY(const unsigned char* source, unsigned char* dest, size_t count) {
     int sta = 0;
-    /*
-#ifdef MNN_USE_NEON
+
+#if defined MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        auto rC = vdup_n_u8(19);
-        auto gC = vdup_n_u8(38);
-        auto bC = vdup_n_u8(7);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld4_u8(source + 32 * i);
-            auto res   = vmull_u8(rC, rgb.val[0]) + vmull_u8(gC, rgb.val[1]) + vmull_u8(bC, rgb.val[2]);
-            auto resU8 = vshrn_n_u16(res, 6);
-            vst1_u8(dest + 8 * i, resU8);
-        }
+        MNNRGBAToGRAYFast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
-    */
 
     for (int i = sta; i < count; ++i) {
         int r = source[4 * i + 0];
@@ -291,28 +234,15 @@ void MNNC3ToYUV(const unsigned char* source, unsigned char* dest, size_t count,
         C3 = coeffs[r1], C4 = coeffs[g1], C5 = coeffs[b1],
         C6 = coeffs[r2], C7 = coeffs[g2], C8 = coeffs[b2];
     int sta = 0;
-    /*
-#ifdef MNN_USE_NEON
+
+#if defined MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        auto rC0 = vdup_n_u8(C0), rC1 = vdup_n_u8(C1), rC2 = vdup_n_u8(C2),
-             rC3 = vdup_n_u8(C3), rC4 = vdup_n_u8(C4), rC5 = vdup_n_u8(C5),
-             rC6 = vdup_n_u8(C6), rC7 = vdup_n_u8(C7), rC8 = vdup_n_u8(C8);
-        auto delta = vdup_n_u8(128);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld4_u8(source + 24 * i);
-            uint8x8x3_t yuv;
-            yuv.val[0] = CV_MUL_SHIFT(rC0, rC1, rC2, 14);
-            yuv.val[1] = CV_MUL_SHIFT(rC3, rC4, rC5, 14);
-            yuv.val[2] = CV_MUL_SHIFT(rC6, rC7, rC8, 14);
-            yuv.val[1] = vadd_u8(yuv.val[1], delta);
-            yuv.val[2] = vadd_u8(yuv.val[2], delta);
-            vst3_u8(dest + 24 * i, yuv);
-        }
+        int32_t c[] = {C0, C1, C2, C3, C4, C5, C6, C7, C8};
+        MNNC3ToYUVFast(source, dest, countD8, c);
         sta = countD8 * 8;
     }
 #endif
-     */
     for (int i = sta; i < count; ++i) {
         int r = source[3 * i + 0];
         int g = source[3 * i + 1];
@@ -342,25 +272,16 @@ void MNNC3ToXYZ(const unsigned char* source, unsigned char* dest, size_t count,
         C3 = coeffs[r1], C4 = coeffs[4], C5 = coeffs[b1],
         C6 = coeffs[r2], C7 = coeffs[7], C8 = coeffs[b2];
     int sta = 0;
-    /*
-#ifdef MNN_USE_NEON
+    
+#if defined MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        auto rC0 = vdup_n_u8(C0), rC1 = vdup_n_u8(C1), rC2 = vdup_n_u8(C2),
-             rC3 = vdup_n_u8(C3), rC4 = vdup_n_u8(C4), rC5 = vdup_n_u8(C5),
-             rC6 = vdup_n_u8(C6), rC7 = vdup_n_u8(C7), rC8 = vdup_n_u8(C8);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld4_u8(source + 24 * i);
-            uint8x8x3_t xyz;
-            xyz.val[0] = CV_MUL_SHIFT(rC0, rC1, rC2, 12);
-            xyz.val[1] = CV_MUL_SHIFT(rC3, rC4, rC5, 12);
-            xyz.val[2] = CV_MUL_SHIFT(rC6, rC7, rC8, 12);
-            vst3_u8(dest + 24 * i, xyz);
-        }
+        int32_t c[] = {C0, C1, C2, C3, C4, C5, C6, C7, C8};
+        MNNC3ToXYZFast(source, dest, countD8, c);
         sta = countD8 * 8;
     }
 #endif
-    */
+    
     for (int i = sta; i < count; ++i) {
         int r = source[3 * i + 0];
         int g = source[3 * i + 1];
@@ -403,6 +324,18 @@ void MNNC3ToHSV(const unsigned char* source, unsigned char* dest, size_t count,
 
 void MNNC3ToBGR555(const unsigned char* source, unsigned char* dest, size_t count, bool bgr) {
     int i = 0;
+    int countD8 = (int)count / 8;
+#if defined MNN_USE_NEON
+    if (countD8 > 0) {
+        if (bgr) {
+            MNNBGRToBGR555Fast(source, dest, countD8);
+        } else {
+            MNNRGBToBGR555Fast(source, dest, countD8);
+        }
+        
+        i = countD8 * 8;
+    }
+#endif
     for (; i < count; ++i) {
         int r = source[3 * i + 0];
         int g = source[3 * i + 1];
@@ -414,6 +347,17 @@ void MNNC3ToBGR555(const unsigned char* source, unsigned char* dest, size_t coun
 
 void MNNC3ToBGR565(const unsigned char* source, unsigned char* dest, size_t count, bool bgr) {
     int i = 0;
+#if defined MNN_USE_NEON
+    auto countD8 = count / 8;
+    if (countD8 > 0) {
+        if (bgr) {
+            MNNBGRToBGR565Fast(source, dest, countD8);
+        } else {
+            MNNRGBToBGR565Fast(source, dest, countD8);
+        }
+        i = countD8 * 8;
+    }
+#endif
     for (; i < count; ++i) {
         int r = source[3 * i + 0];
         int g = source[3 * i + 1];
@@ -428,15 +372,7 @@ void MNNRGBToGRAY(const unsigned char* source, unsigned char* dest, size_t count
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        auto rC = vdup_n_u8(19);
-        auto gC = vdup_n_u8(38);
-        auto bC = vdup_n_u8(7);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld3_u8(source + 24 * i);
-            auto res   = vmull_u8(rC, rgb.val[0]) + vmull_u8(gC, rgb.val[1]) + vmull_u8(bC, rgb.val[2]);
-            auto resU8 = vshrn_n_u16(res, 6);
-            vst1_u8(dest + 8 * i, resU8);
-        }
+        MNNRGBToGRAYFast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -457,15 +393,7 @@ void MNNBRGToGRAY(const unsigned char* source, unsigned char* dest, size_t count
 #ifdef MNN_USE_NEON
     int countD8 = (int)count / 8;
     if (countD8 > 0) {
-        auto rC = vdup_n_u8(19);
-        auto gC = vdup_n_u8(38);
-        auto bC = vdup_n_u8(7);
-        for (int i = 0; i < countD8; ++i) {
-            auto rgb   = vld3_u8(source + 24 * i);
-            auto res   = vmull_u8(rC, rgb.val[2]) + vmull_u8(gC, rgb.val[1]) + vmull_u8(bC, rgb.val[0]);
-            auto resU8 = vshrn_n_u16(res, 6);
-            vst1_u8(dest + 8 * i, resU8);
-        }
+        MNNBGRToGRAYFast(source, dest, countD8);
         sta = countD8 * 8;
     }
 #endif
@@ -839,7 +767,7 @@ static void _sampleBilinearCommon(const unsigned char* source, unsigned char* de
             float v =
                 (1.0f - xF) * (1.0f - yF) * c00 + xF * (1.0f - yF) * c01 + yF * (1.0 - xF) * c10 + xF * yF * (c11);
             v                 = std::min(std::max(v, 0.0f), 255.0f);
-            dest[bpp * i + b] = (unsigned char)v;
+            dest[bpp * i + b] = (unsigned char)roundf(v);
         }
         curPoints.fY += dy;
         curPoints.fX += dx;
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index 50fad7e6a..497ef3bf9 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -37,6 +37,8 @@ void MNNGemmInt8AddBiasScale_ARMV86_Unit(int8_t* dst, const int8_t* src, const i
                                         const QuanPostTreatParameters* post, size_t realDstCount);
 void MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
                                         size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder=nullptr);
+void MNNSumByAxisLForMatmul_A_ARM86(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
+void MNNSumByAxisLForMatmul_A_ARM82(float* dest, int8_t* source, const float* dequantScale, ssize_t realDstCount, SumByAxisParams sumParams);
 #if defined(MNN_LOW_MEMORY)
 // int4 weight gemmInt8 kernel
 void MNNGemmInt8AddBiasScale_ARMV82_w4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
@@ -48,7 +50,7 @@ void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src, const
 // Tools to dynamic-quant fp16-input data.
 #ifdef MNN_USE_ARMV82
 void DynamicQuanInput_ARM82(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
-                        ssize_t maxValue, ssize_t zeroPoint);
+                        ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
 // int8 weight gemmInt8 kernel to return fp16-output data.
 void MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
                                               const QuanPostTreatParameters* post, size_t realDstCount);
@@ -59,7 +61,7 @@ void MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16(int8_t* dst, const int8_t* src, co
 void MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
                                               const QuanPostTreatParameters* post, size_t realDstCount);
 void DynamicQuanInputAndReorder_ARM82(const float* src, int8_t* dst, size_t planeSize, const float* scale, ssize_t aMin,
-                                     ssize_t aMax, ssize_t zeroPoint, size_t ocQuad, size_t offset);
+                                     ssize_t aMax, const float* zeroPoint, size_t ocQuad, size_t offset);
 #endif
 #endif
 #endif // __aarch64__
@@ -1514,8 +1516,8 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
 
                 int w8[64]; // 64=GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT
                 for (int k = 0; k < 32; ++k) {
-                    w8[2 * k] = (weight_sz[k]>>4);
-                    w8[2 * k + 1] = (weight_sz[k] & c);
+                    w8[k] = (weight_sz[k]>>4);
+                    w8[k + 32] = (weight_sz[k] & c);
                 }
 
                 for (int j = 0; j < GEMM_INT8_UNIT; ++j) {
@@ -1642,10 +1644,28 @@ static void MNNLineDepthWiseInt8AddBiasScaleUnit3x3(int8_t* dst, const int8_t* s
 
 #ifndef MNN_USE_NEON
 void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
-                   ssize_t maxValue, ssize_t zeroPoint) {
+                   ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec) {
+    // quanParamVec:
+    // 00: scale is vector
+    // 10: zero is vector
+    // 11: both are vector
+    float scale4[4] = {scalep[0], scalep[0], scalep[0], scalep[0] };
+    float zero4[4] = {zeroPoint[0], zeroPoint[0], zeroPoint[0], zeroPoint[0]};
+    if (quanParamVec % 2 == 1) {
+        scale4[0] = scalep[0];
+        scale4[1] = scalep[1];
+        scale4[2] = scalep[2];
+        scale4[3] = scalep[3];
+    }
+    if (quanParamVec >> 1 == 1) {
+        zero4[0] = zeroPoint[0];
+        zero4[1] = zeroPoint[1];
+        zero4[2] = zeroPoint[2];
+        zero4[3] = zeroPoint[3];
+    }
     for (int i = 0; i < sizeQuad; ++i) {
         for (int j=0; j<4; ++j) {
-            int v = (int)roundf(src[4*i+j] * scalep[j]) + zeroPoint;
+            int v = (int)roundf(src[4*i+j] * scale4[j]) + zero4[j];
             if (v > maxValue) {
                 v = maxValue;
             }
@@ -2103,7 +2123,7 @@ static void MNNGetGemmUnit(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
 }
 
 static void MNNGetGemmUnitSdot(int* UNIT, int* SRC_UNIT, int* DST_XUNIT) {
-    *UNIT = 4;
+    *UNIT = 8;
     *SRC_UNIT = 4;
     *DST_XUNIT = 12;
 }
@@ -2226,6 +2246,7 @@ void MNNCoreInt8FunctionInit() {
         gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
         // ConvDepthwise
         gCoreFunc->ConvDepthwise3x3LineInt8_ARM82 = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3;
+        core->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A_ARM82;
 #if defined(MNN_LOW_MEMORY)
     #ifdef MNN_USE_ARMV82
         gCoreFunc->DynamicQuanInput_ARM82 = DynamicQuanInput_ARM82;
@@ -2241,6 +2262,7 @@ void MNNCoreInt8FunctionInit() {
         gCoreFunc->Int8GemmKernel = MNNGemmInt8AddBiasScale_ARMV86_Unit;
         gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV86_Unit;
         gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitI8mm;
+        core->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A_ARM86;
 #if defined(MNN_LOW_MEMORY)
         gCoreFunc->Int8GemmKernel_W4 = MNNGemmInt8AddBiasScale_ARMV86_w4_Unit;
     #ifdef MNN_USE_ARMV82
@@ -2250,7 +2272,6 @@ void MNNCoreInt8FunctionInit() {
 #endif
         // Im2Col
         gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A<10, 8, 8>;
-        gCoreFunc->MNNPackC4Int8ForMatMul_A_ARM86FP16 = _ArmBasicMNNPackC4ForMatMul_A<10, 8, 8>;
     }
 #endif
     MNNInt8FunctionInit();
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h
index da974619c..6860c0643 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@@ -48,7 +48,7 @@ struct QuanPostTreatParameters {
     float* weightQuanBias;
     float* fp32minmax;
     ssize_t blockNum = 1;
-    const int32_t* bias;
+    const int32_t* bias = nullptr;
     const float* extraScale = nullptr;
     const float* extraBias = nullptr;
 };
@@ -61,7 +61,7 @@ struct QuanPrePostParameters{
     ssize_t maxValue;
 };
 void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
-                   ssize_t maxValue, ssize_t zeroPoint);
+                   ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
 void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
 void MNNInt8FunctionInit();
 void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
@@ -84,11 +84,10 @@ struct CoreInt8Functions {
     void(*Int8GemmKernelFast)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount);
     void(*MNNGetGemmUnit)(int* UNIT, int* SRC_UNIT, int* DST_XUNIT);
     void(*MNNPackC4Int8ForMatMul_A)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el);
-    void(*MNNPackC4Int8ForMatMul_A_ARM86FP16)(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) = nullptr;
     void(*MNNGemmInt8AddBiasScale_Unit_FP16)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
-                                        const QuanPostTreatParameters* post, size_t realDstCount);
+                                        const QuanPostTreatParameters* post, size_t realDstCount) = nullptr;
     void(*MNNGemmInt8AddBiasScale_w4_Unit_FP16)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
-                                        const QuanPostTreatParameters* post, size_t realDstCount);
+                                        const QuanPostTreatParameters* post, size_t realDstCount) = nullptr;
     void(*Int8GemmKernel_W4)(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
                                            const QuanPostTreatParameters* post, size_t realDstCount);
     // sparse
@@ -102,9 +101,9 @@ struct CoreInt8Functions {
                                  size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder);
     void(*ConvDepthwise3x3LineInt8_ARM82)(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width,
                                  size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder) = nullptr;
-    void(*DynamicQuanInput_ARM82)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint) = nullptr;
-    void (*DynamicQuanInputAndReorder_ARM82)(const float* src, int8_t* dst, size_t planeSize, const float* scale, ssize_t aMin, ssize_t aMax, ssize_t zeroPoint, size_t ocQuad, size_t offset) = nullptr;
-    void(*MNNFloat2Int8)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint);
+    void(*DynamicQuanInput_ARM82)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec) = nullptr;
+    void (*DynamicQuanInputAndReorder_ARM82)(const float* src, int8_t* dst, size_t planeSize, const float* scale, ssize_t aMin, ssize_t aMax, const float* zeroPoint, size_t ocQuad, size_t offset) = nullptr;
+    void(*MNNFloat2Int8)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
     void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
 
     void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
diff --git a/source/backend/cpu/x86_x64/AVX2Backend.cpp b/source/backend/cpu/x86_x64/AVX2Backend.cpp
index 167a5f984..ed263e366 100644
--- a/source/backend/cpu/x86_x64/AVX2Backend.cpp
+++ b/source/backend/cpu/x86_x64/AVX2Backend.cpp
@@ -366,6 +366,7 @@ void AVX2Backend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
         CPUBackend::onCopyBuffer(srcTensor, dstTensor);
         return;
     }
+    _resetDynamicMemory();
     if (getDataType(srcTensor) != getDataType(dstTensor)) {
         auto dimType = Tensor::CAFFE;
         switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
diff --git a/source/backend/cpu/x86_x64/AVX2Functions.cpp b/source/backend/cpu/x86_x64/AVX2Functions.cpp
index e48d00981..3bafc7573 100644
--- a/source/backend/cpu/x86_x64/AVX2Functions.cpp
+++ b/source/backend/cpu/x86_x64/AVX2Functions.cpp
@@ -39,11 +39,14 @@ bool AVX2Functions::init(int cpuFlags) {
 
     coreFunction->MNNPackedMatMul       = _AVX_MNNPackedMatMul;
     coreFunction->MNNPackedMatMulRemain = _AVX_MNNPackedMatMulRemain;
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
     coreFunction->MNNPackedMatMul_int4       = _AVX_MNNPackedMatMul_int4;
     coreFunction->MNNPackedMatMulRemain_int4 = _AVX_MNNPackedMatMulRemain_int4;
     coreFunction->MNNPackedMatMul_int8       = _AVX_MNNPackedMatMul_int8;
     coreFunction->MNNPackedMatMulRemain_int8 = _AVX_MNNPackedMatMulRemain_int8;
+#endif
+
+#ifdef MNN_LOW_MEMORY
     coreFunction->MNNAbsMax = _AVX_MNNAbsMaxFP32;
 #endif
     coreFunction->MNNPackC4ForMatMul_A  = _AVX_MNNPackC4ForMatMul_A;
diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt
index 631f12069..d9b462266 100644
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@@ -95,6 +95,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
         target_compile_options(MNNAVX PRIVATE -DMNN_LOW_MEMORY)
         target_compile_options(MNNAVXFMA PRIVATE -DMNN_LOW_MEMORY)
     endif()
+    if (MNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNNX8664 PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNNSSE PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNNAVX PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+        target_compile_options(MNNAVXFMA PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
+    endif()
     list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
     if (MSVC AND WIN_USE_ASM)
         target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM)
diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
index ca87c0464..54effc2cb 100644
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@@ -50,11 +50,14 @@ void MNNFunctionInit() {
         coreFunction->MNNGetMatMulPackMode = _SSEMNNGetMatMulPackMode;
         coreFunction->MNNPackedMatMul       = _SSE_MNNPackedMatMul;
         coreFunction->MNNPackedMatMulRemain = _SSE_MNNPackedMatMulRemain;
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
         coreFunction->MNNPackedMatMul_int4       = _SSE_MNNPackedMatMul_int4;
         coreFunction->MNNPackedMatMulRemain_int4 = _SSE_MNNPackedMatMulRemain_int4;
         coreFunction->MNNPackedMatMul_int8       = _SSE_MNNPackedMatMul_int8;
         coreFunction->MNNPackedMatMulRemain_int8 = _SSE_MNNPackedMatMulRemain_int8;
+#endif
+
+#ifdef MNN_LOW_MEMORY
         coreFunction->MNNAbsMax = _SSE_MNNAbsMaxFP32;
 #endif
         coreFunction->MNNPackC4ForMatMul_A  = _SSE_MNNPackC4ForMatMul_A;
diff --git a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
index 214010c6f..c21411b48 100644
--- a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
@@ -37,7 +37,7 @@ void _AVX_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t
                           const float* postParameters, const float* bias, const float* k, const float* b);
 void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                 const float* postParameters, const float* bias, const float* k, const float* b);
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 void _AVX_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter,
                           const float* postParameters, const float* bias, const float* k, const float* b);
 void _AVX_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
@@ -46,13 +46,16 @@ void _AVX_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const s
                           const float* postParameters, const float* bias, const float* k, const float* b);
 void _AVX_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                      const float* postParameters, const float* bias, const float* k, const float* b);
+#endif
+
+#ifdef MNN_LOW_MEMORY
 void _AVX_MNNAbsMaxFP32(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack);
 #endif
 void _AVX_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 
 void _AVX_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
 void _AVX_MNNSoftmax(float* dest, const float* source, size_t size);
-void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint);
+void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, const float* zeroPoint, ssize_t quanParamVec);
 void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint);
 void _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder);
 void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
diff --git a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
index d19863b14..516f247cc 100644
--- a/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmAVX2.cpp
@@ -31,7 +31,7 @@ void _AVX_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t
     AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
 }
 
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 void _AVX_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter,
                                const float* postParameters, const float* bias, const float* k, const float* b) {
     _AVX_MNNPackedMatMul_Main_int4(C, A, B, parameter, k, b);
@@ -60,17 +60,9 @@ void _AVX_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, s
         AVX2GemmPostTreat(C, eSize, parameter, postParameters, bias);
     }
 }
-static __m128i _load_int4_to_int8(const uint8_t* src) {
-    uint8_t c = 0xf;
-    uint8_t temp[16];
-    for (int i = 0; i < 8; ++i) {
-        temp[2 * i] = (src[i] >> 4);
-        temp[2 * i +1] = (src[i] & c);
-    }
-    auto int8_tx16 = _mm_loadu_si128((const __m128i*)temp);
-    return int8_tx16;
-}
+#endif
 
+#ifdef MNN_LOW_MEMORY
 void _AVX_MNNAbsMaxFP32(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack) {
     // source: (ic/8, N, 8)
     auto srcStep = pack * realSize;
diff --git a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
index bf299722c..8a3accc18 100644
--- a/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx/GemmFunction.hpp
@@ -816,7 +816,7 @@ static void _AVX_MNNPackednMatMulRemainCommon(TYPE* C, const TYPE* A, const TYPE
     }
 }
 
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 //----------------------- MatMul(float, int4) Functions ---------------------------//
 
 #define LOAD_WEIGHT_ALPHA_BIAS_int4x4 \
diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
index 1a6b60746..450714416 100644
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@@ -53,10 +53,8 @@ D##u##v = _mm256_add_epi32(D##u##v, _mm256_madd_epi16(W##u, S##v));
 
 #define LOAD_INT4_TO_INT8 \
 auto w_int4 = _mm_loadu_si128((__m128i const*)weight_sz);\
-auto w_int4_high = _mm_and_si128(mask, _mm_srli_epi16(w_int4, 4));\
-auto w_int4_low = _mm_and_si128(mask, w_int4);\
-auto w_0 = _mm_unpacklo_epi8(w_int4_high, w_int4_low);\
-auto w_1 = _mm_unpackhi_epi8(w_int4_high, w_int4_low);
+auto w_0 = _mm_and_si128(mask, _mm_srli_epi16(w_int4, 4));\
+auto w_1 = _mm_and_si128(mask, w_int4);
 
 void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
     MNN_ASSERT(post->useInt8==0);
@@ -1316,15 +1314,22 @@ void _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO,
     }
 }
 
-void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint) {
+void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, const float* zeroPoint, ssize_t quanParamVec) {
     auto zero = _mm256_set1_epi32(0);
     auto minValue = _mm256_set1_ps(minV);
     auto maxValue = _mm256_set1_ps(maxV);
-    auto zeroPointValue = _mm256_set1_ps(zeroPoint);
+    auto zeroPointValue = _mm256_set1_ps(zeroPoint[0]);
     auto offset = _mm256_set1_epi32(128);
     auto plus = _mm256_set1_ps(0.5f);
     auto minus = _mm256_set1_ps(-0.5f);
-    auto scaleValue = _mm256_loadu_ps(scalep);
+    auto scaleValue = _mm256_set1_ps(scalep[0]);
+
+    if (quanParamVec & 1) {
+        scaleValue = _mm256_loadu_ps(scalep);
+    }
+    if (quanParamVec >> 1) {
+        zeroPointValue = _mm256_loadu_ps(zeroPoint);
+    }
 
     for (int i = 0; i < sizeQuad; ++i) {
         auto f0 = _mm256_loadu_ps(src + 8 * i);
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
index 6eb8a5379..fd80b6dc8 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
@@ -201,16 +201,23 @@ void _AVX512_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* sr
         src += src_w_step;
     }
 }
-void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint) {
+void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, const float* zeroPoint, ssize_t quanParamVec) {
     auto zero = _mm256_set1_epi32(0);
     auto minValue = _mm256_set1_ps(minV);
     auto maxValue = _mm256_set1_ps(maxV);
-    auto zeroPointValue = _mm256_set1_ps(zeroPoint);
+    auto zeroPointValue = _mm256_set1_ps(zeroPoint[0]);
     auto offset = _mm256_set1_epi32(128);
     auto plus = _mm256_set1_ps(0.5f);
     auto minus = _mm256_set1_ps(-0.5f);
-    auto scaleValue0 = _mm256_loadu_ps(scalep);
-    auto scaleValue1 = _mm256_loadu_ps(scalep + 8);
+    auto scaleValue0 = _mm256_set1_ps(scalep[0]);
+    auto scaleValue1 = scaleValue0;
+    if (quanParamVec & 1) {
+        scaleValue0 = _mm256_loadu_ps(scalep);
+        scaleValue1 = _mm256_loadu_ps(scalep + 8);
+    }
+    if (quanParamVec >> 1) {
+        zeroPointValue = _mm256_loadu_ps(zeroPoint);
+    }
 
     for (int i = 0; i < sizeQuad; ++i) {
         auto f0 = _mm256_loadu_ps(src + PACK_UNIT * i);
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index 4f1525087..7e8fff748 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -50,7 +50,7 @@ void _SSE_MNNPackedMatMul(float* C, const float* A, const float* B, const size_t
                           const float* postParameters, const float* bias, const float* k, const float* b);
 void _SSE_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                  const float* postParameters, const float* bias, const float* k, const float* b);
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 void _SSE_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter,
                                const float* postParameters, const float* bias, const float* k, const float* b);
 void _SSE_MNNPackedMatMulRemain_int4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
@@ -59,6 +59,8 @@ void _SSE_MNNPackedMatMul_int8(float* C, const float* A, const float* B, const s
                                const float* postParameters, const float* bias, const float* k, const float* b);
 void _SSE_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                      const float* postParameters, const float* bias, const float* k, const float* b);
+#endif
+#ifdef MNN_LOW_MEMORY
 void _SSE_MNNAbsMaxFP32(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack);
 void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                             size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
@@ -71,7 +73,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
                                             size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
 void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
-void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, ssize_t zeroPoint);
+void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
 
 void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
 void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder=nullptr);
diff --git a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
index 77702c2d4..f1fb9b338 100644
--- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
@@ -300,14 +300,14 @@ auto d##i##j = _mm_add_epi32(_mm_madd_epi16(S##i##j##0, W##i##j##0), _mm_madd_ep
         }
     }
 }
+#define LOAD_INT4_TO_INT8 \
+    auto w0_int4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(weight_sz));\
+    auto w1_int4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(weight_sz + 16));\
+    auto w0 = _mm_and_si128(mask, _mm_srli_epi16(w0_int4, 4));\
+    auto w1 = _mm_and_si128(mask, _mm_srli_epi16(w1_int4, 4));\
+    auto w2 = _mm_and_si128(mask, w0_int4);\
+    auto w3 = _mm_and_si128(mask, w1_int4);
 
-static inline void _load_int4_to_int8(const uint8_t* src, int8_t* dst) {
-    uint8_t c = 0xf;
-    for (int i = 0; i < 32; ++i) {
-        dst[2 * i] = (src[i] >> 4);
-        dst[2 * i +1] = (src[i] & c);
-    }
-}
 void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                             size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
     MNN_ASSERT(post->useInt8 == 0);
@@ -335,6 +335,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
     __m128 kernelSum1 = _mm_setzero_ps();
     __m128 kernelSum2 = _mm_setzero_ps();
     __m128 kernelSum3 = _mm_setzero_ps();
+    const auto mask = _mm_set1_epi8(0xf);
     if (GEMM_INT8_DST_XUNIT == realDst) {
         kernelSum0 = _mm_load_ps1(post->srcKernelSum);
         kernelSum1 = _mm_load_ps1(post->srcKernelSum + 1);
@@ -402,13 +403,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
             const auto weight_sz = weight_dz + weight_step_Y * sz;
             const auto src_z     = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
 
-            int8_t tmp_w[64];
-            _load_int4_to_int8((uint8_t*)weight_sz, tmp_w);
-
-            auto w0 = _mm_loadu_si128((__m128i*)(tmp_w + GEMM_INT8_SRC_UNIT * 0));
-            auto w1 = _mm_loadu_si128((__m128i*)(tmp_w + GEMM_INT8_SRC_UNIT * 1));
-            auto w2 = _mm_loadu_si128((__m128i*)(tmp_w + GEMM_INT8_SRC_UNIT * 2));
-            auto w3 = _mm_loadu_si128((__m128i*)(tmp_w + GEMM_INT8_SRC_UNIT * 3));
+            LOAD_INT4_TO_INT8;
 
             auto s0 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 0));
             auto s1 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 1));
@@ -480,12 +475,6 @@ auto d##i##j = _mm_add_epi32(_mm_madd_epi16(S##i##j##0, W##i##j##0), _mm_madd_ep
         E1 = _mm_hadd_epi32(E2, E3);
         d3 = _mm_hadd_epi32(E0, E1);
         auto scaleValue = _mm_loadu_ps(scale_dz);
-        // auto biasValue = _mm_loadu_si128((__m128i*)(bias_dz));
-        // d0 = _mm_add_epi32(d0, biasValue);
-        // d1 = _mm_add_epi32(d1, biasValue);
-        // d2 = _mm_add_epi32(d2, biasValue);
-        // d3 = _mm_add_epi32(d3, biasValue);
-        //auto biasValue = _mm_loadu_ps((float*)(bias_dz));
         auto weightBiasValue = _mm_loadu_ps((float*)weightBias_dz);
         __m128 f0 = _mm_cvtepi32_ps(d0);
         __m128 f1 = _mm_cvtepi32_ps(d1);
@@ -584,14 +573,20 @@ void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size, ssize_t zeroP
 }
 
 // require SSE 4.1
-void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint) {
+void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, const float* zeroPoint, ssize_t quanParamVec) {
     __m128i zero = _mm_set1_epi32(0);
     __m128 minValue = _mm_set1_ps(minV);
     __m128 maxValue = _mm_set1_ps(maxV);
-    __m128 zeroPointValue = _mm_set1_ps(zeroPoint);
+    __m128 zeroPointValue = _mm_set1_ps(zeroPoint[0]);
     __m128 plus = _mm_set1_ps(0.5f);
     __m128 minus = _mm_set1_ps(-0.5f);
-    __m128 scaleValue = _mm_loadu_ps(scalep);
+    __m128 scaleValue = _mm_set1_ps(scalep[0]);
+    if (quanParamVec & 1) {
+        scaleValue = _mm_loadu_ps(scalep);
+    }
+    if (quanParamVec >> 1) {
+        zeroPointValue = _mm_loadu_ps(zeroPoint);
+    }
     auto offset = _mm_set1_epi32(128);
 
     for (int i = 0; i < sizeQuad; ++i) {
diff --git a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
index 8e5a32896..336019603 100644
--- a/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmSSE.cpp
@@ -27,7 +27,7 @@ void _SSE_MNNPackedMatMulRemain(float* C, const float* A, const float* B, size_t
     _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
 }
 
-#ifdef MNN_LOW_MEMORY
+#ifdef MNN_CPU_WEIGHT_DEQUANT_GEMM
 //----------------------- MatMul(float, int4) Functions ---------------------------//
 void _SSE_MNNPackedMatMul_int4(float* C, const float* A, const float* B, const size_t* parameter,
                                const float* postParameters, const float* bias, const float* k, const float* b) {
@@ -66,7 +66,9 @@ void _SSE_MNNPackedMatMulRemain_int8(float* C, const float* A, const float* B, s
         _SSE_GemmPostTreat(C, eSize, parameter, postParameters, bias);
     }
 }
+#endif
 
+#ifdef MNN_LOW_MEMORY
 // Dynamic quant
 void _SSE_MNNAbsMaxFP32(const float* source, float* absmax, size_t src_depth_quad, size_t realSize, int pack) {
     // source: (ic/4, N, 4)
diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp
index 1cefb8a2b..e72155724 100644
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@@ -79,7 +79,7 @@ bool CUDARuntimeWrapper::onSetCache(const void* buffer, size_t size) {//set Cach
     return mCUDARuntime->setCache(std::make_pair(buffer, size));
 }
 
-Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
+Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config, Backend* origin) const {
 #ifdef LOG_VERBOSE
     MNN_PRINT("cudaruntime:%p, create CUDABackend\n", this);
 #endif
diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp
index 3c3fb2402..03737a10b 100644
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@@ -31,7 +31,7 @@ class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
 public:
     CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power, BackendConfig::MemoryMode memory, int deviceId = 0);
     virtual ~CUDARuntimeWrapper();
-    virtual Backend *onCreate(const BackendConfig* config) const override;
+    virtual Backend *onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onGabageCollect(int level) override;
     bool isCreateError() const {
         return mIsCreateError;
diff --git a/source/backend/hiai/backend/NPUBackend.cpp b/source/backend/hiai/backend/NPUBackend.cpp
index 1b4f45fca..33159aa67 100644
--- a/source/backend/hiai/backend/NPUBackend.cpp
+++ b/source/backend/hiai/backend/NPUBackend.cpp
@@ -552,7 +552,7 @@ namespace MNN {
 
     NPURuntime::~NPURuntime() {}
 
-    Backend* NPURuntime::onCreate(const BackendConfig* config) const {
+    Backend* NPURuntime::onCreate(const BackendConfig* config, Backend* origin) const {
         return new NPUBackend(this);
     }
 
diff --git a/source/backend/hiai/backend/NPUBackend.hpp b/source/backend/hiai/backend/NPUBackend.hpp
index 4ee14a513..cfada3d13 100644
--- a/source/backend/hiai/backend/NPUBackend.hpp
+++ b/source/backend/hiai/backend/NPUBackend.hpp
@@ -251,7 +251,7 @@ namespace MNN {
         NPURuntime(const Backend::Info& info);
         virtual ~NPURuntime();
         virtual CompilerType onGetCompilerType() const override;
-        virtual Backend* onCreate(const BackendConfig* conf) const override;
+        virtual Backend* onCreate(const BackendConfig* conf, Backend* origin) const override;
         virtual void onGabageCollect(int level) override;
         // If buffer is not nullptr, try copy cache, else delete cache
         virtual bool onSetCache(const void* buffer, size_t size) override {
diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm
index 2c6eed591..e1d1ef28f 100644
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@@ -239,21 +239,11 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     auto context = (__bridge MNNMetalContext *)mtbn->context();
     mParamQKV = [context newDeviceBuffer:sizeof(Param) access:CPUWriteOnly];
     mParamSoftmax = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
-
+    mTempQK.reset(Tensor::createDevice<float>({0, 0}));
+    mTempSoftMax.reset(Tensor::createDevice<float>({0, 0}));
 }
 
 void AttentionBufExecution::reallocKVCache() {
-    if (mCache->mPastLength < mCache->mMaxLength || nullptr == mTempQK || (!mIsDecode)) {
-        if (mIsDecode) {
-            mTempQK.reset(Tensor::createDevice<float>({mNumHead, mCache->mMaxLength}));
-            mTempSoftMax.reset(Tensor::createDevice<float>({mNumHead, mCache->mMaxLength}));
-        } else {
-            mTempQK.reset(Tensor::createDevice<float>({mNumHead, mCache->mPastLength, mCache->mPastLength}));
-            mTempSoftMax.reset(Tensor::createDevice<float>({mNumHead, mCache->mPastLength, mCache->mPastLength}));
-        }
-        backend()->onAcquireBuffer(mTempQK.get(), Backend::STATIC);
-        backend()->onAcquireBuffer(mTempSoftMax.get(), Backend::STATIC);
-    }
     if (!mKVCache || mCache->mPastLength < mCache->mMaxLength) {
         return;
     }
@@ -378,6 +368,31 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     int group_size = mNumHead / mKvNumHead;
 
     reallocKVCache();
+    bool needMalloc = mTempQK->length(0) != mNumHead;
+    if (mIsDecode) {
+        if (mTempQK->length(1) != mCache->mMaxLength) {
+            needMalloc = true;
+        }
+        mTempQK->setLength(0, mNumHead);
+        mTempQK->setLength(1, mCache->mMaxLength);
+        mTempSoftMax->setLength(0, mNumHead);
+        mTempSoftMax->setLength(1, mCache->mMaxLength);
+    } else {
+        if (mTempQK->length(1) != mCache->mPastLength * mCache->mPastLength) {
+            needMalloc = true;
+        }
+        mTempQK->setLength(0, mNumHead);
+        mTempQK->setLength(1, mCache->mPastLength * mCache->mPastLength);
+        mTempSoftMax->setLength(0, mNumHead);
+        mTempSoftMax->setLength(1, mCache->mPastLength * mCache->mPastLength);
+    }
+    if (needMalloc) {
+        auto res = backend()->onAcquireBuffer(mTempQK.get(), Backend::STATIC) && backend()->onAcquireBuffer(mTempSoftMax.get(), Backend::STATIC);
+        if (!res) {
+            MNN_ERROR("MNN::Metal: OUT_OF_MEMORY when execute attention metal\n");
+            return;
+        }
+    }
 
     // Update Parameters
     {
@@ -456,7 +471,6 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         mCache->mPastLength += 1;
         mCache->mKv_seq_len = mCache->mPastLength + 1;
     }
-
     return;
 }
 
diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp
index e01913a38..22eee335f 100644
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@@ -56,7 +56,7 @@ class MetalRuntime : public Runtime {
     std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>,  uint32_t>>& getTunedThreadGroup() {
         return mTunedThreadGroup;
     };
-    virtual Backend *onCreate(const BackendConfig* config) const override;
+    virtual Backend *onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onGabageCollect(int level) override;
     virtual CompilerType onGetCompilerType() const override {
         return Compiler_Loop;
@@ -71,10 +71,16 @@ class MetalRuntime : public Runtime {
                                const MNN::Op* op) override;
     virtual bool onMeasure(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                         const MNN::Op* op, Runtime::OpInfo& dstInfo) const override;
+    SingleBufferWithAllocator* buffer(int index) const {
+        return &mDynamic[index];
+    }
+    BufferAllocator* createDynamicAllocator(int index, bool secondResize) const;
 private:
     MetalRuntime(void* context);
     void* mContext = nullptr;
-    std::shared_ptr<EagerBufferAllocator> mStatic;
+    mutable std::shared_ptr<EagerBufferAllocator> mStatic;
+    mutable std::shared_ptr<EagerBufferAllocator> mStaticCache;
+    mutable std::vector<SingleBufferWithAllocator> mDynamic;
     MetalTuneLevel mTuneLevel = Wide;
     std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
 
@@ -226,8 +232,6 @@ class MetalBackend : public Backend {
     id<MTLCommandQueue> _commandQueue;
 
     const MetalRuntime* mRuntime;
-    id<MTLBuffer> mShapeH2D;
-    id<MTLBuffer> mShapeD2H;
     mutable NSUInteger mEncoderCount = 0;
     mutable bool mOpEncoderSet = false;//whether has set encoder
     mutable bool mSupportDeferEncode = true;
@@ -240,6 +244,7 @@ class MetalBackend : public Backend {
     std::shared_ptr<EagerBufferAllocator> mStaticBufferPool;
 
 private:
+    void _resetDynamicMemory() const;
     CopyPipeline _makeCopyInfo(const Tensor *src, const Tensor *dst, id<MTLBuffer> shape, int castType) const;
 
     mutable id<MTLBuffer> mHostBuffer = nullptr;
diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm
index 6f73629bb..268db6fde 100644
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@@ -10,6 +10,7 @@
 #define MNN_METAL
 #import <MNN/MNNSharedContext.h>
 #define METAL_CONST_BUFFER_LIMIT 128
+#define METAL_SEPERATE_MAX_COUNT 2
 #if MNN_METAL_ENABLED
 #import "backend/metal/MNNMetalContext.h"
 #import "core/Macro.h"
@@ -35,12 +36,16 @@ static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
     auto des = TensorUtils::getDescribe(t);
     des->extra.offset = offset;
 }
-static BufferAllocator* _createBufferAllocator(const Runtime* runtime, BufferAllocator* origin, bool secondResize) {
-    if (runtime->hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
-        return new DeferBufferAllocator(BufferAllocator::Allocator::createRecurse(origin), 1024, _MetalApplyTensor);
+BufferAllocator* MetalRuntime::createDynamicAllocator(int index, bool secondResize) const {
+    if (hint().memoryAllocatorType == Runtime::Allocator_Defer && secondResize) {
+        return new DeferBufferAllocator(buffer(index), 1024, _MetalApplyTensor);
     }
-    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(origin), 1024);
+    if (mStaticCache.get() != nullptr) {
+        return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStaticCache.get()), 1024);
+    }
+    return new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(mStatic.get()), 1024);
 }
+
 struct TunedInfo {
     std::vector<std::unique_ptr<MetalCache::OpInfoT>> mInfos;
 };
@@ -70,11 +75,9 @@ static void _MetalApplyTensor(uint8_t* host, size_t offset, Tensor* t) {
     {
     mRuntime = runtime;
     auto ctx = (__bridge MNNMetalContext *)runtime->context();
-    mBufferPool.reset(_createBufferAllocator(runtime, staticMem.get(), false));
+    mBufferPool.reset(runtime->createDynamicAllocator(0, false));
     mCurrentAllocator = mBufferPool.get();
     mStaticBufferPool = staticMem;
-    mShapeH2D = getConstBuffer(4 * sizeof(int));
-    mShapeD2H = getConstBuffer(4 * sizeof(int));
     mUseFloatAsFp16 = usefp16AsFp32;
     mIsIphone = ctx.isIphone;
     if (runtime->getCommandQueue() == nil) {
@@ -207,6 +210,9 @@ MemChunk chunk() override {
 
 bool MetalBackend::onClearBuffer() {
     mCurrentAllocator->release(true);
+    if (nullptr != mRuntime->mStaticCache.get()) {
+        mStaticBufferPool = mRuntime->mStaticCache;
+    }
     return true;
 }
 
@@ -238,8 +244,15 @@ MemChunk chunk() override {
         mComputeEncoder = nil;
     }
 }
+void MetalBackend::_resetDynamicMemory() const {
+    mCurrentAllocator->apply();
+    if (nullptr != mBufferPoolShapeImmutable.get()) {
+        mBufferPoolShapeImmutable->apply();
+    }
+}
 
 void MetalBackend::onExecuteBegin() const {
+    _resetDynamicMemory();
     mEncoderCount = 0;
 }
 void MetalBackend::onExecuteEnd() const {
@@ -263,8 +276,8 @@ MemChunk chunk() override {
         return false;
     }
     if (maxIndex == 2 && mBufferPoolShapeImmutable.get() == nullptr) {
-        mBufferPoolShapeImmutable.reset(_createBufferAllocator(mRuntime, mStaticBufferPool.get(), true));
-        mBufferPool.reset(_createBufferAllocator(mRuntime, mStaticBufferPool.get(), true));
+        mBufferPoolShapeImmutable.reset(mRuntime->createDynamicAllocator(1, true));
+        mBufferPool.reset(mRuntime->createDynamicAllocator(0, true));
     }
     if (1 == index) {
         mCurrentAllocator = mBufferPoolShapeImmutable.get();
@@ -315,9 +328,7 @@ MemChunk chunk() override {
 }
 
 id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
-    if (size < METAL_CONST_BUFFER_LIMIT) {
-        size = METAL_CONST_BUFFER_LIMIT;
-    }
+    size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
     // reuse
     if (nullptr != mHostBuffer && mHostBuffer.length >= size) {
         return mHostBuffer;
@@ -703,7 +714,7 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
     if(!mFrameEncodeCache) {
         commit_net();
     }
-
+    _resetDynamicMemory();
     onCopyBuffer(src, dst, nil, nil);
 }
 
@@ -983,6 +994,10 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
     auto ctx = (__bridge MNNMetalContext *)mContext;
     std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
     mStatic.reset(new EagerBufferAllocator(allocator));
+    mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
+    for (auto& buf : mDynamic) {
+        buf.root = allocator;
+    }
     mTunedInfo = new TunedInfo;
 }
 
@@ -1067,7 +1082,11 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
 
 float MetalRuntime::onGetMemoryInMB() {
     auto staticMemoryInMB = mStatic->totalSize() / 1024.0f / 1024.0f;
-    return staticMemoryInMB;
+    float dynamicMemoryInMB = 0.0f;
+    for (auto& buf : mDynamic) {
+        dynamicMemoryInMB += buf.currentSize / 1024.0f / 1024.0f;
+    }
+    return staticMemoryInMB + dynamicMemoryInMB;
 }
 
 void MetalRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
@@ -1153,7 +1172,36 @@ static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* s
     return true;
 }
 
-Backend* MetalRuntime::onCreate(const BackendConfig* config) const {
+class MetalWrapAllocator : public BufferAllocator::Allocator {
+private:
+    std::shared_ptr<BufferAllocator::Allocator> mOrigin;
+    id<MTLDevice> mDevice;
+public:
+    MetalWrapAllocator(std::shared_ptr<BufferAllocator::Allocator> origin, id<MTLDevice> device) : mOrigin(origin), mDevice(device) {}
+    virtual ~ MetalWrapAllocator() {
+        // Do nothing
+    }
+    virtual MemChunk onAlloc(size_t size, size_t align) override {
+        auto mem = mOrigin->onAlloc(size, align);
+        MNN_ASSERT(mem.second == 0);
+        id<MTLBuffer> buffer = [mDevice newBufferWithBytesNoCopy:mem.first length:size options:MTLResourceStorageModeShared  deallocator:nil];
+        auto wrap = new MetalRuntimeAllocator::MetalBufferAlloc(buffer);
+        return MemChunk((void *)wrap, 0);
+    }
+    virtual void onRelease(MemChunk chunk) override {
+        auto mem = (MetalRuntimeAllocator::MetalBufferAlloc *)chunk.first;
+        mOrigin->onRelease(MemChunk(mem->getBuffer().contents));
+        delete mem;
+    }
+};
+Backend* MetalRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
+    if (hint().weightMemoryPath.size() > 0 && mStaticCache.get() == nullptr) {
+        auto ctx = (__bridge MNNMetalContext *)mContext;
+        auto mmap = BufferAllocator::Allocator::createMmap(hint().weightMemoryPath.c_str(), "metal.weight");
+        std::shared_ptr<BufferAllocator::Allocator> mmapMem(new MetalWrapAllocator(mmap, [ctx device]));
+        mStaticCache = mStatic;
+        mStatic.reset(new EagerBufferAllocator(mmapMem, 32, 1024 * 1024 * 1024));
+    }
     BackendConfig::PrecisionMode precision = mDefaultConfig.precision;
     if (nullptr != config) {
         precision = config->precision;
@@ -1164,6 +1212,11 @@ static bool _checkTensorInfo(const MetalCache::TensorInfoT* dst, const Tensor* s
 
 void MetalRuntime::onGabageCollect(int level) {
     mStatic->release(false);
+    if (level >= 100) {
+        for (auto& buf : mDynamic) {
+            buf.release();
+        }
+    }
 }
 
 std::pair<const void*, size_t> MetalRuntime::onGetCache() {//make Cache
diff --git a/source/backend/metal/MetalBinary.mm b/source/backend/metal/MetalBinary.mm
index b9482ce84..6854a5578 100755
--- a/source/backend/metal/MetalBinary.mm
+++ b/source/backend/metal/MetalBinary.mm
@@ -40,9 +40,9 @@
 void MetalBinary::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
     auto input0 = inputs[0], input1 = inputs[1], output = outputs[0];
     [encoder setComputePipelineState:mPipeline];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input0->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input0)->extra.offset atIndex:0];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input1->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input1)->extra.offset atIndex:1];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:2];
+    MetalBackend::setTensor(input0, encoder, 0);
+    MetalBackend::setTensor(input1, encoder, 1);
+    MetalBackend::setTensor(output, encoder, 2);
     [encoder setBuffer:mConstBuffer offset:0 atIndex:3];
     [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
 }
diff --git a/source/backend/metal/MetalConvolutionDepthwise.mm b/source/backend/metal/MetalConvolutionDepthwise.mm
index 85b17c88f..cb3036225 100755
--- a/source/backend/metal/MetalConvolutionDepthwise.mm
+++ b/source/backend/metal/MetalConvolutionDepthwise.mm
@@ -73,7 +73,13 @@
                     mConstBuffer, (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(), ((MetalRuntimeAllocator::MetalBufferAlloc *)mBias->deviceId())->getBuffer(), nil];
     const Tensor* weight = mWeight.get();
     const Tensor* bias = mBias.get();
-    int buffer_offset[] = {TensorUtils::getDescribe(input)->extra.offset, TensorUtils::getDescribe(output)->extra.offset, TensorUtils::getDescribe(weight)->extra.offset, TensorUtils::getDescribe(bias)->extra.offset, 0};
+    int buffer_offset[] = {
+        TensorUtils::getDescribe(input)->extra.offset,
+        TensorUtils::getDescribe(output)->extra.offset,
+        0,
+        TensorUtils::getDescribe(weight)->extra.offset,
+        TensorUtils::getDescribe(bias)->extra.offset
+    };
 
     std::string name = "conv_depthwise";
     MetalRuntime *rt = (MetalRuntime *)backend->runtime();
diff --git a/source/backend/metal/MetalUnary.mm b/source/backend/metal/MetalUnary.mm
index bc66f77a2..72b91f874 100755
--- a/source/backend/metal/MetalUnary.mm
+++ b/source/backend/metal/MetalUnary.mm
@@ -122,8 +122,8 @@ kernel void main0(const device T *in [[buffer(0)]], \
 void MetalUnary::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
     auto input = inputs[0], output = outputs[0];
     [encoder setComputePipelineState:mPipeline];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer() offset:TensorUtils::getDescribe(input)->extra.offset atIndex:0];
-    [encoder setBuffer:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId())->getBuffer() offset:TensorUtils::getDescribe(output)->extra.offset atIndex:1];
+    MetalBackend::setTensor(input, encoder, 0);
+    MetalBackend::setTensor(output, encoder, 1);
     [encoder setBuffer:mConstBuffer offset:0 atIndex:2];
     [encoder dispatchThreadgroups:mThreads.first threadsPerThreadgroup:mThreads.second];
 }
diff --git a/source/backend/nnapi/backend/NNAPIBackend.cpp b/source/backend/nnapi/backend/NNAPIBackend.cpp
index 81ac425e0..a60b0b90f 100644
--- a/source/backend/nnapi/backend/NNAPIBackend.cpp
+++ b/source/backend/nnapi/backend/NNAPIBackend.cpp
@@ -549,7 +549,7 @@ namespace MNN {
 
     NNAPIRuntime::~NNAPIRuntime() {}
 
-    Backend* NNAPIRuntime::onCreate(const BackendConfig* config) const {
+    Backend* NNAPIRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
         return new NNAPIBackend(this);
     }
 
diff --git a/source/backend/nnapi/backend/NNAPIBackend.hpp b/source/backend/nnapi/backend/NNAPIBackend.hpp
index 17e947973..ac6b462b3 100644
--- a/source/backend/nnapi/backend/NNAPIBackend.hpp
+++ b/source/backend/nnapi/backend/NNAPIBackend.hpp
@@ -50,7 +50,7 @@ namespace MNN {
         NNAPIRuntime(const Backend::Info& info);
         virtual ~NNAPIRuntime();
         virtual CompilerType onGetCompilerType() const override;
-        virtual Backend* onCreate(const BackendConfig* conf) const override;
+        virtual Backend* onCreate(const BackendConfig* conf, Backend* origin) const override;
         virtual void onGabageCollect(int level) override;
         virtual std::pair<const void*, size_t> onGetCache() override {
             return std::make_pair(mCacheBuffer, mCacheSize);
diff --git a/source/backend/opencl/core/BufferConvertor.cpp b/source/backend/opencl/core/BufferConvertor.cpp
index 1d649a0b8..1f6abd82b 100644
--- a/source/backend/opencl/core/BufferConvertor.cpp
+++ b/source/backend/opencl/core/BufferConvertor.cpp
@@ -170,82 +170,6 @@ bool converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(const Tensor *input, Tensor
     return true;
 }
 
-bool convertNC4HW4BufferToNC4HW4Buffer(const Tensor *input, Tensor *output,                                OpenCLRuntime *runtime, TransType formatTrans, bool needWait, bool svmFlag, bool srcswap, bool dstswap) {
-    std::vector<int> outputShape = tensorShapeFormat(input);
-    uint32_t outputGlobalWorkSize[2] = {static_cast<uint32_t>(UP_DIV(outputShape[3], 4) * outputShape[2]),
-                                        static_cast<uint32_t>(outputShape[0] * outputShape[1])};
-    std::set<std::string> buildOptions;
-    std::string kernelName = "nc4hw4_buffer_to_nc4hw4_buffer";
-    switch (formatTrans) {
-        case InpTrans:
-            AddBuildOptionOfDataType(input, output, buildOptions, runtime->isSupportedFP16(), true, false);
-            break;
-        case OutTrans:
-            AddBuildOptionOfDataType(input, output, buildOptions, runtime->isSupportedFP16(), false, true);
-            break;
-        default:
-            AddBuildOptionOfDataType(input, output, buildOptions, runtime->isSupportedFP16(), true, true);
-            break;
-    }
-    auto convertBufferKernelW = runtime->buildKernelWithCache("buffer_convert_buf", kernelName, buildOptions);
-    auto convertBufferKernel = convertBufferKernelW->get();
-    uint32_t idx   = 0;
-    int outputImageShape[2] = {input->height(), input->width()};
-    int channelC4 = UP_DIV(input->channel(), 4);
-    int batch  = input->batch();
-    int srcStride[2] = {
-        channelC4,
-        1
-    };
-    int dstStride[2] = {
-        channelC4,
-        1
-    };
-    if (srcswap) {
-        srcStride[0] = 1;
-        srcStride[1] = batch;
-    }
-    if (dstswap) {
-        dstStride[0] = 1;
-        dstStride[1] = batch;
-    }
-    cl_int ret = CL_SUCCESS;
-    ret |= convertBufferKernel.setArg(idx++, outputGlobalWorkSize[0]);
-    ret |= convertBufferKernel.setArg(idx++, outputGlobalWorkSize[1]);
-#ifdef MNN_OPENCL_SVM_ENABLE
-    if(svmFlag == true)
-    {
-        ret |= clSetKernelArgSVMPointer(convertBufferKernel.get(), idx++, (const void *)input->buffer().device);
-    }
-    else
-#endif
-    {
-        ret |= convertBufferKernel.setArg(idx++, openCLBuffer(input));
-    }
-    ret |= convertBufferKernel.setArg(idx++, sizeof(outputImageShape), outputImageShape);
-    ret |= convertBufferKernel.setArg(idx++, sizeof(srcStride), srcStride);
-    ret |= convertBufferKernel.setArg(idx++, sizeof(dstStride), dstStride);
-    ret |= convertBufferKernel.setArg(idx++, openCLBuffer(output));
-    MNN_CHECK_CL_SUCCESS(ret, "setArg convertNC4HW4BufferToNC4HW4Buffer");
-
-    const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(convertBufferKernelW));
-    const std::vector<uint32_t> lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16)};
-    cl::Event event;
-    cl_int res;
-    std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
-    for (size_t i = 0; i < lws.size(); ++i) {
-        roundUpGroupWorkSize[i] = ROUND_UP(outputGlobalWorkSize[i], lws[i]);
-    }
-    res = runtime->commandQueue().enqueueNDRangeKernel(convertBufferKernel, cl::NullRange,
-                                                         cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]),
-                                                         cl::NDRange(lws[0], lws[1]), nullptr, &event);
-    MNN_CHECK_CL_SUCCESS(res, "nc4hw4_buffer_to_nc4hw4_buffer");
-    if (true == needWait) {
-        event.wait();
-    }
-    return true;
-}
-
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
 bool convertNC4HW4BufferBetweenNC16HW16Buffer(const Tensor *input, Tensor *output, const std::string Name,
                                        OpenCLRuntime *runtime, TransType formatTrans, bool needWait, bool svmFlag,
@@ -511,6 +435,145 @@ bool BufferConvertor::convertToNC4HW4Buffer(const Tensor *buffer, const OpenCLBu
 #endif
     return true;
 }
+
+bool convertBufferToBuffer(Tensor *input, Tensor *output, OpenCLRuntime *runtime, bool toDevice, bool toHost, bool needWait, bool svmFlag) {
+    std::vector<int> outputShape = tensorShapeFormat(input);
+    int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+    auto srcDimensionFormat = TensorUtils::getDescribe(input)->dimensionFormat;
+    auto dstDimensionFormat = TensorUtils::getDescribe(output)->dimensionFormat;
+    if (MNN_DATA_FORMAT_NC4HW4 == dstDimensionFormat && srcDimensionFormat != dstDimensionFormat && (outputShape[3] % 4) != 0){
+        int region[] = {outputShape[0], ROUND_UP(outputShape[3], 4), outputShape[1], outputShape[2]};//nchw
+        
+        auto kernelW = runtime->buildKernelWithCache("raster_buf", "buffer_set_zero", {}, output, output);
+        auto kernel = kernelW->get();
+        uint32_t lws[2] = {8, 8};
+        uint32_t gws[2] = {(uint32_t)UP_DIV((region[2] * region[3]), 8)*8, (uint32_t)UP_DIV((region[0] * region[1]), 8)*8};
+    
+        int global_dim0 = region[2] * region[3];
+        int global_dim1 = region[0] * region[1];
+    
+        uint32_t idx   = 0;
+        cl_int res = CL_SUCCESS;
+        res |= kernel.setArg(idx++, global_dim0);
+        res |= kernel.setArg(idx++, global_dim1);
+        res |= kernel.setArg(idx++, openCLBuffer(output));
+        MNN_CHECK_CL_SUCCESS(res, "setArg buffer_set_zero");
+    
+        res = runtime->commandQueue().enqueueNDRangeKernel(kernel, cl::NullRange,
+                                                         cl::NDRange(gws[0], gws[1]),
+                                                         cl::NDRange(lws[0], lws[1]), nullptr, nullptr);
+        MNN_CHECK_CL_SUCCESS(res, "buffer_set_zero");
+    }
+    if (srcDimensionFormat == dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat){
+        int size = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
+        uint32_t gws[2] = {static_cast<uint32_t>(UP_DIV(size, 4)), static_cast<uint32_t>(1)};
+        std::set<std::string> buildOptions;
+        if(size % 4 != 0){
+            buildOptions.emplace("-DPACK_LEAVE");
+        }
+        AddBuildOptionOfDataType(input, output, buildOptions, runtime->isSupportedFP16(), toDevice, toHost);
+        auto convertBufferKernelW = runtime->buildKernelWithCache("buffer_convert_buf", "buffer_copy_to_buffer", buildOptions);
+        auto convertBufferKernel = convertBufferKernelW->get();
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= convertBufferKernel.setArg(idx++, gws[0]);
+        ret |= convertBufferKernel.setArg(idx++, gws[1]);
+#ifdef MNN_OPENCL_SVM_ENABLE
+        if(svmFlag == true && toDevice) {
+            ret |= clSetKernelArgSVMPointer(convertBufferKernel.get(), idx++, (const void *)input->deviceId());
+        }
+        else
+#endif
+        {
+            ret |= convertBufferKernel.setArg(idx++, openCLBuffer(input));
+        }
+#ifdef MNN_OPENCL_SVM_ENABLE
+        if(svmFlag == true && toHost) {
+            ret |= clSetKernelArgSVMPointer(convertBufferKernel.get(), idx++, (const void *)output->deviceId());
+        }
+        else
+#endif
+        {
+            ret |= convertBufferKernel.setArg(idx++, openCLBuffer(output));
+        }
+        ret |= convertBufferKernel.setArg(idx++, size);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
+
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(convertBufferKernelW));
+        const std::vector<uint32_t> lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16)};
+        cl::Event event;
+        cl_int res;
+        std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
+        for (size_t i = 0; i < lws.size(); ++i) {
+            roundUpGroupWorkSize[i] = ROUND_UP(gws[i], lws[i]);
+        }
+        
+        res = runtime->commandQueue().enqueueNDRangeKernel(convertBufferKernel, cl::NullRange,
+                                                             cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1]),
+                                                             cl::NDRange(lws[0], lws[1]), nullptr, &event);
+        MNN_CHECK_CL_SUCCESS(res, "buffer_convert_to_buffer");
+        
+        if (true == needWait) {
+            event.wait();
+        }
+    } else{
+        uint32_t gws[3] = {static_cast<uint32_t>(shape[2] * shape[3]),
+                                      static_cast<uint32_t>(shape[1]),
+                                      static_cast<uint32_t>(shape[0])};
+        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DINPUT_FORMAT=" + std::to_string(srcDimensionFormat));
+        buildOptions.emplace("-DOUTPUT_FORMAT=" + std::to_string(dstDimensionFormat));
+        AddBuildOptionOfDataType(input, output, buildOptions, runtime->isSupportedFP16(), toDevice, toHost);
+        auto convertBufferKernelW = runtime->buildKernelWithCache("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions);
+        auto convertBufferKernel = convertBufferKernelW->get();
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= convertBufferKernel.setArg(idx++, gws[0]);
+        ret |= convertBufferKernel.setArg(idx++, gws[1]);
+        ret |= convertBufferKernel.setArg(idx++, gws[2]);
+#ifdef MNN_OPENCL_SVM_ENABLE
+        if(svmFlag == true && toDevice) {
+            ret |= clSetKernelArgSVMPointer(convertBufferKernel.get(), idx++, (const void *)input->deviceId());
+        }
+        else
+#endif
+        {
+            ret |= convertBufferKernel.setArg(idx++, openCLBuffer(input));
+        }
+        
+        ret |= convertBufferKernel.setArg(idx++, sizeof(shape), shape);
+#ifdef MNN_OPENCL_SVM_ENABLE
+        if(svmFlag == true && toHost) {
+            ret |= clSetKernelArgSVMPointer(convertBufferKernel.get(), idx++, (const void *)output->deviceId());
+        }
+        else
+#endif
+        {
+            ret |= convertBufferKernel.setArg(idx++, openCLBuffer(output));
+        }
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
+        
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(convertBufferKernelW));
+        const std::vector<uint32_t> lws = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+        cl::Event event;
+        cl_int res;
+        std::vector<uint32_t> roundUpGroupWorkSize(lws.size());
+        for (size_t i = 0; i < lws.size(); ++i) {
+            roundUpGroupWorkSize[i] = ROUND_UP(gws[i], lws[i]);
+        }
+        
+        res = runtime->commandQueue().enqueueNDRangeKernel(convertBufferKernel, cl::NullRange,
+                                                           cl::NDRange(roundUpGroupWorkSize[0], roundUpGroupWorkSize[1], roundUpGroupWorkSize[2]),
+                                                           cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+        MNN_CHECK_CL_SUCCESS(res, "buffer_convert_to_buffer");
+        
+        if (true == needWait) {
+            event.wait();
+        }
+    }
+    return true;
+}
+
 } // namespace OpenCL
 } // namespace MNN
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
diff --git a/source/backend/opencl/core/BufferConvertor.hpp b/source/backend/opencl/core/BufferConvertor.hpp
index 71514acbf..b1843226e 100644
--- a/source/backend/opencl/core/BufferConvertor.hpp
+++ b/source/backend/opencl/core/BufferConvertor.hpp
@@ -26,14 +26,13 @@ bool convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(const Tensor *input, Tensor
 
 enum TransType {InpTrans = 0, OutTrans = 1, NoTrans = 2};
 
-bool convertNC4HW4BufferToNC4HW4Buffer(const Tensor *input, Tensor *output,
-                                       OpenCLRuntime *runtime, TransType formatTrans = NoTrans, bool needWait = false, bool svmFlag = false, bool srcswap = false, bool dstswap = false);
-
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
 bool convertNC4HW4BufferBetweenNC16HW16Buffer(const Tensor *input, Tensor *output, const std::string Name,
                                              OpenCLRuntime *runtime, TransType formatTrans = NoTrans, bool needWait = false,
                                              bool svmFlag = false, bool srcswap = false, bool dstswap = false);
 #endif
+
+bool convertBufferToBuffer(Tensor *input, Tensor *output, OpenCLRuntime *runtime, bool toDevice, bool toHost, bool needWait = false, bool svmFlag = false);
                                        
 class BufferConvertor {
 public:
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index f05c0b2e5..67e0a1a81 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -191,7 +191,7 @@ std::pair<const void*, size_t> CLRuntime::onGetCache() {
     return mOpenCLRuntime->makeCache(mTunedInfo);
 }
 
-Backend* CLRuntime::onCreate(const BackendConfig* config) const {
+Backend* CLRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
     // FIXME: Use config info
     return new OpenCLBackend(mImagePool, mBufferPool, this);
 }
@@ -413,6 +413,9 @@ Backend::MemObj* OpenCLBackend::onAcquire(const Tensor* nativeTensor, StorageTyp
 }
 
 bool OpenCLBackend::onSelectDynamicAllocator(int index, int maxIndex) {
+    if (mUseRecordQueue && false == mDevideOpRecord){
+        return false;
+    }
     if (maxIndex > 2) {
         return false;
     }
@@ -702,25 +705,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
             }
         } else 
 #endif
-        {
-            switch (data_format) {
-                case MNN_DATA_FORMAT_NHWC:
-                    OpenCL::convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor),
-                                                      "nc4hw4_buffer_to_nhwc_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
-                    break;
-                case MNN_DATA_FORMAT_NCHW:
-                    OpenCL::convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor),
-                                                     "nc4hw4_buffer_to_nchw_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
-                    break;
-                case MNN_DATA_FORMAT_NC4HW4:
-                    OpenCL::convertNC4HW4BufferToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor),
-                                                    mOpenCLRuntime.get(), OutTrans, false, svmFlag, false, true);
-                    break;
-                default:
-                    MNN_PRINT("output data format not support!\n");
-                    break;
-            }
-        }
+        OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), false, true, true, svmFlag);
     }
     else
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
@@ -755,18 +740,41 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
 
 void OpenCLBackend::copyFromDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
     auto needSize = dstTensor->size();
-
+    auto shape = tensorShapeFormat(srcTensor);
+    auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
+    auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    auto memType = dstTensor->buffer().flags;
+    bool directCopy =  BUFFER == mOpenCLRuntime->getGpuMemType()
+                       && (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
+                       && MNN::MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
+                       && (getDataType(srcTensor) == getDataType(dstTensor))
+                       && memType != MNN_FORWARD_OPENCL 
+                       && memType != MNN_FORWARD_OPENGL;
+    if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
+        if (dstTensor->getType().code == halide_type_float) {
+            directCopy = false;
+        }
+    }
+    if(mOpenCLRuntime->isSupportedIntelSubgroup()){
+        int cPack = TensorUtils::getTensorChannelPack(srcTensor);
+        if (cPack == 16){
+            directCopy = false;
+        }
+    }
     void* hostPtr = dstTensor->host<float>();
+    if(directCopy){
+        mOpenCLRuntime->commandQueue().enqueueReadBuffer(openCLBuffer(srcTensor), CL_TRUE, 0, needSize, hostPtr);
+        return;
+    }
 
     _allocHostBuffer(needSize, dstTensor);
 
     MNN::Tensor interTensor(dstTensor, dstTensor->getDimensionType(), false);
     interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
-
-    MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    TensorUtils::getDescribe(&interTensor)->dimensionFormat = dstDimensionFormat;
     
     //Convert format
-    mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, data_format, false);
+    mCLRuntime->convertFromDevice(srcTensor, (const Tensor*)&interTensor, dstDimensionFormat, false);
     mOpenCLRuntime->printEventTime();
 
     cl_int res;
@@ -808,18 +816,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
             }
         }else
 #endif        
-        {
-            if (MNN_DATA_FORMAT_NHWC == data_format) {
-                OpenCL::converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nhwc_buffer_to_nc4hw4_buffer",mOpenCLRuntime.get(), true, false, svmFlag);
-            } else if (MNN_DATA_FORMAT_NCHW == data_format) {
-                OpenCL::converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nchw_buffer_to_nc4hw4_buffer",mOpenCLRuntime.get(), true, false, svmFlag);
-            } else if (MNN_DATA_FORMAT_NC4HW4 == data_format) {
-                OpenCL::convertNC4HW4BufferToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), InpTrans, false, svmFlag, true, false);
-            } else {
-                MNN_PRINT("input data format not support\n");
-                MNN_ASSERT(false);
-            }
-        }
+        OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), true, false, false, svmFlag);
     }
     else
     #endif /* MNN_OPENCL_BUFFER_CLOSED */
@@ -853,28 +850,47 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
 void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTensor) const{
     auto needSize = srcTensor->size();
     auto shape = tensorShapeFormat(srcTensor);
+    auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
+    auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    auto memType = srcTensor->buffer().flags;
+    void* hostPtr = srcTensor->host<float>();
     // 1*1*1*1 don't need convert
-    if(BUFFER == mOpenCLRuntime->getGpuMemType() && 1 == shape[0] * shape[1] * shape[2] * shape[3]){
-        void *tmpPtr;
-        void *hostPtr = srcTensor->host<float>();
-        if(srcTensor->getType().code == halide_type_float && mOpenCLRuntime->isSupportedFP16()) {
-            needSize /= 2;
-            void *tmpPtr = malloc(needSize);
-            ((half_float::half*)tmpPtr)[0] = (half_float::half)(((float*)hostPtr)[0]);
-            mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, tmpPtr);
-            free(tmpPtr);
-        } else {
-            mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, hostPtr);
+    if(srcTensor->getType().code == halide_type_float && mOpenCLRuntime->isSupportedFP16() && 1 == shape[0] * shape[1] * shape[2] * shape[3]){
+        needSize /= 2;
+        void *tmpPtr = malloc(needSize);
+        ((half_float::half*)tmpPtr)[0] = (half_float::half)(((float*)hostPtr)[0]);
+        mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, tmpPtr);
+        free(tmpPtr);
+        return;
+    }
+    
+    bool directCopy =  BUFFER == mOpenCLRuntime->getGpuMemType()
+                       && (srcDimensionFormat == dstDimensionFormat || srcTensor->dimensions() <= 1)
+                       && MNN_DATA_FORMAT_NC4HW4 != dstDimensionFormat && MNN_DATA_FORMAT_NC4HW4 != srcDimensionFormat
+                       && (getDataType(srcTensor) == getDataType(dstTensor))
+                       && memType != MNN_FORWARD_OPENCL
+                       && memType != MNN_FORWARD_OPENGL;
+    if (mOpenCLRuntime->isSupportedFP16()) { // Fp16
+        if (dstTensor->getType().code == halide_type_float) {
+            directCopy = false;
+        }
+    }
+    if(mOpenCLRuntime->isSupportedIntelSubgroup()){
+        int cPack = TensorUtils::getTensorChannelPack(dstTensor);
+        if (cPack == 16){
+            directCopy = false;
         }
+    }
+    if(directCopy){
+        mOpenCLRuntime->commandQueue().enqueueWriteBuffer(openCLBuffer(dstTensor), CL_TRUE, 0, needSize, hostPtr);
         return;
     }
 
-    void* hostPtr = srcTensor->host<float>();
-
     _allocHostBuffer(needSize, srcTensor);
 
     MNN::Tensor interTensor(srcTensor, srcTensor->getDimensionType(), false);
     interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
+    TensorUtils::getDescribe(&interTensor)->dimensionFormat = srcDimensionFormat;
 
     #ifdef ENABLE_OPENCL_TIME_PROFILER
     mOpenCLRuntime->commandQueue().finish();
@@ -891,8 +907,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
     #endif
 
     //Covert format
-    MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
-    mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, data_format, false);
+    mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, srcDimensionFormat, false);
 
     return;
 }
@@ -904,6 +919,7 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
         mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
     } else {
         const Tensor* copyTensor = MNN_FORWARD_CPU != srcMemtype ? srcTensor : dstTensor;
+        MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(copyTensor)->dimensionFormat;
         int memType = MNN_FORWARD_CPU != srcMemtype ? srcMemtype : dstMemtype;
         if(MNN_FORWARD_OPENCL != memType && MNN_FORWARD_OPENGL != memType){
             MNN_PRINT("Unsupport ForwardType %d for OpenCL backend!\n", memType);
@@ -916,6 +932,7 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
         _allocHostBuffer(0, copyTensor);
 
         MNN::Tensor interTensor(copyTensor, copyTensor->getDimensionType(), false);
+        TensorUtils::getDescribe(&interTensor)->dimensionFormat = data_format;
         if(MNN_FORWARD_OPENCL == memType ){
             interTensor.buffer().device = (uint64_t)mDeviceBuffer;
         }else if(MNN_FORWARD_OPENGL == memType){
@@ -924,7 +941,6 @@ void OpenCLBackend::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dst
             interTensor.buffer().device = (uint64_t)mHostBuffer.second.get();
         }
         //Covert format
-        MNN_DATA_FORMAT data_format = TensorUtils::getDescribe(copyTensor)->dimensionFormat;
         if(MNN_FORWARD_CPU != srcMemtype){
             mCLRuntime->convertToDevice((const Tensor*)&interTensor, dstTensor, data_format, false, srcMemtype);
         }else{
@@ -937,7 +953,7 @@ void CLRuntime::copyBetweenDevice(const Tensor* srcTensor, const Tensor* dstTens
     #ifndef MNN_OPENCL_BUFFER_CLOSED
     if(mOpenCLRuntime->getGpuMemType() == BUFFER)
     {
-        OpenCL::convertNC4HW4BufferToNC4HW4Buffer(srcTensor, const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), NoTrans);
+        OpenCL::convertBufferToBuffer(const_cast<Tensor*>(srcTensor), const_cast<Tensor*>(dstTensor), mOpenCLRuntime.get(), true, true);
     }
     else
     #endif /* MNN_OPENCL_BUFFER_CLOSED */
@@ -1166,7 +1182,7 @@ class CLRuntimeCreator : public RuntimeCreator {
     }
 };
 
-DataType OpenCLBackend::getDataType(const Tensor* tensor) {
+DataType OpenCLBackend::getDataType(const Tensor* tensor) const{
     auto des = TensorUtils::getDescribe(tensor);
     if (nullptr == des->quantAttr.get()) {
         return DataType_DT_FLOAT;
diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
index 6e50d25ba..3f0abcefb 100644
--- a/source/backend/opencl/core/OpenCLBackend.hpp
+++ b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -48,7 +48,7 @@ class CLRuntime : public Runtime {
     CLRuntime(const Backend::Info& info, int platformSize, int platformId, int deviceId = 0, void *contextPtr = nullptr, void *glshared = nullptr);
     virtual ~CLRuntime();
 
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
@@ -122,7 +122,7 @@ class OpenCLBackend : public Backend {
     }
     
     float getBytes(const Tensor* tensor);
-    DataType getDataType(const Tensor* tensor);
+    DataType getDataType(const Tensor* tensor) const;
 
     cl_channel_type fpType();
     int fpBytes();
diff --git a/source/backend/opencl/core/OpenCLGemmTune.cpp b/source/backend/opencl/core/OpenCLGemmTune.cpp
index 388fba6f0..8f1b63c50 100644
--- a/source/backend/opencl/core/OpenCLGemmTune.cpp
+++ b/source/backend/opencl/core/OpenCLGemmTune.cpp
@@ -127,24 +127,86 @@ static bool isCandidateValid(uint32_t kwg, uint32_t kwi, uint32_t mwg, uint32_t
     
     return true;
 }
+
+static bool GemmlocalWSTune(const std::map<std::string, std::vector<std::pair<std::vector<uint32_t>, std::pair<std::vector<uint32_t>, uint32_t>>>> &tuneMap, const std::vector<uint32_t> &gemmSize, std::vector<uint32_t>& res, OpenCLRuntime *runtime){
+    auto iter = tuneMap.find("Xgemm_tune");
+    if(iter == tuneMap.end()){
+        return false;
+    }
+    auto gwsAndLws = iter->second;
+    uint32_t minPoint = UINT_MAX;
+    int index = -1;
+    for(int i = 0; i < gwsAndLws.size(); ++i){
+        // Layout+Precision, Batch, Bias+GroupSize must equall
+        if(gemmSize[3] != gwsAndLws[i].first[3] || gemmSize[4] != gwsAndLws[i].first[4] || gemmSize[5] != gwsAndLws[i].first[5]){
+            continue;
+        }
+        auto combinations = gwsAndLws[i].second.first;
+        uint32_t kwg   = combinations[0];
+        uint32_t kwi   = combinations[1];
+        uint32_t mdima = combinations[2];
+        uint32_t mdimc = combinations[3];
+        uint32_t mwg   = combinations[4];
+        uint32_t ndimb = combinations[5];
+        uint32_t ndimc = combinations[6];
+        uint32_t nwg   = combinations[7];
+        uint32_t sa    = combinations[8];
+        uint32_t sb    = combinations[9];
+        uint32_t strm  = combinations[10];
+        uint32_t strn  = combinations[11];
+        uint32_t vwm   = combinations[12];
+        uint32_t vwn   = combinations[13];
+        
+        if(!isCandidateValid(kwg, kwi, mwg, mdimc, vwm, nwg, ndimc, vwn, mdima, ndimb, sa, sb, runtime, gemmSize)) {
+            continue;
+        }
+        uint32_t point = 0;
+        for(int j = 0; j < 3; ++j){
+            point += std::abs(static_cast<int>(gemmSize[j]) - static_cast<int>(gwsAndLws[i].first[j]));
+        }
+        
+        if(point < minPoint){
+            index = i;
+            minPoint = point;
+        }
+    }
+    if(index != -1){
+        res = gwsAndLws[index].second.first;
+    } else{
+        return false;
+    }
+    return true;
+}
     
 std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const std::vector<cl::Buffer> tensorMemory,
                                        OpenCLRuntime *runtime) {
-    MNN_ASSERT(gemmSize.size() == 6); // M, N, K, Layout, Batch, Bias
+    MNN_ASSERT(gemmSize.size() == 6); // M, N, K, Layout+Precision, Batch, Bias+GroupSize
     MNN_ASSERT(gemmSize[0] % 16 == 0);
     MNN_ASSERT(gemmSize[1] % 16 == 0);
     MNN_ASSERT(gemmSize[2] % 4 == 0);
 
-    MNN_ASSERT((gemmSize[5] == 0 && tensorMemory.size() == 3) || (gemmSize[5] >= 1 && tensorMemory.size() == 4));
+    int layoutType = gemmSize[3] % 10;
+    int mixPrecision = gemmSize[3] / 10;
+    int biasType = gemmSize[5] % 10;
+    int groupSize = gemmSize[5] / 10 + 1;
+    MNN_ASSERT((biasType == 0 && tensorMemory.size() == 3) || (biasType >= 1 && tensorMemory.size() == 4));
     auto& tunedGemmParams = runtime->tunedGemmParamsMap();
+    auto& tuneLws = runtime->getTuneLwsMap();
     
     std::vector<uint32_t> info(gemmSize);
-    uint32_t isFp16 = runtime->isSupportedFP16();
-    info.emplace_back(isFp16);
+    uint32_t precisionType = runtime->getPrecisionLevel();
+    if(precisionType == 2 && mixPrecision > 0) {
+        precisionType = 0;
+    }
+    info.emplace_back(precisionType);
     
     if (tunedGemmParams.find(info) != tunedGemmParams.end()) {
         return tunedGemmParams[info];
     }
+    std::vector<uint32_t> tuneLwsRes;
+    if(GemmlocalWSTune(tuneLws, gemmSize, tuneLwsRes, runtime)){
+        return tuneLwsRes;
+    }
     
     if(runtime->getCLTuneLevel() == None) {
         auto getMaxDivisor = [](uint32_t num) -> uint32_t {
@@ -201,6 +263,8 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
         totalCombinations.push_back({16, 2, 16, 16, 128, 8 , 8 , 64 , 0, 0, 1, 1, 2, 8});//2
         totalCombinations.push_back({16, 2, 16, 16, 128, 8 , 8 , 128, 0, 0, 0, 0, 8, 8});
         totalCombinations.push_back({16, 2, 8 , 8 , 16 , 8 , 8 , 128, 0, 0, 0, 0, 2, 8});
+        totalCombinations.push_back({16, 2, 4, 4, 32, 8, 8, 32, 0, 0, 0, 0, 8, 2});
+        totalCombinations.push_back({16, 2, 4, 4, 16, 8, 8, 32, 0, 0, 0, 0, 4, 2});
 
         if(runtime->getCLTuneLevel() < Fast) {
             totalCombinations.push_back({16, 2, 16, 16, 128, 8 , 8 , 64 , 0, 0, 1, 0, 8, 8});//4
@@ -226,14 +290,17 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
             
             totalCombinations.push_back({16, 2, 8, 8, 32, 8, 8, 32, 0, 0, 1, 0, 2, 4});
             totalCombinations.push_back({16, 2, 8, 8, 16, 8, 8, 32, 0, 0, 1, 1, 2, 4});
+            totalCombinations.push_back({16, 2, 4, 4, 16, 8, 8, 64, 0, 0, 0, 0, 2, 8});
+            totalCombinations.push_back({16, 2, 4, 4, 64, 8, 8, 32, 0, 0, 1, 0, 4, 4});
+            totalCombinations.push_back({16, 2, 4, 4, 32, 8, 8, 64, 0, 0, 0, 1, 2, 4});
         }
     } else {
         // get all combinations
         std::vector<std::vector<uint32_t>> candidates = {
             {16, 32},         // KWG
             {2},              // KWI
-            {8, 16},          // MDIMA
-            {8, 16},          // MDIMC
+            {4, 8, 16},          // MDIMA
+            {4, 8, 16},          // MDIMC
             {16, 32, 64, 128}, // MWG
             {8, 16},          // NDIMB
             {8, 16},          // NDIMC
@@ -284,7 +351,7 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
             buildOptions.emplace("-DVWM="   + std::to_string(vwm));
             buildOptions.emplace("-DVWN="   + std::to_string(vwn));
             
-            if(gemmSize[3] >= 4) {
+            if(layoutType >= 4) {
                 buildOptions.emplace(" -DOUTPUTMN");
             }
             if(runtime->getGpuType() == GpuType::ADRENO) {
@@ -292,24 +359,29 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
                 buildOptions.emplace(" -DRELAX_WORKGROUP_SIZE=1");
             }
             
-            if(gemmSize[5] >= 1) {
-                buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string((int)gemmSize[5]));
+            if(biasType >= 1) {
+                buildOptions.emplace(" -DBIAS_TYPE=" + std::to_string((int)biasType));
+            }
+            if(mixPrecision > 0) {
+                buildOptions.emplace("-DPRECISION_COMPUTE=float -DCONVERT_PRECISION_COMPUTE=convert_float");
+                buildOptions.emplace("-DPRECISION_COMPUTE2=float2 -DCONVERT_PRECISION_COMPUTE2=convert_float2");
+                buildOptions.emplace("-DPRECISION_COMPUTE4=float4 -DCONVERT_PRECISION_COMPUTE4=convert_float4");
+                buildOptions.emplace("-DPRECISION_COMPUTE8=float8 -DCONVERT_PRECISION_COMPUTE8=convert_float8");
+                buildOptions.emplace("-DPRECISION_COMPUTE16=float16 -DCONVERT_PRECISION_COMPUTE16=convert_float16");
             }
 
             int localM = mdimc;
             int localN = ndimc;
             
-            std::shared_ptr<KernelWrap> kernel = runtime->buildKernel("matmul_params_buf", "Xgemm", buildOptions);
-            if(kernel == nullptr) {
-                continue;
-            }
+            std::shared_ptr<KernelWrap> kernel;
             if(gemmSize[4] > 1) {
                 kernel =    runtime->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
-                if(kernel == nullptr) {
-                    continue;
-                }
+            } else {
+                kernel = runtime->buildKernel("matmul_params_buf", "Xgemm", buildOptions);
+            }
+            if(kernel == nullptr) {
+                continue;
             }
-            
             if(localM * localN > runtime->getMaxWorkGroupSize(kernel)) {
                 continue;
             }
@@ -326,52 +398,56 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
             // A: [n, l, e]
             // B: [n, l, h]
             
-            cl::Event event;
-            int idx            = 0;
+            int cost_time;
+            int idx = 0;
             cl_int ret = CL_SUCCESS;
             ret |= kernel->get().setArg(idx++, static_cast<int>(gemmSize[0]));
             ret |= kernel->get().setArg(idx++, static_cast<int>(gemmSize[1]));
             ret |= kernel->get().setArg(idx++, static_cast<int>(gemmSize[2]));
             ret |= kernel->get().setArg(idx++, alpha);
             ret |= kernel->get().setArg(idx++, beta);
+            
+            int stride[4] = {(int)gemmSize[0], (int)gemmSize[1], (int)gemmSize[1], (int)gemmSize[1]};
+            if(layoutType < 4) {
+                stride[2] = gemmSize[0]; // output: [N, M]
+            }
             if(gemmSize[4] > 1) {
                 int batch_offset_a = gemmSize[0] * gemmSize[2];
                 int batch_offset_b = gemmSize[1] * gemmSize[2];
                 int batch_offset_c = gemmSize[0] * gemmSize[1];
+                int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+                int group[4] = {1, (int)groupSize, 1, (int)gemmSize[4]};
 
                 ret |= kernel->get().setArg(idx++, tensorMemory[0]);
-                ret |= kernel->get().setArg(idx++, batch_offset_a);
                 ret |= kernel->get().setArg(idx++, tensorMemory[1]);
-                ret |= kernel->get().setArg(idx++, batch_offset_b);
-                if(gemmSize[5] == 1) {
+                if(biasType > 0) {
                     ret |= kernel->get().setArg(idx++, tensorMemory[3]);
-                    ret |= kernel->get().setArg(idx++, gemmSize[1]);
-                } else if(gemmSize[5] > 1) {
-                    MNN_ERROR("BatchGemm with bias type > 1 (elementwise) not supported! please check\n");
                 }
                 ret |= kernel->get().setArg(idx++, tensorMemory[2]);
-                ret |= kernel->get().setArg(idx++, batch_offset_c);
-                
+                ret |= kernel->get().setArg(idx++, sizeof(batch_offset), batch_offset);
+                ret |= kernel->get().setArg(idx++, sizeof(stride), stride);
+                ret |= kernel->get().setArg(idx++, sizeof(group), group);
+
                 MNN_CHECK_CL_SUCCESS(ret, "setArg getGemmParams XgemmBatchhed Kernel");
                 
+                cl::Event event;
                 auto res = CL_SUCCESS;
                 res = runtime->commandQueue().enqueueNDRangeKernel(kernel->get(), cl::NullRange, {globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]}, {localWorkSize[0], localWorkSize[1], localWorkSize[2]}, nullptr, &event);
                 if (res != CL_SUCCESS) {
                     MNN_PRINT("XgemmBatched params tune error: %d\n", res);
                     continue;
                 }
+
+                cost_time = (int)runtime->getCostTime(&event);
             } else {
                 int offset_a = 0;
                 int offset_b = 0;
                 int offset_c = 0;
                 int offset[4] = {0, 0, 0, 0};
-                int stride[4] = {(int)gemmSize[0], (int)gemmSize[1], (int)gemmSize[1], (int)gemmSize[1]};
-                if(gemmSize[3] < 4) {
-                    stride[2] = gemmSize[0]; // output: [N, M]
-                }
+
                 ret |= kernel->get().setArg(idx++, tensorMemory[0]);
                 ret |= kernel->get().setArg(idx++, tensorMemory[1]);
-                if(gemmSize[5] >= 1) {
+                if(biasType >= 1) {
                     ret |= kernel->get().setArg(idx++, tensorMemory[3]);
                 }
                 ret |= kernel->get().setArg(idx++, tensorMemory[2]);
@@ -380,17 +456,17 @@ std::vector<uint32_t> getGemmParams(const std::vector<uint32_t> &gemmSize, const
                 
                 MNN_CHECK_CL_SUCCESS(ret, "setArg getGemmParams Xgemm Kernel");
                 
+                cl::Event event;
                 auto res = CL_SUCCESS;
                 res = runtime->commandQueue().enqueueNDRangeKernel(kernel->get(), cl::NullRange, {globalWorkSize[0], globalWorkSize[1]}, {localWorkSize[0], localWorkSize[1]}, nullptr, &event);
                 if (res != CL_SUCCESS) {
                     MNN_PRINT("Xgemm params tune error: %d\n", res);
                     continue;
                 }
+                cost_time = (int)runtime->getCostTime(&event);
             }
             
-            
-            int cost_time = (int)runtime->getCostTime(&event);
-            if(cost_time < min_cost) {
+            if(cost_time > 0 && cost_time < min_cost) {
                 min_cost = cost_time;
                 params_prefer[0]  = kwg;
                 params_prefer[1]  = kwi;
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index 51ba62619..d42961c1e 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -499,7 +499,9 @@ uint32_t OpenCLRuntime::MaxThreadsPerDevice() const {
 uint32_t OpenCLRuntime::MaxWorkGroupSize() const {
     return mMaxWorkGroupSize;
 }
-
+uint32_t OpenCLRuntime::getPrecisionLevel() const {
+    return mPrecisionLevel;
+}
 uint32_t OpenCLRuntime::maxFreq() const {
     return mMaxFreq;
 }
@@ -548,11 +550,11 @@ std::shared_ptr<KernelWrap> OpenCLRuntime::buildKernelWithCache(const std::strin
                                       const std::set<std::string> &buildOptions, const Tensor *input, const Tensor *output, bool useCache) {
     std::string buildOptionsStr;
     if (mPrecisionLevel == 2) {// Fp16 Memory and fp16 compute
-        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=half  -DCOMPUTE_FLOAT2=half2 -DCOMPUTE_FLOAT3=half3 -DCOMPUTE_FLOAT4=half4 -DCOMPUTE_FLOAT8=half8 -DCOMPUTE_FLOAT16=half16 -DCONVERT_COMPUTE_FLOAT2=convert_half2 -DCONVERT_COMPUTE_FLOAT4=convert_half4 -DCONVERT_COMPUTE_FLOAT8=convert_half8 -DCONVERT_COMPUTE_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DMNN_SUPPORT_FP16";
+        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=half  -DCOMPUTE_FLOAT2=half2 -DCOMPUTE_FLOAT3=half3 -DCOMPUTE_FLOAT4=half4 -DCOMPUTE_FLOAT8=half8 -DCOMPUTE_FLOAT16=half16 -DCONVERT_COMPUTE_FLOAT=convert_half -DCONVERT_COMPUTE_FLOAT2=convert_half2 -DCONVERT_COMPUTE_FLOAT4=convert_half4 -DCONVERT_COMPUTE_FLOAT8=convert_half8 -DCONVERT_COMPUTE_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DMNN_SUPPORT_FP16";
     } else if (mPrecisionLevel == 0) {// Fp16 Memory and fp32 compute
-        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DMNN_SUPPORT_FP16";
+        buildOptionsStr = "-DFLOAT=half -DFLOAT2=half2 -DFLOAT3=half3 -DFLOAT4=half4 -DFLOAT8=half8 -DFLOAT16=half16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DCONVERT_FLOAT=convert_half  -DCONVERT_FLOAT2=convert_half2 -DCONVERT_FLOAT3=convert_half3 -DCONVERT_FLOAT4=convert_half4 -DCONVERT_FLOAT8=convert_half8 -DCONVERT_FLOAT16=convert_half16 -DRI_F=read_imageh -DWI_F=write_imageh -DMNN_SUPPORT_FP16";
     } else {// Fp32 Memory and fp32 compute
-        buildOptionsStr = "-DFLOAT=float -DFLOAT2=float2 -DFLOAT3=float3 -DFLOAT4=float4 -DFLOAT8=float8 -DFLOAT16=float16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT2=convert_float2 -DCONVERT_FLOAT4=convert_float4 -DCONVERT_FLOAT8=convert_float8 -DCONVERT_FLOAT16=convert_float16";
+        buildOptionsStr = "-DFLOAT=float -DFLOAT2=float2 -DFLOAT3=float3 -DFLOAT4=float4 -DFLOAT8=float8 -DFLOAT16=float16 -DCOMPUTE_FLOAT=float  -DCOMPUTE_FLOAT2=float2 -DCOMPUTE_FLOAT3=float3 -DCOMPUTE_FLOAT4=float4 -DCOMPUTE_FLOAT8=float8 -DCOMPUTE_FLOAT16=float16 -DCONVERT_COMPUTE_FLOAT=convert_float  -DCONVERT_COMPUTE_FLOAT2=convert_float2 -DCONVERT_COMPUTE_FLOAT4=convert_float4 -DCONVERT_COMPUTE_FLOAT8=convert_float8 -DCONVERT_COMPUTE_FLOAT16=convert_float16 -DRI_F=read_imagef -DFLOAT16=float16 -DWI_F=write_imagef -DCONVERT_FLOAT=convert_float  -DCONVERT_FLOAT2=convert_float2 -DCONVERT_FLOAT3=convert_float3 -DCONVERT_FLOAT4=convert_float4 -DCONVERT_FLOAT8=convert_float8 -DCONVERT_FLOAT16=convert_float16";
     }
     
     if(nullptr != input){
@@ -975,6 +977,7 @@ bool OpenCLRuntime::setCache(std::pair<const void*, size_t> cache) {
                 params[v] = tun->paramInfo()->data()[v];
             }
             mTunedGemmParams.insert(std::make_pair(info, params));
+            mTuneLws["Xgemm_tune"].push_back(std::make_pair(info, std::make_pair(params, 0)));
         }
     }
     
@@ -1026,6 +1029,8 @@ void OpenCLRuntime::printEventTime(){
             conv_time += kernel_time;
         } else if (mEvents[i].first.length() >= 11 && mEvents[i].first.substr(0, 11) == "Convolution") {
             conv_time += kernel_time;
+        } else if (mEvents[i].first.length() >= 8 && mEvents[i].first.substr(0, 8) == "Strassen") {
+            conv_time += kernel_time;
         }
         if((mEvents[i].first.length() >= 10 && mEvents[i].first.substr(0, 10) == "While-gemm")) {
             loop_bg_time += kernel_time;
@@ -1043,6 +1048,10 @@ void OpenCLRuntime::printEventTime(){
             wino_gemm_time += kernel_time;
             conv_time += kernel_time;
         }
+        if((mEvents[i].first.length() >= 6 && mEvents[i].first.substr(0, 6) == "Raster")) {
+            raster_num++;
+            raster_time += kernel_time;
+        }
         
         kernels[i] = std::make_pair(mEvents[i].first, kernel_time);
     }
@@ -1063,7 +1072,7 @@ void OpenCLRuntime::printEventTime(){
         MNN_PRINT("kernel time = %d    us %s\n", kernels[i].second, kernels[i].first.c_str());
     }
     mEvents.clear();
-    MNN_PRINT("total kernel time = %d  us, conv time = %d us (gemm2:%d us, gemm1:%d us, 1x1:%d us, ori:%d us, wino: %d us, other: %d us), while gemm time = %d us (core gemm time: %d us, softmax:%d us), ori softmax: %d us\n", mKernelTime, conv_time, conv_gemm2_buf_time, conv_gemm1_buf_time, conv_1x1_buf_time, conv_ori_buf_time, wino_gemm_time, conv_time-conv_gemm2_buf_time-conv_gemm1_buf_time-conv_1x1_buf_time-conv_ori_buf_time-wino_gemm_time, loop_bg_time, loop_bg_gemm_time, loop_softmax_time, ori_softmax_time);
+    MNN_PRINT("total kernel time = %d  us, conv time = %d us (gemm2:%d us, gemm1:%d us, 1x1:%d us, ori:%d us, wino: %d us, other: %d us), while gemm time = %d us (core gemm time: %d us, softmax:%d us), ori softmax: %d us, raster[%d] time: %d us\n", mKernelTime, conv_time, conv_gemm2_buf_time, conv_gemm1_buf_time, conv_1x1_buf_time, conv_ori_buf_time, wino_gemm_time, conv_time-conv_gemm2_buf_time-conv_gemm1_buf_time-conv_1x1_buf_time-conv_ori_buf_time-wino_gemm_time, loop_bg_time, loop_bg_gemm_time, loop_softmax_time, ori_softmax_time, raster_num, raster_time);
 #endif
 }
 } // namespace MNN
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
index 2586a6559..b5dfa5918 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -109,6 +109,7 @@ class OpenCLRuntime {
     float getCLVersion() {
         return mCLVersion;
     }
+	uint32_t getPrecisionLevel() const;
     bool isSupportGL(){
     	return mIsSupportGL;
 	}
diff --git a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
index d1faba8eb..d2676ddcb 100644
--- a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.cpp
@@ -19,7 +19,7 @@ ArgMaxBufExecution::ArgMaxBufExecution(const std::string &compute, const MNN::Op
     mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
     std::set<std::string> buildOptions = mBuildOptions;
     buildOptions.emplace("-DARGMAX_LOCAL_SIZE=512");
-    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("argmax_buf", "argmax_channel_buf", buildOptions);
+    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("argmax_buf", "argmax_buf", buildOptions);
     mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 }
 
@@ -32,12 +32,30 @@ int ArgMaxBufExecution::getLocalSize(int size, int maxGroupSize){
 }
 
 ErrorCode ArgMaxBufExecution::onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    mUnits.resize(1);
-    auto &unit = mUnits[0];
+    mUnits.clear();
     auto runtime       = mOpenCLBackend->getOpenCLRuntime();
     auto MaxLocalSize = std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize);
     auto input = inputs[0];
     auto output = outputs[0];
+    
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+    if (mNeedUnpackC4) {
+        int inputTotalSize = 1, outputTotalSize = 1;
+        for (int i = 1; i < input->dimensions(); ++i) {
+            inputTotalSize *= input->length(i);
+        }
+        for (int i = 1; i < output->dimensions(); ++i) {
+            outputTotalSize *= output->length(i);
+        }
+        mTempInputTensor.reset(Tensor::createDevice<float>({inputTotalSize}));
+        mTempOutputTensor.reset(Tensor::createDevice<float>({outputTotalSize}));
+        mOpenCLBackend->onAcquireBuffer(mTempInputTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempOutputTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempInputTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempOutputTensor.get(), Backend::DYNAMIC);
+        
+    }
     if(mAxis < 0){
         mAxis = input->dimensions() + mAxis;
     }
@@ -51,74 +69,111 @@ ErrorCode ArgMaxBufExecution::onEncode(const std::vector<Tensor*>& inputs, const
     }
     int dim = input->length(mAxis);
 
-    std::vector<int> inputShape = tensorShapeFormat(input);
-    std::vector<int> outputShape = tensorShapeFormat(output);
+    // NC4HW4 -> NCHW
+    if(mNeedUnpackC4){
+        Unit unit;
+        std::vector<int> outputShape = tensorShapeFormat(input);
+        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
+        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
+        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
+        cl_int ret = CL_SUCCESS;
+        uint32_t idx = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
+        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempInputTensor.get()));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
 
-    int batch = inputShape.at(0);
-    int inputHeight = inputShape.at(1);
-    int inputWidth  = inputShape.at(2);
-    int inputChannels = inputShape.at(3);
-    int inputChannelBlocks = (inputChannels + 3) / 4;
-    int outputBatch = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int outputChannels = outputShape.at(3);
-    int outputChannelBlocks = (outputChannels + 3) / 4;
-    
-    int localSize = getLocalSize(dim, MaxLocalSize);
-    if(localSize < 4){
-        localSize = 1;
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+        mUnits.emplace_back(unit);
     }
-    std::set<std::string> buildOptions = mBuildOptions;
-    buildOptions.emplace("-DARGMAX_LOCAL_SIZE=" + std::to_string(localSize));
-    std::string kernelName;
-    if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
-        kernelName = "argmax_width_buf";
-        unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
-        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outputHeight), static_cast<uint32_t>(outputBatch * outputChannelBlocks)};
-    }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
-        kernelName = "argmax_height_buf";
-        unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
-        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outputWidth), static_cast<uint32_t>(outputBatch * outputChannelBlocks)};
-    }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
-        if(output->buffer().dimensions == 1){
-            buildOptions.emplace("-DARGMAX_CHANNEL_DIM1");
+    
+    // Argmax
+    {
+        Unit unit;
+        int localSize = getLocalSize(dim, MaxLocalSize);
+        if(localSize < 4){
+            localSize = 1;
         }
-        kernelName = "argmax_channel_buf";
-        unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
-        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outputWidth * outputHeight), static_cast<uint32_t>(outputBatch * outputChannels)};
-    }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
-        kernelName = "argmax_batch_buf";
-        unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
-        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outputWidth * outputHeight), static_cast<uint32_t>(outputChannelBlocks)};
+        std::set<std::string> buildOptions = mBuildOptions;
+        buildOptions.emplace("-DARGMAX_LOCAL_SIZE=" + std::to_string(localSize));
+        std::string kernelName;
+        if(inside % 4 == 0){
+            kernelName = "argmax_v4_buf";
+            unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
+            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};
+        }else {
+            kernelName = "argmax_buf";
+            unit.kernel = runtime->buildKernel("argmax_buf", kernelName, buildOptions);
+            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};
+        }
+        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalSize = {(uint32_t)(localSize), 1, 1};
+        
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        if(mNeedUnpackC4){
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempInputTensor.get()));
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempOutputTensor.get()));
+        }else{
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+        }
+        ret |= unit.kernel->get().setArg(idx++, inside);
+        ret |= unit.kernel->get().setArg(idx++, outside);
+        ret |= unit.kernel->get().setArg(idx++, dim);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
+        
+        if(localSize == 1){
+            mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+        }
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+        mUnits.emplace_back(unit);
     }
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
-    mLocalSize = {(uint32_t)(localSize), 1, 1};
-
-    uint32_t idx = 0;
-    cl_int ret = CL_SUCCESS;
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
-    ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
-    ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(idx++, inputWidth);
-    ret |= unit.kernel->get().setArg(idx++, inputHeight);
-    ret |= unit.kernel->get().setArg(idx++, inputChannels);
-    ret |= unit.kernel->get().setArg(idx++, batch);
-    ret |= unit.kernel->get().setArg(idx++, inputChannelBlocks);
-    ret |= unit.kernel->get().setArg(idx++, outputWidth);
-    ret |= unit.kernel->get().setArg(idx++, outputHeight);
-    ret |= unit.kernel->get().setArg(idx++, outputChannels);
-    ret |= unit.kernel->get().setArg(idx++, outputChannelBlocks);
-    MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
+    
+    // NCHW -> NC4HW4
+    if(mNeedUnpackC4){
+        Unit unit;
+        std::vector<int> outputShape = tensorShapeFormat(output);
+        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
+        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
+        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
+        cl_int ret = CL_SUCCESS;
+        uint32_t idx = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempOutputTensor.get()));
+        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
 
-    if(localSize == 1){
-        mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+        mUnits.emplace_back(unit);
     }
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp
index 760f909ce..9ce5cd79a 100644
--- a/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ArgMaxBufExecution.hpp
@@ -29,6 +29,9 @@ class ArgMaxBufExecution : public CommonExecution {
     std::set<std::string> mBuildOptions;
     int mAxis;
     OpenCLBackend *mOpenCLBackend;
+    std::shared_ptr<Tensor> mTempInputTensor;
+    std::shared_ptr<Tensor> mTempOutputTensor;
+    bool mNeedUnpackC4;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp b/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
index 2ca359ecf..2302714cc 100644
--- a/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
@@ -13,17 +13,17 @@
 namespace MNN {
 namespace OpenCL {
 
-AttentionBufImpl::AttentionBufImpl(const MNN::Op *op, Backend *backend, bool kv_cahce)
-    : mKVCache(kv_cahce){
+KVCacheCLManager::KVCacheCLManager(Backend *backend, bool kv_cahce) : mKVCache(kv_cahce){
     mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_channel", {"-DSOFTMAX_LOCAL_SIZE=512"});
-    mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 }
 
-void AttentionBufImpl::allocKVCache() {
+void KVCacheCLManager::allocKVCache() {
     if (!mKVCache || mPastLength < mMaxLength) {
         return;
     }
+    if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
+        mByte = 2;
+    }
     mMaxLength = mPastLength + mExpandChunk;
     size_t buffer_size = UP_DIV(mMaxLength, 4) * mKvNumHead * mHeadDim * 4 * mByte;
     // past_key: [1, numhead, headdim, maxlen]
@@ -32,9 +32,9 @@ void AttentionBufImpl::allocKVCache() {
     mPastValue.reset(new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size));
 }
 
-void AttentionBufImpl::reallocKVCache() {
+bool KVCacheCLManager::reallocKVCache() {
     if (!mKVCache || mPastLength < mMaxLength) {
-        return;
+        return false;
     }
     
     size_t old_size = mKvNumHead * UP_DIV(mMaxLength, 4) * mHeadDim * 4 * mByte;
@@ -70,40 +70,47 @@ void AttentionBufImpl::reallocKVCache() {
     
     mPastKey.reset(new_key);
     mPastValue.reset(new_value);
-    mTempQK.reset(Tensor::createDevice<float>({UP_DIV(mMaxLength, 4) * mNumHead * 4}));
-    mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(mMaxLength, 4) * mNumHead * 4}));
+    return true;
+}
+
+int AttentionBufExecution::getLocalSize(int size, int maxGroupSize){
+    int local_size = 1;
+    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
+        local_size *= 2;
+    }
+    return local_size;
+}
+
+void AttentionBufExecution::reallocKVCache() {
+    int maxLength = mKVCacheCLManager->maxLength();
+    int numHead = mKVCacheCLManager->numHead();
+    mTempQK.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
+    mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
     mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::STATIC);
     mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::STATIC);
     // reset memory for args
     if(mOpenCLBackend->isUseRecordQueue()){
         mQkUpdateInfo.update_kernel_args[1].arg_value = &openCLBuffer(mTempQK.get())();
-        mQkUpdateInfo.update_kernel_args[2].arg_value = &(*(mPastKey.get()))();
+        mQkUpdateInfo.update_kernel_args[2].arg_value = &(*(mKVCacheCLManager->key()))();
         mSoftMaxUpdateInfo.update_kernel_args[0].arg_value = &openCLBuffer(mTempQK.get())();
         mSoftMaxUpdateInfo.update_kernel_args[1].arg_value = &openCLBuffer(mTempSoftMax.get())();
         mQkvUpdateInfo.update_kernel_args[0].arg_value = &openCLBuffer(mTempSoftMax.get())();
-        mQkvUpdateInfo.update_kernel_args[1].arg_value = &(*(mPastValue.get()))();
+        mQkvUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
     }else{
         cl_int ret = CL_SUCCESS;
         ret |= mKernel_qk->get().setArg(5, openCLBuffer(mTempQK.get()));
-        ret |= mKernel_qk->get().setArg(6, *mPastKey.get());
+        ret |= mKernel_qk->get().setArg(6, *mKVCacheCLManager->key());
         ret |= mKernel_softmax->get().setArg(3, openCLBuffer(mTempQK.get()));
         ret |= mKernel_softmax->get().setArg(4, openCLBuffer(mTempSoftMax.get()));
         ret |= mKernel_qkv->get().setArg(3, openCLBuffer(mTempSoftMax.get()));
-        ret |= mKernel_qkv->get().setArg(6, *mPastValue.get());
+        ret |= mKernel_qkv->get().setArg(6, *mKVCacheCLManager->value());
         MNN_CHECK_CL_SUCCESS(ret, "reset memory arg for AttentionBufExecution");
     }
+    mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::STATIC);
+    mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::STATIC);
 }
 
-int AttentionBufImpl::getLocalSize(int size, int maxGroupSize){
-    int local_size = 1;
-    while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
-        local_size *= 2;
-    }
-    return local_size;
-}
-
-ErrorCode AttentionBufImpl::onResize(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
+ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     mOpenCLBackend->startRecord(mRecording);
     //clear update arg vector, if prefill and decode use the same one
     mOpRecordUpdateInfo.clear();
@@ -124,195 +131,605 @@ ErrorCode AttentionBufImpl::onResize(Backend *backend, const std::vector<Tensor
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
     auto shape = query->shape();
     
+    int batch = shape[0];
     int seq_len = shape[1];
-    mNumHead = shape[2];
-    mKvNumHead = key->shape()[2];
-    int group_size = mNumHead / mKvNumHead;
-    mHeadDim = shape[3];
-    mScale = 1.0 / sqrt(mHeadDim);
+    int numHead = shape[2];
+    int kvNumHead = key->shape()[2];
+    int headDim = shape[3];
+    int group_size = numHead / kvNumHead;
+    float scale = 1.0 / sqrt(headDim);
     mIsDecode = seq_len == 1;
-    mIsFirstDecode = true;
-    if (mPastLength == 0 || seq_len > 1) {
-        mPastLength = seq_len;
-    }
-    mKv_seq_len = mPastLength;
-    if(mIsDecode){
-        mKv_seq_len = mPastLength + 1;
-    }
     
-    if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
-        mByte = 2;
+    mIsAddMask = (mask->getType() == halide_type_of<float>());
+    mLongPrefill = false;
+    if(false == mIsDecode){
+        mKVCacheCLManager->setArgs(seq_len, numHead, kvNumHead, headDim);
+        mKVCacheCLManager->allocKVCache();
+        
+        if(seq_len > 512) {
+            mLongPrefill = true;
+            mAlignQ = 128;
+            mAlignKV = 128;
+            mAlignHDK = 4;
+            mAlignHDN = 128;
+            
+            mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
+            mTempK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
+            mTempV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
+            if(mIsAddMask) {
+                mTempMask.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
+            } else {
+                mTempMask.reset(Tensor::createDevice<uint32_t>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
+            }
+            mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
+            mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
+            mTempQKV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
+            
+        } else {
+            mTempQK.reset(Tensor::createDevice<float>({UP_DIV(seq_len, 4) * seq_len * numHead * 4}));
+            mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(seq_len, 4) * seq_len * numHead * 4}));
+        }
+        mKv_seq_len = mKVCacheCLManager->kvLength();
+    } else {
+        mKv_seq_len = mKVCacheCLManager->kvLength() + 1;
+        int maxLength = mKVCacheCLManager->maxLength();
+        mTempQK.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
+        mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
     }
-    allocKVCache();
-    if (mIsDecode) {
-        mTempQK.reset(Tensor::createDevice<float>({UP_DIV(mMaxLength, 4) * mNumHead * 4}));
-        mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(mMaxLength, 4) * mNumHead * 4}));
+
+    if(mLongPrefill) {
+        mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempV.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempMask.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
+
+        mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempMask.get(), Backend::DYNAMIC);
+
+        mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        
+        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        
+        mOpenCLBackend->onAcquireBuffer(mTempQKV.get(), Backend::DYNAMIC);
+
+        mOpenCLBackend->onReleaseBuffer(mTempV.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempQKV.get(), Backend::DYNAMIC);
+
     } else {
-        mTempQK.reset(Tensor::createDevice<float>({UP_DIV(mPastLength, 4) * mPastLength * mNumHead * 4}));
-        mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(mPastLength, 4) * mPastLength * mNumHead * 4}));
+        mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     }
-    mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     
-    // query * key -> div -> select
-    {
-        std::set<std::string> buildOption;
-        if(!mIsDecode){
-            buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+    
+    if(mLongPrefill) {
+        // query: [batch, seqLenQ, headNum, headDim] -> mTempQ: [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)]
+        // key: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempK: [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)]
+        // value: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempV: [batch*headNum/group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDK]
+        // key & value -> pastKey & pastValue (copy)
+        {
+            std::set<std::string> buildOption;
+            if((headDim % 4) != 0){
+                buildOption.emplace("-DHEADDIM_LEAVE");
+            }
+            if((seq_len % 4) != 0){
+                buildOption.emplace("-DSEQLEN_LEAVE");
+            }
+            
+            int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
+            int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
+
+            int head_dim_pack_qk = ROUND_UP(headDim, mAlignHDK);
+            int head_dim_pack_v = ROUND_UP(headDim, mAlignHDN);
+
+            int tile[4] = {mAlignQ, mAlignKV, mAlignHDK, mAlignHDN};
+            int shape[4] = {seq_len, mKv_seq_len, numHead, headDim};
+            int param[4] = {group_size, batch, 0, 0};
+            mKernel_rearrange = runtime->buildKernel("attention_buf", "rearrange_qkv", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrange));
+            
+            mGlobalWorkSizeRearrg = {static_cast<uint32_t>(ALIMAX(UP_DIV(seq_len_pack_q, 4), UP_DIV(seq_len_pack_kv, 4))), \
+                                    static_cast<uint32_t>(ALIMAX(UP_DIV(head_dim_pack_qk, 4), UP_DIV(head_dim_pack_v, 4))), \
+                                    static_cast<uint32_t>(batch*numHead)};
+  
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[0]);
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[1]);
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[2]);
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(query));
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(key));
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(value));
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempQ.get()));
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempK.get()));
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempV.get()));
+            ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->key());
+            ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->value());
+            ret |= mKernel_rearrange->get().setArg(index++, tile);
+            ret |= mKernel_rearrange->get().setArg(index++, shape);
+            ret |= mKernel_rearrange->get().setArg(index++, param);
+            
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_qkv");
+            mLocalWorkSizeRearrg = localWS3DDefault(mGlobalWorkSizeRearrg, maxWorkGroupSize, runtime, "rearrange_qkv", mKernel_rearrange).first;
+            mGlobalWorkSizeRearrg[0] = ROUND_UP(mGlobalWorkSizeRearrg[0], std::max((uint32_t)1, mLocalWorkSizeRearrg[0]));
+            mGlobalWorkSizeRearrg[1] = ROUND_UP(mGlobalWorkSizeRearrg[1], std::max((uint32_t)1, mLocalWorkSizeRearrg[1]));
+            mGlobalWorkSizeRearrg[2] = ROUND_UP(mGlobalWorkSizeRearrg[2], std::max((uint32_t)1, mLocalWorkSizeRearrg[2]));
+            mOpenCLBackend->recordKernel3d(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg);
         }
-        if((mHeadDim % 4) != 0){
-            buildOption.emplace("-DHEADDIM_LEAVE");
+        
+        // mask rearaange
+        {
+            std::set<std::string> buildOption;
+
+            int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
+            int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
+            int shape[4] = {seq_len, mKv_seq_len, mAlignQ, mAlignKV};
+
+            mKernel_mask = runtime->buildKernel("attention_buf", "rearrange_mask", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_mask));
+            
+            mGlobalWorkSizeMask = {static_cast<uint32_t>(UP_DIV(seq_len_pack_q, 4)), \
+                                    static_cast<uint32_t>(UP_DIV(seq_len_pack_kv, 4)), \
+                                    static_cast<uint32_t>(batch)};
+  
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[0]);
+            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[1]);
+            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[2]);
+            ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mask));
+            ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mTempMask.get()));
+            ret |= mKernel_mask->get().setArg(index++, shape);
+            
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_mask");
+            mLocalWorkSizeMask = localWS3DDefault(mGlobalWorkSizeMask, maxWorkGroupSize, runtime, "rearrange_mask", mKernel_mask).first;
+            mGlobalWorkSizeMask[0] = ROUND_UP(mGlobalWorkSizeMask[0], std::max((uint32_t)1, mLocalWorkSizeMask[0]));
+            mGlobalWorkSizeMask[1] = ROUND_UP(mGlobalWorkSizeMask[1], std::max((uint32_t)1, mLocalWorkSizeMask[1]));
+            mGlobalWorkSizeMask[2] = ROUND_UP(mGlobalWorkSizeMask[2], std::max((uint32_t)1, mLocalWorkSizeMask[2]));
+            mOpenCLBackend->recordKernel3d(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask);
         }
-        if(mask->getType() == halide_type_of<float>()){
-            buildOption.emplace("-DADD_MASK");
+        
+        {
+            // Q : [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)] -> [B, K, M]
+            // K : [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)] -> [B, K, N]
+            // QV: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]   -> [B, M, N]
+            int loop = batch * numHead;
+            int e_pack = ROUND_UP(seq_len, mAlignQ);
+            int h_pack = ROUND_UP(mKv_seq_len, mAlignKV);
+            int l_pack = ROUND_UP(headDim, mAlignHDK);
+            
+            std::set<std::string> buildOptions;
+
+            int biasType = 5;// int value mask
+            if(mIsAddMask) {
+                biasType = 2;
+            }
+            uint32_t layout = 14; // 10 means mix-precision, 4 means layput
+            auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)(biasType + 10*(group_size-1))}, {openCLBuffer(mTempQ.get()), openCLBuffer(mTempK.get()), openCLBuffer(mTempQK.get()), openCLBuffer(mTempMask.get())}, mOpenCLBackend->getOpenCLRuntime());
+            
+            int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+            buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+            buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+            buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+            buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+            buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+            buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+            buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+            buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+            buildOptions.emplace("-DSA=" + std::to_string(SA));
+            buildOptions.emplace("-DSB=" + std::to_string(SB));
+            buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+            buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+            buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+            buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+            if(layout >= 4) {
+                buildOptions.emplace("-DOUTPUTMN");
+            }
+            
+            int tileM = MWG;
+            int tileN = NWG;
+            int localM = MDIMC;
+            int localN = NDIMC;
+            
+            if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
+                buildOptions.emplace("-DUSE_CL_MAD=1");
+                buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
+            }
+            buildOptions.emplace("-DONLY_HAVE_ALPHA");
+            buildOptions.emplace("-DBIAS_TYPE=" + std::to_string(biasType));
+            
+            buildOptions.emplace("-DPRECISION_COMPUTE=float -DCONVERT_PRECISION_COMPUTE=convert_float");
+            buildOptions.emplace("-DPRECISION_COMPUTE2=float2 -DCONVERT_PRECISION_COMPUTE2=convert_float2");
+            buildOptions.emplace("-DPRECISION_COMPUTE4=float4 -DCONVERT_PRECISION_COMPUTE4=convert_float4");
+            buildOptions.emplace("-DPRECISION_COMPUTE8=float8 -DCONVERT_PRECISION_COMPUTE8=convert_float8");
+            buildOptions.emplace("-DPRECISION_COMPUTE16=float16 -DCONVERT_PRECISION_COMPUTE16=convert_float16");
+
+            mKernel_qk = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
+            
+            int out_per_thread_m = tileM / localM;
+            int out_per_thread_n = tileN / localN;
+            
+            mGlobalWorkSizeQk = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
+            mLocalWorkSizeQk = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
+            
+            float alpha = scale;
+            float beta = 0.0f;
+            int batch_offset_a = e_pack * l_pack;
+            int batch_offset_b = h_pack * l_pack;
+            int batch_offset_c = e_pack * h_pack;
+            
+            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+            int stride[4] = {e_pack, h_pack, h_pack, h_pack};
+            int group[4] = {1, group_size, 1, numHead};
+            
+            int idx            = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(e_pack));
+            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(h_pack));
+            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(l_pack));
+            ret |= mKernel_qk->get().setArg(idx++, alpha);
+            ret |= mKernel_qk->get().setArg(idx++, beta);
+            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQ.get()));
+            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempK.get()));
+            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempMask.get()));
+            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_qk->get().setArg(idx++, batch_offset);
+            ret |= mKernel_qk->get().setArg(idx++, stride);
+            ret |= mKernel_qk->get().setArg(idx++, group);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qk Kernel");
+            mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk);
         }
-        buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
-        mKernel_qk = runtime->buildKernel("attention_buf", "matmul_qk_div_mask", buildOption, inputs[0], outputs[0]);
-        mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(mNumHead), static_cast<uint32_t>(UP_DIV(mKv_seq_len, 4))};
-        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qk));
-        mGlobalWorkSizeQk2 = UP_DIV(mKv_seq_len, 4);
         
-        uint32_t index = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[0]);
-        ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[1]);
-        ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk2);
-        ret |= mKernel_qk->get().setArg(index++, openCLBuffer(query));
-        ret |= mKernel_qk->get().setArg(index++, openCLBuffer(key));
-        ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQK.get()));
-        ret |= mKernel_qk->get().setArg(index++, *mPastKey.get());
-        ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mask));
-        ret |= mKernel_qk->get().setArg(index++, mScale);
-        ret |= mKernel_qk->get().setArg(index++, seq_len);
-        ret |= mKernel_qk->get().setArg(index++, mKv_seq_len);
-        ret |= mKernel_qk->get().setArg(index++, mNumHead);
-        ret |= mKernel_qk->get().setArg(index++, mKvNumHead);
-        ret |= mKernel_qk->get().setArg(index++, mHeadDim);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qk_div_mask");
-
-        mLocalWorkSizeQk = localWS3DDefault(mGlobalWorkSizeQk, maxWorkGroupSize, runtime, "matmul_qk_div_mask", mKernel_qk).first;
-        mGlobalWorkSizeQk[0] = ROUND_UP(mGlobalWorkSizeQk[0], std::max((uint32_t)1, mLocalWorkSizeQk[0]));
-        mGlobalWorkSizeQk[1] = ROUND_UP(mGlobalWorkSizeQk[1], std::max((uint32_t)1, mLocalWorkSizeQk[1]));
-        mGlobalWorkSizeQk[2] = ROUND_UP(mGlobalWorkSizeQk[2], std::max((uint32_t)1, mLocalWorkSizeQk[2]));
-        mQkUpdateInfo.update_kernel_args.push_back({0, 2, sizeof(mGlobalWorkSizeQk2), &mGlobalWorkSizeQk2});
-        mQkUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
-        mQkUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mPastKey.get()))()});
-        mQkUpdateInfo.update_kernel_args.push_back({0, 10, sizeof(mKv_seq_len), &mKv_seq_len});
-        mQkGlobal_size[0] = mGlobalWorkSizeQk[0];
-        mQkGlobal_size[1] = mGlobalWorkSizeQk[1];
-        mQkGlobal_size[2] = mGlobalWorkSizeQk[2];
-        mQkUpdateInfo.update_global_size.push_back({0, mQkGlobal_size});
-        mOpRecordUpdateInfo.emplace_back(&mQkUpdateInfo);
-        mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, &mQkUpdateInfo);
-    }
-    
-    // softmax
-    {
-        auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(512));
-        int localSize = getLocalSize(mKv_seq_len, MaxLocalSize);
-        if(localSize < 4){
-            localSize = 1;
+        // softmax
+        {
+            // QV:     [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+            // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+            // axis  : 2 (last dim)
+            int softmaxShape[4];
+            softmaxShape[0] = batch*numHead;
+            softmaxShape[1] = ROUND_UP(seq_len, mAlignQ);
+            softmaxShape[2] = ROUND_UP(mKv_seq_len, mAlignKV);
+            
+            auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
+            int localSize = getLocalSize(softmaxShape[2], MaxLocalSize);
+            if(localSize < 4){
+                localSize = 1;
+            }
+            
+            std::set<std::string> buildOption;
+            buildOption.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
+            
+            mKernel_softmax = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOption, inputs[0], outputs[0]);
+            mGlobalWorkSizeSoftMax =  {static_cast<uint32_t>(localSize), static_cast<uint32_t>(softmaxShape[1]), static_cast<uint32_t>(softmaxShape[0])};
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[0]);
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[1]);
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[2]);
+            ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+            ret |= mKernel_softmax->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_softmax->get().setArg(index++, softmaxShape);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Attention softmax");
+            
+            mLocalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), 1, 1};
+            mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax);
+        }
+        {
+            // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+            // Trans:  [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]
+            int loop = batch * numHead;
+            int transDimW = ROUND_UP(seq_len, mAlignQ);
+            int transDimH = ROUND_UP(mKv_seq_len, mAlignKV);
+            
+            std::set<std::string> buildOptions;
+            mKernel_trans = runtime->buildKernel("self_attention_buf", "trans_3d_buf", buildOptions, inputs[0], outputs[0]);
+            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel_trans));
+
+            mGlobalWorkSizeTrans = {(uint32_t)transDimW/8, (uint32_t)transDimH/8, (uint32_t)(loop)};
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[0]);
+            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[1]);
+            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[2]);
+            ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+            ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_trans->get().setArg(index++, loop);
+            ret |= mKernel_trans->get().setArg(index++, transDimW);
+            ret |= mKernel_trans->get().setArg(index++, transDimH);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Attention transpose");
+            mLocalWorkSizeTrans = localWS3DDefault(mGlobalWorkSizeTrans, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "trans_3d_buf", mKernel_trans).first;
+            
+            mGlobalWorkSizeTrans[0] = ROUND_UP(mGlobalWorkSizeTrans[0], std::max((uint32_t)1, mLocalWorkSizeTrans[0]));
+            mGlobalWorkSizeTrans[1] = ROUND_UP(mGlobalWorkSizeTrans[1], std::max((uint32_t)1, mLocalWorkSizeTrans[1]));
+            mGlobalWorkSizeTrans[2] = ROUND_UP(mGlobalWorkSizeTrans[2], std::max((uint32_t)1, mLocalWorkSizeTrans[2]));
+            
+            mOpenCLBackend->recordKernel3d(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans);
         }
-        int past_len4 = UP_DIV(mKv_seq_len, 4);
-        mSoftMaxRemainChannels = past_len4 * 4 - mKv_seq_len;
-        mSoftmaxShape[0] = mNumHead;
-        mSoftmaxShape[1] = past_len4;
-        mSoftmaxShape[2] = 1;
-        mSoftmaxShape[3] = mPastLength;
-        std::set<std::string> buildOption;
-        buildOption.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
-        if(!mIsDecode){
-            mKernel_softmax = runtime->buildKernel("softmax_buf", "softmax_width", buildOption, inputs[0], outputs[0]);
-            mGlobalWorkSizeSoftMax =  {static_cast<uint32_t>(localSize), static_cast<uint32_t>(past_len4), static_cast<uint32_t>(mNumHead)};
-        } else{
-            mKernel_softmax = runtime->buildKernel("softmax_buf", "softmax_channel", buildOption, inputs[0], outputs[0]);
-            mSoftmaxShape[3] = 1;
-            mGlobalWorkSizeSoftMax =  {static_cast<uint32_t>(localSize), static_cast<uint32_t>(1), static_cast<uint32_t>(mNumHead)};
+
+        // qk * value
+        {
+            // Trans: [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]   -> [B, K, M]
+            // V :     [Batch * numHead / group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDN)] -> [B, K, N]
+            // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
+            
+            int loop = batch * numHead;
+            int e_pack = ROUND_UP(seq_len, mAlignQ);
+            int l_pack = ROUND_UP(mKv_seq_len, mAlignKV);
+            int h_pack = ROUND_UP(headDim, mAlignHDN);
+            
+            std::set<std::string> buildOptions;
+
+            uint32_t layout = 0;
+            auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)0}, {openCLBuffer(mTempQK.get()), openCLBuffer(mTempV.get()), openCLBuffer(mTempQKV.get())}, mOpenCLBackend->getOpenCLRuntime());
+
+            int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+            buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+            buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+            buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+            buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+            buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+            buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+            buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+            buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+            buildOptions.emplace("-DSA=" + std::to_string(SA));
+            buildOptions.emplace("-DSB=" + std::to_string(SB));
+            buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+            buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+            buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+            buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+            if(layout >= 4) {
+                buildOptions.emplace("-DOUTPUTMN");
+            }
+            
+            int tileM = MWG;
+            int tileN = NWG;
+            int localM = MDIMC;
+            int localN = NDIMC;
+            
+            if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
+                buildOptions.emplace("-DUSE_CL_MAD=1");
+                buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
+            }
+
+            mKernel_qkv = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
+            
+            int out_per_thread_m = tileM / localM;
+            int out_per_thread_n = tileN / localN;
+            
+            mGlobalWorkSizeQkv = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
+            mLocalWorkSizeQkv = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
+            
+            float alpha = 1.0f;
+            float beta = 0.0f;
+            int batch_offset_a = e_pack * l_pack;
+            int batch_offset_b = h_pack * l_pack;
+            int batch_offset_c = e_pack * h_pack;
+            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+            int stride[4] = {e_pack, h_pack, e_pack, h_pack};
+            int group[4] = {1, group_size, 1, numHead};
+            
+            int idx            = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(e_pack));
+            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(h_pack));
+            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(l_pack));
+            ret |= mKernel_qkv->get().setArg(idx++, alpha);
+            ret |= mKernel_qkv->get().setArg(idx++, beta);
+            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempV.get()));
+            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQKV.get()));
+            ret |= mKernel_qkv->get().setArg(idx++, batch_offset);
+            ret |= mKernel_qkv->get().setArg(idx++, stride);
+            ret |= mKernel_qkv->get().setArg(idx++, group);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qkv Kernel");
+            mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv);
         }
-        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_softmax));
         
-        uint32_t index = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[0]);
-        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[1]);
-        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[2]);
-        ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempQK.get()));
-        ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
-        ret |= mKernel_softmax->get().setArg(index++, mSoftMaxRemainChannels);
-        ret |= mKernel_softmax->get().setArg(index++, mSoftmaxShape);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg softmax");
-
-        mLocalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), 1, 1};
-        if(localSize == 1){
-            mLocalWorkSizeSoftMax = localWS3DDefault(mGlobalWorkSizeSoftMax, maxWorkGroupSize, runtime, "softmax", mKernel_softmax).first;
+        // transpose to output
+        {
+            // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
+            // output: [batch, seqLenQ/4, headNum, headDim, seqLenQ_4]
+            std::set<std::string> buildOption;
+            
+            mKernel_clip = runtime->buildKernel("attention_buf", "qkv_transpose_output", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_clip));
+                        
+            mGlobalWorkSizeClip = {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(UP_DIV(headDim, 4)), static_cast<uint32_t>(batch*numHead)};
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[0]);
+            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[1]);
+            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[2]);
+            ret |= mKernel_clip->get().setArg(index++, openCLBuffer(mTempQKV.get()));
+            ret |= mKernel_clip->get().setArg(index++, openCLBuffer(outputs[0]));
+            ret |= mKernel_clip->get().setArg(index++, mAlignQ);
+            ret |= mKernel_clip->get().setArg(index++, mAlignHDN);
+            ret |= mKernel_clip->get().setArg(index++, seq_len);
+            ret |= mKernel_clip->get().setArg(index++, numHead);
+            ret |= mKernel_clip->get().setArg(index++, headDim);
+
+            mLocalWorkSizeClip = localWS3DDefault(mGlobalWorkSizeClip, maxWorkGroupSize, runtime, "qkv_transpose_output", mKernel_clip).first;
+            mGlobalWorkSizeClip[0] = ROUND_UP(mGlobalWorkSizeClip[0], std::max((uint32_t)1, mLocalWorkSizeClip[0]));
+            mGlobalWorkSizeClip[1] = ROUND_UP(mGlobalWorkSizeClip[1], std::max((uint32_t)1, mLocalWorkSizeClip[1]));
+            mGlobalWorkSizeClip[2] = ROUND_UP(mGlobalWorkSizeClip[2], std::max((uint32_t)1, mLocalWorkSizeClip[2]));
+
+            MNN_CHECK_CL_SUCCESS(ret, "setArg qkv_transpose_output");
+            mOpenCLBackend->recordKernel3d(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip);
         }
-        mGlobalWorkSizeSoftMax[0] = ROUND_UP(mGlobalWorkSizeSoftMax[0], std::max((uint32_t)1, mLocalWorkSizeSoftMax[0]));
-        mGlobalWorkSizeSoftMax[1] = ROUND_UP(mGlobalWorkSizeSoftMax[1], std::max((uint32_t)1, mLocalWorkSizeSoftMax[1]));
-        mGlobalWorkSizeSoftMax[2] = ROUND_UP(mGlobalWorkSizeSoftMax[2], std::max((uint32_t)1, mLocalWorkSizeSoftMax[2]));
-        mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
-        mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 4, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
-        mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(mSoftMaxRemainChannels), &mSoftMaxRemainChannels});
-        mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(mSoftmaxShape), &mSoftmaxShape});
-        mOpRecordUpdateInfo.emplace_back(&mSoftMaxUpdateInfo);
-        mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, &mSoftMaxUpdateInfo);
-    }
-    
-    // qk * value
-    {
-        std::set<std::string> buildOption;
-        if(!mIsDecode){
-            buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+        
+    } else {
+        // query * key -> div -> select
+        {
+            std::set<std::string> buildOption;
+            if(!mIsDecode){
+                buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+            }
+            if((headDim % 4) != 0){
+                buildOption.emplace("-DHEADDIM_LEAVE");
+            }
+            if(mask->getType() == halide_type_of<float>()){
+                buildOption.emplace("-DADD_MASK");
+            }
+            buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
+            mKernel_qk = runtime->buildKernel("attention_buf", "matmul_qk_div_mask", buildOption, inputs[0], outputs[0]);
+            mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(mKv_seq_len, 4)), static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(numHead)};
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qk));
+            mGlobalWorkSizeQk0 = UP_DIV(mKv_seq_len, 4);
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk0);
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[1]);
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[2]);
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(query));
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(key));
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_qk->get().setArg(index++, *mKVCacheCLManager->key());
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mask));
+            ret |= mKernel_qk->get().setArg(index++, scale);
+            ret |= mKernel_qk->get().setArg(index++, seq_len);
+            ret |= mKernel_qk->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_qk->get().setArg(index++, numHead);
+            ret |= mKernel_qk->get().setArg(index++, kvNumHead);
+            ret |= mKernel_qk->get().setArg(index++, headDim);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qk_div_mask");
+            
+            mLocalWorkSizeQk = localWS3DDefault(mGlobalWorkSizeQk, maxWorkGroupSize, runtime, "matmul_qk_div_mask", mKernel_qk).first;
+            mGlobalWorkSizeQk[0] = ROUND_UP(mGlobalWorkSizeQk[0], std::max((uint32_t)1, mLocalWorkSizeQk[0]));
+            mGlobalWorkSizeQk[1] = ROUND_UP(mGlobalWorkSizeQk[1], std::max((uint32_t)1, mLocalWorkSizeQk[1]));
+            mGlobalWorkSizeQk[2] = ROUND_UP(mGlobalWorkSizeQk[2], std::max((uint32_t)1, mLocalWorkSizeQk[2]));
+            mQkUpdateInfo.update_kernel_args.push_back({0, 0, sizeof(mGlobalWorkSizeQk0), &mGlobalWorkSizeQk0});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mKVCacheCLManager->key()))()});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 10, sizeof(mKv_seq_len), &mKv_seq_len});
+            mQkGlobal_size[0] = mGlobalWorkSizeQk[0];
+            mQkGlobal_size[1] = mGlobalWorkSizeQk[1];
+            mQkGlobal_size[2] = mGlobalWorkSizeQk[2];
+            mQkUpdateInfo.update_global_size.push_back({0, mQkGlobal_size});
+            mOpRecordUpdateInfo.emplace_back(&mQkUpdateInfo);
+            mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, &mQkUpdateInfo);
         }
-        if((mHeadDim % 4) != 0){
-            buildOption.emplace("-DHEADDIM_LEAVE");
+        
+        // softmax
+        {
+            int inside  = 1;
+            int outside = numHead * seq_len;
+            auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
+            int localSize = getLocalSize(UP_DIV(mKv_seq_len, 4), MaxLocalSize);
+            if(localSize < 4){
+                localSize = 1;
+            }
+            
+            std::set<std::string> buildOption;
+            buildOption.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
+            mKernel_softmax = runtime->buildKernel("softmax_buf", "softmax_in1_buf", buildOption);
+            mGlobalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_softmax));
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[0]);
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[1]);
+            ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[2]);
+            ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+            ret |= mKernel_softmax->get().setArg(index++, inside);
+            ret |= mKernel_softmax->get().setArg(index++, outside);
+            ret |= mKernel_softmax->get().setArg(index++, mKv_seq_len);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg softmax");
+            
+            mLocalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), 1, 1};
+            if(localSize == 1){
+                mLocalWorkSizeSoftMax = localWS3DDefault(mGlobalWorkSizeSoftMax, maxWorkGroupSize, runtime, "softmax", mKernel_softmax).first;
+            }
+            mGlobalWorkSizeSoftMax[0] = ROUND_UP(mGlobalWorkSizeSoftMax[0], std::max((uint32_t)1, mLocalWorkSizeSoftMax[0]));
+            mGlobalWorkSizeSoftMax[1] = ROUND_UP(mGlobalWorkSizeSoftMax[1], std::max((uint32_t)1, mLocalWorkSizeSoftMax[1]));
+            mGlobalWorkSizeSoftMax[2] = ROUND_UP(mGlobalWorkSizeSoftMax[2], std::max((uint32_t)1, mLocalWorkSizeSoftMax[2]));
+            mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
+            mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 4, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
+            mSoftMaxUpdateInfo.update_kernel_args.push_back({0, 7, sizeof(mKv_seq_len), &mKv_seq_len});
+            mOpRecordUpdateInfo.emplace_back(&mSoftMaxUpdateInfo);
+            mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, &mSoftMaxUpdateInfo);
         }
-        buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
-        mKernel_qkv = runtime->buildKernel("attention_buf", "matmul_qkv", buildOption, inputs[0], outputs[0]);
-        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qkv));
-        mGlobalWorkSizeQkv =  {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(mNumHead), static_cast<uint32_t>(UP_DIV(mHeadDim, 4))};
         
-        uint32_t index = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[0]);
-        ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[1]);
-        ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[2]);
-        ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
-        ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(value));
-        ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(outputs[0]));
-        ret |= mKernel_qkv->get().setArg(index++, *mPastValue.get());
-        ret |= mKernel_qkv->get().setArg(index++, seq_len);
-        ret |= mKernel_qkv->get().setArg(index++, mKv_seq_len);
-        ret |= mKernel_qkv->get().setArg(index++, mNumHead);
-        ret |= mKernel_qkv->get().setArg(index++, mKvNumHead);
-        ret |= mKernel_qkv->get().setArg(index++, mHeadDim);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv");
-
-        mLocalWorkSizeQkv = localWS3DDefault(mGlobalWorkSizeQkv, maxWorkGroupSize, runtime, "matmul_qkv", mKernel_qkv).first;
-        mGlobalWorkSizeQkv[0] = ROUND_UP(mGlobalWorkSizeQkv[0], std::max((uint32_t)1, mLocalWorkSizeQkv[0]));
-        mGlobalWorkSizeQkv[1] = ROUND_UP(mGlobalWorkSizeQkv[1], std::max((uint32_t)1, mLocalWorkSizeQkv[1]));
-        mGlobalWorkSizeQkv[2] = ROUND_UP(mGlobalWorkSizeQkv[2], std::max((uint32_t)1, mLocalWorkSizeQkv[2]));
+        // qk * value
+        {
+            std::set<std::string> buildOption;
+            if(!mIsDecode){
+                buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+            }
+            if((headDim % 4) != 0){
+                buildOption.emplace("-DHEADDIM_LEAVE");
+            }
+            buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
+            mKernel_qkv = runtime->buildKernel("attention_buf", "matmul_qkv", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qkv));
+            mGlobalWorkSizeQkv =  {static_cast<uint32_t>(UP_DIV(headDim, 4)), static_cast<uint32_t>(numHead), static_cast<uint32_t>(UP_DIV(seq_len, 4))};
+            
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[0]);
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[1]);
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[2]);
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(value));
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(outputs[0]));
+            ret |= mKernel_qkv->get().setArg(index++, *mKVCacheCLManager->value());
+            ret |= mKernel_qkv->get().setArg(index++, seq_len);
+            ret |= mKernel_qkv->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_qkv->get().setArg(index++, numHead);
+            ret |= mKernel_qkv->get().setArg(index++, kvNumHead);
+            ret |= mKernel_qkv->get().setArg(index++, headDim);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv");
+            
+            mLocalWorkSizeQkv = localWS3DDefault(mGlobalWorkSizeQkv, maxWorkGroupSize, runtime, "matmul_qkv", mKernel_qkv).first;
+            mGlobalWorkSizeQkv[0] = ROUND_UP(mGlobalWorkSizeQkv[0], std::max((uint32_t)1, mLocalWorkSizeQkv[0]));
+            mGlobalWorkSizeQkv[1] = ROUND_UP(mGlobalWorkSizeQkv[1], std::max((uint32_t)1, mLocalWorkSizeQkv[1]));
+            mGlobalWorkSizeQkv[2] = ROUND_UP(mGlobalWorkSizeQkv[2], std::max((uint32_t)1, mLocalWorkSizeQkv[2]));
+            
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mKVCacheCLManager->value()))()});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 8, sizeof(mKv_seq_len), &mKv_seq_len});
+            mOpRecordUpdateInfo.emplace_back(&mQkvUpdateInfo);
+            mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, &mQkvUpdateInfo);
+        }
         
-        mQkvUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
-        mQkvUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mPastValue.get()))()});
-        mQkvUpdateInfo.update_kernel_args.push_back({0, 8, sizeof(mKv_seq_len), &mKv_seq_len});
-        mOpRecordUpdateInfo.emplace_back(&mQkvUpdateInfo);
-        mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, &mQkvUpdateInfo);
+        mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     }
-    
     mOpenCLBackend->endRecord(mRecording);
-    
-    mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+
     return NO_ERROR;
 }
 
-ErrorCode AttentionBufImpl::onExecute(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+ErrorCode AttentionBufExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("start AttentionBufExecution onExecute !\n");
 #endif
-    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    reallocKVCache();
+    if(mIsDecode){
+        if(mKVCacheCLManager->reallocKVCache()){
+            reallocKVCache();
+        }
+        mKv_seq_len = mKVCacheCLManager->kvLength() + 1;
+        mGlobalWorkSizeQk0 = UP_DIV(mKv_seq_len, 4);
+        mQkGlobal_size[0] = ROUND_UP(mGlobalWorkSizeQk0, std::max((uint32_t)1, mLocalWorkSizeQk[0]));
+        mGlobalWorkSizeQk[0] = mQkGlobal_size[0];
+        mKVCacheCLManager->addKvLength();
+    }
 #ifdef ENABLE_OPENCL_TIME_PROFILER
+    if(mLongPrefill) {
+        cl::Event event0, event1;
+        run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime(), &event0);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_qkv", event0});
+        run3DKernelDefault(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask, mOpenCLBackend->getOpenCLRuntime(), &event1);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_mask", event1});
+    }
     {
         cl::Event event;
         run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk,
@@ -327,6 +744,12 @@ ErrorCode AttentionBufImpl::onExecute(Backend *backend, const std::vector<Tensor
         
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"softmax", event});
     }
+    if(mLongPrefill) {
+        cl::Event event;
+        run3DKernelDefault(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans, mOpenCLBackend->getOpenCLRuntime(), &event);
+        
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"transpose_softmax", event});
+    }
     {
         cl::Event event;
         run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv,
@@ -334,49 +757,45 @@ ErrorCode AttentionBufImpl::onExecute(Backend *backend, const std::vector<Tensor
         
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qkv", event});
     }
+    if(mLongPrefill) {
+        cl::Event event;
+        run3DKernelDefault(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip, mOpenCLBackend->getOpenCLRuntime(), &event);
+        
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_output", event});
+    }
 #else
     if(mOpenCLBackend->isUseRecordQueue()){
         mOpenCLBackend->addRecord(mRecording, mOpRecordUpdateInfo);
-        if(mIsDecode){
-            if(mIsFirstDecode){
-                mIsFirstDecode = false;
-            }else{
-                mPastLength += 1;
-                mKv_seq_len = mPastLength + 1;
-                int past_len4 = UP_DIV(mKv_seq_len, 4);
-                mSoftMaxRemainChannels = past_len4 * 4 - mKv_seq_len;
-                mSoftmaxShape[1] = past_len4;
-                mGlobalWorkSizeQk2 = past_len4;
-                mQkGlobal_size[2] = ROUND_UP(mGlobalWorkSizeQk2, std::max((uint32_t)1, mLocalWorkSizeQk[2]));
-            }
-        }
 #ifdef LOG_VERBOSE
         MNN_PRINT("End AttentionBufExecution onExecute... \n");
 #endif
         return NO_ERROR;
     }
-    run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
-    run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
-    run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
-#endif
     
     // decode
     if(mIsDecode){
-        mPastLength += 1;
-        mKv_seq_len = mPastLength + 1;
-        int past_len4 = UP_DIV(mKv_seq_len, 4);
-        mSoftMaxRemainChannels = past_len4 * 4 - mKv_seq_len;
-        mSoftmaxShape[1] = past_len4;
         cl_int ret = CL_SUCCESS;
-        mGlobalWorkSizeQk2 = past_len4;
-        mGlobalWorkSizeQk[2] = ROUND_UP(mGlobalWorkSizeQk2, std::max((uint32_t)1, mLocalWorkSizeQk[2]));
-        ret |= mKernel_qk->get().setArg(2, mGlobalWorkSizeQk2);
+        ret |= mKernel_qk->get().setArg(0, mGlobalWorkSizeQk0);
         ret |= mKernel_qk->get().setArg(10, mKv_seq_len);
-        ret |= mKernel_softmax->get().setArg(5, mSoftMaxRemainChannels);
-        ret |= mKernel_softmax->get().setArg(6, mSoftmaxShape);
+        ret |= mKernel_softmax->get().setArg(7, mKv_seq_len);
         ret |= mKernel_qkv->get().setArg(8, mKv_seq_len);
         MNN_CHECK_CL_SUCCESS(ret, "reset arg for AttentionBufExecution");
     }
+    if(mLongPrefill) {
+        run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime());
+        run3DKernelDefault(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask, mOpenCLBackend->getOpenCLRuntime());
+    }
+    run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
+    run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
+    if(mLongPrefill) {
+        run3DKernelDefault(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans, mOpenCLBackend->getOpenCLRuntime());
+    }
+    run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
+    if(mLongPrefill) {
+        run3DKernelDefault(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip, mOpenCLBackend->getOpenCLRuntime());
+    }
+#endif
+    
 #ifdef LOG_VERBOSE
     MNN_PRINT("end AttentionBufExecution onExecute !\n");
 #endif
@@ -385,24 +804,23 @@ ErrorCode AttentionBufImpl::onExecute(Backend *backend, const std::vector<Tensor
 }
 
 AttentionBufExecution::AttentionBufExecution(const MNN::Op *op, Backend* backend, bool kv_cahce) : CommonExecution(backend, op) {
-    mImpl.reset(new AttentionBufImpl(op, backend, kv_cahce));
-}
-
-AttentionBufExecution::AttentionBufExecution(std::shared_ptr<AttentionBufImpl> impl, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op), mImpl(impl) {}
-
-ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    return mImpl->onResize(backend(), inputs, outputs);
+    mKVCacheCLManager.reset(new KVCacheCLManager(backend, kv_cahce));
+    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
+    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_buf", {"-DSOFTMAX_LOCAL_SIZE=512"});
+    mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 }
 
-ErrorCode AttentionBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
-    return mImpl->onExecute(backend(), inputs, outputs);
+AttentionBufExecution::AttentionBufExecution(std::shared_ptr<KVCacheCLManager> manager, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op), mKVCacheCLManager(manager) {
+    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
+    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_buf", {"-DSOFTMAX_LOCAL_SIZE=512"});
+    mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 }
 
 bool AttentionBufExecution::onClone(Backend* bn, const Op* op, Execution** dst) {
     if (nullptr == dst) {
         return true;
     }
-    *dst = new AttentionBufExecution(mImpl, op, bn);
+    *dst = new AttentionBufExecution(mKVCacheCLManager, op, bn);
     return true;
 }
 
diff --git a/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp b/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
index cb33dc05d..1292ace2f 100644
--- a/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
@@ -16,33 +16,63 @@
 namespace MNN {
 namespace OpenCL {
 
-class AttentionBufImpl {
+class KVCacheCLManager {
 public:
-    AttentionBufImpl(const MNN::Op *op, Backend *backend, bool kv_cache);
+    KVCacheCLManager(Backend *backend, bool kv_cache);
 
-    ~AttentionBufImpl() {
-        if(mRecording != NULL){
-#ifdef MNN_USE_LIB_WRAPPER
-            clReleaseRecordingQCOM(mRecording);
-#endif
-        }
+    ~KVCacheCLManager() = default;
+    void allocKVCache();
+    bool reallocKVCache();
+    void setArgs(int pastLength, int numHead, int kvNumHead, int headDim){
+        mPastLength = pastLength;
+        mNumHead = numHead;
+        mKvNumHead = kvNumHead;
+        mHeadDim = headDim;
+    }
+    int kvLength() {
+        return mPastLength;
+    }
+    void addKvLength(){
+        mPastLength += 1;
+    }
+    int maxLength() {
+        return mMaxLength;
+    }
+    int numHead() {
+        return mNumHead;
+    }
+    const cl::Buffer * key() {
+        return mPastKey.get();
+    }
+    const cl::Buffer * value() {
+        return mPastValue.get();
     }
-    ErrorCode onResize(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
-    ErrorCode onExecute(Backend *backend, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
 
 private:
-    int getLocalSize(int size, int maxGroupSize);
-    void allocKVCache();
-    void reallocKVCache();
     bool mKVCache;
-    float mScale;
     const int mExpandChunk = 2048;
-    bool mIsDecode = false;
-    bool mIsFirstDecode = true;
-    int mPastLength = 0, mMaxLength = 0, mKv_seq_len = 0, mSoftMaxRemainChannels = 0;
     std::shared_ptr<cl::Buffer> mPastKey, mPastValue;
-    std::shared_ptr<Tensor> mTempQK, mTempSoftMax;
-    int mNumHead = 0, mKvNumHead = 0, mHeadDim = 0, mValueH = 0;
+    int mPastLength = 0, mMaxLength = 0, mNumHead = 0, mKvNumHead = 0, mHeadDim = 0;
+    OpenCLBackend *mOpenCLBackend;
+    int mByte = 4;
+};
+
+class AttentionBufExecution : public CommonExecution {
+public:
+    AttentionBufExecution(const MNN::Op *op, Backend *backend, bool kv_cache);
+    AttentionBufExecution(std::shared_ptr<KVCacheCLManager> manager, const MNN::Op *op, Backend *backend);
+
+    virtual ~AttentionBufExecution() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
+
+private:
+    
+    int getLocalSize(int size, int maxGroupSize);
+    void reallocKVCache();
+    bool mIsDecode = false;
+    int mKv_seq_len = 0;
     std::shared_ptr<KernelWrap> mKernel_qk;
     std::shared_ptr<KernelWrap> mKernel_softmax;
     std::shared_ptr<KernelWrap> mKernel_qkv;
@@ -57,26 +87,28 @@ class AttentionBufImpl {
     RecordUpdateInfo mQkUpdateInfo;
     RecordUpdateInfo mSoftMaxUpdateInfo;
     RecordUpdateInfo mQkvUpdateInfo;
-    int mGlobalWorkSizeQk2 = 0;
+    int mGlobalWorkSizeQk0 = 0;
     size_t mQkGlobal_size[3];
-    int mSoftmaxShape[4];
-    cl_recording_qcom mRecording{NULL};
     std::vector<RecordUpdateInfo*> mOpRecordUpdateInfo;
-    int mByte = 4;
-};
-
-class AttentionBufExecution : public CommonExecution {
-public:
-    AttentionBufExecution(const MNN::Op *op, Backend *backend, bool kv_cache);
-    AttentionBufExecution(std::shared_ptr<AttentionBufImpl> impl, const MNN::Op *op, Backend *backend);
-
-    virtual ~AttentionBufExecution() = default;
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
-
+    std::shared_ptr<KVCacheCLManager> mKVCacheCLManager;
+    std::shared_ptr<Tensor> mTempQK, mTempSoftMax;
 private:
-    std::shared_ptr<AttentionBufImpl> mImpl;
+    int mAlignQ, mAlignKV, mAlignHDK, mAlignHDN;
+    bool mLongPrefill = false;
+    std::shared_ptr<KernelWrap> mKernel_rearrange;
+    std::vector<uint32_t> mGlobalWorkSizeRearrg{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeRearrg{1, 1, 1, 1};
+    std::shared_ptr<KernelWrap> mKernel_mask;
+    std::vector<uint32_t> mGlobalWorkSizeMask{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeMask{1, 1, 1, 1};
+    std::shared_ptr<KernelWrap> mKernel_trans;
+    std::vector<uint32_t> mGlobalWorkSizeTrans{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeTrans{1, 1, 1, 1};
+    std::shared_ptr<KernelWrap> mKernel_clip;
+    std::vector<uint32_t> mGlobalWorkSizeClip{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeClip{1, 1, 1, 1};
+    std::shared_ptr<Tensor> mTempQ, mTempK, mTempV, mTempMask, mTempQKV;
+    bool mIsAddMask = false;
 };
 } // namespace OpenCL
 } // namespace MNN
diff --git a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
index 47b75864f..94db4128e 100644
--- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@@ -246,19 +246,22 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
     auto output = outputs[0];
-    auto inputShape0 = tensorShapeFormat(inputs[0]);
-    auto inputShape1 = tensorShapeFormat(inputs[1]);
     auto outputShape = tensorShapeFormat(output);
     auto runTime     = ((OpenCLBackend *)backend())->getOpenCLRuntime();
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
-    if (runTime->isSupportedIntelSubgroup()) {
+    if (runTime->isSupportedIntelSubgroup() && MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(output)->dimensionFormat) {
         return SubgroupOnResize(inputs, outputs);
     }
 #endif /* MNN_SUPPORT_INTEL_SUBGROUP */
-    int shape[4] = {outputShape[0], outputShape[1], outputShape[2], UP_DIV(outputShape[3], 4)};
     int fullCount[2] = {1, 1};
     fullCount[0] = realSize(inputs[0]) == 1 ? 0 : 1;
     fullCount[1] = realSize(inputs[1]) == 1 ? 0 : 1;
+    int totalSize = 0;
+    if(MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(output)->dimensionFormat){
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * ROUND_UP(outputShape[3], 4);
+    }else{
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
+    }
     
     int activationType = 0;
     if(mOp->type() == OpType_BinaryOp) {
@@ -267,10 +270,8 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     auto &unit = mUnits[0];
     
     std::set<std::string> buildOptions = mBuildOptions;
-    int wh_pack = 1;
-    if((outputShape[1]*outputShape[2]) % 4 == 0) {
-        wh_pack = 4;
-        buildOptions.emplace("-DWH_PACK4");
+    if(totalSize % 4 != 0) {
+        buildOptions.emplace("-DPACK_LEAVE");
     }
     if(fullCount[0] == 0) {
         buildOptions.emplace("-DA_SINGLE");
@@ -281,9 +282,7 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     unit.kernel = runTime->buildKernel("binary_buf", "binary_buf", buildOptions, inputs[0], output);
     mMaxWorkGroupSize      = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
 
-    mGlobalWorkSize =  {(uint32_t)UP_DIV(outputShape[3], 4) * outputShape[0],
-                                        (uint32_t)UP_DIV(outputShape[1]*outputShape[2], wh_pack)};
-
+    mGlobalWorkSize =  {(uint32_t)UP_DIV(totalSize, 4), (uint32_t)1};
     uint32_t index = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(index++, mGlobalWorkSize[0]);
@@ -291,13 +290,12 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(inputs[0]));
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(inputs[1]));
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(index++, shape);
-    ret |= unit.kernel->get().setArg(index++, fullCount);
+    ret |= unit.kernel->get().setArg(index++, totalSize);
     ret |= unit.kernel->get().setArg(index++, activationType);
     MNN_CHECK_CL_SUCCESS(ret, "setArg BinaryBufExecution");
 
     std::string name = "binary_buf";
-    mLocalWorkSize = {(uint32_t)16, (uint32_t)16};
+    mLocalWorkSize = {(uint32_t)16, (uint32_t)1};
     
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
     unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1]};
@@ -307,13 +305,6 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         fullCount[1] = realSize(inputs[i]) == 1 ? 0 : 1;
         auto &unit = mUnits[i-1];
         
-        std::set<std::string> buildOptions = mBuildOptions;
-        if((outputShape[1]*outputShape[2]) % 4 == 0) {
-            buildOptions.emplace("-DWH_PACK4");
-        }
-        if(fullCount[1] == 0) {
-            buildOptions.emplace("-DB_SINGLE");
-        }
         unit.kernel = runTime->buildKernel("binary_buf", "binary_buf", buildOptions, inputs[i], output);
 
         uint32_t index = 0;
@@ -322,8 +313,7 @@ ErrorCode BinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         ret |= unit.kernel->get().setArg(index++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(index++, openCLBuffer(inputs[i]));
         ret |= unit.kernel->get().setArg(index++, openCLBuffer(output));
-        ret |= unit.kernel->get().setArg(index++, shape);
-        ret |= unit.kernel->get().setArg(index++, fullCount);
+        ret |= unit.kernel->get().setArg(index++, totalSize);
         ret |= unit.kernel->get().setArg(index++, activationType);
         MNN_CHECK_CL_SUCCESS(ret, "setArg BinaryBufExecution MultiInput");
 
@@ -341,7 +331,8 @@ class BinaryBufCreator : public OpenCLBackend::Creator {
                                 const MNN::Op *op, Backend *backend) const override {
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
-            if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()) {
+            if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()
+                && MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(inputs[i])->dimensionFormat) {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
diff --git a/source/backend/opencl/execution/buffer/CastBufExecution.cpp b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
index dd4debd80..d4ab150bc 100644
--- a/source/backend/opencl/execution/buffer/CastBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/CastBufExecution.cpp
@@ -13,54 +13,49 @@ namespace MNN {
 namespace OpenCL {
 
 CastBufExecution::CastBufExecution(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const std::string& compute, const MNN::Op* op, Backend* backend) : CommonExecution(backend, op) {
-    mUnits.resize(1);
-    auto &unit = mUnits[0];
     mBuildOptions.emplace(compute);
-    auto runtime = static_cast<OpenCLBackend*>(backend)->getOpenCLRuntime();
-    unit.kernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions, inputs[0], outputs[0]);
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 }
 ErrorCode CastBufExecution::onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    mUnits.resize(1);
     auto &unit = mUnits[0];
     Tensor* input      = inputs[0];
     Tensor* output     = outputs[0];
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
     auto runtime       = openCLBackend->getOpenCLRuntime();
 
-    std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
-
-    int batch        = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int channels     = outputShape.at(3);
-
-    int channelBlocks = (channels + 3) / 4;
-
+    int totalSize = 0;
+    if(MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(output)->dimensionFormat){
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * ROUND_UP(outputShape[3], 4);
+    }else{
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
+    }
+    std::set<std::string> buildOptions = mBuildOptions;
+    if(totalSize % 4 != 0) {
+        buildOptions.emplace("-DPACK_LEAVE");
+    }
+    unit.kernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions, inputs[0], outputs[0]);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+    
     mGlobalWorkSize = {
-        static_cast<uint32_t>(outputWidth),
-        static_cast<uint32_t>(outputHeight),
-        static_cast<uint32_t>(batch * channelBlocks),
+        static_cast<uint32_t>(UP_DIV(totalSize, 4)),
+        static_cast<uint32_t>(1)
     };
 
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(idx++, outputWidth);
-    ret |= unit.kernel->get().setArg(idx++, outputHeight);
-    ret |= unit.kernel->get().setArg(idx++, channelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, totalSize);
     MNN_CHECK_CL_SUCCESS(ret, "setArg CastBufExecution");
 
     std::string kernelName = "cast_buf";
-    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
-    openCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
-
+    mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+    openCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalSize);
+    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+    unit.localWorkSize = {mLocalSize[0], mLocalSize[1]};
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
index ba25bda93..8ba800b26 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@@ -122,17 +122,26 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
                           mPaddings[1] == 0 && mResource->mStrides[0] == 1 && mResource->mStrides[1] == 1);
 
         mResource->mConv1x1Opt = isConv1x1;
-        mResource->mConv1x1C8Opt = mResource->mConv1x1Opt && mResource->mOutputChannel >= 16;
+        if(mResource->mConv1x1Opt) {
+            mResource->mAlignK = 4;
+            mResource->mAlignN = 8;
+        }
         bool useConvGemm = isConv1x1 && mResource->mInputChannel > 32 && mResource->mOutputChannel > 64;
         if (useConvGemm) {
-            mResource->mConvGemmOptLevel = 2;
+            mResource->mAlignK = 4;
+            mResource->mAlignN = 16;
+            mResource->mConvGemmOptLevel = 1;
+            if(mResource->mOutputChannel > 1024) {
+                mResource->mAlignN = 128;
+            } else if(mResource->mOutputChannel > 512) {
+                mResource->mAlignN = 64;
+            } else if(mResource->mOutputChannel > 96) {
+                mResource->mAlignN = 32;
+            }
         }
     }
     if (mResource->mConv1x1Opt) {
-        // Tile Match with mConvGemmOptLevel == 2
-        int tileK = 4;
-        int tileN = 32;
-        int buffer_size = ROUND_UP(mResource->mOutputChannel, tileN) * ROUND_UP(mResource->mInputChannel, tileK);
+        int buffer_size = ROUND_UP(mResource->mOutputChannel, mResource->mAlignN) * ROUND_UP(mResource->mInputChannel, mResource->mAlignK);
         mResource->mFilter.reset(
             Tensor::createDevice<float>({buffer_size}));
         mOpenCLBackend->onAcquireBuffer(mResource->mFilter.get(), Backend::STATIC);
@@ -153,13 +162,13 @@ ConvBufExecution::ConvBufExecution(const std::vector<Tensor *> &inputs, const st
                 // [Ci, Co] ( [K, N] )
                 for (int o = 0; o < mResource->mOutputChannel; o++) {
                     for (int i = 0; i < mResource->mInputChannel; i++) {
-                        ((half_float::half *)ptrCL)[i * ROUND_UP(mResource->mOutputChannel, tileN) + o] = (half_float::half)(mFilterDataPtr[o * mResource->mInputChannel + i]);
+                        ((half_float::half *)ptrCL)[i * ROUND_UP(mResource->mOutputChannel, mResource->mAlignN) + o] = (half_float::half)(mFilterDataPtr[o * mResource->mInputChannel + i]);
                     }
                 }
             } else {
                 for (int o = 0; o < mResource->mOutputChannel; o++) {
                     for (int i = 0; i < mResource->mInputChannel; i++) {
-                        ((float *)ptrCL)[i * ROUND_UP(mResource->mOutputChannel, tileN) + o] = (mFilterDataPtr[o * mResource->mInputChannel + i]);
+                        ((float *)ptrCL)[i * ROUND_UP(mResource->mOutputChannel, mResource->mAlignN) + o] = (mFilterDataPtr[o * mResource->mInputChannel + i]);
                     }
                 }
             }
@@ -257,6 +266,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
     mOpenCLBackend->startRecord(mRecording);
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
+    const int batch              = outputShape.at(0);
     const int height             = outputShape.at(1);
     const int width              = outputShape.at(2);
     const int outChannel         = outputShape.at(3);
@@ -279,50 +289,48 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         int M = outputShape.at(0) * area;
         int N = outputShape.at(3);
         int K = inputShape.at(3);
-
-        bool isAlign = (K % 8 == 0 && area == 1 && N % 64 == 0 && M % 64 == 0);
-        bool isLimitSize = (M * 1.0 / 512 * N / 512 * K / 512 <= 1.0) && (1.0 * M * K / N / N >= 16.0);
-        if(isAlign && isLimitSize) {
-            mResource->mConvGemmOptLevel = 1;
-        } else if(M < 128 || 1.0 * M / 512 * N / 512 * K / 256 < 1.0) {
+        
+       if(M < 128 || 1.0 * M / 512 * N / 512 * K / 256 < 1.0) {
+            mResource->mConvGemmOptLevel = 0;
+        }
+        if(1.0 * M * N / K / K > 100.0 || 1.0 * M * K / N / N > 100.0) {
             mResource->mConvGemmOptLevel = 0;
         }
     }
-
-    if (mResource->mConvGemmOptLevel == 2) {
-        // set large tile
-        int tileM = 16;
-        int tileN = 32;
-        int tileK = 4;
-
+    
+    if (mResource->mConvGemmOptLevel == 1) {
         int area = height * width;
         int M = outputShape.at(0) * area;
         int N = outputShape.at(3);
         int K = inputShape.at(3);
+        // set M Align
+        float ratio = 1.0 * M / 1024.0 * N / 1024.0 * K / 1024.0;
+        if(M > 1024 && ratio >= 1.0) {
+            mAlignM = 128;
+        } else if(M > 512 && ratio >= 0.1) {
+            mAlignM = 64;
+        } else if(M > 96){
+            mAlignM = 32;
+        } else {
+            mAlignM = 16;
+        }
 
-        int alignM = ROUND_UP(M, tileM);
-        int alignN = ROUND_UP(N, tileN);
-        int alignK = ROUND_UP(K, tileK);
+        int alignM = ROUND_UP(M, mAlignM);
+        int alignN = ROUND_UP(N, mResource->mAlignN);
+        int alignK = ROUND_UP(K, mResource->mAlignK);
 
         // ReArrange input
         mConvGemmInpTensor.reset(Tensor::createDevice<float>({alignK * alignM}));
         mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-        
-        mNeedOutTempTensor = true;
         mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
         mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
-
+        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
+        
         {
             std::set<std::string> buildOptions;
-
-            int m_pack = 1;
-            if(area == 1) {
-                m_pack = 4;
-                buildOptions.emplace("-DAREA_EQUAL_1");
-            } else if(outputShape.at(0) == 1) {
-                m_pack = 4;
-                buildOptions.emplace("-DBATCH_EQUAL_1");
-            }
+            
+            int m_pack = 4;
             mPreKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", "transpose_pad", buildOptions);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mPreKernel));
             mPreGlobalWorkSize = {static_cast<uint32_t>(alignM/m_pack), static_cast<uint32_t>(alignK/4)};
@@ -339,14 +347,14 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             ret |= mPreKernel->get().setArg(idx++, static_cast<int>(area));
             ret |= mPreKernel->get().setArg(idx++, openCLBuffer(input));
             ret |= mPreKernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get()));
-            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==2 PreKernel");
+            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==1 PreKernel");
             mPreLocalWorkSize = localWS2DDefault(mPreGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "transpose_pad", mPreKernel).first;
 
             mOpenCLBackend->recordKernel2d(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize);
             mPreGlobalWorkSize[0] = ROUND_UP(mPreGlobalWorkSize[0], std::max((uint32_t)1, mPreLocalWorkSize[0]));
             mPreGlobalWorkSize[1] = ROUND_UP(mPreGlobalWorkSize[1], std::max((uint32_t)1, mPreLocalWorkSize[1]));
         }
-
+        
         // call gemm strassen
         {
             mStrassenComputor.reset(new StrassenMatrixComputor(backend(), 3));
@@ -355,15 +363,19 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         }
         
         // call output transpose
-        if(mNeedOutTempTensor) {
+        {
             std::set<std::string> buildOptions = mResource->mBuildOptions;
-            if(area == 1) {
-                buildOptions.emplace("-DAREA_EQUAL_1");
+            int pack_m = 1;
+            if(M % 8 == 0) {
+                pack_m = 8;
+            } else if(M % 4 == 0) {
+                pack_m = 4;
             }
+            buildOptions.emplace("-DM_VEC=" + std::to_string(pack_m));
             mPostKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", "transpose_bias", buildOptions);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mPostKernel));
 
-            mPostGlobalWorkSize = {static_cast<uint32_t>(M), static_cast<uint32_t>(UP_DIV(N, 16))};
+            mPostGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(M, pack_m)), static_cast<uint32_t>(UP_DIV(N, 4))};
 
             int offset = 0;
             int idx            = 0;
@@ -379,7 +391,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             ret |= mPostKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
             ret |= mPostKernel->get().setArg(idx++, openCLBuffer(output));
 
-            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==2 PostKernel");
+            MNN_CHECK_CL_SUCCESS(ret, "setArg mConvgemmOptLevel==1 PostKernel");
             mPostLocalWorkSize = localWS2DDefault(mPostGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "transpose_bias", mPostKernel).first;
             mOpenCLBackend->recordKernel2d(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize);
             mPostGlobalWorkSize[0] = ROUND_UP(mPostGlobalWorkSize[0], std::max((uint32_t)1, mPostLocalWorkSize[0]));
@@ -388,146 +400,132 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             mOpenCLBackend->endRecord(mRecording);
         }
         
-        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-        if(mNeedOutTempTensor) {
-            mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
-        }
-        
         return NO_ERROR;
-    } else if (mResource->mConvGemmOptLevel == 1) {
-        // set small tile
-        int tileM = 64;
-        int tileN = 64;
-        int tileK = 8;
-        int localM = 16;
-        int localN = 16;
-        int M = outputShape.at(0);
-        int N = outputShape.at(3);
-        int K = inputShape.at(3);
-
-        std::set<std::string> buildOptions = mResource->mBuildOptions;;
-        buildOptions.emplace(" -DBIAS");
+    } else if (mResource->mConv1x1Opt) {
+        if(inputChannels >= 128 && outputShape[0] * outChannel * width * height <= 64){
+            mResource->mConv1x1Local = true;
+            int local_size = 1;
+            while(local_size * 2 <= 256 && local_size * 2 <= inputChannelBlocks){
+                local_size *= 2;
+            }
+            mGlobalWorkSize = {static_cast<uint32_t>(local_size), static_cast<uint32_t>(UP_DIV(outChannel, 4) * width), static_cast<uint32_t>(outputShape[0] * height)};
+            mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
+            
+            std::set<std::string> buildOption = mResource->mBuildOptions;
+            buildOption.emplace("-DCONV_LOCAL_SIZE=" + std::to_string(local_size));
+            mKernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", "conv_2d_1x1_local", buildOption);
+            uint32_t idx = 0;
+            cl_int ret = CL_SUCCESS;
 
-        if(N % 128 == 0) {
-            tileN = 128;
-            buildOptions.emplace(" -DOPWM=64 -DOPWN=128 -DCPWK=8 -DOPTM=4 -DOPTN=8");
+            ret |= mKernel->get().setArg(idx++, UP_DIV(width, 1));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(input));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(output));
+            ret |= mKernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+            ret |= mKernel->get().setArg(idx++, batch);
+            ret |= mKernel->get().setArg(idx++, height);
+            ret |= mKernel->get().setArg(idx++, width);
+            ret |= mKernel->get().setArg(idx++, UP_DIV(outChannel, 4));
+            ret |= mKernel->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf");
         } else {
-            buildOptions.emplace(" -DOPWM=64 -DOPWN=64 -DCPWK=8 -DOPTM=4 -DOPTN=4");
-        }
-
-
-        mKernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_local_buf", "matmul_local_buf", buildOptions);
-        int out_per_thread_m = tileM / localM;
-        int out_per_thread_n = tileN / localN;
-
-        mGlobalWorkSize = {static_cast<uint32_t>(M/out_per_thread_m), static_cast<uint32_t>(N/out_per_thread_n)};
-        mLocalWorkSize = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN)};
-
-        int idx            = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(M));
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(N));
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(K));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(input));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(output));
-
-        MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf mConvgemmOptLevel==1 Kernel Select");
-    } else if (mResource->mConv1x1Opt) {
+            mResource->mConv1x1Local = false;
+            // {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1", "conv_2d_1x1_c8h1w4"};
+            const int total_kernel = 3;
+            std::string kernelName[total_kernel] = {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1"};
+            int itemC[total_kernel] = {4, 4, 4};
+            int itemW[total_kernel] = {4, 2, 1};
+
+            int M = outputShape.at(0) * outputShape.at(1) * outputShape.at(2);
+            mResource->mConv1x1C8Opt = (mResource->mOutputChannel >= 16 && M >= 16 && M * mResource->mOutputChannel >= 65536);
+            
+            int actual_kernel = total_kernel;
+            if(mResource->mConv1x1C8Opt) {
+                actual_kernel = 2;
+                kernelName[0] = "conv_2d_1x1_c8h1w4";
+                itemC[0]      = 8;
+                itemW[0]      = 4;
+
+                kernelName[1] = "conv_2d_1x1_c8h1w2";
+                itemC[1]      = 8;
+                itemW[1]      = 2;
+            }
 
-        int tileN = 32;
-        // {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1", "conv_2d_1x1_c8h1w4"};
-        const int total_kernel = 3;
-        std::string kernelName[total_kernel] = {"conv_2d_1x1_c4h1w4", "conv_2d_1x1_c4h1w2", "conv_2d_1x1_c4h1w1"};
-        int itemC[total_kernel] = {4, 4, 4};
-        int itemW[total_kernel] = {4, 2, 1};
+            std::shared_ptr<KernelWrap> kernel[total_kernel];
+            std::vector<uint32_t> globalWorkSize[total_kernel];
+            std::vector<uint32_t> localWorkSize[total_kernel];
+            std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
+            for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
+                std::set<std::string> buildOption = mResource->mBuildOptions;
+                if(outputShape.at(3) % itemC[knl_idx] != 0){
+                    buildOption.emplace("-DCHANNEL_LEAVE");
+                }
+                if((outputShape.at(2) % itemW[knl_idx]) != 0){
+                    buildOption.emplace("-DBLOCK_LEAVE");
+                }
+                kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption);
+                uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
+                
+                uint32_t idx            = 0;
+                cl_int ret = CL_SUCCESS;
+                globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
+
+                ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
+                ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]);
+                ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx]));
+                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(input));
+                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
+                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
+                ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
+                ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+                ret |= kernel[knl_idx]->get().setArg(idx++, height);
+                ret |= kernel[knl_idx]->get().setArg(idx++, width);
+                ret |= kernel[knl_idx]->get().setArg(idx++, batch);
+                ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(outChannel, 4));
+                ret |= kernel[knl_idx]->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));
+
+                MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf Kernel Select");
+
+                std::pair<std::vector<uint32_t>, int> retTune;
+                retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]);
+                if(min_cost.first > retTune.second) {
+                    min_cost.first = retTune.second;
+                    min_cost.second = knl_idx;
+                    mLocalWorkSize = {retTune.first[0], retTune.first[1]};
+                }
+            }
 
-        int actual_kernel = total_kernel;
-        if(mResource->mConv1x1C8Opt) {
-            actual_kernel = 2;
-            kernelName[0] = "conv_2d_1x1_c8h1w4";
-            itemC[0]      = 8;
-            itemW[0]      = 4;
-
-            kernelName[1] = "conv_2d_1x1_c8h1w2";
-            itemC[1]      = 8;
-            itemW[1]      = 2;
-        }
+            std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
+            int min_index  = min_cost.second;
+            mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
 
-        std::shared_ptr<KernelWrap> kernel[total_kernel];
-        std::vector<uint32_t> globalWorkSize[total_kernel];
-        std::vector<uint32_t> localWorkSize[total_kernel];
-        std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
-        for(int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
             std::set<std::string> buildOption = mResource->mBuildOptions;
-            if(outputShape.at(3) % itemC[knl_idx] != 0){
+            if(outputShape.at(3) % itemC[min_index] != 0){
                 buildOption.emplace("-DCHANNEL_LEAVE");
             }
-            if((outputShape.at(2) % itemW[knl_idx]) != 0){
+            if((outputShape.at(2) % itemW[min_index]) != 0){
                 buildOption.emplace("-DBLOCK_LEAVE");
             }
-            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[knl_idx], buildOption);
-            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-
-            uint32_t idx            = 0;
+            mKernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption);
+            uint32_t idx = 0;
             cl_int ret = CL_SUCCESS;
-            globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outputShape.at(3), itemC[knl_idx]) * UP_DIV(outputShape.at(2), itemW[knl_idx])), static_cast<uint32_t>(outputShape.at(0) * outputShape.at(1))};
-
-            ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
-            ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][1]);
-            ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(width, itemW[knl_idx]));
-            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(input));
-            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
-            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
-            ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
-            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
-            ret |= kernel[knl_idx]->get().setArg(idx++, height);
-            ret |= kernel[knl_idx]->get().setArg(idx++, width);
-            ret |= kernel[knl_idx]->get().setArg(idx++, UP_DIV(outChannel, 4));
-            ret |= kernel[knl_idx]->get().setArg(idx++, ROUND_UP(outChannel, tileN));
-
-            MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf Kernel Select");
-
-            std::pair<std::vector<uint32_t>, int> retTune;
-            retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[knl_idx] + info, kernel[knl_idx]);
-            if(min_cost.first > retTune.second) {
-                min_cost.first = retTune.second;
-                min_cost.second = knl_idx;
-                mLocalWorkSize = {retTune.first[0], retTune.first[1]};
-            }
-        }
-
-        std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
-        int min_index  = min_cost.second;
-        mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
 
-        std::set<std::string> buildOption = mResource->mBuildOptions;
-        if(outputShape.at(3) % itemC[min_index] != 0){
-            buildOption.emplace("-DCHANNEL_LEAVE");
-        }
-        if((outputShape.at(2) % itemW[min_index]) != 0){
-            buildOption.emplace("-DBLOCK_LEAVE");
+            ret |= mKernel->get().setArg(idx++, mGlobalWorkSize[0]);
+            ret |= mKernel->get().setArg(idx++, mGlobalWorkSize[1]);
+            ret |= mKernel->get().setArg(idx++, UP_DIV(width, itemW[min_index]));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(input));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
+            ret |= mKernel->get().setArg(idx++, openCLBuffer(output));
+            ret |= mKernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+            ret |= mKernel->get().setArg(idx++, height);
+            ret |= mKernel->get().setArg(idx++, width);
+            ret |= mKernel->get().setArg(idx++, batch);
+            ret |= mKernel->get().setArg(idx++, UP_DIV(outChannel, 4));
+            ret |= mKernel->get().setArg(idx++, ROUND_UP(outChannel, mResource->mAlignN));
+            MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf");
         }
-        mKernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("conv_2d_buf", kernelName[min_index], buildOption);
-        uint32_t idx = 0;
-        cl_int ret = CL_SUCCESS;
-
-        ret |= mKernel->get().setArg(idx++, mGlobalWorkSize[0]);
-        ret |= mKernel->get().setArg(idx++, mGlobalWorkSize[1]);
-        ret |= mKernel->get().setArg(idx++, UP_DIV(width, itemW[min_index]));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(input));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
-        ret |= mKernel->get().setArg(idx++, openCLBuffer(output));
-        ret |= mKernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
-        ret |= mKernel->get().setArg(idx++, height);
-        ret |= mKernel->get().setArg(idx++, width);
-        ret |= mKernel->get().setArg(idx++, UP_DIV(outChannel, 4));
-        ret |= mKernel->get().setArg(idx++, ROUND_UP(outChannel, tileN));
-        MNN_CHECK_CL_SUCCESS(ret, "setArg Conv1x1Buf");
-
-        //printf("conv1x1 %d, %d %d, %d %d, %d %d\n", min_index, mGlobalWorkSize[0], mGlobalWorkSize[1], mLocalWorkSize[0], mLocalWorkSize[1], outChannel, width);
     } else {
         int inputImageShape[2]  = {inputHeight, inputWidth};
         int outputImageShape[2] = {height, width};
@@ -574,6 +572,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, inputChannels);
             ret |= kernel[knl_idx]->get().setArg(idx++, inputChannelBlocks);
+            ret |= kernel[knl_idx]->get().setArg(idx++, batch);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(strideShape), strideShape);
@@ -617,6 +616,7 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         ret |= mKernel->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
         ret |= mKernel->get().setArg(idx++, inputChannels);
         ret |= mKernel->get().setArg(idx++, inputChannelBlocks);
+        ret |= mKernel->get().setArg(idx++, batch);
         ret |= mKernel->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
         ret |= mKernel->get().setArg(idx++, sizeof(kernelShape), kernelShape);
         ret |= mKernel->get().setArg(idx++, sizeof(strideShape), strideShape);
@@ -630,9 +630,13 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
     if (inputs.size() > 1) {
         backend()->onReleaseBuffer(mResource->mFilter.get(), Backend::DYNAMIC);
     }
-    mOpenCLBackend->recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize);
-    mGlobalWorkSize[0] = ROUND_UP(mGlobalWorkSize[0], std::max((uint32_t)1, mLocalWorkSize[0]));
-    mGlobalWorkSize[1] = ROUND_UP(mGlobalWorkSize[1], std::max((uint32_t)1, mLocalWorkSize[1]));
+    if (mResource->mConv1x1Opt && mResource->mConv1x1Local){
+        mOpenCLBackend->recordKernel3d(mKernel, mGlobalWorkSize, mLocalWorkSize);
+    }else{
+        mOpenCLBackend->recordKernel2d(mKernel, mGlobalWorkSize, mLocalWorkSize);
+        mGlobalWorkSize[0] = ROUND_UP(mGlobalWorkSize[0], std::max((uint32_t)1, mLocalWorkSize[0]));
+        mGlobalWorkSize[1] = ROUND_UP(mGlobalWorkSize[1], std::max((uint32_t)1, mLocalWorkSize[1]));
+    }
     mOpenCLBackend->endRecord(mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ConvExecution onResize !\n");
@@ -663,11 +667,15 @@ ErrorCode ConvBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"ConvBuf2D-gemm2-0", event0});
     }
 
-    if(mResource->mConvGemmOptLevel == 2) {
+    if(mResource->mConvGemmOptLevel == 1) {
         mStrassenComputor->onExecute();
     } else {
         cl::Event event;
-        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
+        if (mResource->mConv1x1Opt && mResource->mConv1x1Local){
+            run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
+        } else{
+            runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
+        }
         std::string name = "ConvBuf2D";
         std::string b = std::to_string(inputs[0]->batch());
         std::string ci = std::to_string(inputs[0]->channel());
@@ -708,11 +716,14 @@ ErrorCode ConvBufExecution::onExecute(const std::vector<Tensor *> &inputs, const
     if (mPreKernel) {
         runKernel2D(mPreKernel, mPreGlobalWorkSize, mPreLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
     }
-    
-    if(mResource->mConvGemmOptLevel == 2) {
+    if(mResource->mConvGemmOptLevel == 1) {
         mStrassenComputor->onExecute();
     } else {
-        runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+        if (mResource->mConv1x1Opt && mResource->mConv1x1Local){
+            run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+        } else{
+            runKernel2D(mKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
+        }
     }
     if (mPostKernel) {
         runKernel2D(mPostKernel, mPostGlobalWorkSize, mPostLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
@@ -739,7 +750,7 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
         const int outputChannel         = outputShape.at(3);
         const int inputChannels = inputShape.at(3);
 #ifdef MNN_LOW_MEMORY
-        {
+        if (static_cast<OpenCLBackend *>(backend)->getMemory() == BackendConfig::Memory_Low){
             auto conv2dParams = op->main_as_Convolution2D();
             if (conv2dParams->quanParameter() != nullptr) {
                 if (((conv2dParams->quanParameter()->type() == 4) ||
@@ -749,6 +760,12 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
                         // Don't support IDST-int8 because of error
                         return nullptr;
                     }
+                    for (int i = 0; i < inputs.size(); ++i) {
+                        TensorUtils::setTensorSupportPack(inputs[i], false);
+                    }
+                    for (int i = 0; i < outputs.size(); ++i) {
+                        TensorUtils::setTensorSupportPack(outputs[i], false);
+                    }
                     return new ConvBufLowMemoryExecution(inputs, outputs, op, backend);
                 } else {
                     //MNN_ERROR("OpenCL Conv buf low memory init error. For Opencl Backend, only support low memory mode of int8 or int4 dequantization currently.\n");
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
index 96e1ec5aa..b8edea9ef 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.hpp
@@ -35,6 +35,7 @@ struct ConvBufResource {
     std::set<std::string> mBuildOptions;
     bool mConv1x1Opt = false;
     bool mConv1x1C8Opt = false;
+    bool mConv1x1Local = false;
     /*
      0 -> not use
      1 -> use small tile
@@ -44,6 +45,8 @@ struct ConvBufResource {
     std::shared_ptr<Execution> mRasterExe;
     bool mUseImage = false;
     int mNumQuantBit = 0;
+    int mAlignK = 1;
+    int mAlignN = 1;
 };
 
 class ConvBufCommonExecution {
@@ -76,7 +79,6 @@ class ConvBufExecution : public ConvBufCommonExecution, public CommonExecution {
     std::shared_ptr<KernelWrap> mKernel;
     std::shared_ptr<Tensor> mConvGemmInpTensor;
     std::shared_ptr<Tensor> mConvGemmOutTensor;
-    bool mNeedOutTempTensor = false;
     std::shared_ptr<KernelWrap> mPreKernel = nullptr;
     std::vector<uint32_t> mPreGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mPreLocalWorkSize{1, 1, 1, 1};
@@ -84,8 +86,9 @@ class ConvBufExecution : public ConvBufCommonExecution, public CommonExecution {
     std::vector<uint32_t> mPostGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mPostLocalWorkSize{1, 1, 1, 1};
     const float* mFilterDataPtr = nullptr;
+    
 private:
-
+    int mAlignM = 1;
     std::shared_ptr<StrassenMatrixComputor> mStrassenComputor;
 
 };
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
index c932c0a6c..d31462301 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
@@ -166,13 +166,12 @@ void ConvBufLowMemoryExecution::set1x1WeightLowMemory(int packCout, int packCin,
         mResource->mUseImage = true;
     }
     if(mResource->mUseImage) {
-        size_t w = ROUND_UP(mResource->mOutputChannel, packCout);
-        size_t h = UP_DIV(mResource->mInputChannel, packCin);
         if(mResource->mNumQuantBit == 4){
-            mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_UNSIGNED_INT16), w, h, 0, nullptr, &res));
-        }else{
-            mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res));
+            packCin *= 2;
         }
+        size_t w = ROUND_UP(mResource->mOutputChannel, packCout);
+        size_t h = UP_DIV(mResource->mInputChannel, packCin);
+        mResource->mKernelImage.reset(new cl::Image2D(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE, cl::ImageFormat(CL_RGBA, CL_SIGNED_INT32), w, h, 0, nullptr, &res));
         if (nullptr == mResource->mKernelImage.get() || res != CL_SUCCESS) {
             MNN_ERROR("Alloc Image %d x %d error, code:%d \n", (int)w, (int)h, (int)res);
         }
@@ -229,9 +228,11 @@ void ConvBufLowMemoryExecution::setGeneralWeightLowMemory(void* filterDataPtr, s
 }
 // select the fastest kernel for the general cases by tuning
 void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor * output) {
+    mUnits.resize(1);
     auto &unit = mUnits[0];
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
+    const int batch              = outputShape.at(0);
     const int height             = outputShape.at(1);
     const int width              = outputShape.at(2);
     const int outChannel         = outputShape.at(3);
@@ -286,6 +287,7 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
         ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
         ret |= kernel[knl_idx]->get().setArg(idx++, inputChannels);
         ret |= kernel[knl_idx]->get().setArg(idx++, inputChannelBlocks);
+        ret |= kernel[knl_idx]->get().setArg(idx++, batch);
         ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
         ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
         ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(strideShape), strideShape);
@@ -331,6 +333,7 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
     ret |= unit.kernel->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
     ret |= unit.kernel->get().setArg(idx++, inputChannels);
     ret |= unit.kernel->get().setArg(idx++, inputChannelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, batch);
     ret |= unit.kernel->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
     ret |= unit.kernel->get().setArg(idx++, sizeof(kernelShape), kernelShape);
     ret |= unit.kernel->get().setArg(idx++, sizeof(strideShape), strideShape);
@@ -346,9 +349,171 @@ void ConvBufLowMemoryExecution::tuneGeneralCaseLowMemory(Tensor * input, Tensor
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     return;
 }
-unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output) {
+
+// weight inverse quantization, use xgemm opt
+void ConvBufLowMemoryExecution::useFPWeightGemmLowMemory(Tensor * input, Tensor * output) {
+    mUnits.resize(3);
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    std::vector<int> inputShape  = tensorShapeFormat(input);
+    std::vector<int> outputShape = tensorShapeFormat(output);
+    int channelPack = 16;
+    if(mResource->mUseImage && mResource->mNumQuantBit == 4){
+        channelPack = 32;
+    }
+    int area = inputShape.at(1) * inputShape.at(2);
+    int M = outputShape.at(0) * area;
+    int N = mResource->mOutputChannel;
+    int K = mResource->mInputChannel;
+    int mAlignK = 4;
+    int mAlignN = 16;
+    int mAlignM = 64;
+    
+    // set M Align and N Align
+    if(mResource->mOutputChannel > 1024) {
+        mAlignN = 128;
+    } else if(mResource->mOutputChannel > 512) {
+        mAlignN = 64;
+    } else if(mResource->mOutputChannel > 96) {
+        mAlignN = 32;
+    }
+    float ratio = 1.0 * M / 1024.0 * N / 1024.0 * K / 1024.0;
+    if(M > 1024 && ratio >= 1.0) {
+        mAlignM = 128;
+    } else if(M > 512 && ratio >= 0.1) {
+        mAlignM = 64;
+    } else if(M > 96){
+        mAlignM = 32;
+    } else {
+        mAlignM = 16;
+    }
+    int alignM = ROUND_UP(M, mAlignM);
+    int alignN = ROUND_UP(N, mAlignN);
+    int alignK = ROUND_UP(K, mAlignK);
+    int blockDim = mResource->mInputChannel / mResource->mBlockSize;
+    
+    // alloc temp bufer
+    mConvGemmWeightTensor.reset(Tensor::createDevice<float>({ROUND_UP(mResource->mOutputChannel, mAlignN) * ROUND_UP(mResource->mInputChannel, std::max(mAlignK, channelPack))}));
+    mConvGemmInpTensor.reset(Tensor::createDevice<float>({alignK * alignM}));
+    mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
+    mOpenCLBackend->onAcquireBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
+    
+    //weight inverse quantization and rearrange
+    {
+        auto &unit = mUnits[0];
+        int outputChannelAlign = ROUND_UP(mResource->mOutputChannel, alignN);
+        int outputChannel4Align = ROUND_UP(mResource->mOutputChannel, 4);
+        std::set<std::string> buildOption = mResource->mBuildOptions;
+        if(mResource->mUseImage){
+            buildOption.emplace("-DUSE_IMAGE");
+        }
+        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(mResource->mInputChannel, channelPack)), static_cast<uint32_t>(UP_DIV(mResource->mOutputChannel, 4))};
+        unit.kernel = runtime->buildKernel("gemm_conv1x1_buf", "inverse_quant_weight", buildOption);
+        uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        if(mResource->mUseImage){
+            ret |= unit.kernel->get().setArg(idx++, *mResource->mKernelImage.get());
+        }else{
+            ret |= unit.kernel->get().setArg(idx++, *mResource->mKernelBuffer.get());
+        }
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->dequantScaleOffset.get()));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmWeightTensor.get()));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelAlign));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannel4Align));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(blockDim));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg inverse_quant_weight");
+        
+        mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, maxWorkGroupSize, runtime, "inverse_quant_weight", unit.kernel).first;
+        mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
+    }
+    
+    // rearange input
+    {
+        auto &unit = mUnits[1];
+        std::set<std::string> buildOptions = mResource->mBuildOptions;
+        
+        int m_pack = 4;
+        mGlobalWorkSize = {static_cast<uint32_t>(alignM/m_pack), static_cast<uint32_t>(alignK/4)};
+        unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_buf", "transpose_pad", buildOptions);
+        uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
+
+        int offset = 0;
+        int idx            = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(mGlobalWorkSize[0]));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(mGlobalWorkSize[1]));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignM));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignK));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(area));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get()));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg transpose_pad");
+        mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, maxWorkGroupSize, runtime, "transpose_pad", unit.kernel).first;
+
+        mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
+    }
+    
+    // call gemm strassen
+    {
+        mStrassenComputor.reset(new StrassenMatrixComputor(backend(), 3));
+        mStrassenComputor->onEncode(alignM, alignK, alignN, alignM, alignN, alignN, openCLBuffer(mConvGemmInpTensor.get()), openCLBuffer(mConvGemmWeightTensor.get()), openCLBuffer(mConvGemmOutTensor.get()), false, openCLBuffer(mResource->mBias.get()));
+    }
+        
+    // call output transpose
+    {
+        auto &unit = mUnits[2];
+        std::set<std::string> buildOptions = mResource->mBuildOptions;
+        int pack_m = 1;
+        if(M % 8 == 0) {
+            pack_m = 8;
+        } else if(M % 4 == 0) {
+            pack_m = 4;
+        }
+        buildOptions.emplace("-DM_VEC=" + std::to_string(pack_m));
+        unit.kernel = runtime->buildKernel("gemm_buf", "transpose_bias", buildOptions);
+        uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+
+        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(M, pack_m)), static_cast<uint32_t>(UP_DIV(N, 4))};
+
+        int offset = 0;
+        int idx            = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(mGlobalWorkSize[0]));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(mGlobalWorkSize[1]));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignM));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(alignN));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(area));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+
+        MNN_CHECK_CL_SUCCESS(ret, "setArg transpose_bias");
+        mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, maxWorkGroupSize, runtime, "transpose_bias", unit.kernel).first;
+        mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
+    }
+    
+    return;
+}
+void ConvBufLowMemoryExecution::tuneGemvLowMemory(Tensor * input, Tensor * output) {
+    mUnits.resize(1);
     auto &unit = mUnits[0];
-    unsigned int total_time = 0;
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
     const int outChannel = outputShape.at(3);
@@ -361,20 +526,17 @@ unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor
     const int blockNum = mResource->mBlockSize;
     const int blockDim = mResource->mInputChannel / mResource->mBlockSize;
 
-    int global_y = batch * height;
-    const int total_kernel = 5;
-    std::string kernelName[total_kernel] = {"gemm_conv_c1_buf", "gemm_conv_c2_buf", "gemm_conv_c4_buf", "gemm_conv_c1_image",  "gemm_conv_c2_image"};
-    int itemC[total_kernel] = {1, 2, 4, 1, 2};
+    int global_y = batch * height * width;
+    const int total_kernel = 3;
+    std::string kernelName[total_kernel] = {"gemv_conv_c1_buf", "gemv_conv_c2_buf", "gemv_conv_c4_buf"};
+    int itemC[total_kernel] = {1, 2, 4};
     int actual_kernel = total_kernel;
     std::shared_ptr<KernelWrap> kernel[total_kernel];
     std::vector<uint32_t> globalWorkSize[total_kernel];
     std::vector<uint32_t> localWorkSize[total_kernel];
     std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
     std::set<std::string> buildOption = mResource->mBuildOptions;
-    if(width == 1 && height == 1){
-        buildOption.emplace("-DWIDTH_HEIGHT_1");
-    }
-
+    
     if(blockDim % 16 != 0){
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     } else if (mResource->mUseImage && mResource->mNumQuantBit == 4 && blockDim % 32 != 0) {
@@ -382,22 +544,15 @@ unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     }
     std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel);
-    if(batch > 1){
-        global_y = UP_DIV(batch, 4) * height;
-        buildOption.emplace("-DBACTH_BLOCK4");
-        info += "_BATCH_BLOCK4";
-    }
-    int knl_idx = 0;
-    actual_kernel = 3;
     if(mResource->mUseImage){
-        knl_idx = 3;
-        actual_kernel = total_kernel;
+        buildOption.emplace("-DUSE_IMAGE");
     }
-    for (; knl_idx < actual_kernel; knl_idx++) {
-        kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[knl_idx], buildOption);
+    for (int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
+        auto option = buildOption;
+        option.emplace("-DTILE_N=" + std::to_string(itemC[knl_idx]));
+        kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[knl_idx], option);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-
-        globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx]) * width), static_cast<uint32_t>(global_y)};
+        globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx])), static_cast<uint32_t>(global_y)};
         uint32_t idx            = 0;
         cl_int ret = CL_SUCCESS;
         ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
@@ -414,9 +569,7 @@ unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor
         ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
         ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
         ret |= kernel[knl_idx]->get().setArg(idx++, inputChannels);
-        ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(batch));
-        ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(height));
-        ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(width));
+        ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(global_y));
         ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockNum));
         ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockDim));
         MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf Kernel Select");
@@ -428,13 +581,11 @@ unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor
             mLocalWorkSize = {retTune.first[0], retTune.first[1]};
         }
     }
-    total_time += min_cost.first;
     int min_index  = min_cost.second;
     mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
-
-
+    
+    buildOption.emplace("-DTILE_N=" + std::to_string(itemC[min_index]));
     unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemv_conv1x1_buf", kernelName[min_index], buildOption);
-    //MNN_PRINT("Kernel is %d.\n", min_index);
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -451,35 +602,37 @@ unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor
     ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannels));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(height));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(width));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int>(global_y));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int>(blockNum));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int>(blockDim));
     MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf");
     mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
     unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
-    return total_time;
+    return;
 }
-unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, Tensor * output) {
+unsigned int ConvBufLowMemoryExecution::tuneGemmLowMemory(Tensor * input, Tensor * output, std::string option, bool onlyGetTime) {
     mUnits.resize(3);
     unsigned int total_time = 0;
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
+    int channelPack = 16;
+    if(mResource->mUseImage && mResource->mNumQuantBit == 4){
+        channelPack = 32;
+    }
     const int outChannel = outputShape.at(3);
     const int inputChannels = inputShape.at(3);
     const int batch = outputShape.at(0);
     const int width_height = outputShape.at(1) * outputShape.at(2);
-    const int inputChannelBlocks = UP_DIV(inputChannels, 4);
-    const int outputChannelBlocks = UP_DIV(outChannel, 4);
+    const int inputChannelAlign = ROUND_UP(inputChannels, channelPack);
+    const int outputChannelAlign = ROUND_UP(outChannel, 4);
     const int blockNum = mResource->mBlockSize;
     const int blockDim = mResource->mInputChannel / mResource->mBlockSize;
-
-    int global_y = UP_DIV(batch, 4) * width_height;
-    const int total_kernel = 6;
-    std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf", "gemm_b4_c1_image",  "gemm_b4_c2_image", "gemm_b4_c4_image"};
-    int itemC[total_kernel] = {1, 2, 4, 1, 2, 4};
+    
+    int global_y = batch * width_height;
+    const int total_kernel = 3;
+    std::string kernelName[total_kernel] = {"gemm_b4_c1_buf", "gemm_b4_c2_buf", "gemm_b4_c4_buf"};
+    int itemC[total_kernel] = {1, 2, 4};
     int actual_kernel = total_kernel;
     std::shared_ptr<KernelWrap> kernel[total_kernel];
     std::vector<uint32_t> globalWorkSize[total_kernel];
@@ -492,9 +645,13 @@ unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, T
         // Image weight-int4 use load32
         buildOption.emplace("-DINPUT_CHANNEL_LEAVE");
     }
-    std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel);
+    buildOption.emplace(option);
+    if(mResource->mUseImage){
+        buildOption.emplace("-DUSE_IMAGE");
+    }
+    std::string info = std::to_string(inputChannels) + "_" + std::to_string(outChannel) + option;
     // mResource->mInputChannel ROUND_UP to blockDim, avoid gemm overstep
-    mConvGemmInpTensor.reset(Tensor::createDevice<float>({ROUND_UP(batch, 4) * ROUND_UP(ROUND_UP(mResource->mInputChannel, 4), blockDim) * width_height}));
+    mConvGemmInpTensor.reset(Tensor::createDevice<float>({ROUND_UP(batch, 4) * inputChannelAlign * width_height}));
     mConvGemmOutTensor.reset(Tensor::createDevice<float>({ROUND_UP(batch, 4) * ROUND_UP(mResource->mOutputChannel, 4) * width_height}));
     mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
     mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
@@ -504,43 +661,37 @@ unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, T
     // reshape n*c/4*4*hw -> n/4*hw*c*4
     {
         auto &unit = mUnits[0];
-        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(mResource->mInputChannel, 4)), static_cast<uint32_t>(UP_DIV(batch, 4)), static_cast<uint32_t>(width_height)};
-        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", "reshape_nchw4_nhwc4", buildOption);
+        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(inputChannelAlign, 4)), static_cast<uint32_t>(UP_DIV(global_y, 4))};
+        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_conv1x1_buf", "reshape_nchw4_nhwc4", buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
 
         uint32_t idx = 0;
         cl_int ret = CL_SUCCESS;
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmInpTensor.get()));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(width_height));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(global_y));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannels));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelAlign));
         MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_nc4_cn4");
-        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel);
+        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS2DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nchw4_nhwc4", unit.kernel);
         total_time += retTune.second;
         mLocalWorkSize = retTune.first;
-        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
-        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        if(false == onlyGetTime){
+            mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        }
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     }
     // gemm
     {
         auto &unit = mUnits[1];
-        int knl_idx = 0;
-        actual_kernel = 3;
-        if(mResource->mUseImage){
-            knl_idx = 3;
-            actual_kernel = total_kernel;
-        }
-        for (; knl_idx < actual_kernel; knl_idx++) {
-            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[knl_idx], buildOption);
+        for (int knl_idx = 0; knl_idx < actual_kernel; knl_idx++) {
+            kernel[knl_idx]        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_conv1x1_buf", kernelName[knl_idx], buildOption);
             uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
-
-            globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx])), static_cast<uint32_t>(global_y)};
+            
+            globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(outChannel, itemC[knl_idx])), static_cast<uint32_t>(UP_DIV(global_y, 4))};
             uint32_t idx            = 0;
             cl_int ret = CL_SUCCESS;
             ret |= kernel[knl_idx]->get().setArg(idx++, globalWorkSize[knl_idx][0]);
@@ -554,8 +705,9 @@ unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, T
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->dequantScaleOffset.get()));
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
-            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
-            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(UP_DIV(global_y, 4) * 4));
+            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(outputChannelAlign));
+            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannelAlign));
             ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockNum));
             ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(blockDim));
             MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf Kernel Select");
@@ -572,8 +724,7 @@ unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, T
         mGlobalWorkSize = {globalWorkSize[min_index][0], globalWorkSize[min_index][1]};
 
 
-        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", kernelName[min_index], buildOption);
-        //MNN_PRINT("Kernel is %d.\n", min_index);
+        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_conv1x1_buf", kernelName[min_index], buildOption);
         uint32_t idx = 0;
         cl_int ret = CL_SUCCESS;
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -587,38 +738,41 @@ unsigned int ConvBufLowMemoryExecution::tuneGemvBatchLowMemory(Tensor * input, T
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->dequantScaleOffset.get()));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelBlocks));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(UP_DIV(global_y, 4) * 4));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelAlign));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannelAlign));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(blockNum));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int>(blockDim));
-        MNN_CHECK_CL_SUCCESS(ret, "setArg gemv_conv1x1_buf");
-        mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg gemm_conv1x1_buf");
+        if(false == onlyGetTime){
+            mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        }
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
         unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     }
     // reshape n/4*hw*c*4 -> n*c/4*4*hw
     {
         auto &unit = mUnits[2];
-        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(mResource->mOutputChannel, 4)), static_cast<uint32_t>(UP_DIV(batch, 4)), static_cast<uint32_t>(width_height)};
-        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_quant_batch_buf", "reshape_nhwc4_nchw4", buildOption);
+        mGlobalWorkSize = {static_cast<uint32_t>(UP_DIV(mResource->mOutputChannel, 4)), static_cast<uint32_t>(UP_DIV(global_y, 4))};
+        unit.kernel        = mOpenCLBackend->getOpenCLRuntime()->buildKernel("gemm_conv1x1_buf", "reshape_nhwc4_nchw4", buildOption);
         uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(unit.kernel));
         uint32_t idx = 0;
         cl_int ret = CL_SUCCESS;
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
         ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mConvGemmOutTensor.get()));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(width_height));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelBlocks));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(global_y));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(outputChannelAlign));
         MNN_CHECK_CL_SUCCESS(ret, "setArg reshape_cn4_nc4");
-        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS3DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel);
+        std::pair<std::vector<uint32_t>, unsigned int> retTune = localWS2DDefault(mGlobalWorkSize, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "reshape_nhwc4_nchw4", unit.kernel);
         mLocalWorkSize = retTune.first;
         total_time += retTune.second;
-        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
-        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        if(false == onlyGetTime){
+            mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        }
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     }
     return total_time;
 }
@@ -695,10 +849,12 @@ bool ConvBufLowMemoryExecution::onClone(Backend* bn, const Op* op, Execution** d
     return true;
 }
 
-ErrorCode ConvBufLowMemoryExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+ErrorCode ConvBufLowMemoryExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("Start ConvBufLowMemoryExecution onResize !\n");
 #endif
+    auto runTime = mOpenCLBackend->getOpenCLRuntime();
+    mOpenCLBackend->startRecord(mRecording);
     mUnits.resize(1);
     auto input  = inputs[0];
     auto output = outputs[0];
@@ -707,30 +863,138 @@ ErrorCode ConvBufLowMemoryExecution::onEncode(const std::vector<Tensor *> &input
     mPaddings[1] = padding.first;//padX
     // onclone default use conv1x1Opt, need reset
     std::vector<int> outputShape = tensorShapeFormat(output);
-    const int batch = outputShape.at(0);
-    auto runTime = mOpenCLBackend->getOpenCLRuntime();
+    const int batch = outputShape.at(0) * outputShape.at(1) * outputShape.at(2);
+    mUseFPWeight = false;
     if (mResource->mConv1x1Opt) {
-        if(batch > 1 && false == getPreParamInfo("ConvBufLowMemoryPreArrangeMode", &batchConvMode, runTime)){
-            if(tuneGemvBatchLowMemory(input, output) < tuneGemmLowMemory(input, output)){
-                batchConvMode = 1;
-            } else{
-                batchConvMode = 2;
+        if(batch == 1){
+            tuneGemvLowMemory(input, output);
+        } else {
+            if(batch > 512){
+                useFPWeightGemmLowMemory(input, output);
+                mUseFPWeight = true;
+            }
+            else if(false == getPreParamInfo("ConvBufLowMemoryPreArrangeMode", &batchConvMode, runTime)){
+                if(tuneGemmLowMemory(input, output, "-DFORMAT_CNHW", true) < tuneGemmLowMemory(input, output, "", true)){
+                    batchConvMode = 1;
+                } else{
+                    batchConvMode = 2;
+                }
+                setPreParamInfo("ConvBufLowMemoryPreArrangeMode", batchConvMode, runTime);
+            } else {
+                std::string option = "";
+                if(1 == batchConvMode){
+                    option = "-DFORMAT_CNHW";
+                }
+                tuneGemmLowMemory(input, output, option);
             }
-            setPreParamInfo("ConvBufLowMemoryPreArrangeMode", batchConvMode, runTime);
-        }
-        if(batch > 1 && batchConvMode == 1){
-            tuneGemvBatchLowMemory(input, output);
-        }else{
-            tuneGemmLowMemory(input, output);
         }
     } else {
         tuneGeneralCaseLowMemory(input, output);
     }
+    for (auto &unit : mUnits) {
+        bool lws_null = true;
+        for (size_t i = 0; i < unit.globalWorkSize.dimensions(); ++i) {
+            unit.globalWorkSize.get()[i] = ROUND_UP(unit.globalWorkSize.get()[i], std::max((size_t)1, unit.localWorkSize.get()[i]));
+            if(unit.localWorkSize.get()[i] != 0) {
+                lws_null = false;
+            }
+        }
+        if(lws_null){
+            unit.localWorkSize = cl::NullRange;
+        }
+    }
+    mOpenCLBackend->endRecord(mRecording);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ConvBufLowMemoryExecution onResize !\n");
 #endif
     return NO_ERROR;
 }
+
+ErrorCode ConvBufLowMemoryExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+#ifdef LOG_VERBOSE
+    MNN_PRINT("Start ConvBufLowMemoryExecution onExecute !\n");
+#endif
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
+#ifdef ENABLE_OPENCL_TIME_PROFILER
+    int idx = 0;
+#else
+    if(mOpenCLBackend->isUseRecordQueue()){
+        mOpenCLBackend->addRecord(mRecording, mOpRecordUpdateInfo);
+        return NO_ERROR;
+    }
+#endif
+    auto res = CL_SUCCESS;
+    if(mUseFPWeight){
+        // arrange input and weight
+        int i = 0;
+        for (; i < 2; ++i){
+            auto unit = mUnits[i];
+            #ifdef ENABLE_OPENCL_TIME_PROFILER
+            cl::Event event;
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                   cl::NullRange,
+                                                   unit.globalWorkSize,
+                                                   unit.localWorkSize,
+                                                   nullptr,
+                                                   &event);
+            runtime->pushEvent({EnumNameOpType(mOpType) + std::to_string(idx++), event});
+            #else
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                   cl::NullRange,
+                                                   unit.globalWorkSize,
+                                                   unit.localWorkSize);
+            #endif
+            MNN_CHECK_CL_SUCCESS(res, EnumNameOpType(mOp->type()));
+        }
+        // call gemm execute
+        mStrassenComputor->onExecute();
+        
+        // rearrange output
+        for (; i < mUnits.size(); ++i){
+            auto unit = mUnits[i];
+            #ifdef ENABLE_OPENCL_TIME_PROFILER
+            cl::Event event;
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                   cl::NullRange,
+                                                   unit.globalWorkSize,
+                                                   unit.localWorkSize,
+                                                   nullptr,
+                                                   &event);
+            runtime->pushEvent({EnumNameOpType(mOpType) + std::to_string(idx++), event});
+            #else
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                   cl::NullRange,
+                                                   unit.globalWorkSize,
+                                                   unit.localWorkSize);
+            #endif
+            MNN_CHECK_CL_SUCCESS(res, EnumNameOpType(mOp->type()));
+        }
+    }else{
+        for (auto &unit : mUnits) {
+            #ifdef ENABLE_OPENCL_TIME_PROFILER
+            cl::Event event;
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                               cl::NullRange,
+                                                               unit.globalWorkSize,
+                                                               unit.localWorkSize,
+                                                               nullptr,
+                                                               &event);
+            runtime->pushEvent({EnumNameOpType(mOpType) + std::to_string(idx++), event});
+            #else
+            res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
+                                                               cl::NullRange,
+                                                               unit.globalWorkSize,
+                                                               unit.localWorkSize);
+            #endif
+            MNN_CHECK_CL_SUCCESS(res, EnumNameOpType(mOp->type()));
+        }
+    }
+#ifdef LOG_VERBOSE
+    MNN_PRINT("end ConvBufLowMemoryExecution onExecute !\n");
+#endif
+    return NO_ERROR;
+}
+
 } // namespace OpenCL
 } // namespace MNN
 #endif /* MNN_OPENCL_BUFFER_CLOSED */
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
index 8488f461b..5e04ac1aa 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.hpp
@@ -21,25 +21,30 @@ class ConvBufLowMemoryExecution : public ConvBufCommonExecution, public CommonEx
     ConvBufLowMemoryExecution(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op, Backend *backend);
     ConvBufLowMemoryExecution(std::shared_ptr<ConvBufResource> resource, const MNN::Op* op, Backend* backend);
     virtual ~ConvBufLowMemoryExecution();
-    virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 private:
     void getInfoFromOpLowMemory(std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon);
     void set1x1WeightLowMemory(int packCout, int packCin, void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon);
     void setGeneralWeightLowMemory(void * filterDataPtr, std::shared_ptr<ConvolutionCommon::Int8Common> & quanCommon);
     void tuneGeneralCaseLowMemory(Tensor * input, Tensor * output);
-    unsigned int tuneGemmLowMemory(Tensor * input, Tensor * output);
-    unsigned int tuneGemvBatchLowMemory(Tensor * input, Tensor * output);
+	void useFPWeightGemmLowMemory(Tensor * input, Tensor * output);
+    void tuneGemvLowMemory(Tensor * input, Tensor * output);
+    unsigned int tuneGemmLowMemory(Tensor * input, Tensor * output, std::string option, bool onlyGetTime = false);
     bool convertToQuantWeight1x1Buffer(cl::Buffer input, int pack);
     std::vector<int> mPaddings{0, 0};
     std::vector<uint32_t> mGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mLocalWorkSize{1, 1, 1, 1};
     void *mFilterDataPtr = nullptr;
     bool mLowMemoryFlag = false;
+    bool mUseFPWeight = false;
     std::shared_ptr<Tensor> mConvGemmInpTensor;
     std::shared_ptr<Tensor> mConvGemmOutTensor;
+    std::shared_ptr<Tensor> mConvGemmWeightTensor;
     std::shared_ptr<KernelWrap> mBufferToConv1x1Kernel = nullptr;
     uint32_t batchConvMode = 0; // batch > 1 convolution input arrage mode. 0 is need tune; 1 arrage to n/4chw4; 2 arrage to c/4hwn4
+    std::shared_ptr<StrassenMatrixComputor> mStrassenComputor;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
index c7b7fc644..bcabf60f6 100644
--- a/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.cpp
@@ -35,11 +35,11 @@ bool ConvBufWinograd::valid(const Convolution2DCommon* common, const Tensor* inp
     return valid;
 }
     
-void ConvBufWinograd::convertWeightFormat(cl::Buffer& buffer, const int tileK, const int tileN) {
+void ConvBufWinograd::convertWeightFormat(cl::Buffer& buffer, const int alignK, const int alignN) {
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
     
-    auto icPad  = ROUND_UP(mCi, tileK);
-    auto ocPad  = ROUND_UP(mCo, tileN);
+    auto icPad  = ROUND_UP(mCi, alignK);
+    auto ocPad  = ROUND_UP(mCo, alignN);
     
     auto kernel = runtime->buildKernel("winogradTransform_buf", "winoTransWeightBuf2_3_1", {});
     uint32_t gws[2] = {static_cast<uint32_t>(icPad), static_cast<uint32_t>(ocPad)};
@@ -205,15 +205,22 @@ ConvBufWinograd::ConvBufWinograd(const MNN::Op* op, Backend* backend) : CommonEx
         int kernelSize = kx;
         int alpha       = unit + kernelSize - 1;
         
-        int tileK = 4;
-        int tileN = 32;
+        mResource->mAlignK = 4;
+        mResource->mAlignN = 16;
+        if(mCo > 1024) {
+            mResource->mAlignN = 128;
+        } else if(mCo > 256) {
+            mResource->mAlignN = 64;
+        } else if(mCo > 64) {
+            mResource->mAlignN = 32;
+        }
 
         std::shared_ptr<Tensor> tmpFilterTensor;
         tmpFilterTensor.reset(Tensor::createDevice<int32_t>({mCo * mCi * ky * kx}));
         mOpenCLBackend->onAcquireBuffer(tmpFilterTensor.get(), Backend::DYNAMIC);
         mOpenCLBackend->onReleaseBuffer(tmpFilterTensor.get(), Backend::DYNAMIC);
 
-        mResource->mWeight.reset(Tensor::createDevice<float>({alpha * alpha * ROUND_UP(mCo, tileN) * ROUND_UP(mCi, tileK)}));//NHWC
+        mResource->mWeight.reset(Tensor::createDevice<float>({alpha * alpha * ROUND_UP(mCo, mResource->mAlignN) * ROUND_UP(mCi, mResource->mAlignK)}));//NHWC
         mOpenCLBackend->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
         
         buffer_size = mCo * mCi * ky * kx * sizeof(float);
@@ -228,7 +235,7 @@ ConvBufWinograd::ConvBufWinograd(const MNN::Op* op, Backend* backend) : CommonEx
         }
         mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(weightBufferCL, ptrCL);
         
-        convertWeightFormat(weightBufferCL, tileK, tileN);
+        convertWeightFormat(weightBufferCL, mResource->mAlignK, mResource->mAlignN);
     }
 }
 
@@ -277,7 +284,8 @@ ErrorCode ConvBufWinograd::SubgroupOnResize(const std::vector<Tensor *> &inputs,
     auto icC4  = UP_DIV(input->channel(), 4);
     auto icC16 = UP_DIV(input->channel(), 16);
     auto ocC4  = UP_DIV(output->channel(), 4);
-    auto ocC16     = UP_DIV(output->channel(), 16);
+    auto ocC16 = UP_DIV(output->channel(), 16);
+    auto batch = output->batch();
     auto inputpad  = TensorUtils::getDescribe(input)->mPads;
     auto outputpad = TensorUtils::getDescribe(output)->mPads;
     int in_c_pack  = TensorUtils::getTensorChannelPack(input);
@@ -316,7 +324,7 @@ ErrorCode ConvBufWinograd::SubgroupOnResize(const std::vector<Tensor *> &inputs,
         }
     }
     
-    for (int b = 0; b < input->batch(); ++b) {
+    for (int b = 0; b < batch; ++b) {
         int hCount = hUnit;
         int wCount = wUnit;
         int width_pack = ROUND_UP(hCount * wCount, 8);
@@ -340,6 +348,7 @@ ErrorCode ConvBufWinograd::SubgroupOnResize(const std::vector<Tensor *> &inputs,
             ret |= mUnits[b * 3].kernel->get().setArg(index++, icC16);
             ret |= mUnits[b * 3].kernel->get().setArg(index++, width_pack);
             ret |= mUnits[b * 3].kernel->get().setArg(index++, b);
+            ret |= mUnits[b * 3].kernel->get().setArg(index++, batch);
             ret |= mUnits[b * 3].kernel->get().setArg(index++, static_cast<uint32_t>(inputpad.left));
             ret |= mUnits[b * 3].kernel->get().setArg(index++, static_cast<uint32_t>(inputpad.right));
             MNN_CHECK_CL_SUCCESS(ret, "setArg ConvWinogradBuf Source Trans");
@@ -400,6 +409,7 @@ ErrorCode ConvBufWinograd::SubgroupOnResize(const std::vector<Tensor *> &inputs,
             ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, ocC16);
             ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, width_pack);
             ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, b);
+            ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, batch);
             ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, static_cast<uint32_t>(outputpad.left));
             ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, static_cast<uint32_t>(outputpad.right));
             MNN_CHECK_CL_SUCCESS(ret, "setArg ConvWinogradBuf Dest Trans");
@@ -458,13 +468,21 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
     } else
 #endif /* MNN_SUPPORT_INTEL_SUBGROUP */    
     {
-	int tileM = 16;
-        int tileN = 32;
-        int tileK = 4;
+        mAlignM = 16;
+        float ratio = 1.0 * alpha * alpha * wUnit * hUnit / 1024.0 * input->channel() / 1024.0 * output->channel() / 1024.0;
+        if (wUnit * hUnit > 512 && ratio > 1.0) {
+            mAlignM = 128;
+        } else if (wUnit * hUnit > 256 && ratio > 0.1) {
+            mAlignM = 64;
+        } else if (wUnit * hUnit > 64) {
+            mAlignM = 32;
+        }
+        int mAlignK = mResource->mAlignK;
+        int mAlignN = mResource->mAlignN;
         mSource.reset(Tensor::createDevice<float>(
-            std::vector<int>{alpha * alpha * ROUND_UP(input->channel(), tileK) * ROUND_UP(wUnit * hUnit, tileM)}));
+            std::vector<int>{alpha * alpha * ROUND_UP(input->channel(), mAlignK) * ROUND_UP(wUnit * hUnit, mAlignM)}));
         mDest.reset(Tensor::createDevice<float>(
-            std::vector<int>{alpha * alpha * ROUND_UP(wUnit * hUnit, tileM) * ROUND_UP(output->channel(), tileN)}));
+            std::vector<int>{alpha * alpha * ROUND_UP(wUnit * hUnit, mAlignM) * ROUND_UP(output->channel(), mAlignN)}));
 
         mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
         mOpenCLBackend->onAcquireBuffer(mDest.get(), Backend::DYNAMIC);
@@ -498,9 +516,9 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
 
         int hCount = hUnit;
         int wCount = wUnit;
-        int M_pack = ROUND_UP(wCount * hCount, tileM);
-        int K_pack = ROUND_UP(input->channel(), tileK);
-        int N_pack = ROUND_UP(output->channel(), tileN);
+        int M_pack = ROUND_UP(wCount * hCount, mAlignM);
+        int K_pack = ROUND_UP(input->channel(), mAlignK);
+        int N_pack = ROUND_UP(output->channel(), mAlignN);
         for (int b = 0; b < input->batch(); ++b) {
 
             // Source Transform
@@ -521,6 +539,7 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
                 ret |= mUnits[b * 3].kernel->get().setArg(index++, icC4);
                 ret |= mUnits[b * 3].kernel->get().setArg(index++, M_pack);
                 ret |= mUnits[b * 3].kernel->get().setArg(index++, K_pack);
+                ret |= mUnits[b * 3].kernel->get().setArg(index++, input->batch());
                 ret |= mUnits[b * 3].kernel->get().setArg(index++, b);
                 MNN_CHECK_CL_SUCCESS(ret, "setArg ConvWinogradBuf Source Trans");
 
@@ -535,9 +554,9 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
             
             {
                 int loop = alpha * alpha;
-                int e_pack = ROUND_UP(wCount * hCount, tileM);
-                int l_pack = ROUND_UP(input->channel(), tileK);
-                int h_pack = ROUND_UP(output->channel(), tileN);
+                int e_pack = ROUND_UP(wCount * hCount, mAlignM);
+                int l_pack = ROUND_UP(input->channel(), mAlignK);
+                int h_pack = ROUND_UP(output->channel(), mAlignN);
 
                 std::set<std::string> buildOptions;
                 uint32_t layout = 4;
@@ -586,6 +605,10 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
                 int batch_offset_b = h_pack * l_pack;
                 int batch_offset_c = e_pack * h_pack;
                 
+                int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+                int stride[4] = {e_pack, h_pack, h_pack, h_pack};
+                int group[4] = {1, 1, 1, loop};
+                
                 int idx            = 0;
                 cl_int ret = CL_SUCCESS;
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, static_cast<int>(e_pack));
@@ -594,11 +617,11 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, alpha);
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, beta);
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, openCLBuffer(mSource.get()));
-                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, batch_offset_a);
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, openCLBuffer(mResource->mWeight.get()));
-                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, batch_offset_b);
                 ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, openCLBuffer(mDest.get()));
-                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, batch_offset_c);
+                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, batch_offset);
+                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, stride);
+                ret |= mUnits[b * 3 + 1].kernel->get().setArg(idx++, group);
                 MNN_CHECK_CL_SUCCESS(ret, "setArg Winograd batchmatmul Kernel");
                 
                 mOpenCLBackend->recordKernel3d(mUnits[b * 3 + 1].kernel, mGWS_M[b], mLWS_M[b]);
@@ -624,6 +647,7 @@ ErrorCode ConvBufWinograd::onEncode(const std::vector<Tensor*>& inputs, const st
                 ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, ocC4);
                 ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, M_pack);
                 ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, N_pack);
+                ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, input->batch());
                 ret |= mUnits[b * 3 + 2].kernel->get().setArg(index++, b);
                 MNN_CHECK_CL_SUCCESS(ret, "setArg ConvWinogradBuf Dest Trans");
                 
diff --git a/source/backend/opencl/execution/buffer/ConvBufWinograd.hpp b/source/backend/opencl/execution/buffer/ConvBufWinograd.hpp
index e200fc2ef..cec80e347 100644
--- a/source/backend/opencl/execution/buffer/ConvBufWinograd.hpp
+++ b/source/backend/opencl/execution/buffer/ConvBufWinograd.hpp
@@ -22,6 +22,8 @@ struct ConvBufWinoResource {
     bool mUseSubgroup{false};
     std::shared_ptr<Tensor> mWeight;
     std::shared_ptr<Tensor> mBias;
+    int mAlignN;
+    int mAlignK;
 };
 
 class ConvBufWinograd : public CommonExecution {
@@ -41,7 +43,7 @@ class ConvBufWinograd : public CommonExecution {
 #endif /* MNN_SUPPORT_INTEL_SUBGROUP */
     
 private:
-    void convertWeightFormat(cl::Buffer& buffer, const int tileK, const int tileN);
+    void convertWeightFormat(cl::Buffer& buffer, const int alignK, const int alignN);
 private:
     OpenCLBackend* mOpenCLBackend;
     std::shared_ptr<ConvBufWinoResource> mResource;
@@ -66,6 +68,8 @@ class ConvBufWinograd : public CommonExecution {
     std::vector<std::vector<uint32_t> > mLWS_S;
     std::vector<std::vector<uint32_t> > mLWS_D;
     std::vector<std::vector<uint32_t> > mLWS_M;
+private:
+    int mAlignM;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
index 70685e7bb..dd4201f12 100644
--- a/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvSubgroupBufExecution.cpp
@@ -258,6 +258,7 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     std::vector<int> outputShape = tensorShapeFormat(output);
     int in_c_pack                = TensorUtils::getTensorChannelPack(input);
     int out_c_pack               = TensorUtils::getTensorChannelPack(output);
+    const int batch              = outputShape.at(0);
     const int height             = outputShape.at(1);
     const int width              = outputShape.at(2);
     const int outChannel         = outputShape.at(3);
@@ -266,8 +267,6 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     const int inputWidth    = inputShape.at(2);
     const int inputChannels = inputShape.at(3);
 
-    int input_width_pad = mResource->mStrides[1] * (8 - 1) + (mResource->mKernelWidth - 1) * mResource->mDilations[1] + 1 + width * mResource->mStrides[1] + mPaddings[1];
-    int input_height_pad = inputHeight + 2 * mPaddings[0];
     uint32_t MaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->MaxWorkGroupSize());
     uint32_t MaxThreadsPerDevice = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->MaxThreadsPerDevice());
     bool isSupportedFP16 = mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16();
@@ -280,6 +279,8 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     int strideShape[2]                 = {mResource->mStrides[0], mResource->mStrides[1]};
     int paddingShape[2]                = {mPaddings[0], mPaddings[1]};
     int dilationShape[2]               = {mResource->mDilations[0], mResource->mDilations[1]};
+    int trans_pad_x                    = inputpad.left;
+    int trans_pad_y                    = inputpad.right;
     auto tune_param = GetTuningParams(inputs, outputs, MaxWorkGroupSize, isSupportedFP16, MaxThreadsPerDevice);
     uint32_t blockWidth = tune_param.first;
     uint32_t sub_group_size = 16;
@@ -318,14 +319,17 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputWidth));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputHeight));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputChannels));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batch));
              unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4));
-             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
-             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_x));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_y));
 
              mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c1", unit.kernel).first;
              mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize);
          } else {
-             mSource.reset(Tensor::createDevice<float>(std::vector<int>{inputShape.at(0), UP_DIV(input->channel(), 16),inputHeight * inputWidth, 16}, Tensor::CAFFE_C4));
+             trans_pad_x = std::max(inputpad.left, mPaddings[1]);
+             trans_pad_y = std::max(inputpad.right, mPaddings[1]);
+             mSource.reset(Tensor::createDevice<float>(std::vector<int>{inputShape.at(0), UP_DIV(input->channel(), 16),inputHeight * (inputWidth + trans_pad_x + trans_pad_y), 16}, Tensor::CAFFE_C4));
              mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
              mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
              unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("input_transe_buf", "conv_transe_c4_c16", {});
@@ -344,9 +348,10 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputWidth));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputHeight));
              unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputChannels));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batch));
              unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4));
-             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
-             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_x));
+             unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_y));
 
              mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(), "conv_transe_c4_c16", unit.kernel).first;
              mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize);
@@ -402,9 +407,10 @@ ErrorCode ConvSubgroupBuf::onEncode(const std::vector<Tensor *> &inputs, const s
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(width));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(height));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outChannel));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batch));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(x_blocks));
-    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
-    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_x));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_y));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputpad.left));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputpad.right));
 #ifdef LOG_VERBOSE
diff --git a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
index 096594ebc..4fd445067 100644
--- a/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DeconvBufExecution.cpp
@@ -153,6 +153,7 @@ ErrorCode DeconvBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mFilter.get()));
     unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
     unit.kernel->get().setArg(idx++, openCLBuffer(output));
+    unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputBatch));
     unit.kernel->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
     unit.kernel->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
     unit.kernel->get().setArg(idx++, sizeof(strideShape), strideShape);
diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
index 5bc18f9ff..44af28f35 100644
--- a/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvBufExecution.cpp
@@ -108,7 +108,8 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
     const int outputHeight = outputShape.at(1);
     const int outputWidth  = outputShape.at(2);
     const int outputChannel  = outputShape.at(3);
-
+    
+    const int batch         = inputShape.at(0);
     const int inputHeight   = inputShape.at(1);
     const int inputWidth    = inputShape.at(2);
     const int inputChannels = inputShape.at(3);
@@ -173,7 +174,7 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
-            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannels));
+            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(batch));
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(paddingShape), paddingShape);
@@ -206,7 +207,7 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannels));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
         ret |= unit.kernel->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
         ret |= unit.kernel->get().setArg(idx++, sizeof(kernelShape), kernelShape);
         ret |= unit.kernel->get().setArg(idx++, sizeof(paddingShape), paddingShape);
@@ -249,7 +250,7 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
             ret |= kernel[knl_idx]->get().setArg(idx++, openCLBuffer(output));
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
-            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(inputChannels));
+            ret |= kernel[knl_idx]->get().setArg(idx++, static_cast<int>(batch));
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(kernelShape), kernelShape);
             ret |= kernel[knl_idx]->get().setArg(idx++, sizeof(paddingShape), paddingShape);
@@ -283,7 +284,7 @@ ErrorCode DepthwiseConvBufExecution::onEncode(const std::vector<Tensor *> &input
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mResource->mBias.get()));
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(idx++, sizeof(inputImageShape), inputImageShape);
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(inputChannels));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(batch));
         ret |= unit.kernel->get().setArg(idx++, sizeof(outputImageShape), outputImageShape);
         ret |= unit.kernel->get().setArg(idx++, sizeof(kernelShape), kernelShape);
         ret |= unit.kernel->get().setArg(idx++, sizeof(paddingShape), paddingShape);
diff --git a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
index 90bcb5c36..c7db48719 100644
--- a/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/DepthwiseConvSubgroupBufExecution.cpp
@@ -178,7 +178,8 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     auto padding = ConvolutionCommon::convolutionPad(input, output, mResource->mConv2dCommonParams);
     mPaddings[0] = padding.second;//padY
     mPaddings[1] = padding.first;//padX
-
+    
+    const int batch = outputShape.at(0);
     const int outputHeight = outputShape.at(1);
     const int outputWidth  = outputShape.at(2);
     const int outputChannel  = outputShape.at(3);
@@ -201,6 +202,8 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     auto outputpad          = TensorUtils::getDescribe(output)->mPads;
     int input_c_pack        = TensorUtils::getTensorChannelPack(input);
     int output_c_pack       = TensorUtils::getTensorChannelPack(output);
+    int trans_pad_x         = inputpad.left;
+    int trans_pad_y         = inputpad.right;
 
     std::set<std::string> buildOptions = mResource->mBuildOptions;
     buildOptions.emplace("-DFILTER_HEIGHT=" + std::to_string(kernelShape[0]));
@@ -210,9 +213,11 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     buildOptions.emplace("-DSTRIDE_HEIGHT=" + std::to_string(strideShape[0]));
     buildOptions.emplace("-DSTRIDE_WIDTH=" + std::to_string(strideShape[1]));
     if (input_c_pack == 4) {
+        trans_pad_x = std::max(inputpad.left, mPaddings[1]);
+        trans_pad_y = std::max(inputpad.right, mPaddings[1]);
         Unit unit;
         mNeedTranse = true;
-        mSource.reset(Tensor::createDevice<float>(std::vector<int>{inputShape.at(0), UP_DIV(input->channel(), 16), inputHeight * (inputWidth + inputpad.left + inputpad.right), 16}, Tensor::CAFFE_C4));
+        mSource.reset(Tensor::createDevice<float>(std::vector<int>{inputShape.at(0), UP_DIV(input->channel(), 16), inputHeight * (inputWidth + trans_pad_x + trans_pad_y), 16}, Tensor::CAFFE_C4));
         mOpenCLBackend->onAcquireBuffer(mSource.get(), Backend::DYNAMIC);
         mOpenCLBackend->onReleaseBuffer(mSource.get(), Backend::DYNAMIC);
         unit.kernel =
@@ -233,9 +238,10 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
         unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputWidth));
         unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputHeight));
         unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputChannels));
+        unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batch));
         unit.kernel->get().setArg(idx++, UP_DIV(inputShape.at(3), 4));
-        unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
-        unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
+        unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_x));
+        unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_y));
 
         mTranseLocalWorkSize = localWS3DDefault(mTranseGlobalWorkSize, mMaxWGS_S, mOpenCLBackend->getOpenCLRuntime(),"conv_transe_c4_c16", unit.kernel).first;
         mOpenCLBackend->recordKernel3d(unit.kernel, mTranseGlobalWorkSize, mTranseLocalWorkSize);
@@ -265,8 +271,9 @@ ErrorCode DepthwiseConvSubgroupBufExecution::onEncode(const std::vector<Tensor *
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputHeight));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputWidth));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputChannels));
-    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
-    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batch));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_x));
+    unit.kernel->get().setArg(idx++, static_cast<uint32_t>(trans_pad_y));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputHeight));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputWidth));
     unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputpad.left));
diff --git a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
index e4091b0ea..50b7fd25e 100644
--- a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
@@ -76,7 +76,7 @@ ErrorCode GridSampleBufExecution::onEncode(const std::vector<Tensor *> &inputs,
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(channelC4));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batches));
     ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
     ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
     MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution");
diff --git a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
index 92485742b..1865696ea 100644
--- a/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/GroupNormBufExecution.cpp
@@ -117,47 +117,12 @@ ErrorCode GroupNormBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
     inner_size /= mGroup;
     
     mUnits.clear();
-    mUnits.resize(3);
+    mUnits.resize(1);
     std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
     int inputWH[]      = {inputShape[2], inputShape[1]};
     int region[]       = {inputShape[0], UP_DIV(inputShape[3], 4), inputShape[1], inputShape[2]};
     
-    mInputPlain = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{inputShape[0] * inputShape[3] * ROUND_UP(inputShape[1] * inputShape[2], 4)}));
-    mOpenCLBackend->onAcquireBuffer(mInputPlain.get(), Backend::DYNAMIC);
-    mOutputPlain = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{inputShape[0] * inputShape[3] * ROUND_UP(inputShape[1] * inputShape[2], 4)}));
-    mOpenCLBackend->onAcquireBuffer(mOutputPlain.get(), Backend::DYNAMIC);
-    
-    mOpenCLBackend->onReleaseBuffer(mInputPlain.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onReleaseBuffer(mOutputPlain.get(), Backend::DYNAMIC);
     std::set<std::string> buildOptions;
-    // convert nc4hw4 to nchw
-    {
-        auto &unit = mUnits[0];
-        unit.kernel         = runtime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", {}, inputs[0], outputs[0]);
-
-        mGWS = {(uint32_t)(UP_DIV(region[3] * region[1], 16) * 16),
-            (uint32_t)(UP_DIV(region[2] * region[0], 16) * 16)};
-        mLWS = {16, 16};
-        unit.globalWorkSize  = {mGWS[0], mGWS[1]};
-        unit.localWorkSize = {mLWS[0], mLWS[1]};
-        
-        int global_dim0 = region[3] * region[1];
-        int global_dim1 = region[2] * region[0];
-        
-        //MNN_CHECK_CL_SUCCESS
-        uint32_t idx   = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= unit.kernel->get().setArg(idx++, global_dim0);
-        ret |= unit.kernel->get().setArg(idx++, global_dim1);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mInputPlain.get()));
-        ret |= unit.kernel->get().setArg(idx++, inputWH[1]);
-        ret |= unit.kernel->get().setArg(idx++, inputWH[0]);
-        ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
-        MNN_CHECK_CL_SUCCESS(ret, "setArg GroupNormBufExecution with group, convert nc4hw4 to nchw");
-        
-        mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
-    }
     // do groupnorm
     {
         int area = inputWH[1] * inputWH[0];
@@ -175,7 +140,7 @@ ErrorCode GroupNormBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
         }
         auto MaxLocalSize = std::min(runtime->getMaxWorkItemSizes()[0], (uint32_t)256);
 
-        auto &unit = mUnits[1];
+        auto &unit = mUnits[0];
         std::string kernelName = "groupnorm_plain_buf";
         int local_size = getLocalSize(UP_DIV(inner_size, 4), MaxLocalSize);
         buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
@@ -195,11 +160,11 @@ ErrorCode GroupNormBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
         ret |= unit.kernel->get().setArg(idx++, mGWS[0]);
         ret |= unit.kernel->get().setArg(idx++, mGWS[1]);
         ret |= unit.kernel->get().setArg(idx++, mGWS[2]);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mInputPlain.get()));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
         if(inputs.size() > 1) {
             ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[1]));
         }
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mOutputPlain.get()));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(area));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(mGroup));
         ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inner_size));
@@ -212,33 +177,6 @@ ErrorCode GroupNormBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
         MNN_CHECK_CL_SUCCESS(ret, "setArg GroupNormBufExecution with group, do group layernorm");
         mOpenCLBackend->recordKernel3d(unit.kernel, mGWS, mLWS);
     }
-    // convert nchw to nc4hw4
-    {
-        auto &unit = mUnits[2];
-
-        unit.kernel         = runtime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", {}, inputs[0], outputs[0]);
-        mLWS  = {16, 16};
-        mGWS = {(uint32_t)UP_DIV(region[3] * region[1], 16) * 16,
-                (uint32_t)UP_DIV(region[2] * region[0], 16) * 16};
-        
-        unit.globalWorkSize  = {mGWS[0], mGWS[1]};
-        unit.localWorkSize = {mLWS[0], mLWS[1]};
-        
-        int global_dim0 = region[3] * region[1];
-        int global_dim1 = region[2] * region[0];
-        
-        uint32_t idx   = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= unit.kernel->get().setArg(idx++, global_dim0);
-        ret |= unit.kernel->get().setArg(idx++, global_dim1);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mOutputPlain.get()));
-        ret |= unit.kernel->get().setArg(idx++, inputWH[1]);
-        ret |= unit.kernel->get().setArg(idx++, inputWH[0]);
-        ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-        MNN_CHECK_CL_SUCCESS(ret, "setArg GroupNormBufExecution with group, convert nchw to nc4hw4");
-        mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
-    }
     mOpenCLBackend->endRecord(mRecording);
 
     return NO_ERROR;
diff --git a/source/backend/opencl/execution/buffer/GroupNormBufExecution.hpp b/source/backend/opencl/execution/buffer/GroupNormBufExecution.hpp
index bf569f983..2076c2780 100644
--- a/source/backend/opencl/execution/buffer/GroupNormBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/GroupNormBufExecution.hpp
@@ -31,7 +31,6 @@ class GroupNormBufExecution : public CommonExecution {
     int32_t mBatch;
     std::unique_ptr<Tensor> mGammaTensor;
     std::unique_ptr<Tensor> mBetaTensor;
-    std::shared_ptr<Tensor> mInputPlain, mOutputPlain;
     bool mHasGammaBeta = false;
     std::vector<uint32_t> mLWS{0, 0, 0, 0};
     std::vector<uint32_t> mGWS{0, 0, 0, 0};
diff --git a/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp b/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp
index 191c2fabe..fff8ef02f 100644
--- a/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/Interp3DBufExecution.cpp
@@ -86,7 +86,7 @@ ErrorCode Interp3DBufExecution::onEncode(const std::vector<Tensor *> &inputs, co
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputDepth));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputHeight));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputWidth));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(channelBlocks));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputBatch));
     MNN_CHECK_CL_SUCCESS(ret, "setArg Interp3DBufExecution");
 
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
diff --git a/source/backend/opencl/execution/buffer/InterpBufExecution.cpp b/source/backend/opencl/execution/buffer/InterpBufExecution.cpp
index 00ab7de08..061dacbd0 100644
--- a/source/backend/opencl/execution/buffer/InterpBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/InterpBufExecution.cpp
@@ -80,7 +80,7 @@ ErrorCode InterpBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputWidth));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputHeight));
     ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outputWidth));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(channelBlocks));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputBatch));
     MNN_CHECK_CL_SUCCESS(ret, "setArg InterpBufExecution");
 
     mLWS = localWS3DDefault(mGWS, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
diff --git a/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp b/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp
index 0f6b3f629..100fe2db2 100644
--- a/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/LayerNormBufExecution.cpp
@@ -24,7 +24,7 @@ LayerNormBufExecution::LayerNormBufExecution(const std::vector<Tensor *> &inputs
     group_ = layer_norm_param->group();
     RMSNorm = layer_norm_param->useRMSNorm();
     auto bufferUnitSize = runtime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
-    auto kernel = runtime->buildKernel("layernorm_buf", "layernorm_w_buf", {"-DLOCAL_SIZE=512"});
+    auto kernel = runtime->buildKernel("layernorm_buf", "layernorm_buf", {"-DLOCAL_SIZE=512"});
     mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(kernel));
 
     if(layer_norm_param->gamma() && layer_norm_param->beta()){
@@ -99,11 +99,6 @@ ErrorCode LayerNormBufExecution::onEncode(const std::vector<Tensor *> &inputs, c
     std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
 
-    const int inputBatch    = inputShape[0];
-    const int inputHeight   = inputShape[1];
-    const int inputWidth    = inputShape[2];
-    const int inputChannels = inputShape[3];
-    int local_size;
     int rank = inputs.at(0)->dimensions();
     int outter_size = 1;
     int inner_size = 1;
@@ -122,169 +117,40 @@ ErrorCode LayerNormBufExecution::onEncode(const std::vector<Tensor *> &inputs, c
         }
         inner_size /= group_;
     }
-//    printf("out:%d in:%d, %d %d %d %d, %d\n", outter_size, inner_size, inputBatch, inputHeight, inputWidth, inputChannels, group_);
+    
+    int local_size = getLocalSize(inner_size / 4, MaxLocalSize);
     std::set<std::string> buildOptions;
+    buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
     if(RMSNorm){
         buildOptions.emplace("-DRMSNORM");
     }
     if(has_gamma_beta_){
         buildOptions.emplace("-DGAMMA_BETA");
     }
-    std::string kernelName;
-    if (inner_size == inputWidth && outter_size == inputBatch * inputHeight * inputChannels) {
-        kernelName = "layernorm_w_buf";
-        local_size = getLocalSize(inputWidth, MaxLocalSize);
-        buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-        unit.kernel = runtime->buildKernel("layernorm_buf", kernelName, buildOptions);
-        
-        mGWS = {static_cast<uint32_t>(local_size),
-                static_cast<uint32_t>(inputHeight * UP_DIV(inputChannels, 4)),
-                static_cast<uint32_t>(inputBatch)};
-    }else if(inner_size == inputWidth * inputHeight && outter_size == inputBatch * inputChannels){
-        kernelName = "layernorm_hw_buf";
-        local_size = getLocalSize(inputWidth * inputHeight, MaxLocalSize);
-        buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-        unit.kernel = runtime->buildKernel("layernorm_buf", kernelName, buildOptions);
-        
-        mGWS = {static_cast<uint32_t>(local_size),
-                static_cast<uint32_t>(UP_DIV(inputChannels, 4)),
-                static_cast<uint32_t>(inputBatch)};
-    }else if(inner_size == inputWidth * inputHeight * inputChannels && outter_size == inputBatch){
-        kernelName = "layernorm_chw_buf";
-        local_size = getLocalSize(inputWidth * inputHeight, MaxLocalSize);
-        buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-        unit.kernel = runtime->buildKernel("layernorm_buf", kernelName, buildOptions);
-        
-        mGWS = {static_cast<uint32_t>(local_size),
-                static_cast<uint32_t>(1),
-                static_cast<uint32_t>(inputBatch)};
-    } else if(inner_size == inputWidth * inputHeight * inputChannels / group_ && outter_size == inputBatch * group_){
-        mUnits.clear();
-        mUnits.resize(3);
-        std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
-        int inputWH[]      = {inputShape[2], inputShape[1]};
-        int region[]       = {inputShape[0], UP_DIV(inputShape[3], 4), inputShape[1], inputShape[2]};
-        
-        mInputPlain = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{inputShape[0], inputShape[3], ROUND_UP(inputShape[1] * inputShape[2], 4), 1}, Tensor::CAFFE));
-        mOpenCLBackend->onAcquireBuffer(mInputPlain.get(), Backend::DYNAMIC);
-        mOutputPlain = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{inputShape[0], inputShape[3], ROUND_UP(inputShape[1] * inputShape[2], 4), 1}, Tensor::CAFFE));
-        mOpenCLBackend->onAcquireBuffer(mOutputPlain.get(), Backend::DYNAMIC);
-
-        // convert nc4hw4 to nchw
-        {
-            auto &unit = mUnits[0];
-            unit.kernel         = runtime->buildKernel("buffer_convert_buf", "nc4hw4_buffer_to_nchw_buffer", {}, inputs[0], outputs[0]);
-
-            mGWS = {(uint32_t)(UP_DIV(region[3] * region[1], 16) * 16),
-                (uint32_t)(UP_DIV(region[2] * region[0], 16) * 16)};
-            mLWS = {16, 16};
-            unit.globalWorkSize  = {mGWS[0], mGWS[1]};
-            unit.localWorkSize = {mLWS[0], mLWS[1]};
-            
-            int global_dim0 = region[3] * region[1];
-            int global_dim1 = region[2] * region[0];
-            
-            //MNN_CHECK_CL_SUCCESS
-            uint32_t idx   = 0;
-            cl_int ret = CL_SUCCESS;
-            ret |= unit.kernel->get().setArg(idx++, global_dim0);
-            ret |= unit.kernel->get().setArg(idx++, global_dim1);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mInputPlain.get()));
-            ret |= unit.kernel->get().setArg(idx++, inputWH[1]);
-            ret |= unit.kernel->get().setArg(idx++, inputWH[0]);
-            ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
-            MNN_CHECK_CL_SUCCESS(ret, "setArg LayerNormBufExecution with group, convert nc4hw4 to nchw");
-            
-            mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
-        }
-        // do group layernorm
-        {
-            auto &unit = mUnits[1];
-            kernelName = "layernorm_plain_buf";
-            local_size = getLocalSize(UP_DIV(inner_size, 4), MaxLocalSize);
-            buildOptions.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-            unit.kernel = runtime->buildKernel("layernorm_buf", kernelName, buildOptions);
-            
-            mGWS = {static_cast<uint32_t>(local_size),
-                    static_cast<uint32_t>(1),
-                    static_cast<uint32_t>(outter_size)};
-            
-            mLWS = {static_cast<uint32_t>(local_size), 1, 1};
-
-            unit.globalWorkSize  = {mGWS[0], mGWS[1], mGWS[2]};
-            unit.localWorkSize   = {mLWS[0], mLWS[1], mLWS[2]};
-
-            uint32_t idx = 0;
-            cl_int ret = CL_SUCCESS;
-            ret |= unit.kernel->get().setArg(idx++, mGWS[0]);
-            ret |= unit.kernel->get().setArg(idx++, mGWS[1]);
-            ret |= unit.kernel->get().setArg(idx++, mGWS[2]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mInputPlain.get()));
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mOutputPlain.get()));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inner_size));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(outter_size));
-            if(has_gamma_beta_){
-                ret |= unit.kernel->get().setArg(idx++, *mGammaBuffer.get());
-                ret |= unit.kernel->get().setArg(idx++, *mBetaBuffer.get());
-            }
-            ret |= unit.kernel->get().setArg(idx++, epsilon_);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg LayerNormBufExecution with group, do group layernorm");
-            mOpenCLBackend->recordKernel3d(unit.kernel, mGWS, mLWS);
-        }
-        // convert nchw to nc4hw4
-        {
-            auto &unit = mUnits[2];
-
-            unit.kernel         = runtime->buildKernel("buffer_convert_buf", "nchw_buffer_to_nc4hw4_buffer", {}, inputs[0], outputs[0]);
-            mLWS  = {16, 16};
-            mGWS = {(uint32_t)UP_DIV(region[3] * region[1], 16) * 16,
-                    (uint32_t)UP_DIV(region[2] * region[0], 16) * 16};
-            
-            unit.globalWorkSize  = {mGWS[0], mGWS[1]};
-            unit.localWorkSize = {mLWS[0], mLWS[1]};
-            
-            int global_dim0 = region[3] * region[1];
-            int global_dim1 = region[2] * region[0];
-            
-            uint32_t idx   = 0;
-            cl_int ret = CL_SUCCESS;
-            ret |= unit.kernel->get().setArg(idx++, global_dim0);
-            ret |= unit.kernel->get().setArg(idx++, global_dim1);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mOutputPlain.get()));
-            ret |= unit.kernel->get().setArg(idx++, inputWH[1]);
-            ret |= unit.kernel->get().setArg(idx++, inputWH[0]);
-            ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-            MNN_CHECK_CL_SUCCESS(ret, "setArg LayerNormBufExecution with group, convert nchw to nc4hw4");
-            mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
-        }
-        
-        mOpenCLBackend->onReleaseBuffer(mInputPlain.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mOutputPlain.get(), Backend::DYNAMIC);
-        return NO_ERROR;
+    if(inner_size % 4 != 0){
+        buildOptions.emplace("-DPACK_LEAVE");
     }
-    mLWS = {static_cast<uint32_t>(local_size), 1, 1};
+    
+    unit.kernel = runtime->buildKernel("layernorm_buf", "layernorm_buf", buildOptions);
+    mGWS = {static_cast<uint32_t>(local_size), static_cast<uint32_t>(outter_size)};
+    mLWS = {static_cast<uint32_t>(local_size), 1};
 
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGWS[0]);
     ret |= unit.kernel->get().setArg(idx++, mGWS[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGWS[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputWidth));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputHeight));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inputChannels));
+    ret |= unit.kernel->get().setArg(idx++, static_cast<int32_t>(inner_size));
     if(has_gamma_beta_){
         ret |= unit.kernel->get().setArg(idx++, *mGammaBuffer.get());
         ret |= unit.kernel->get().setArg(idx++, *mBetaBuffer.get());
     }
     ret |= unit.kernel->get().setArg(idx++, epsilon_);
     MNN_CHECK_CL_SUCCESS(ret, "setArg LayerNormBufExecution");
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGWS, mLWS);
-    unit.globalWorkSize = {mGWS[0], mGWS[1], mGWS[2]};
-    unit.localWorkSize = {mLWS[0], mLWS[1], mLWS[2]};
+    mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
+    unit.globalWorkSize = {mGWS[0], mGWS[1]};
+    unit.localWorkSize = {mLWS[0], mLWS[1]};
 
     return NO_ERROR;
 
diff --git a/source/backend/opencl/execution/buffer/LoopBufExecution.cpp b/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
index bf8dfc463..56476b59e 100644
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.cpp
@@ -12,137 +12,6 @@
 
 namespace MNN {
 namespace OpenCL {
-        
-static void _TileOrPackTensor(Tensor *input, Tensor *output, std::shared_ptr<KernelWrap>& kernelW, cl::NDRange &globalWorkSize,
-                              cl::NDRange &localWorkSize, const int Width, const int Height, const int Channel,
-                              const int Batch, OpenCLBackend *bn, const std::string& KernelName, std::set<std::string> buildOptions,
-                              const int WidthPad, const int HeightPad, const int ChannelPad, OpenCLRuntime* runtime) {
-    bool fastTileTranspose = false;
-    if (TensorUtils::getDescribe(output)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC || TensorUtils::getDescribe(input)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
-        buildOptions.emplace("-DMNN_NHWC");
-    } else {
-        if (KernelName == "tile_buf" && buildOptions.find("-DTRANSPOSE") != buildOptions.end() && (buildOptions.find("-DDIMENSION_3") != buildOptions.end() || buildOptions.find("-DDIMENSION_4") != buildOptions.end())) {
-            fastTileTranspose = true;
-        }
-    }
-    
-    std::string runKernelName = KernelName;
-    unsigned int tileW = 32;
-    unsigned int tileC = 32;
-    unsigned int tileH = 32;
-
-    unsigned int localW = 8;
-    unsigned int localC = 8;
-    unsigned int localH = 8;
-    if(fastTileTranspose) {
-        // local memory limit
-        uint32_t local_mem_size = 4;
-        if(runtime->isSupportedFP16()) {
-            local_mem_size = 2;
-        }
-
-        if(buildOptions.find("-DDIMENSION_4") != buildOptions.end()) {
-            local_mem_size *= (64 * 64 * 4);
-            if(local_mem_size <= runtime->getMaxLocalMem()) {
-                if((WidthPad & 63) == 0) {
-                    tileW = 64;
-                }
-                if((HeightPad & 63) == 0) {
-                    tileH = 64;
-                }
-            }
-
-            runKernelName = "tile_trans_4d_buf";
-            // match with tileW tileH tileW/localW tileH/localH
-            buildOptions.emplace("-DWGSW=" + std::to_string(tileW));
-            buildOptions.emplace("-DWGSH=" + std::to_string(tileH));
-            buildOptions.emplace("-DTSW=" + std::to_string(tileW/localW));
-            buildOptions.emplace("-DTSH=" + std::to_string(tileH/localH));
-        } else {
-            local_mem_size *= (64 * 64);
-            if(local_mem_size <= runtime->getMaxLocalMem()) {
-                if((ChannelPad & 63) == 0) {
-                    tileC = 64;
-                }
-                if((HeightPad & 63) == 0) {
-                    tileH = 64;
-                }
-            }
-            runKernelName = "tile_trans_3d_buf";
-            // match with tileW tileH tileW/localW tileH/localH
-            buildOptions.emplace("-DWGSC=" + std::to_string(tileC));
-            buildOptions.emplace("-DWGSH=" + std::to_string(tileH));
-            buildOptions.emplace("-DTSC=" + std::to_string(tileC/localC));
-            buildOptions.emplace("-DTSH=" + std::to_string(tileH/localH));
-        }
-
-    }
-    if(input->getType().code == halide_type_int){
-        kernelW = bn->getOpenCLRuntime()->buildKernel("loop_buf", runKernelName, buildOptions, input, input);
-    }else if (output->getType().code == halide_type_int){
-        kernelW = bn->getOpenCLRuntime()->buildKernel("loop_buf", runKernelName, buildOptions, output, output);
-    }else {
-        kernelW = bn->getOpenCLRuntime()->buildKernel("loop_buf", runKernelName, buildOptions, input, output);
-    }
-    auto kernel = kernelW->get();
-    
-    uint32_t mMaxWorkGroupSize  = static_cast<uint32_t>(bn->getOpenCLRuntime()->getMaxWorkGroupSize(kernelW));
-    
-    if(fastTileTranspose) {
-        int w_per_thread = tileW / localW;
-        int h_per_thread = tileH / localH;
-        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)WidthPad/w_per_thread, (uint32_t)HeightPad/h_per_thread, (uint32_t)(UP_DIV(ChannelPad, 4)*Batch)};
-        std::vector<uint32_t> mLocalWorkSize = {localW, localH, 1};
-
-        if(buildOptions.find("-DDIMENSION_3") != buildOptions.end()) {
-            int c_per_thread = tileC / localC;
-            int h_per_thread = tileH / localH;
-            mGlobalWorkSize = {(uint32_t)ChannelPad/c_per_thread, (uint32_t)HeightPad/h_per_thread, (uint32_t)Batch};
-            mLocalWorkSize = {localC, localH, 1};
-        }
-
-        uint32_t index = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= kernel.setArg(index++, openCLBuffer(input));
-        ret |= kernel.setArg(index++, openCLBuffer(output));
-        ret |= kernel.setArg(index++, WidthPad);
-        ret |= kernel.setArg(index++, HeightPad);
-        ret |= kernel.setArg(index++, ChannelPad);
-        ret |= kernel.setArg(index++, Batch);
-        ret |= kernel.setArg(index++, Width);
-        ret |= kernel.setArg(index++, Height);
-        ret |= kernel.setArg(index++, Channel);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBuf _TileOrPackTensor tile_transpose_fast_buf");
-                
-        globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-        localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
-        bn->recordKernel3d(kernelW, mGlobalWorkSize, mLocalWorkSize);
-    } else {
-        std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)WidthPad, (uint32_t)HeightPad, (uint32_t)(UP_DIV(ChannelPad, 4)*Batch)};
-        
-        uint32_t index = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= kernel.setArg(index++, mGlobalWorkSize[0]);
-        ret |= kernel.setArg(index++, mGlobalWorkSize[1]);
-        ret |= kernel.setArg(index++, mGlobalWorkSize[2]);
-        ret |= kernel.setArg(index++, openCLBuffer(input));
-        ret |= kernel.setArg(index++, openCLBuffer(output));
-        ret |= kernel.setArg(index++, WidthPad);
-        ret |= kernel.setArg(index++, HeightPad);
-        ret |= kernel.setArg(index++, ChannelPad);
-        ret |= kernel.setArg(index++, Batch);
-        ret |= kernel.setArg(index++, Width);
-        ret |= kernel.setArg(index++, Height);
-        ret |= kernel.setArg(index++, Channel);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBuf _TileOrPackTensor");
-        
-        std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, bn->getOpenCLRuntime(), KernelName, kernelW).first;
-        
-        globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-        localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
-        bn->recordKernel3d(kernelW, mGlobalWorkSize, mLocalWorkSize);
-    }
-}
 
 static void _setTensorStack(std::vector<Tensor *> &result, const std::vector<Tensor *> &inputs,
                             const std::vector<Tensor *> &outputs, const LoopParam *loop) {
@@ -190,23 +59,10 @@ ErrorCode LoopGatherBufExecution::onEncode(const std::vector<Tensor *> &inputs,
     
     // gather
     {
+        Unit unit;
         auto input = mTensors[cmd->indexes()->data()[1]];
         auto output = mTensors[cmd->indexes()->data()[0]];
-        std::vector<int> inputShape = tensorShapeFormat(input);
-        std::vector<int> outputShape = tensorShapeFormat(output);
-        int inputShapeVec[4] = {inputShape[2], inputShape[1], inputShape[3], inputShape[0]};
-        int outputShapeVec[4] = {outputShape[2], outputShape[1], outputShape[3], outputShape[0]};
-        int offset_index = 0;
-        
-        Unit unit;
         std::set<std::string> buildOptions;
-        if (TensorUtils::getDescribe(output)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
-            buildOptions.emplace("-DGATHER_OUTPUT_NHWC");
-        }
-        if (TensorUtils::getDescribe(input)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC){
-            buildOptions.emplace("-DGATHER_INPUT_NHWC");
-        }
-
         if (mIter[0] >= 0) {
             buildOptions.emplace("-DOFFSET_DST");
         }
@@ -239,8 +95,6 @@ ErrorCode LoopGatherBufExecution::onEncode(const std::vector<Tensor *> &inputs,
         ret |= unit.kernel->get().setArg(index++, sizeof(mStride_dst), mStride_dst);
         ret |= unit.kernel->get().setArg(index++, sizeof(mStep), mStep);
         ret |= unit.kernel->get().setArg(index++, sizeof(mIter), mIter);
-        ret |= unit.kernel->get().setArg(index++, sizeof(outputShapeVec), outputShapeVec);
-        ret |= unit.kernel->get().setArg(index++, sizeof(inputShapeVec), inputShapeVec);
         ret |= unit.kernel->get().setArg(index++, inputSize);
         MNN_CHECK_CL_SUCCESS(ret, "setArg LoopGatherBufExecution");
         
@@ -261,142 +115,6 @@ LoopBatchMatMulBufExecution::LoopBatchMatMulBufExecution(const LoopParam *loop,
     mTensors.resize(mLoop->tensorNumber());
 }
     
-static std::tuple<int, int, int> getTileDimensionSize(std::tuple<int, int, int> shape, std::tuple<int, int, int> tile, MNN_DATA_FORMAT format, int dimension, bool transpose, int index) {
-    if(index > 2 || index < 0) {
-        MNN_ERROR("Error getTileDimensionSize index, only support 1 for input_1, 2 for input_2, 0 for output!\n");
-        return shape;
-    }
-    // tile: {e, l, h}
-    int tile_e = std::get<0>(tile);
-    int tile_l = std::get<1>(tile);
-    int tile_h = std::get<2>(tile);
-    // shape: {w, h, c}
-    int pad_w =  std::get<0>(shape);
-    int pad_h =  std::get<1>(shape);
-    int pad_c =  std::get<2>(shape);
-                  
-    // output
-    if(index == 0) {
-        if (format == MNN::MNN_DATA_FORMAT_NHWC) {
-            if(dimension == 3) {
-                // [N, H, W] -> (n, e, h)
-                pad_h = ROUND_UP(pad_h, tile_e);
-                pad_w = ROUND_UP(pad_w, tile_h);
-            } else {
-                // [N*H, W, C] -> [n, e, h]
-                pad_w = ROUND_UP(pad_w, tile_e);
-                pad_c = ROUND_UP(pad_c, tile_h);
-            }
-        } else {
-            if(dimension == 3) {
-                // [N, C, H] -> (n, e, h)
-                pad_c = ROUND_UP(pad_c, tile_e);
-                pad_h = ROUND_UP(pad_h, tile_h);
-            } else {
-                // [N*C, H, W] -> [n, e, h]
-                pad_h = ROUND_UP(pad_h, tile_e);
-                pad_w = ROUND_UP(pad_w, tile_h);
-            }
-        }
-        return std::make_tuple(pad_w, pad_h, pad_c);
-    }
-
-    if (format == MNN::MNN_DATA_FORMAT_NHWC) {
-        if(dimension == 3) {
-            if(transpose) {
-                if(index == 1) {
-                    // [N, H, W] -> (n, l, e)
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                    pad_w = ROUND_UP(pad_w, tile_e);
-                } else {
-                    // [N, H, W] -> (n, h, l)
-                    pad_h = ROUND_UP(pad_h, tile_h);
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                }
-            } else {
-                if(index == 1) {
-                    // [N, H, W] -> (n, e, l)
-                    pad_h = ROUND_UP(pad_h, tile_e);
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                } else {
-                    // [N, H, W] -> (n, l, h)
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                    pad_w = ROUND_UP(pad_w, tile_h);
-                }
-            }
-        } else {
-            if(transpose) {
-                if(index == 1) {
-                    // [N*H, W, C] -> (n, l, e)
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                    pad_c = ROUND_UP(pad_c, tile_e);
-                } else {
-                    // [N*H, W, C] -> (n, h, l)
-                    pad_w = ROUND_UP(pad_w, tile_h);
-                    pad_c = ROUND_UP(pad_c, tile_l);
-                }
-            } else {
-                if(index == 1) {
-                    // [N*H, W, C] -> [n, e, l]
-                    pad_w = ROUND_UP(pad_w, tile_e);
-                    pad_c = ROUND_UP(pad_c, tile_l);
-                } else {
-                    // [N*H, W, C] -> [n, l, h]
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                    pad_c = ROUND_UP(pad_c, tile_h);
-                }
-            }
-        }
-    } else {
-        if(dimension == 3) {
-            if(transpose) {
-                if(index == 1) {
-                    // [N, C, H] -> (n, l, e)
-                    pad_c = ROUND_UP(pad_c, tile_l);
-                    pad_h = ROUND_UP(pad_h, tile_e);
-                } else {
-                    // [N, C, H] -> (n, h, l)
-                    pad_c = ROUND_UP(pad_c, tile_h);
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                }
-            } else {
-                if(index == 1) {
-                    // [N, C, H] -> (n, e, l)
-                    pad_c = ROUND_UP(pad_c, tile_e);
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                } else {
-                    // [N, C, H] -> (n, l, h)
-                    pad_c = ROUND_UP(pad_c, tile_l);
-                    pad_h = ROUND_UP(pad_h, tile_h);
-                }
-            }
-        } else {
-            if(transpose) {
-                if(index == 1) {
-                    // [N*C, H, W] -> (n, l, e)
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                    pad_w = ROUND_UP(pad_w, tile_e);
-                } else {
-                    // [N*C, H, W] -> (n, h, l)
-                    pad_h = ROUND_UP(pad_h, tile_h);
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                }
-            } else {
-                if(index == 1) {
-                    // [N*C, H, W] -> [n, e, l]
-                    pad_h = ROUND_UP(pad_h, tile_e);
-                    pad_w = ROUND_UP(pad_w, tile_l);
-                } else {
-                    // [N*C, H, W] -> [n, l, h]
-                    pad_h = ROUND_UP(pad_h, tile_l);
-                    pad_w = ROUND_UP(pad_w, tile_h);
-                }
-            }
-        }
-    }
-    return std::make_tuple(pad_w, pad_h, pad_c);
-}
-    
 ErrorCode LoopBatchMatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto cmd     = mLoop->commands()->GetAs<RegionCommand>(0);
     mHasBias = cmd->indexes()->size() > 3;
@@ -410,10 +128,7 @@ ErrorCode LoopBatchMatMulBufExecution::onEncode(const std::vector<Tensor *> &inp
     mOffset[1] = cmd->view()->GetAs<View>(1)->offset();
     mOffset[2] = cmd->view()->GetAs<View>(2)->offset();
     mUnits.clear();
-    mOffsetTensors.clear();
-    mTmpTensors.resize(3);
     if (mHasBias) {
-        mTmpTensors.resize(4);
         mOffset[3] = cmd->view()->GetAs<View>(3)->offset();
     }
     
@@ -424,190 +139,8 @@ ErrorCode LoopBatchMatMulBufExecution::onEncode(const std::vector<Tensor *> &inp
     int h = cmd->size()->data()[2];
     int n = mLoop->loopNumber();
     
-    int tileM = 32;
-    int tileN = 32;
-    int tileK = 4;
-    bool isTotalLarge = (e * 1.0 / 512 * l / 512 * h / 512 > 0.5);
-    bool isDimLarge = (e > 256 && l > 256 && h > 256);
-    int max_eh = std::max(e, h);
-    int min_eh = std::min(e, h);
-    isDimLarge = isDimLarge || (l >= 512 && (max_eh > 1024 || min_eh > 32));
-
-    mBatchGemmOpt = isTotalLarge && isDimLarge;
-    for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
-        if (mIter[i] >= 0) {
-            mBatchGemmOpt = false;
-            break;
-        }
-    }
-    
-    if(mHasBias) {
-        mBatchGemmOpt = false;
-    }
-   
-    bool needRearrangeA = false;
-    if(mBatchGemmOpt && !mTransposeA) {
-        // rearrange to [n, l, e]
-        needRearrangeA = true;
-    }
-    bool needRearrangeB = false;
-    if(mBatchGemmOpt && mTransposeB) {
-        // rearrange to [n, l, h]
-        needRearrangeB = true;
-    }
-   
-    // tile input
-    for (int i = 1; i < cmd->indexes()->size(); ++i) {
-       auto input = mTensors[cmd->indexes()->data()[i]];
-       std::vector<int> Shape = tensorShapeFormat(input);
-       const int Channel = Shape.at(3);
-       const int Width = Shape.at(2);
-       const int Height = Shape.at(1);
-       const int Batch = Shape.at(0);
-       bool needTranspose = false;
-       if(i == 1) {
-           needTranspose = needRearrangeA;
-       } else if(i == 2) {
-           needTranspose = needRearrangeB;
-       }
-
-       Unit unit;
-       std::set<std::string> buildOptions = mBuildOptions;
-       if(needTranspose) {
-           buildOptions.emplace("-DTRANSPOSE");
-       }
-       if(input->buffer().dimensions == 3) {
-           buildOptions.emplace("-DDIMENSION_3");
-       }
-       if(input->buffer().dimensions == 4) {
-           buildOptions.emplace("-DDIMENSION_4");
-       }
-       
-       int WidthPad = Width;
-       int HeightPad = Height;
-       int ChannelPad = Channel;
-        
-        if(mBatchGemmOpt) {
-            auto shape = getTileDimensionSize(std::make_tuple(Width, Height, Channel), std::make_tuple(tileM, tileK, tileN), TensorUtils::getDescribe(input)->dimensionFormat, input->buffer().dimensions, needTranspose, i);
-            WidthPad   = std::get<0>(shape);
-            HeightPad  = std::get<1>(shape);
-            ChannelPad = std::get<2>(shape);
-        }
-        
-        mTmpTensors[i] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, ChannelPad, HeightPad, WidthPad}, Tensor::CAFFE));
-        // MNN_PRINT("input%d, %d %d %d %d\n", i, Batch, ChannelPad, HeightPad, WidthPad);
-
-        mOpenCLBackend->onAcquireBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
-       _TileOrPackTensor(input, mTmpTensors[i].get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, mOpenCLBackend, "tile_buf", buildOptions, WidthPad, HeightPad, ChannelPad, runTime);
-       mUnits.emplace_back(unit);
-    }
-
-    for(int i = 0; i < cmd->iterIndexes()->size(); ++i){
-       if (mIter[i] >= 0) {
-           auto input = mTensors[cmd->iterIndexes()->data()[i]];
-           std::vector<int> Shape = tensorShapeFormat(input);
-           const int Channel = Shape.at(3);
-           const int Width = Shape.at(2);
-           const int Height = Shape.at(1);
-           const int Batch = Shape.at(0);
-           mOffsetTensors.emplace_back(std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{Batch, Channel, Height, Width}, Tensor::CAFFE)));
-           mOpenCLBackend->onAcquireBuffer(mOffsetTensors.back().get(), Backend::DYNAMIC);
-           // MNN_PRINT("input%d offset, %d %d %d %d\n", i, Batch, Channel, Height, Width);
-
-           Unit unit;
-           _TileOrPackTensor(input, mOffsetTensors.back().get(), unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, mOpenCLBackend, "tile_buf", mBuildOptions, Width, Height, Channel, runTime);
-           mUnits.emplace_back(unit);
-       }
-    }
-
-    mBatch = n;
-    mM = e;
-    mN = h;
-    mK = l;
-    if(mBatchGemmOpt) {
-        // matmul
-        int e_pack = ROUND_UP(e, tileM);
-        int l_pack = ROUND_UP(l, tileK);
-        int h_pack = ROUND_UP(h, tileN);
-        mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{n * e_pack * h_pack}, Tensor::CAFFE));
-        mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
-
-        
-        std::set<std::string> buildOptions;
-        
-        uint32_t layout = 0;
-        auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)n, (uint32_t)0}, {openCLBuffer(mTmpTensors[1].get()), openCLBuffer(mTmpTensors[2].get()), openCLBuffer(mTmpTensors[0].get())}, mOpenCLBackend->getOpenCLRuntime());
-
-        int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
-        buildOptions.emplace("-DKWG=" + std::to_string(KWG));
-        buildOptions.emplace("-DKWI=" + std::to_string(KWI));
-        buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
-        buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
-        buildOptions.emplace("-DMWG=" + std::to_string(MWG));
-        buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
-        buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
-        buildOptions.emplace("-DNWG=" + std::to_string(NWG));
-        buildOptions.emplace("-DSA=" + std::to_string(SA));
-        buildOptions.emplace("-DSB=" + std::to_string(SB));
-        buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
-        buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
-        buildOptions.emplace("-DVWM=" + std::to_string(VWM));
-        buildOptions.emplace("-DVWN=" + std::to_string(VWN));
-        if(layout >= 4) {
-            buildOptions.emplace("-DOUTPUTMN");
-        }
-        
-        tileM = MWG;
-        tileN = NWG;
-        int localM = MDIMC;
-        int localN = NDIMC;
-        
-        if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
-            buildOptions.emplace("-DUSE_CL_MAD=1");
-            buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
-        }
-        
-        Unit unit;
-        unit.kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
-        
-        int out_per_thread_m = tileM / localM;
-        int out_per_thread_n = tileN / localN;
-        
-        std::vector<uint32_t>  globalWorkSize = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(n)};
-        std::vector<uint32_t>  localWorkSize = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
-        
-        float alpha = 1.0;
-        float beta = 0.0f;
-        int batch_offset_a = e_pack * l_pack;
-        int batch_offset_b = h_pack * l_pack;
-        int batch_offset_c = e_pack * h_pack;
-        int idx            = 0;
-        cl_int ret = CL_SUCCESS;
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(e_pack));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(h_pack));
-        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(l_pack));
-        ret |= unit.kernel->get().setArg(idx++, alpha);
-        ret |= unit.kernel->get().setArg(idx++, beta);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTmpTensors[1].get()));
-        ret |= unit.kernel->get().setArg(idx++, batch_offset_a);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTmpTensors[2].get()));
-        ret |= unit.kernel->get().setArg(idx++, batch_offset_b);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTmpTensors[0].get()));
-        ret |= unit.kernel->get().setArg(idx++, batch_offset_c);
-        MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBuf GemmTile Kernel");
-
-        unit.globalWorkSize = {globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]};
-        unit.localWorkSize  = {localWorkSize[0], localWorkSize[1], localWorkSize[2]};
-        mUnits.emplace_back(unit);
-        mOpenCLBackend->recordKernel3d(unit.kernel, globalWorkSize, localWorkSize);
-        
-    } else {
+    {
        // matmul
-       mTmpTensors[0] = std::make_shared<Tensor>(Tensor::createDevice<float>(std::vector<int>{1, n, e, h}, Tensor::CAFFE));
-       mOpenCLBackend->onAcquireBuffer(mTmpTensors[0].get(), Backend::DYNAMIC);
-       int offset_index = 0;
-
-       // MNN_PRINT("batchgemm:%d, %d %d %d, transAB %d %d, bias:%d, inputsize:%d\n", n, e, h, l, mTransposeA, mTransposeB, mHasBias, cmd->indexes()->size());
        Unit unit;
        std::string KernelName = "batch_matmul";
        std::set<std::string> buildOptions = mBuildOptions;
@@ -630,15 +163,15 @@ ErrorCode LoopBatchMatMulBufExecution::onEncode(const std::vector<Tensor *> &inp
        ret |= unit.kernel->get().setArg(index++, mGlobalWorkSize[0]);
        ret |= unit.kernel->get().setArg(index++, mGlobalWorkSize[1]);
        ret |= unit.kernel->get().setArg(index++, mGlobalWorkSize[2]);
-       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTmpTensors[0].get()));
-       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTmpTensors[1].get()));
-       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTmpTensors[2].get()));
+       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[0]]));
+       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
+       ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[2]]));
        if (mHasBias) {
-           ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTmpTensors[3].get()));
+           ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[3]]));
        }
        for (int i = 0; i < cmd->iterIndexes()->size(); ++i) {
            if (mIter[i] >= 0) {
-               ret |= unit.kernel->get().setArg(index++, openCLBuffer(mOffsetTensors[offset_index++].get()));
+               ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->iterIndexes()->data()[i]]));
            } else {
                ret |= unit.kernel->get().setArg(index++, openCLBuffer(mTensors[cmd->indexes()->data()[1]]));
            }
@@ -659,116 +192,9 @@ ErrorCode LoopBatchMatMulBufExecution::onEncode(const std::vector<Tensor *> &inp
        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
     }
 
-    //pack output
-    {
-       auto output = mTensors[cmd->indexes()->data()[0]];
-       std::vector<int> Shape = tensorShapeFormat(output);
-       const int Channel = Shape.at(3);
-       const int Width = Shape.at(2);
-       const int Height = Shape.at(1);
-       const int Batch = Shape.at(0);
-       // MNN_PRINT("output, %d %d %d %d\n", Batch, Channel, Height, Width);
-
-       Unit unit;
-       std::set<std::string> buildOptions = mBuildOptions;
-       if(mBatchGemmOpt) {
-           buildOptions.emplace("-DTRANSPOSE");
-           if (mHasBias) {
-               buildOptions.emplace("-DBIAS");
-           }
-           if(output->buffer().dimensions == 3) {
-               buildOptions.emplace("-DDIMENSION_3");
-           }
-           if(output->buffer().dimensions == 4) {
-               buildOptions.emplace("-DDIMENSION_4");
-           }
-       }
-        
-        int WidthPad = Width;
-        int HeightPad = Height;
-        int ChannelPad = Channel;
-        if(mBatchGemmOpt) {
-            auto shape = getTileDimensionSize(std::make_tuple(Width, Height, Channel), std::make_tuple(tileM, tileK, tileN), TensorUtils::getDescribe(output)->dimensionFormat, output->buffer().dimensions, false, 0);
-            WidthPad   = std::get<0>(shape);
-            HeightPad  = std::get<1>(shape);
-            ChannelPad = std::get<2>(shape);
-        }
-       _TileOrPackTensor(mTmpTensors[0].get(), output, unit.kernel, unit.globalWorkSize, unit.localWorkSize, Width, Height, Channel, Batch, mOpenCLBackend, "pack_buf", buildOptions, WidthPad, HeightPad, ChannelPad, runTime);
-       mUnits.emplace_back(unit);
-    }
-
-    for (int i = 0; i < cmd->indexes()->size(); ++i) {
-         mOpenCLBackend->onReleaseBuffer(mTmpTensors[i].get(), Backend::DYNAMIC);
-    }
-    for (int i = 0; i < mOffsetTensors.size(); ++i) {
-         mOpenCLBackend->onReleaseBuffer(mOffsetTensors[i].get(), Backend::DYNAMIC);
-    }
-
     return NO_ERROR;
 }
 
-ErrorCode LoopBatchMatMulBufExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto openCLBackend = static_cast<OpenCLBackend*>(backend());
-    auto runtime = openCLBackend->getOpenCLRuntime();
-#ifdef ENABLE_OPENCL_TIME_PROFILER
-    int idx = 0;
-#else
-    if(openCLBackend->isUseRecordQueue()){
-        openCLBackend->addRecord(mRecording, mOpRecordUpdateInfo);
-        return NO_ERROR;
-    }
-#endif
-    auto res = CL_SUCCESS;
-    for (auto &unit : mUnits) {
-    #ifdef ENABLE_OPENCL_TIME_PROFILER
-        cl::Event event;
-        res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
-                                                    cl::NullRange,
-                                                    unit.globalWorkSize,
-                                                    unit.localWorkSize,
-                                                    nullptr,
-                                                    &event);
-        std::string name = "While-gemm";
-
-        if(mBatchGemmOpt) {
-            if(idx == 2) {
-                name += "-batchgemm";
-            } else if(idx == 0) {
-                name += "-rearrangeA";
-            } else if(idx == 1) {
-                name += "-rearrangeB";
-            } else {
-                name += "-rearrangeC";
-            }
-        } else {
-            if(idx == mUnits.size()-2) {
-                name += "-batchgemm";
-            } else if(idx == 0) {
-                name += "-rearrangeA";
-            } else if(idx == 1) {
-                name += "-rearrangeB";
-            } else {
-                name += "-rearrangeC";
-            }
-        }
-        std::string b = std::to_string(mBatch);
-        std::string m = std::to_string(mM);
-        std::string n = std::to_string(mN);
-        std::string k = std::to_string(mK);
-        std::string total = std::to_string(1.0 / 1000000 * mBatch * mM * mN * mK);
-        name += "-b" + b + "m" + m + "n" + n + "k" + k + "-total:" + total + "*10^6";
-        runtime->pushEvent({name.c_str(), event});
-        idx++;
-    #else
-        res = runtime->commandQueue().enqueueNDRangeKernel(unit.kernel->get(),
-                                                    cl::NullRange,
-                                                    unit.globalWorkSize,
-                                                    unit.localWorkSize);
-    #endif
-        MNN_CHECK_CL_SUCCESS(res, "While-gemm execute");
-    }
-    return NO_ERROR;
-}
 LoopBinaryBufExecution::LoopBinaryBufExecution(const LoopParam *loop, const std::string &compute, const MNN::Op *op, Backend *bn)
     : CommonExecution(bn, op) {
     mLoop = loop;
@@ -784,115 +210,28 @@ ErrorCode LoopBinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs,
     mUnits.clear();
     
     Unit unit;
+    int z = cmd->size()->data()[0];
+    int y = cmd->size()->data()[1];
+    int x = cmd->size()->data()[2];
+    int n = mLoop->loopNumber();
+    int inputSize = mTensors[cmd->indexes()->data()[1]]->elementSize();
+    
+    auto src0Stride = cmd->view()->GetAs<View>(1)->stride()->data();
+    auto src1Stride = cmd->view()->GetAs<View>(2)->stride()->data();
+    auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+    for (int i = 0; i < 3; ++i) {
+        mStride_src0[i] = src0Stride[i];
+        mStride_src1[i] = src1Stride[i];
+        mStride_dst[i] = dstStride[i];
+    }
+    
     auto input0 = mTensors[cmd->indexes()->data()[1]];
-    std::vector<int> input0C4Shape = tensorShapeFormat(input0);
-    int input0C4Size[4] = {input0C4Shape.at(0), input0C4Shape.at(3),input0C4Shape.at(1),input0C4Shape.at(2)};
-         
     auto input1 = mTensors[cmd->indexes()->data()[2]];
-    std::vector<int> input1C4Shape = tensorShapeFormat(input1);
-    int input1C4Size[4] = {input1C4Shape.at(0), input1C4Shape.at(3),input1C4Shape.at(1),input1C4Shape.at(2)};
-         
     auto output = mTensors[cmd->indexes()->data()[0]];
-    std::vector<int> outputC4Shape = tensorShapeFormat(output);
-    
-    int input0Shape[8] = {1, 1, 1, 1, 1, 1, 1, 1};
-    int input1Shape[8] = {1, 1, 1, 1, 1, 1, 1, 1};
-    int outputShape[8] = {1, 1, 1, 1, 1, 1, 1, 1};
-
-    int offset0 = output->dimensions() - input0->dimensions();
-    int offset1 = output->dimensions() - input1->dimensions();
-    for (int i = 0; i < input0->dimensions(); ++i) {
-        input0Shape[i + offset0] = input0->length(i);
-    }
-    for (int i = 0; i < input1->dimensions(); ++i) {
-        input1Shape[i + offset1] = input1->length(i);
-    }
-    for(int i =0;i<output->dimensions();++i){
-        outputShape[i] = output->length(i);
-    }
-    if (TensorUtils::getDescribe(input0)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC)
-    {
-        int iN = input0Shape[0];
-        int iH = input0Shape[1];
-        int iW = input0Shape[2];
-        int iC = input0Shape[3];
-            
-        if(input0->dimensions() > 4)
-        {
-            for(int i = 4; i < input0->dimensions(); i++)
-            {
-                iC *= input0Shape[i];
-            }
-        }
-        input0Shape[0] = iN;
-        input0Shape[1] = iC;
-        input0Shape[2] = iH;
-        input0Shape[3] = iW;
-        input0Shape[4] = 1;
-    }
-    if (TensorUtils::getDescribe(input1)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC)
-    {
-        int iN = input1Shape[0];
-        int iH = input1Shape[1];
-        int iW = input1Shape[2];
-        int iC = input1Shape[3];
-            
-        if(input1->dimensions() > 4)
-        {
-            for(int i = 4; i < input1->dimensions(); i++)
-            {
-                iC *= input1Shape[i];
-            }
-        }
-        input1Shape[0] = iN;
-        input1Shape[1] = iC;
-        input1Shape[2] = iH;
-        input1Shape[3] = iW;
-        input1Shape[4] = 1;
-    }
-    if (TensorUtils::getDescribe(output)->dimensionFormat == MNN::MNN_DATA_FORMAT_NHWC)
-    {
-        int iN = outputShape[0];
-        int iH = outputShape[1];
-        int iW = outputShape[2];
-        int iC = outputShape[3];
-            
-        if(input1->dimensions() > 4)
-        {
-            for(int i = 4; i < output->dimensions(); i++)
-            {
-                iC *= outputShape[i];
-            }
-        }
-        outputShape[0] = iN;
-        outputShape[1] = iC;
-        outputShape[2] = iH;
-        outputShape[3] = iW;
-        outputShape[4] = 1;
-    }
-    auto BuildOptions = mBuildOptions;
-    for(int i = 0; i < 4; ++i){
-        if(input1C4Shape[i] != outputC4Shape[i]){
-            BuildOptions.emplace("-DBROADCAST_INPUT1");
-            break;
-        }
-    }
-   
-    const int Channel = outputC4Shape.at(3);
-    const int Width = outputC4Shape.at(2);
-    const int Height = outputC4Shape.at(1);
-    const int Batch = outputC4Shape.at(0);
-    const int ChannelBlock = UP_DIV(Channel, 4);
-    std::string KernelName = "broadcast_binary_buf";
-    if(input0Shape[1] == input1Shape[1] && input0C4Size[1] == input1C4Size[1]){
-        KernelName = "broadcast_binary_channel_equall_buf";
-    } else if((input0->dimensions() == 1 && input0Shape[1] == 1) || (input1->dimensions() == 1 && input1Shape[1] == 1)){
-        KernelName = "broadcast_binary_dimmision1_channel1_buf";
-    }
-    unit.kernel = runTime->buildKernel("loop_buf", KernelName, BuildOptions, input0, output);
+    unit.kernel = runTime->buildKernel("loop_buf", "loop_binary_buf", mBuildOptions, input0, output);
     uint32_t mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
-
-    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(Width), (uint32_t)(Height), (uint32_t)(Batch * ChannelBlock)};
+    
+    std::vector<uint32_t> mGlobalWorkSize = {(uint32_t)(x), (uint32_t)(y), (uint32_t)(z)};
 
     uint32_t index = 0;
     cl_int ret = CL_SUCCESS;
@@ -902,18 +241,18 @@ ErrorCode LoopBinaryBufExecution::onEncode(const std::vector<Tensor *> &inputs,
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(output));
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(input0));
     ret |= unit.kernel->get().setArg(index++, openCLBuffer(input1));
-    ret |= unit.kernel->get().setArg(index++, sizeof(input0Shape), input0Shape);
-    ret |= unit.kernel->get().setArg(index++, sizeof(input0C4Size), input0C4Size);
-    ret |= unit.kernel->get().setArg(index++, sizeof(input1Shape), input1Shape);
-    ret |= unit.kernel->get().setArg(index++, sizeof(input1C4Size), input1C4Size);
-    ret |= unit.kernel->get().setArg(index++, sizeof(outputShape), outputShape);
-    ret |= unit.kernel->get().setArg(index++, Width);
-    ret |= unit.kernel->get().setArg(index++, Height);
-    ret |= unit.kernel->get().setArg(index++, Channel);
-    ret |= unit.kernel->get().setArg(index++, ChannelBlock);
+    ret |= unit.kernel->get().setArg(index++, mStride_src0[0]);
+    ret |= unit.kernel->get().setArg(index++, mStride_src0[1]);
+    ret |= unit.kernel->get().setArg(index++, mStride_src0[2]);
+    ret |= unit.kernel->get().setArg(index++, mStride_src1[0]);
+    ret |= unit.kernel->get().setArg(index++, mStride_src1[1]);
+    ret |= unit.kernel->get().setArg(index++, mStride_src1[2]);
+    ret |= unit.kernel->get().setArg(index++, mStride_dst[0]);
+    ret |= unit.kernel->get().setArg(index++, mStride_dst[1]);
+    ret |= unit.kernel->get().setArg(index++, mStride_dst[2]);
     MNN_CHECK_CL_SUCCESS(ret, "setArg LoopBinaryBufExecution");
 
-    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, KernelName, unit.kernel).first;
+    std::vector<uint32_t> mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runTime, "loop_binary_buf", unit.kernel).first;
 
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
     unit.localWorkSize  = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
@@ -958,35 +297,35 @@ class LoopBufCreator : public OpenCLBackend::Creator {
                     case BinaryOpOperation_SUB:
                         return new LoopBinaryBufExecution(loop, "in0-in1", op, backend);
                     case BinaryOpOperation_REALDIV:
-                        return new LoopBinaryBufExecution(loop, "sign(in1)*in0/(fabs(in1)>(float4)((float)0.0000001)?fabs(in1):(float4)((float)0.0000001))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "sign(in1)*in0/(fabs(in1)>(float)((float)0.0000001)?fabs(in1):(float)((float)0.0000001))", op, backend);
                     case BinaryOpOperation_MINIMUM:
                         return new LoopBinaryBufExecution(loop, "in0>in1?in1:in0", op, backend);
                     case BinaryOpOperation_MAXIMUM:
                         return new LoopBinaryBufExecution(loop, "in0>in1?in0:in1", op, backend);
                     case BinaryOpOperation_GREATER:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-isgreater(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(isgreater(in0,in1))", op, backend);
                     case BinaryOpOperation_LESS:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-isless(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(isless(in0,in1))", op, backend);
                     case BinaryOpOperation_LESS_EQUAL:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-islessequal(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(islessequal(in0,in1))", op, backend);
                     case BinaryOpOperation_GREATER_EQUAL:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-isgreaterequal(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(isgreaterequal(in0,in1))", op, backend);
                     case BinaryOpOperation_EQUAL:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-isequal(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(isequal(in0,in1))", op, backend);
                     case BinaryOpOperation_FLOORDIV:
-                        return new LoopBinaryBufExecution(loop, "floor(sign(in1)*in0/(fabs(in1)>(float4)((float)0.0000001)?fabs(in1):(float4)((float)0.0000001)))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "floor(sign(in1)*in0/(fabs(in1)>(float)((float)0.0000001)?fabs(in1):(float)((float)0.0000001)))", op, backend);
                     case BinaryOpOperation_FLOORMOD:
-                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(float4)((float)0.0000001)?fabs(in1):(float4)((float)0.0000001)))*in1", op, backend);
+                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(float)((float)0.0000001)?fabs(in1):(float)((float)0.0000001)))*in1", op, backend);
                     case BinaryOpOperation_POW:
                         return new LoopBinaryBufExecution(loop, "pow(in0,in1)", op, backend);
                     case BinaryOpOperation_SquaredDifference:
                         return new LoopBinaryBufExecution(loop, "(in0-in1)*(in0-in1)", op, backend);
                     case BinaryOpOperation_ATAN2:
-                        return new LoopBinaryBufExecution(loop, "(in1==(float4)0?(sign(in0)*(float4)(PI/2)):(atan(in0/in1)+(in1>(float4)0?(float4)0:sign(in0)*(float4)PI)))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(in1==(float)0?(sign(in0)*(float)(PI/2)):(atan(in0/in1)+(in1>(float)0?(float)0:sign(in0)*(float)PI)))", op, backend);
                     case BinaryOpOperation_NOTEQUAL:
-                        return new LoopBinaryBufExecution(loop, "convert_float4(-isnotequal(in0,in1))", op, backend);
+                        return new LoopBinaryBufExecution(loop, "(float)(isnotequal(in0,in1))", op, backend);
                     case BinaryOpOperation_MOD:
-                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(float4)((float)0.0000001)?fabs(in1):(float4)((float)0.0000001)))*in1", op, backend);
+                        return new LoopBinaryBufExecution(loop, "in0-floor(sign(in1)*in0/(fabs(in1)>(float)((float)0.0000001)?fabs(in1):(float)((float)0.0000001)))*in1", op, backend);
                     default:
                         break;
                 }
diff --git a/source/backend/opencl/execution/buffer/LoopBufExecution.hpp b/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
index aba7848ff..6bad208af 100644
--- a/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/LoopBufExecution.hpp
@@ -39,14 +39,11 @@ class LoopBatchMatMulBufExecution : public CommonExecution {
     LoopBatchMatMulBufExecution(const LoopParam *loop, const MNN::Op *op, Backend *bn);
     virtual ~LoopBatchMatMulBufExecution() = default;
     virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 
 private:
     const LoopParam *mLoop;
     std::vector<Tensor *> mTensors;
-    std::vector<std::shared_ptr<Tensor>> mTmpTensors;
-    std::vector<std::shared_ptr<Tensor>> mOffsetTensors;
     int mOffset[4];
     int mStep[4];
     int mIter[4];
@@ -54,8 +51,6 @@ class LoopBatchMatMulBufExecution : public CommonExecution {
     bool mTransposeA = false;
     bool mTransposeB = false;
     std::set<std::string> mBuildOptions;
-    bool mBatchGemmOpt = false;
-    int mBatch, mM, mN, mK;
 };
 
 
@@ -69,6 +64,9 @@ class LoopBinaryBufExecution : public CommonExecution {
     const LoopParam *mLoop;
     std::vector<Tensor *> mTensors;
     std::set<std::string> mBuildOptions;
+    int mStride_src0[3];
+    int mStride_src1[3];
+    int mStride_dst[3];
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
index ea055eb37..4062220fb 100644
--- a/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/MatmulBufExecution.cpp
@@ -122,11 +122,21 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         unit.kernel       = runtime->buildKernel("matmul_local_buf", "matmul_local_buf", buildOptions);
     } else {
         if(mTransposeA) {
-            mKernelName = mTransposeB ? "matmul_transA_transB_buf":"matmul_transA_buf";
-        } else {
-            mKernelName = mTransposeB ? "matmul_transB_buf":"matmul_buf";
+            buildOptions.emplace(" -DTRANSPOSE_A");
+        }
+        if(mTransposeB) {
+            buildOptions.emplace(" -DTRANSPOSE_B");
         }
-        unit.kernel       = runtime->buildKernel("matmul_buf", mKernelName, buildOptions);
+        if(M % 4 != 0) {
+            buildOptions.emplace(" -DM_LEAVE");
+        }
+        if(N % 4 != 0) {
+            buildOptions.emplace(" -DN_LEAVE");
+        }
+        if(K % 4 != 0) {
+            buildOptions.emplace(" -DK_LEAVE");
+        }
+        unit.kernel       = runtime->buildKernel("matmul_buf", "matmul_buf", buildOptions);
     }
     
     mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
@@ -183,46 +193,22 @@ ErrorCode MatMulBufExecution::onEncode(const std::vector<Tensor *> &inputs, cons
         MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution use tile opt");
 
     } else {
-        if(mTransposeA) {
-            mGlobalWorkSize = {static_cast<uint32_t>(N_4), static_cast<uint32_t>(M_4)};
-            int idx            = 0;
-            ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-            ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input0));
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input1));
-            if(inputs.size() > 2) {
-                ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[2]));
-            }
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K_4));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M_4));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N_4));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N));
-            MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution mTransposeA");
-            
-            mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first;
+        mGlobalWorkSize = {static_cast<uint32_t>(N_4), static_cast<uint32_t>(M_4)};
+        int idx            = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input0));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input1));
+        if(inputs.size() > 2) {
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[2]));
         }
-        else {
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(M));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution mTransposeA");
             
-            mGlobalWorkSize = {static_cast<uint32_t>(N_4), static_cast<uint32_t>(M)};
-            int idx            = 0;
-            ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-            ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input0));
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input1));
-            if(inputs.size() > 2) {
-                ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[2]));
-            }
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(K_4));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N_4));
-            ret |= unit.kernel->get().setArg(idx++, static_cast<int>(N));
-            MNN_CHECK_CL_SUCCESS(ret, "setArg MatMulBufExecution");
-            mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first;
-        }
+        mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first;
     }
     mOpenCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
     unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
diff --git a/source/backend/opencl/execution/buffer/PoolBufExecution.cpp b/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
index 8b61b4d77..66e29d1b7 100644
--- a/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
@@ -159,7 +159,7 @@ ErrorCode PoolBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
     ret |= unit.kernel->get().setArg(idx++, sizeof(kernelShape), kernelShape);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(redice));
-    ret |= unit.kernel->get().setArg(idx++, channelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, batch);
     MNN_CHECK_CL_SUCCESS(ret, "setArg PoolBufExecution");
     
     std::string kernelNameTune = "pooling_buf";
@@ -296,6 +296,7 @@ ErrorCode PoolBufExecution::SubgrouponResize(const std::vector<Tensor *> &inputs
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(redice));
     ret |= unit.kernel->get().setArg(idx++, channels);
+    ret |= unit.kernel->get().setArg(idx++, batch);
     ret |= unit.kernel->get().setArg(idx++, in_channel_block);
     ret |= unit.kernel->get().setArg(idx++, out_channel_block);
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
diff --git a/source/backend/opencl/execution/buffer/RangeBufExecution.cpp b/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
index 913e841e8..cc42c3624 100644
--- a/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/RangeBufExecution.cpp
@@ -20,43 +20,35 @@ ErrorCode RangeBufExecution::onEncode(const std::vector<Tensor*>& inputs, const
     mUnits.resize(1);
     auto &unit = mUnits[0];
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
-    auto runtime       = openCLBackend->getOpenCLRuntime();
-    unit.kernel = runtime->buildKernel("range_buf", "range_buf", mBuildOptions, inputs[0], outputs[0]);
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
-
+    auto runtime       = openCLBackend->getOpenCLRuntime();    
     std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
-
-    int batch        = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int channels     = outputShape.at(3);
-    int channelBlocks = (channels + 3) / 4;
-
+    int totalSize = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
     mGlobalWorkSize = {
-        static_cast<uint32_t>(outputWidth),
-        static_cast<uint32_t>(outputHeight),
-        static_cast<uint32_t>(batch * channelBlocks)
+        static_cast<uint32_t>(UP_DIV(totalSize, 4)),
+        static_cast<uint32_t>(1)
     };
+    std::set<std::string> buildOption = mBuildOptions;
+    if((totalSize % 4) != 0){
+        buildOption.emplace("-DPACK_LEAVE");
+    }
+    unit.kernel = runtime->buildKernel("range_buf", "range_buf", buildOption, inputs[0], outputs[0]);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[0]));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[2]));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputs[0]));
-    ret |= unit.kernel->get().setArg(idx++, outputWidth);
-    ret |= unit.kernel->get().setArg(idx++, outputHeight);
-    ret |= unit.kernel->get().setArg(idx++, channels);
-    ret |= unit.kernel->get().setArg(idx++, channelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, totalSize);
     MNN_CHECK_CL_SUCCESS(ret, "setArg RangeBufExecution");
 
     std::string kernelName = "range_buf";
-    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
-    openCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+    mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+    openCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalSize);
+    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+    unit.localWorkSize = {mLocalSize[0], mLocalSize[1]};
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/buffer/RasterBufExecution.cpp b/source/backend/opencl/execution/buffer/RasterBufExecution.cpp
index d663a6c9f..8db39af02 100644
--- a/source/backend/opencl/execution/buffer/RasterBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/RasterBufExecution.cpp
@@ -36,24 +36,42 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
     }
     auto des = TensorUtils::getDescribe(output);
     auto outputDes = TensorUtils::getDescribe(output);
-    mNeedZero = !TensorUtils::regionIsFull(output);
     auto regionNum = des->regions.size();
     auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
-   
-    bool cancombine = CanCombine(outputs);
+    int kernel_idx = 0;
+    auto outputShape = tensorShapeFormat(output);
+    mFast = false;
+    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        mFast = true;
+        for (int i=0; i< des->regions.size(); ++i) {
+            auto& slice = des->regions[i];
+            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                mFast = false;
+                break;
+            }
+            if (!OpCommonUtils::canBlitFast(slice, output, 4, true)) {
+                mFast = false;
+                break;
+            }
+        }
+    }
+    mNeedZero = !TensorUtils::regionIsFull(output);
+    mNeedZero = mNeedZero || ((outputShape[3] % 4) != 0 && MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat && !mFast);
+    bool cancombine = CanCombine(outputs) && (!mFast);
     if(cancombine){
         regionNum = 1;
     }
-    int kernel_idx = 0;
     mUnits.resize(regionNum);
-    auto outputShape = tensorShapeFormat(output);
-    if(mNeedZero || (outputShape[3] % 4) != 0)
+    if(mNeedZero)
     {
         mUnits.resize(regionNum + 1);
-        int region[] = {outputShape[0], ROUND_UP(outputShape[3], 4), outputShape[1], outputShape[2]};//nhwc
+        int region[] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//nchw
+        if(MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat){
+            region[1] = ROUND_UP(outputShape[3], 4);
+        }
         Unit &unit          = mUnits[kernel_idx++];
-        unit.kernel         = runtime->buildKernel("raster", "buffer_set_zero", {}, output, output);
+        unit.kernel         = runtime->buildKernel("raster_buf", "buffer_set_zero", {}, output, output);
         unit.localWorkSize  = {8, 8};
         unit.globalWorkSize = {(uint32_t)UP_DIV((region[2] * region[3]), 8)*8,
                                    (uint32_t)UP_DIV((region[0] * region[1]), 8)*8};
@@ -73,6 +91,64 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
         mOpenCLBackend->recordKernel2d(unit.kernel, {(uint32_t)UP_DIV((region[2] * region[3]), 8)*8,
             (uint32_t)UP_DIV((region[0] * region[1]), 8)*8},  {8, 8});
     }
+    if(mFast)
+    {
+        // nc4hw4 buffer raster
+        for (auto& slice : des->regions)
+        {
+            auto origin = slice.origin;
+            auto inputShape = tensorShapeFormat(origin);
+            Tensor::InsideDescribe::Region C4Region;
+            OpCommonUtils::turnToPackRegion(slice, C4Region, output, 4, true);
+            Unit &unit          = mUnits[kernel_idx++];
+            unit.kernel         = runtime->buildKernel("raster_buf", "raster_nc4hw4_buffer", {}, origin, output);
+
+            const std::vector<uint32_t> gws =  {(uint32_t)C4Region.size[2],
+                                                    (uint32_t)C4Region.size[1],
+                                                    (uint32_t)C4Region.size[0]};
+            uint32_t mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+            
+            auto outputShape    = tensorShapeFormat(output);
+            auto sliceShape    = tensorShapeFormat(slice.origin);
+
+            uint32_t idx   = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= unit.kernel->get().setArg(idx++, gws[0]);
+            ret |= unit.kernel->get().setArg(idx++, gws[1]);
+            ret |= unit.kernel->get().setArg(idx++, gws[2]);
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(slice.origin));
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.offset);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[0]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[1]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[2]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[1]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[2]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[3]);
+            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.offset);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[0]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[1]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[2]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[1]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[2]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[3]);
+            if(ret != CL_SUCCESS)
+            {
+                MNN_PRINT("setArg err %d\n", (int)ret);
+            }
+            std::string name = "raster_nc4hw4_buffer";
+            const std::vector<uint32_t> lws = localWS3DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
+            
+            unit.localWorkSize = {lws[0], lws[1], lws[2]};
+            
+            unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
+                                   ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])),
+                                   ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))};
+            mOpenCLBackend->recordKernel3d(unit.kernel, gws, lws);
+        }
+        return NO_ERROR;
+    }
+    
     if(cancombine){
         auto regions = des->regions;
         auto slice = regions[0];
@@ -82,17 +158,11 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
         std::set<std::string> buildOptions;
         auto origin = slice.origin;
         auto inputShape = tensorShapeFormat(origin);
-        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC)
-        {
-            buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC");
-        }
-        if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC)//nhwc buffer to Image
-        {
-            buildOptions.emplace(" -DOUTPUT_DATA_FORMAT_NHWC");
-        }
+        buildOptions.emplace("-DINPUT_FORMAT=" + std::to_string(TensorUtils::getDescribe(origin)->dimensionFormat));
+        buildOptions.emplace("-DOUTPUT_FORMAT=" + std::to_string(outputDes->dimensionFormat));
         
         Unit &unit          = mUnits[kernel_idx++];
-        unit.kernel         = runtime->buildKernel("raster_buf", "raster_direct_buffer", buildOptions, output, output);
+        unit.kernel         = runtime->buildKernel("raster_buf", "raster_direct_buffer", buildOptions, origin, output);
         
         const std::vector<uint32_t> gws =  {(uint32_t)slice.size[2] * nums,
             (uint32_t)slice.size[1],
@@ -114,6 +184,7 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
         ret |= unit.kernel->get().setArg(idx++, inputShape[2]);
         ret |= unit.kernel->get().setArg(idx++, inputShape[1]);
         ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
+        ret |= unit.kernel->get().setArg(idx++, inputShape[0]);
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
         ret |= unit.kernel->get().setArg(idx++, slice.dst.offset);
         ret |= unit.kernel->get().setArg(idx++, dst_offset);
@@ -123,6 +194,7 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
         ret |= unit.kernel->get().setArg(idx++, outputShape[2]);
         ret |= unit.kernel->get().setArg(idx++, outputShape[1]);
         ret |= unit.kernel->get().setArg(idx++, outputShape[3]);
+        ret |= unit.kernel->get().setArg(idx++, outputShape[0]);
         if(ret != CL_SUCCESS)
         {
             MNN_PRINT("setArg err %d\n", (int)ret);
@@ -141,18 +213,11 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
             auto inputShape = tensorShapeFormat(origin);
             int src_offset = 0;
             int dst_offset = 0;
-            if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC)
-            {
-                buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC");
-            }
-            if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC)//nhwc buffer to Image
-            {
-                buildOptions.emplace(" -DOUTPUT_DATA_FORMAT_NHWC");
-            }
+            buildOptions.emplace("-DINPUT_FORMAT=" + std::to_string(TensorUtils::getDescribe(origin)->dimensionFormat));
+            buildOptions.emplace("-DOUTPUT_FORMAT=" + std::to_string(outputDes->dimensionFormat));
             
             Unit &unit          = mUnits[kernel_idx++];
-            unit.kernel         = runtime->buildKernel("raster_buf", "raster_direct_buffer", buildOptions, output, output);
-            
+            unit.kernel         = runtime->buildKernel("raster_buf", "raster_direct_buffer", buildOptions, origin, output);
             const std::vector<uint32_t> gws =  {(uint32_t)slice.size[2],
                 (uint32_t)slice.size[1],
                 (uint32_t)slice.size[0]};
@@ -173,6 +238,7 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
             ret |= unit.kernel->get().setArg(idx++, inputShape[2]);
             ret |= unit.kernel->get().setArg(idx++, inputShape[1]);
             ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
+            ret |= unit.kernel->get().setArg(idx++, inputShape[0]);
             ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
             ret |= unit.kernel->get().setArg(idx++, slice.dst.offset);
             ret |= unit.kernel->get().setArg(idx++, dst_offset);
@@ -182,6 +248,7 @@ ErrorCode RasterBufExecution::onEncode(const std::vector<Tensor *> &____inputs,
             ret |= unit.kernel->get().setArg(idx++, outputShape[2]);
             ret |= unit.kernel->get().setArg(idx++, outputShape[1]);
             ret |= unit.kernel->get().setArg(idx++, outputShape[3]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[0]);
             if(ret != CL_SUCCESS)
             {
                 MNN_PRINT("setArg err %d\n", (int)ret);
diff --git a/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp b/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
index 83fc56474..bc1760d34 100644
--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.cpp
@@ -23,25 +23,31 @@ ReductionBufExecution::ReductionBufExecution(const std::vector<Tensor *> &inputs
     mAxis = op->main_as_ReductionParam()->dim()->data()[0];
     switch (op->main_as_ReductionParam()->operation()) {
         case ReductionType_MEAN:
-            mReductType = 0;
+            mBuildOptions.emplace("-DOPERATE(a,b)=(a+b)");
+            mBuildOptions.emplace("-DGET_AVG");
+            mBuildOptions.emplace("-DVALUE=0");
             break;
         case ReductionType_MAXIMUM:
-            mReductType = 1;
+            mBuildOptions.emplace("-DOPERATE(a,b)=max(a,b)");
+            mBuildOptions.emplace("-DVALUE=-FLT_MAX");
             break;
         case ReductionType_MINIMUM:
-            mReductType = 2;
+            mBuildOptions.emplace("-DOPERATE(a,b)=min(a,b)");
+            mBuildOptions.emplace("-DVALUE=FLT_MAX");
             break;
         case ReductionType_PROD:
-            mReductType = 3;
+            mBuildOptions.emplace("-DOPERATE(a,b)=(a*b)");
+            mBuildOptions.emplace("-DVALUE=1");
             break;
         case ReductionType_SUM:
-            mReductType = 4;
+            mBuildOptions.emplace("-DOPERATE(a,b)=(a+b)");
+            mBuildOptions.emplace("-DVALUE=0");
             break;
         default:
             MNN_ASSERT(false);
             break;
     }
-    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("reduction_buf", "reduct_width_buf", {"-DOPERATE(a,b)=(a+b)","-DVALUE=0","-DLOCAL_SIZE=512"}, inputs[0], outputs[0]);
+    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("reduction_buf", "reduct_buf", {"-DOPERATE(a,b)=(a+b)","-DVALUE=0","-DLOCAL_SIZE=512"}, inputs[0], outputs[0]);
     mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ReductionBufExecution init !\n");
@@ -76,102 +82,24 @@ ErrorCode ReductionBufExecution::onEncode(const std::vector<Tensor *> &inputs, c
         inside *= input->length(i);
     }
     int dim = input->length(mAxis);
-    int local_size = 0;
     
-    if(dim >= 16){
-        mUseLocal = true;
-    }
-
-    std::vector<int> inputShape = tensorShapeFormat(input);
-    std::vector<int> outputShape = tensorShapeFormat(output);
-
-    int batch = inputShape.at(0);
-    int inputHeight = inputShape.at(1);
-    int inputWidth  = inputShape.at(2);
-    int inputChannels = inputShape.at(3);
-    int inputChannelBlocks = (inputChannels + 3) / 4;
-    int outputBatch = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int outputChannels = outputShape.at(3);
-    int outputChannelBlocks = (outputChannels + 3) / 4;
-
-    std::set<std::string> buildOption;
-    switch (mReductType) {
-        case 0:
-            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
-            buildOption.emplace("-DGET_AVG");
-            buildOption.emplace("-DVALUE=0");
-            break;
-        case 1:
-            buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
-            buildOption.emplace("-DVALUE=-FLT_MAX");
-            break;
-        case 2:
-            buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
-            buildOption.emplace("-DVALUE=FLT_MAX");
-            break;
-        case 3:
-            buildOption.emplace("-DOPERATE(a,b)=(a*b)");
-            buildOption.emplace("-DVALUE=1");
-            break;
-        case 4:
-            buildOption.emplace("-DOPERATE(a,b)=(a+b)");
-            buildOption.emplace("-DVALUE=0");
-            break;
-        default:
-            MNN_ASSERT(false);
-            break;
+    int localSize = getLocalSize(dim, MaxLocalSize);
+    if(localSize < 4){
+        localSize = 1;
     }
     
-    mGlobalWorkSize = {
-        static_cast<uint32_t>(outputWidth),
-        static_cast<uint32_t>(outputHeight),
-        static_cast<uint32_t>(outputBatch * outputChannelBlocks)
-    };
-    
-    if(mUseLocal){
-        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
-            local_size = getLocalSize(inputWidth, MaxLocalSize);
-            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption, input, output);
-        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
-            local_size = getLocalSize(inputHeight, MaxLocalSize);
-            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption, input, output);
-        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
-            local_size = getLocalSize(inputChannelBlocks - 1, MaxLocalSize);
-            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-            if(output->buffer().dimensions == 1){
-                unit.kernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption, input, output);
-            }else{
-                unit.kernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption, input, output);
-            }
-            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
-        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
-            local_size = getLocalSize(batch, MaxLocalSize);
-            buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption, input, output);
-        }
-        mGlobalWorkSize[0] *= local_size;
-    }else{
-        buildOption.emplace("-DLOCAL_SIZE=0");
-        if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption, input, output);
-        }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption, input, output);
-        }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
-            if(output->buffer().dimensions == 1){
-                unit.kernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption, input, output);
-            }else{
-                unit.kernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption, input, output);
-            }
-            mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
-        }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
-            unit.kernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption, input, output);
-        }
+    std::set<std::string> buildOptions = mBuildOptions;
+    buildOptions.emplace("-DREDUCT_LOCAL_SIZE=" + std::to_string(localSize));
+    std::string kernelName;
+    if(inside % 4 == 0){
+        unit.kernel = runtime->buildKernel("reduction_buf", "reduct_v4_buf", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};
+    }else {
+        unit.kernel = runtime->buildKernel("reduction_buf", "reduct_buf", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};
     }
-    //printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+    mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
 
     mUnits.resize(1);
     uint32_t idx = 0;
@@ -181,20 +109,12 @@ ErrorCode ReductionBufExecution::onEncode(const std::vector<Tensor *> &inputs, c
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(idx++, inputWidth);
-    ret |= unit.kernel->get().setArg(idx++, inputHeight);
-    ret |= unit.kernel->get().setArg(idx++, inputChannels);
-    ret |= unit.kernel->get().setArg(idx++, batch);
-    ret |= unit.kernel->get().setArg(idx++, inputChannelBlocks);
-    ret |= unit.kernel->get().setArg(idx++, outputWidth);
-    ret |= unit.kernel->get().setArg(idx++, outputHeight);
-    ret |= unit.kernel->get().setArg(idx++, outputChannels);
-    ret |= unit.kernel->get().setArg(idx++, outputChannelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, inside);
+    ret |= unit.kernel->get().setArg(idx++, outside);
+    ret |= unit.kernel->get().setArg(idx++, dim);
     MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");
 
-    if(mUseLocal){
-        mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
-    }else{
+    if(localSize == 1){
         mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
         std::string kernelName = "reduct_buf";
         mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
diff --git a/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp b/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp
index fb1d78172..091617b82 100644
--- a/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ReductionBufExecution.hpp
@@ -26,12 +26,11 @@ class ReductionBufExecution : public CommonExecution {
     int getLocalSize(int size, int maxGroupSize);
     OpenCLBackend *mOpenCLBackend;
     MNN::DataType mdataType;
-    int mReductType;
     int mAxis;
     std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
     std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
-    bool mUseLocal = false;
     uint32_t mMaxWorkGroupSize;
+    std::set<std::string> mBuildOptions;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/ReluBufExecution.cpp b/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
index 6d1b9ee3d..b268f8dc4 100644
--- a/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
@@ -61,7 +61,7 @@ ErrorCode ReluBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
     int nhwcArray[4]        = {nhwc[0], nhwc[1], nhwc[2], UP_DIV(nhwc[3], 4)};
     auto imageWidth        = nhwc[0] * UP_DIV(nhwc[3], 4);
     auto imageHeight       = nhwc[1] * nhwc[2];
-    
+        
     std::vector<uint32_t> localSize  = {1, 1};
     std::vector<uint32_t> globalSize = {(uint32_t)imageWidth, (uint32_t)imageHeight};
 
@@ -71,7 +71,10 @@ ErrorCode ReluBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
         return SubgrouponResize(inputs, outputs);
     }
 #endif /* MNN_SUPPORT_INTEL_SUBGROUP */
-    mUnits[0].kernel = runTime->buildKernel("binary_buf", "prelu_buf", {"-DOPERATOR=select(in0*in1,in0,in0>=(float4)0)"}, inputs[0], outputs[0]);
+    
+    std::set<std::string> buildOption;
+    buildOption.emplace("-DOPERATOR=select(in0*in1,in0,in0>=(float4)0)");
+    mUnits[0].kernel = runTime->buildKernel("binary_buf", "prelu_buf", buildOption, inputs[0], outputs[0]);
     mMaxWorkGroupSize      = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(mUnits[0].kernel));
     int fullCount[2] = {1, 1};
     
diff --git a/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp b/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp
index 43ce99a58..764ea8c95 100644
--- a/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ScaleBufExecution.cpp
@@ -93,14 +93,10 @@ ScaleBufExecution::ScaleBufExecution(const std::vector<Tensor *> &inputs, const
         }
         openclBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(biasBuffer, biasPtrCL);
 
-        buildOptions.emplace("-DBIAS");
+        mBuildOptions.emplace("-DBIAS");
         mHasBias = true;
     }
 
-    auto runtime           = mOpenCLBackend->getOpenCLRuntime();
-    unit.kernel            = runtime->buildKernel("scale_buf", "scale_buf", buildOptions);
-    mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
-
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ScaleBufExecution init !\n");
 #endif
@@ -122,13 +118,15 @@ ErrorCode ScaleBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
     const int height   = inputShape.at(1);
     const int width    = inputShape.at(2);
     const int channels = inputShape.at(3);
-
+    const int inside = width * height;
     const int channelBlocks = UP_DIV(channels, 4);
 
-    mGlobalWorkSize = {static_cast<uint32_t>(width * channelBlocks),
-            static_cast<uint32_t>(height * batch)};
+    std::set<std::string> buildOptions = mBuildOptions;
+    unit.kernel            = runtime->buildKernel("scale_buf", "scale_buf", buildOptions);
+    mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
     
-    int shape[4] = {batch, height, width, channelBlocks};
+    mGlobalWorkSize = {static_cast<uint32_t>(inside),
+            static_cast<uint32_t>(channelBlocks * batch)};
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
@@ -139,7 +137,9 @@ ErrorCode ScaleBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mBias.get()));
     }
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputs[0]));
-    ret |= unit.kernel->get().setArg(idx++, shape);
+    ret |= unit.kernel->get().setArg(idx++, channelBlocks);
+    ret |= unit.kernel->get().setArg(idx++, batch);
+    ret |= unit.kernel->get().setArg(idx++, inside);
     MNN_CHECK_CL_SUCCESS(ret, "setArg ScaleBufExecution");
 
     std::string name = "scale_buf";
diff --git a/source/backend/opencl/execution/buffer/ScaleBufExecution.hpp b/source/backend/opencl/execution/buffer/ScaleBufExecution.hpp
index b01897bf3..f288a6a73 100644
--- a/source/backend/opencl/execution/buffer/ScaleBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/ScaleBufExecution.hpp
@@ -31,6 +31,7 @@ class ScaleBufExecution : public CommonExecution {
     std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
     OpenCLBackend *mOpenCLBackend;
     bool mHasBias = false;
+    std::set<std::string> mBuildOptions;
 };
 
 } // namespace OpenCL
diff --git a/source/backend/opencl/execution/buffer/SelectBufExecution.cpp b/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
index 385c853a4..94a2f934a 100644
--- a/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SelectBufExecution.cpp
@@ -34,13 +34,12 @@ ErrorCode SelectBufExecution::onEncode(const std::vector<Tensor*>& inputs, const
     mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 
     std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
-
-    int batch        = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int channels     = outputShape.at(3);
-    int channelBlocks = (channels + 3) / 4;
-    int outSize = batch * channelBlocks * outputWidth * outputHeight * 4;
+    int outSize = 0;
+    if(MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(outputs[0])->dimensionFormat){
+        outSize = outputShape[0] * outputShape[1] * outputShape[2] * ROUND_UP(outputShape[3], 4);
+    }else{
+        outSize = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
+    }
 
     mGlobalWorkSize = {
         static_cast<uint32_t>(outSize),
diff --git a/source/backend/opencl/execution/buffer/SelfAttentionBufExecution.cpp b/source/backend/opencl/execution/buffer/SelfAttentionBufExecution.cpp
index bc7aba4ef..6f4a3a97f 100644
--- a/source/backend/opencl/execution/buffer/SelfAttentionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SelfAttentionBufExecution.cpp
@@ -150,6 +150,7 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             ret |= mKernel_split[seq_idx]->get().setArg(index++, seq_len);
             ret |= mKernel_split[seq_idx]->get().setArg(index++, mNumHead);
             ret |= mKernel_split[seq_idx]->get().setArg(index++, mHeadDim);
+            ret |= mKernel_split[seq_idx]->get().setArg(index++, batch);
             ret |= mKernel_split[seq_idx]->get().setArg(index++, seq_idx);
             MNN_CHECK_CL_SUCCESS(ret, "setArg split_transpose_qkv");
             mLocalWorkSizeSplit[seq_idx] = localWS3DDefault(mGlobalWorkSizeSplit[seq_idx], maxWorkGroupSize, runtime, "split_transpose_qkv", mKernel_split[seq_idx]).first;
@@ -216,6 +217,10 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             int batch_offset_a = e_pack * l_pack;
             int batch_offset_b = h_pack * l_pack;
             int batch_offset_c = e_pack * h_pack;
+            
+            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+            int stride[4] = {e_pack, h_pack, h_pack, h_pack};
+            int group[4] = {1, 1, 1, loop};
             int idx            = 0;
             cl_int ret = CL_SUCCESS;
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, static_cast<int>(e_pack));
@@ -224,11 +229,11 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, alpha);
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, beta);
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, openCLBuffer(mTempQ.get()));
-            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, batch_offset_a);
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, openCLBuffer(mTempK.get()));
-            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, batch_offset_b);
             ret |= mKernel_qk[seq_idx]->get().setArg(idx++, openCLBuffer(mTempQK.get()));
-            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, batch_offset_c);
+            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, batch_offset);
+            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, stride);
+            ret |= mKernel_qk[seq_idx]->get().setArg(idx++, group);
             MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qk Kernel");
             mOpenCLBackend->recordKernel3d(mKernel_qk[seq_idx], mGlobalWorkSizeQk[seq_idx], mLocalWorkSizeQk[seq_idx]);
             
@@ -283,6 +288,9 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
+            ret |= mKernel_trans[seq_idx]->get().setArg(index++, mGlobalWorkSizeTrans[seq_idx][0]);
+            ret |= mKernel_trans[seq_idx]->get().setArg(index++, mGlobalWorkSizeTrans[seq_idx][1]);
+            ret |= mKernel_trans[seq_idx]->get().setArg(index++, mGlobalWorkSizeTrans[seq_idx][2]);
             ret |= mKernel_trans[seq_idx]->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
             ret |= mKernel_trans[seq_idx]->get().setArg(index++, openCLBuffer(mTempTrans.get()));
             ret |= mKernel_trans[seq_idx]->get().setArg(index++, loop);
@@ -291,6 +299,10 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention transpose");
             mLocalWorkSizeTrans[seq_idx] = localWS3DDefault(mGlobalWorkSizeTrans[seq_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "trans_3d_buf", mKernel_trans[seq_idx]).first;
             
+            mGlobalWorkSizeTrans[seq_idx][0] = ROUND_UP(mGlobalWorkSizeTrans[seq_idx][0], std::max((uint32_t)1, mLocalWorkSizeTrans[seq_idx][0]));
+            mGlobalWorkSizeTrans[seq_idx][1] = ROUND_UP(mGlobalWorkSizeTrans[seq_idx][1], std::max((uint32_t)1, mLocalWorkSizeTrans[seq_idx][1]));
+            mGlobalWorkSizeTrans[seq_idx][2] = ROUND_UP(mGlobalWorkSizeTrans[seq_idx][2], std::max((uint32_t)1, mLocalWorkSizeTrans[seq_idx][2]));
+            
             mOpenCLBackend->recordKernel3d(mKernel_trans[seq_idx], mGlobalWorkSizeTrans[seq_idx], mLocalWorkSizeTrans[seq_idx]);
         }
         
@@ -361,6 +373,10 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             int batch_offset_a = e_pack * l_pack;
             int batch_offset_b = h_pack * l_pack;
             int batch_offset_c = e_pack * h_pack;
+            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+            int stride[4] = {e_pack, h_pack, e_pack, h_pack};
+            int group[4] = {1, 1, 1, loop};
+            
             int idx            = 0;
             cl_int ret = CL_SUCCESS;
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, static_cast<int>(e_pack));
@@ -369,11 +385,11 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, alpha);
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, beta);
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, openCLBuffer(mTempTrans.get()));
-            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, batch_offset_a);
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, openCLBuffer(mTempV.get()));
-            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, batch_offset_b);
             ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, openCLBuffer(mTempQKV.get()));
-            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, batch_offset_c);
+            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, batch_offset);
+            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, stride);
+            ret |= mKernel_qkv[seq_idx]->get().setArg(idx++, group);
             MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qkv Kernel");
             mOpenCLBackend->recordKernel3d(mKernel_qkv[seq_idx], mGlobalWorkSizeQkv[seq_idx], mLocalWorkSizeQkv[seq_idx]);
         }
@@ -403,6 +419,7 @@ ErrorCode SelfAttentionBufImpl::onResize(Backend *backend, const std::vector<Ten
             ret |= mKernel_clip[seq_idx]->get().setArg(index++, seq_len_piece);
             ret |= mKernel_clip[seq_idx]->get().setArg(index++, mNumHead);
             ret |= mKernel_clip[seq_idx]->get().setArg(index++, mHeadDim);
+            ret |= mKernel_clip[seq_idx]->get().setArg(index++, batch);
             ret |= mKernel_clip[seq_idx]->get().setArg(index++, seq_idx);
 
             mLocalWorkSizeClip[seq_idx] = localWS3DDefault(mGlobalWorkSizeClip[seq_idx], maxWorkGroupSize, runtime, "clip_transpose_qkv", mKernel_clip[seq_idx]).first;
diff --git a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
index fa2dc9216..a01c91832 100644
--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.cpp
@@ -17,27 +17,10 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
     : CommonExecution(backend, Op) {
     mAxis          = axis;
     mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_channel", {"-DSOFTMAX_LOCAL_SIZE=512"});
+    auto kernel = mOpenCLBackend->getOpenCLRuntime()->buildKernel("softmax_buf", "softmax_buf", {"-DSOFTMAX_LOCAL_SIZE=512"});
     mMaxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel));
 }
 
-bool SoftmaxBufExecution::buildSoftmaxKernel(int localSize) {
-    auto runtime = mOpenCLBackend->getOpenCLRuntime();
-    std::set<std::string> buildOptions;
-    buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
-    std::string kernelName;
-    if (mAxis == 1) {
-        mUnits[0].kernel  = runtime->buildKernel("softmax_buf", "softmax_channel", buildOptions);
-    } else if (mAxis == 2) {
-        mUnits[0].kernel  = runtime->buildKernel("softmax_buf", "softmax_height", buildOptions);
-    } else {
-        MNN_ASSERT(mAxis == 3);
-        mUnits[0].kernel  = runtime->buildKernel("softmax_buf", "softmax_width", buildOptions);
-    }
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mUnits[0].kernel));
-    return true;
-}
-
 int SoftmaxBufExecution::getLocalSize(int size, int maxGroupSize){
     int local_size = 1;
     while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
@@ -47,8 +30,7 @@ int SoftmaxBufExecution::getLocalSize(int size, int maxGroupSize){
 }
 
 ErrorCode SoftmaxBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    mUnits.resize(1);
-    auto &unit = mUnits[0];
+    mUnits.clear();
     Tensor *input  = inputs[0];
     Tensor *output = outputs[0];
     
@@ -57,6 +39,18 @@ ErrorCode SoftmaxBufExecution::onEncode(const std::vector<Tensor *> &inputs, con
 
     auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
 
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+    if (mNeedUnpackC4) {
+        int totalSize = 1;
+        for (int i = 1; i < dims; ++i) {
+            totalSize *= input->length(i);
+        }
+        mTempTensor.reset(Tensor::createDevice<float>({totalSize}));
+        mOpenCLBackend->onAcquireBuffer(mTempTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempTensor.get(), Backend::DYNAMIC);
+    }
+    
     int inside  = 1;
     int outside = 1;
     int channel = 1;
@@ -67,62 +61,123 @@ ErrorCode SoftmaxBufExecution::onEncode(const std::vector<Tensor *> &inputs, con
     for (int i = mAxis + 1; i < dims; ++i) {
         inside *= input->length(i);
     }
-
-    std::vector<int> inputShape  = tensorShapeFormat(input);
-    std::vector<int> outputShape = tensorShapeFormat(output);
-
-    const int inputBatch    = inputShape.at(0);
-    const int inputHeight   = inputShape.at(1);
-    const int inputWidth    = inputShape.at(2);
-    const int inputChannels = inputShape.at(3);
     
-    const int outputBatch    = outputShape.at(0);
-    const int outputHeight   = outputShape.at(1);
-    const int outputWidth    = outputShape.at(2);
-    const int outputChannels = outputShape.at(3);
-
-    const int channelBlocks  = UP_DIV(outputChannels, 4);
-    const int remainChannels = channelBlocks * 4 - outputChannels;
-    int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
-    int localSize = getLocalSize(channel, MaxLocalSize);
-    if(localSize < 4){
-        localSize = 1;
-    }
-    if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
-        mAxis = 1;
-        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch};
-        localSize = getLocalSize(channelBlocks, MaxLocalSize);
-    }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputWidth){
-        mAxis = 2;
-        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch};
-    }else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
-        mAxis = 3;
-        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks, (uint32_t)outputBatch*outputHeight};
+    // NC4HW4 -> NCHW
+    if(mNeedUnpackC4){
+        Unit unit;
+        std::vector<int> outputShape = tensorShapeFormat(input);
+        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
+        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
+        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
+        cl_int ret = CL_SUCCESS;
+        uint32_t idx = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
+        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
+
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
     }
     
-//    printf("softmax: %d %d %d %d, %d\n", inputBatch, inputChannels, inputHeight, inputWidth, mAxis);
-    buildSoftmaxKernel(localSize);
-    
-    cl_int ret = CL_SUCCESS;
-    mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
+    // softmax
+    {
+        Unit unit;
+        int localSize = getLocalSize(channel, MaxLocalSize);
+        if(localSize < 4){
+            localSize = 1;
+        }
+        std::set<std::string> buildOptions = mBuildOptions;
+        buildOptions.emplace("-DARGMAX_LOCAL_SIZE=" + std::to_string(localSize));
+        std::string kernelName;
+        if(inside == 1){
+            buildOptions.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
+            unit.kernel = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOptions, inputs[0], outputs[0]);
+            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(outside), static_cast<uint32_t>(1)};
+        }
+        else if(inside % 4 == 0){
+            unit.kernel = runtime->buildKernel("softmax_buf", "softmax_v4_buf", buildOptions);
+            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};
+        }else {
+            unit.kernel = runtime->buildKernel("softmax_buf", "softmax_buf", buildOptions);
+            mGlobalWorkSize = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(inside), static_cast<uint32_t>(outside)};
+        }
+        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalWorkSize = {(uint32_t)(localSize), 1, 1};
+        
+        cl_int ret = CL_SUCCESS;
+        
+        uint32_t idx    = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        if(mNeedUnpackC4){
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(mTempTensor.get()));
+        }else{
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(input));
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
+        }
+        if(inside == 1){
+            ret |= unit.kernel->get().setArg(idx++, channel);
+            int shape[4] = {1, outside, channel, 1};
+            ret |= unit.kernel->get().setArg(idx++, shape);
+        } else {
+            ret |= unit.kernel->get().setArg(idx++, inside);
+            ret |= unit.kernel->get().setArg(idx++, outside);
+            ret |= unit.kernel->get().setArg(idx++, channel);
+        }
+        MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution");
+        if(localSize == 1){
+            mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax_buf", unit.kernel).first;
+        }
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
+    }
     
-    uint32_t idx    = 0;
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
-
-    ret |= unit.kernel->get().setArg(idx++, openCLImage(input));
-    ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
-    ret |= unit.kernel->get().setArg(idx++, remainChannels);
-    ret |= unit.kernel->get().setArg(idx++, shape);
-    MNN_CHECK_CL_SUCCESS(ret, "setArg SoftmaxBufExecution");
-    if(localSize == 1){
-        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "softmax_buf", unit.kernel).first;
+    // NCHW -> NC4HW4
+    if(mNeedUnpackC4){
+        Unit unit;
+        std::vector<int> outputShape = tensorShapeFormat(output);
+        int shape[4] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//N C H W
+        std::set<std::string> buildOptions;
+        buildOptions.emplace("-DINPUT_FORMAT=MNN_DATA_FORMAT_NCHW");
+        buildOptions.emplace("-DOUTPUT_FORMAT=MNN_DATA_FORMAT_NC4HW4");
+        unit.kernel = runtime->buildKernel("buffer_convert_buf", "buffer_convert_to_buffer", buildOptions, input, output);
+        mGlobalWorkSize = {static_cast<uint32_t>(shape[2] * shape[3]), static_cast<uint32_t>(shape[1]), static_cast<uint32_t>(shape[0])};
+        cl_int ret = CL_SUCCESS;
+        uint32_t idx = 0;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(mTempTensor.get()));
+        ret |= unit.kernel->get().setArg(idx++, sizeof(shape), shape);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
+        MNN_CHECK_CL_SUCCESS(ret, "setArg buffer_convert_to_buffer");
+
+        const uint32_t maxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mLocalWorkSize = {16, std::max((uint32_t)1, maxWorkGroupSize / 16), 1};
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+        mUnits.emplace_back(unit);
     }
     
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp
index 4385bae7d..e6d154ffa 100644
--- a/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/SoftmaxBufExecution.hpp
@@ -22,8 +22,6 @@ class SoftmaxBufExecution : public CommonExecution {
 
     virtual ~SoftmaxBufExecution() = default;
     virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-    bool buildSoftmaxKernel(int localSize);
 private:
     int getLocalSize(int size, int maxGroupSize);
     uint32_t mMaxWorkGroupSize;
@@ -31,6 +29,9 @@ class SoftmaxBufExecution : public CommonExecution {
     std::vector<uint32_t> mGlobalWorkSize{1, 1, 1};
     std::vector<uint32_t> mLocalWorkSize{1, 1, 1, 1};
     int mAxis;
+    std::set<std::string> mBuildOptions;
+    std::shared_ptr<Tensor> mTempTensor;
+    bool mNeedUnpackC4;
 };
 } // namespace OpenCL
 } // namespace MNN
diff --git a/source/backend/opencl/execution/buffer/SplitGeluBufExecution.cpp b/source/backend/opencl/execution/buffer/SplitGeluBufExecution.cpp
index 0baee6428..78d72de88 100644
--- a/source/backend/opencl/execution/buffer/SplitGeluBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/SplitGeluBufExecution.cpp
@@ -39,7 +39,10 @@ ErrorCode SplitGeluBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
         buildOptions.emplace("-DDOUBLE_INPUTS");
     }
     int pack_wh = 1;
-    if(shape[2] % 4 == 0) {
+    if(shape[2] % 16 == 0) {
+        pack_wh = 16;
+        buildOptions.emplace("-DWH_16");
+    } else if(shape[2] % 4 == 0) {
         pack_wh = 4;
         buildOptions.emplace("-DWH_4");
     }
@@ -49,15 +52,13 @@ ErrorCode SplitGeluBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
     
     auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 
-    mGWS = {static_cast<uint32_t>(shape[0]),
-            static_cast<uint32_t>(UP_DIV(shape[1], 4)),
-            static_cast<uint32_t>(UP_DIV(shape[2],pack_wh))};
+    mGWS = {static_cast<uint32_t>(UP_DIV(shape[2], pack_wh)),
+            static_cast<uint32_t>(shape[0] * shape[1])};
 
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGWS[0]);
     ret |= unit.kernel->get().setArg(idx++, mGWS[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGWS[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
     if(inputs.size() > 1) {
         ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputs[1]));
@@ -67,12 +68,12 @@ ErrorCode SplitGeluBufExecution::onEncode(const std::vector<Tensor*>& inputs, co
 
     MNN_CHECK_CL_SUCCESS(ret, "setArg SplitGeluBufExecution");
     
-    mLWS = localWS3DDefault(mGWS, maxWorkGroupSize, runtime, "splitgelu_buf", unit.kernel).first;
+    mLWS = localWS2DDefault(mGWS, maxWorkGroupSize, runtime, "splitgelu_buf", unit.kernel).first;
 
-    unit.globalWorkSize  = {mGWS[0], mGWS[1], mGWS[2]};
-    unit.localWorkSize   = {mLWS[0], mLWS[1], mLWS[2]};
+    unit.globalWorkSize  = {mGWS[0], mGWS[1]};
+    unit.localWorkSize   = {mLWS[0], mLWS[1]};
     
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGWS, mLWS);
+    mOpenCLBackend->recordKernel2d(unit.kernel, mGWS, mLWS);
     mOpenCLBackend->endRecord(mRecording);
     return NO_ERROR;
     
diff --git a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
index ff1bddda1..501e6f1ae 100644
--- a/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
+++ b/source/backend/opencl/execution/buffer/StrassenMatmulOpenCLComputor.cpp
@@ -210,6 +210,19 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
 
     return NO_ERROR;
 }
+    
+static int getMaxMultiple(int number) {
+    if(number % 128 == 0) {
+        return 128;
+    } else if(number % 64 == 0) {
+        return 64;
+    } else if(number % 32 == 0) {
+        return 32;
+    } else if(number % 16 == 0) {
+        return 16;
+    }
+    return 1;
+}
 
 ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const MatrixInfo& AT, const MatrixInfo& BT, const MatrixInfo& CT, const MatrixInfo& COT, int currentDepth, int postType) {
 
@@ -244,6 +257,14 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
         return res;
     }
     
+    // sub_matrix cannot own sufficient tile
+    if(getMaxMultiple(e) != getMaxMultiple(eSub)  || getMaxMultiple(h) != getMaxMultiple(eSub) || (lSub % 4 != 0)) {
+        Unit unit;
+        auto res = _generateBasicMatMul(e, l, h, AT, BT, CT, COT, postType, unit);
+        mUnits.emplace_back(unit);
+        return res;
+    }
+    
     // Strassen Construct
     currentDepth += 1;
     
diff --git a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
index 75897d2fe..56a1c9027 100644
--- a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
@@ -22,54 +22,47 @@ ErrorCode UnaryBufExecution::onEncode(const std::vector<Tensor*>& inputs, const
     Tensor* output     = outputs[0];
     auto openCLBackend = static_cast<OpenCLBackend*>(backend());
     auto runtime       = openCLBackend->getOpenCLRuntime();
-    
-    auto dataType = inputs[0]->getType();
     std::set<std::string> buildOptions = mBuildOptions;
-    if (dataType.code == halide_type_int){
-        buildOptions.emplace("-DOPENCL_INPUT_INT");
-    }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
-    if (runtime->isSupportedIntelSubgroup()) {
+    if (runtime->isSupportedIntelSubgroup() && MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(output)->dimensionFormat) {
         return SubgrouponResize(inputs, outputs);
     }
 #endif /* MNN_SUPPORT_INTEL_SUBGROUP */
-    unit.kernel = runtime->buildKernel("unary_buf", "unary_buf", buildOptions, input, output);
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 
-    std::vector<int> inputShape  = tensorShapeFormat(input);
     std::vector<int> outputShape = tensorShapeFormat(output);
-
-    int batch        = outputShape.at(0);
-    int outputHeight = outputShape.at(1);
-    int outputWidth  = outputShape.at(2);
-    int channels     = outputShape.at(3);
-
-    int channelBlocks = (channels + 3) / 4;
+    int totalSize = 0;
+    if(MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(output)->dimensionFormat){
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * ROUND_UP(outputShape[3], 4);
+    }else{
+        totalSize = outputShape[0] * outputShape[1] * outputShape[2] * outputShape[3];
+    }
+    if(totalSize % 4 != 0) {
+        buildOptions.emplace("-DPACK_LEAVE");
+    }
+    unit.kernel = runtime->buildKernel("unary_buf", "unary_buf", buildOptions, input, output);
+    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 
     mGlobalWorkSize = {
-        static_cast<uint32_t>(channelBlocks),
-        static_cast<uint32_t>(outputWidth),
-        static_cast<uint32_t>(batch * outputHeight),
+        static_cast<uint32_t>(UP_DIV(totalSize, 4)),
+        static_cast<uint32_t>(1)
     };
 
     uint32_t idx = 0;
     cl_int ret = CL_SUCCESS;
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
     ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(input));
     ret |= unit.kernel->get().setArg(idx++, openCLBuffer(output));
-    ret |= unit.kernel->get().setArg(idx++, outputHeight);
+    ret |= unit.kernel->get().setArg(idx++, totalSize);
     MNN_CHECK_CL_SUCCESS(ret, "setArg UnaryBufExecution");
 
     std::string kernelName = "unary_buf";
-    mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
-    openCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalSize[0], mLocalSize[1], mLocalSize[2]};
+    mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+    openCLBackend->recordKernel2d(unit.kernel, mGlobalWorkSize, mLocalSize);
+    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
+    unit.localWorkSize = {mLocalSize[0], mLocalSize[1]};
     return NO_ERROR;
 }
-
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
 ErrorCode UnaryBufExecution::SubgrouponResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto &unit = mUnits[0];
@@ -162,6 +155,7 @@ ErrorCode UnaryBufExecution::SubgrouponResize(const std::vector<Tensor*>& inputs
     ret |= unit.kernel->get().setArg(idx++, outputWidth);
     ret |= unit.kernel->get().setArg(idx++, outputHeight);
     ret |= unit.kernel->get().setArg(idx++, channels);
+    ret |= unit.kernel->get().setArg(idx++, batch);
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.left));
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inputpad.right));
     ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outputpad.left));
@@ -187,7 +181,8 @@ class UnaryBufCreator : public OpenCLBackend::Creator {
                                 const MNN::Op* op, Backend* backend) const override {
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
-            if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()) {
+            if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()
+                && MNN::MNN_DATA_FORMAT_NC4HW4 == TensorUtils::getDescribe(inputs[i])->dimensionFormat) {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
diff --git a/source/backend/opencl/execution/cl/argmax_buf.cl b/source/backend/opencl/execution/cl/argmax_buf.cl
index 5240eea4e..f6a675070 100644
--- a/source/backend/opencl/execution/cl/argmax_buf.cl
+++ b/source/backend/opencl/execution/cl/argmax_buf.cl
@@ -22,219 +22,37 @@ __private const int global_size_dim0, __private const int global_size_dim1, __pr
     if(A.z > B.z){ A.z = B.z; C.z = D; }    \
     if(A.w > B.w){ A.w = B.w; C.w = D; }    
 
-__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
-                            __global const FLOAT* input,
-                            __global int* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-    const int x = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
 
-    DEAL_NON_UNIFORM_DIM3(x, height_idx, batch_channel_idx);
-                                
-    const int batch_idx = batch_channel_idx / outputChannelBlock;
-    const int channel_idx = batch_channel_idx % outputChannelBlock;
-                                
-    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
-    int4 index = 0;
-#ifdef ARGMAX
-    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
-#else
-    FLOAT4 maxValue = (FLOAT4)FLT_MAX;
-#endif
-#if ARGMAX_LOCAL_SIZE >= 4
-    int lid = get_local_id(0);
-    FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];
-    int4 local index_reduce[ARGMAX_LOCAL_SIZE];
-    
-    for (int i=lid; i < inputWidth; i+=ARGMAX_LOCAL_SIZE) {
-        FLOAT4 value = vload4(i, input + offset);
-#ifdef ARGMAX
-        ARGMAX_SELECT(maxValue, value, index, i);
-#else
-        ARGMIN_SELECT(maxValue, value, index, i);
-#endif
-    }
-    reduce[lid] = maxValue;
-    index_reduce[lid] = index;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = ARGMAX_LOCAL_SIZE/2; i > 0; i /= 2){
-        if (lid < i){
-#ifdef ARGMAX
-            if(reduce[lid].x < reduce[lid + i].x){reduce[lid].x = reduce[lid + i].x; index_reduce[lid].x = index_reduce[lid + i].x;}
-            if(reduce[lid].y < reduce[lid + i].y){reduce[lid].y = reduce[lid + i].y; index_reduce[lid].y = index_reduce[lid + i].y;}
-            if(reduce[lid].z < reduce[lid + i].z){reduce[lid].z = reduce[lid + i].z; index_reduce[lid].z = index_reduce[lid + i].z;}
-            if(reduce[lid].w < reduce[lid + i].w){reduce[lid].w = reduce[lid + i].w; index_reduce[lid].w = index_reduce[lid + i].w;}
-#else
-            if(reduce[lid].x > reduce[lid + i].x){reduce[lid].x = reduce[lid + i].x; index_reduce[lid].x = index_reduce[lid + i].x;}
-            if(reduce[lid].y > reduce[lid + i].y){reduce[lid].y = reduce[lid + i].y; index_reduce[lid].y = index_reduce[lid + i].y;}
-            if(reduce[lid].z > reduce[lid + i].z){reduce[lid].z = reduce[lid + i].z; index_reduce[lid].z = index_reduce[lid + i].z;}
-            if(reduce[lid].w > reduce[lid + i].w){reduce[lid].w = reduce[lid + i].w; index_reduce[lid].w = index_reduce[lid + i].w;}
-#endif
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0){
-        vstore4(index_reduce[0], 0, output + outputOffset);
-    }
-#else
-    for(int i = 0; i < inputWidth; ++i){
-        FLOAT4 value = vload4(i, input + offset);
-#ifdef ARGMAX
-        ARGMAX_SELECT(maxValue, value, index, i);
-#else
-        ARGMIN_SELECT(maxValue, value, index, i);
-#endif
-    }
-    vstore4(index, 0, output + outputOffset);
-#endif
-}
-
-
-__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
-                            __global const FLOAT* input,
-                            __global int* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
+__kernel void argmax_buf(GLOBAL_SIZE_3_DIMS
+                        __global const FLOAT* input,
+                        __global int* output,
+                        __private const int inside,
+                        __private const int outside,
+                        __private const int dim){
     const int x = get_global_id(0);
-    const int width_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(x, width_idx, batch_channel_idx);
-                                
-    const int batch_idx = batch_channel_idx / outputChannelBlock;
-    const int channel_idx = batch_channel_idx % outputChannelBlock;
-                                
-    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
-    int4 index = 0;
-#ifdef ARGMAX
-    FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
-#else
-    FLOAT4 maxValue = (FLOAT4)FLT_MAX;
-#endif
-#if ARGMAX_LOCAL_SIZE >= 4
-    int lid = get_local_id(0);
-    FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];
-    int4 local index_reduce[ARGMAX_LOCAL_SIZE];
+    const int y = get_global_id(1); // inside
+    const int z = get_global_id(2); // outside
     
-    for (int i=lid; i < inputHeight; i+=ARGMAX_LOCAL_SIZE) {
-        FLOAT4 value = vload4(i * inputWidth, input + offset);
-#ifdef ARGMAX
-        ARGMAX_SELECT(maxValue, value, index, i);
-#else
-        ARGMIN_SELECT(maxValue, value, index, i);
-#endif
-    }
-    reduce[lid] = maxValue;
-    index_reduce[lid] = index;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = ARGMAX_LOCAL_SIZE/2; i > 0; i /= 2){
-        if (lid < i){
-#ifdef ARGMAX
-            if(reduce[lid].x < reduce[lid + i].x){reduce[lid].x = reduce[lid + i].x; index_reduce[lid].x = index_reduce[lid + i].x;}
-            if(reduce[lid].y < reduce[lid + i].y){reduce[lid].y = reduce[lid + i].y; index_reduce[lid].y = index_reduce[lid + i].y;}
-            if(reduce[lid].z < reduce[lid + i].z){reduce[lid].z = reduce[lid + i].z; index_reduce[lid].z = index_reduce[lid + i].z;}
-            if(reduce[lid].w < reduce[lid + i].w){reduce[lid].w = reduce[lid + i].w; index_reduce[lid].w = index_reduce[lid + i].w;}
-#else
-            if(reduce[lid].x > reduce[lid + i].x){reduce[lid].x = reduce[lid + i].x; index_reduce[lid].x = index_reduce[lid + i].x;}
-            if(reduce[lid].y > reduce[lid + i].y){reduce[lid].y = reduce[lid + i].y; index_reduce[lid].y = index_reduce[lid + i].y;}
-            if(reduce[lid].z > reduce[lid + i].z){reduce[lid].z = reduce[lid + i].z; index_reduce[lid].z = index_reduce[lid + i].z;}
-            if(reduce[lid].w > reduce[lid + i].w){reduce[lid].w = reduce[lid + i].w; index_reduce[lid].w = index_reduce[lid + i].w;}
-#endif
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0){
-        vstore4(index_reduce[0], 0, output + outputOffset);
-    }
-#else
-    for(int i = 0; i < inputHeight; ++i){
-        FLOAT4 value = vload4(i * inputWidth, input + offset);
-#ifdef ARGMAX
-        ARGMAX_SELECT(maxValue, value, index, i);
-#else
-        ARGMIN_SELECT(maxValue, value, index, i);
-#endif
-    }
-    vstore4(index, 0, output + outputOffset);
-#endif
-}
-
-__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
-                            __global const FLOAT* input,
-                            __global int* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-    const int x = get_global_id(0);
-    const int wh = get_global_id(1);
-    const int batch_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(x, wh, batch_idx);
-                                
-    const int width_idx = wh % oututWidth;
-    const int height_idx = wh / oututWidth;
-    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-#ifdef ARGMAX_CHANNEL_DIM1
-    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
-#else
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
-#endif
-    int remain = inputChannel - (inputChannelBlock - 1) * 4;
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    int index = 0;
 #ifdef ARGMAX
     FLOAT maxValue = (FLOAT)-FLT_MAX;
 #else
-    FLOAT maxValue = (FLOAT)FLT_MAX;
+FLOAT maxValue = (FLOAT)FLT_MAX;
 #endif
-    int index = 0;
-    FLOAT4 value;
-    FLOAT *valuePtr = (FLOAT*)&value;
+    const int offset = z * dim * inside + y;
 #if ARGMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
     FLOAT local reduce[ARGMAX_LOCAL_SIZE];
     int local index_reduce[ARGMAX_LOCAL_SIZE];
-    
-    for (int i=lid; i < inputChannelBlock - 1; i+=ARGMAX_LOCAL_SIZE) {
-        value = vload4(i * inputWidth * inputHeight, input + offset);
-        for(int j = 0; j < 4; ++j){
+        
+    for (int i=lid; i < dim; i+=ARGMAX_LOCAL_SIZE) {
+        FLOAT value = input[offset + i * inside];
 #ifdef ARGMAX
-            if(maxValue < valuePtr[j]){
-                index = i * 4 + j;
-                maxValue = valuePtr[j];
-            }
+        if(maxValue < value){ maxValue = value; index = i; }
 #else
-            if(maxValue > valuePtr[j]){
-                index = i * 4 + j;
-                maxValue = valuePtr[j];
-            }
+        if(maxValue > value){ maxValue = value; index = i; }
 #endif
-        }
     }
     reduce[lid] = maxValue;
     index_reduce[lid] = index;
@@ -250,96 +68,47 @@ __kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0){
-        maxValue = reduce[lid];
-        index = index_reduce[lid];
-        value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-        for(int j = 0; j < remain; ++j){
-#ifdef ARGMAX
-            if(maxValue < valuePtr[j]){
-                index = (inputChannelBlock - 1) * 4 + j;
-                maxValue = valuePtr[j];
-            }
-#else
-            if(maxValue > valuePtr[j]){
-                index = (inputChannelBlock - 1) * 4 + j;
-                maxValue = valuePtr[j];
-            }
-#endif
-        }
-        output[outputOffset] = index;
+        output[z * inside + y] = index_reduce[0];
     }
 #else
-    for(int i = 0; i < inputChannelBlock - 1; ++i){
-        value = vload4(i * inputWidth * inputHeight, input + offset);
-        for(int j = 0; j < 4; ++j){
-#ifdef ARGMAX
-            if(maxValue < valuePtr[j]){
-                index = i * 4 + j;
-                maxValue = valuePtr[j];
-            }
-#else
-            if(maxValue > valuePtr[j]){
-                index = i * 4 + j;
-                maxValue = valuePtr[j];
-            }
-#endif
-        }
-    }
-    value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-    for(int j = 0; j < remain; ++j){
+    for(int i = 0; i < dim; ++i){
+        FLOAT value = input[ + offset + i * inside];
 #ifdef ARGMAX
-            if(maxValue < valuePtr[j]){
-                index = (inputChannelBlock - 1) * 4 + j;
-                maxValue = valuePtr[j];
-            }
+        if(maxValue < value){ maxValue = value; index = i; }
 #else
-            if(maxValue > valuePtr[j]){
-                index = (inputChannelBlock - 1) * 4 + j;
-                maxValue = valuePtr[j];
-            }
+        if(maxValue > value){ maxValue = value; index = i; }
 #endif
     }
-    output[outputOffset] = index;
+    output[z * inside + y] = index;
 #endif
 }
 
-__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
-                            __global const FLOAT* input,
-                            __global int* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-    const int x = get_global_id(0);
-    const int wh = get_global_id(1);
-    const int channel_idx = get_global_id(2);
 
-    DEAL_NON_UNIFORM_DIM3(x, wh, channel_idx);
+__kernel void argmax_v4_buf(GLOBAL_SIZE_3_DIMS
+                        __global const FLOAT* input,
+                        __global int* output,
+                        __private const int inside,
+                        __private const int outside,
+                        __private const int dim){
+    const int x = get_global_id(0);
+    const int y = get_global_id(1) << 2; // inside
+    const int z = get_global_id(2); // outside
     
-    const int width_idx = wh % oututWidth;
-    const int height_idx = wh / oututWidth;
-    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
     int4 index = 0;
-    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
 #ifdef ARGMAX
     FLOAT4 maxValue = (FLOAT4)-FLT_MAX;
 #else
     FLOAT4 maxValue = (FLOAT4)FLT_MAX;
 #endif
+    const int offset = z * dim * inside + y;
 #if ARGMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
     FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];
     int4 local index_reduce[ARGMAX_LOCAL_SIZE];
-    
-    for (int i=lid; i < inputBatch; i+=ARGMAX_LOCAL_SIZE) {
-        FLOAT4 value = vload4(i * batchOffset, input + offset);
+        
+    for (int i=lid; i < dim; i+=ARGMAX_LOCAL_SIZE) {
+        FLOAT4 value = vload4(0, input + offset + i * inside);
 #ifdef ARGMAX
         ARGMAX_SELECT(maxValue, value, index, i);
 #else
@@ -366,17 +135,17 @@ __kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     if(lid == 0){
-        vstore4(index_reduce[0], 0, output + outputOffset);
+        vstore4(index_reduce[0], 0, output + z * inside + y);
     }
 #else
-    for(int i = 0; i < inputBatch; ++i){
-        FLOAT4 value = vload4(i * batchOffset, input + offset);
+    for(int i = 0; i < dim; ++i){
+        FLOAT4 value = vload4(0, input + offset + i * inside);
 #ifdef ARGMAX
         ARGMAX_SELECT(maxValue, value, index, i);
 #else
         ARGMIN_SELECT(maxValue, value, index, i);
 #endif
     }
-    vstore4(index, 0, output + outputOffset);
+    vstore4(index, 0, output + z * inside + y);
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/attention_buf.cl b/source/backend/opencl/execution/cl/attention_buf.cl
index c17be5a4f..074956902 100644
--- a/source/backend/opencl/execution/cl/attention_buf.cl
+++ b/source/backend/opencl/execution/cl/attention_buf.cl
@@ -10,359 +10,784 @@
         return;                                                                                   \
     }
 
+#define DEAL_OUTER_SEQLEN_NOT_ALIGN(length) \
+    if(4 * sl + 3 >= length) {\
+        temp_3 = (FLOAT4)0;\
+    }\
+    if(4 * sl + 2 >= length) {\
+        temp_2 = (FLOAT4)0;\
+    }\
+    if(4 * sl + 1 >= length) {\
+        temp_1 = (FLOAT4)0;\
+    }
+
+#define DEAL_INNER_HEADDIM_NOT_ALIGN(length) \
+    if(hd * 4 + 3 >= length) {\
+        temp_0.w = (FLOAT)0;\
+        temp_1.w = (FLOAT)0;\
+        temp_2.w = (FLOAT)0;\
+        temp_3.w = (FLOAT)0;\
+    }\
+    if(hd * 4 + 2 >= length) {\
+        temp_0.z = (FLOAT)0;\
+        temp_1.z = (FLOAT)0;\
+        temp_2.z = (FLOAT)0;\
+        temp_3.z = (FLOAT)0;\
+    }\
+    if(hd * 4 + 1 >= length) {\
+        temp_0.y = (FLOAT)0;\
+        temp_1.y = (FLOAT)0;\
+        temp_2.y = (FLOAT)0;\
+        temp_3.y = (FLOAT)0;\
+    }
+
+
+
+__kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *input_q, //[batch, seqLenQ/4, headNum, headDim, seqLenQ_4]
+                              __global const FLOAT *input_k, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
+                              __global const FLOAT *input_v, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
+                              __global FLOAT *output_q, // [batch*headNum, ROUND_UP(headDim, mTileHDK), ROUND_UP(seqLenQ, mTileQ)]
+                              __global FLOAT *output_k, // [batch*headNum/group, ROUND_UP(headDim, mTileHDK), ROUND_UP(seqLenKV, mTileKV)]
+                              __global FLOAT *output_v, // [batch*headNum/group, ROUND_UP(seqLenKV, mTileKV), ROUND_UP(headDim, mTileHDN)]
+                              __global FLOAT *past_k, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
+                              __global FLOAT *past_v, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
+                              __private const int4 tile, // [mTileQ, mTileKV, mTileHDK, mTileHDN]
+                              __private const int4 shape,// [seqLenQ, seqLenKV, headNum, headDim]
+                              __private const int4 param // [group, batch]
+) {
+    const int sl = get_global_id(0); // seqLen/4 : max(seqLenPackQ/4, seqLenPackKV/4)
+    const int hd = get_global_id(1); // headDim/4 : max(headDimPackQK/4, headDimPackV/4)
+    const int z = get_global_id(2); // batch * headNum
+    DEAL_NON_UNIFORM_DIM3(sl, hd, z);
+    
+    const int seqLenQ = shape.x;
+    const int seqLenKV = shape.y;
+    const int headNum = shape.z;
+    const int headDim = shape.w;
+    const int group = param.x;
+    const int batch = param.y;
+
+    const int b = z % batch;
+    const int hn = z / batch;
+    
+    const int seqLenQ_4 = (seqLenQ + 3) / 4;
+    //const int in_offset_q = (((b * seqLenQ_4 + sl) * headNum + hn) * headDim + 4 * hd) * 4;
+    const int in_offset_q = (((b * seqLenQ + sl * 4) * headNum + hn) * headDim + 4 * hd);
+
+    const int seqLenPackQ = ((seqLenQ + tile.x - 1) / tile.x) * tile.x;
+    const int headDimPackQK = ((headDim + tile.z - 1) / tile.z) * tile.z;
+    const int out_offset_q = (((b * headNum + hn) * headDimPackQK + hd * 4) * seqLenPackQ + sl * 4);
+    
+    if(sl * 4 < seqLenPackQ && hd * 4 < headDimPackQK) {
+        if(sl * 4 >= seqLenQ || hd * 4 >= headDim) {
+            vstore4((FLOAT4)0, 0, output_q + out_offset_q);
+            vstore4((FLOAT4)0, 0, output_q + out_offset_q + seqLenPackQ);
+            vstore4((FLOAT4)0, 0, output_q + out_offset_q + 2 * seqLenPackQ);
+            vstore4((FLOAT4)0, 0, output_q + out_offset_q + 3 * seqLenPackQ);
+        } else {
+            FLOAT4 temp_0 = vload4(0, input_q + in_offset_q);
+            FLOAT4 temp_1 = (sl * 4 + 1 >= seqLenQ) ? (FLOAT4)0 : vload4(0, input_q + in_offset_q + headNum*headDim);
+            FLOAT4 temp_2 = (sl * 4 + 2 >= seqLenQ) ? (FLOAT4)0 : vload4(0, input_q + in_offset_q + 2*headNum*headDim);
+            FLOAT4 temp_3 = (sl * 4 + 3 >= seqLenQ) ? (FLOAT4)0 : vload4(0, input_q + in_offset_q + 3*headNum*headDim);
+            #ifdef HEADDIM_LEAVE
+            DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)
+            #endif
+            #ifdef SEQLEN_LEAVE
+            DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenQ)
+            #endif
+            vstore4((FLOAT4)(temp_0.s0, temp_1.s0, temp_2.s0, temp_3.s0), 0, output_q + out_offset_q);
+            vstore4((FLOAT4)(temp_0.s1, temp_1.s1, temp_2.s1, temp_3.s1), 0, output_q + out_offset_q + seqLenPackQ);
+            vstore4((FLOAT4)(temp_0.s2, temp_1.s2, temp_2.s2, temp_3.s2), 0, output_q + out_offset_q + 2 * seqLenPackQ);
+            vstore4((FLOAT4)(temp_0.s3, temp_1.s3, temp_2.s3, temp_3.s3), 0, output_q + out_offset_q + 3 * seqLenPackQ);
+        }
+    }
+        
+    if(hn >= headNum / group) {
+        return;
+    }
+    
+
+    const int seqLenPackKV = ((seqLenKV + tile.y - 1) / tile.y) * tile.y;
+    const int headDimPackV = ((headDim + tile.w - 1) / tile.w) * tile.w;
+    const int seqLenKV_4 = (seqLenKV + 3) / 4;
+    const int in_offset_kv = (((b * seqLenKV + sl*4) * headNum/group + hn) * headDim + 4 * hd);
+    
+    if(sl * 4 < seqLenPackKV && hd * 4 < headDimPackQK) {
+        const int out_offset_k = (((b * headNum/group + hn) * headDimPackQK + hd * 4) * seqLenPackKV + sl * 4);
+
+        if(sl * 4 >= seqLenKV || hd * 4 >= headDim) {
+            vstore4((FLOAT4)0, 0, output_k + out_offset_k);
+            vstore4((FLOAT4)0, 0, output_k + out_offset_k + seqLenPackKV);
+            vstore4((FLOAT4)0, 0, output_k + out_offset_k + 2 * seqLenPackKV);
+            vstore4((FLOAT4)0, 0, output_k + out_offset_k + 3 * seqLenPackKV);
+        } else {
+            FLOAT4 temp_0 = vload4(0, input_k + in_offset_kv);
+            FLOAT4 temp_1 = (sl * 4 + 1 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_k + in_offset_kv + headNum*headDim/group);
+            FLOAT4 temp_2 = (sl * 4 + 2 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_k + in_offset_kv + 2*headNum*headDim/group);
+            FLOAT4 temp_3 = (sl * 4 + 3 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_k + in_offset_kv + 3*headNum*headDim/group);
+            #ifdef HEADDIM_LEAVE
+            DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)
+            #endif
+            #ifdef SEQLEN_LEAVE
+            DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)
+            #endif
+            vstore4((FLOAT4)(temp_0.s0, temp_1.s0, temp_2.s0, temp_3.s0), 0, output_k + out_offset_k);
+            vstore4((FLOAT4)(temp_0.s1, temp_1.s1, temp_2.s1, temp_3.s1), 0, output_k + out_offset_k + seqLenPackKV);
+            vstore4((FLOAT4)(temp_0.s2, temp_1.s2, temp_2.s2, temp_3.s2), 0, output_k + out_offset_k + 2 * seqLenPackKV);
+            vstore4((FLOAT4)(temp_0.s3, temp_1.s3, temp_2.s3, temp_3.s3), 0, output_k + out_offset_k + 3 * seqLenPackKV);
+            
+            // pastK
+            vstore4(temp_0, 0, past_k + in_offset_kv);
+            if(sl * 4 + 1 < seqLenKV) {
+                vstore4(temp_1, 0, past_k + in_offset_kv + headNum*headDim/group);
+            }
+            if(sl * 4 + 2 < seqLenKV) {
+                vstore4(temp_2, 0, past_k + in_offset_kv + 2*headNum*headDim/group);
+            }
+            if(sl * 4 + 3 < seqLenKV) {
+                vstore4(temp_3, 0, past_k + in_offset_kv + 3*headNum*headDim/group);
+            }
+        }
+        
+    }
+    
+    if(sl * 4 < seqLenPackKV && hd * 4 < headDimPackV) {
+        const int out_offset_v = (((b * headNum/group + hn) * seqLenPackKV + sl * 4) * headDimPackV + hd * 4);
+
+        if(sl * 4 >= seqLenKV || hd * 4 >= headDim) {
+            vstore4((FLOAT4)0, 0, output_v + out_offset_v);
+            vstore4((FLOAT4)0, 0, output_v + out_offset_v + headDimPackV);
+            vstore4((FLOAT4)0, 0, output_v + out_offset_v + 2 * headDimPackV);
+            vstore4((FLOAT4)0, 0, output_v + out_offset_v + 3 * headDimPackV);
+        } else {
+            FLOAT4 temp_0 = vload4(0, input_v + in_offset_kv);
+            FLOAT4 temp_1 = (sl * 4 + 1 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_v + in_offset_kv + headNum*headDim/group);
+            FLOAT4 temp_2 = (sl * 4 + 2 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_v + in_offset_kv + 2*headNum*headDim/group);
+            FLOAT4 temp_3 = (sl * 4 + 3 >= seqLenKV) ? (FLOAT4)0 : vload4(0, input_v + in_offset_kv + 3*headNum*headDim/group);
+            #ifdef HEADDIM_LEAVE
+            DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)
+            #endif
+            #ifdef SEQLEN_LEAVE
+            DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)
+            #endif
+            vstore4(temp_0, 0, output_v + out_offset_v);
+            vstore4(temp_1, 0, output_v + out_offset_v + headDimPackV);
+            vstore4(temp_2, 0, output_v + out_offset_v + 2 * headDimPackV);
+            vstore4(temp_3, 0, output_v + out_offset_v + 3 * headDimPackV);
+            
+            // pastV
+            vstore4(temp_0, 0, past_v + in_offset_kv);
+            if(sl * 4 + 1 < seqLenKV) {
+                vstore4(temp_1, 0, past_v + in_offset_kv + headNum*headDim/group);
+            }
+            if(sl * 4 + 2 < seqLenKV) {
+                vstore4(temp_2, 0, past_v + in_offset_kv + 2*headNum*headDim/group);
+            }
+            if(sl * 4 + 3 < seqLenKV) {
+                vstore4(temp_3, 0, past_v + in_offset_kv + 3*headNum*headDim/group);
+            }
+        }
+        
+    }
+}
+
+#ifndef MASK_DTYPE
+#define MASK_DTYPE FLOAT
+#define MASK_DTYPE4 FLOAT4
+#endif
+__kernel void rearrange_mask(GLOBAL_SIZE_3_DIMS
+        __global const MASK_DTYPE *input_mask, // [batch, 1, seqLenQ, seqLenKV, 4]
+        __global MASK_DTYPE *output_mask, // [batch, ROUND_UP(seqLenQ, mTileQ), ROUND_UP(seqLenKV, mTileKV)]
+        const int4 shape // [seqLenQ, seqLenKV, mTileQ, mTileKV]
+) {
+    const int sl = get_global_id(0); // seqLen_4
+    const int sl_kv = get_global_id(1); // seqLenKV_4
+    const int b = get_global_id(2); // Batch
+    DEAL_NON_UNIFORM_DIM3(sl, sl_kv, b);
+        
+    const int seq_len_pack = ((shape.x + shape.z - 1) / shape.z) * shape.z;
+    const int seq_len_kv_pack = ((shape.y + shape.w - 1) / shape.w) * shape.w;
+
+    int in_offset = ((b * shape.x + sl * 4) * shape.y + sl_kv * 4);
+    int out_offset = (b * seq_len_pack + sl * 4) * seq_len_kv_pack + sl_kv * 4;
+
+    if(sl * 4 >= shape.x || sl_kv * 4 >= shape.y) {
+        vstore4((MASK_DTYPE4)0, 0, output_mask + out_offset);
+        vstore4((MASK_DTYPE4)0, 0, output_mask + out_offset + seq_len_kv_pack);
+        vstore4((MASK_DTYPE4)0, 0, output_mask + out_offset + seq_len_kv_pack * 2);
+        vstore4((MASK_DTYPE4)0, 0, output_mask + out_offset + seq_len_kv_pack * 3);
+    } else {
+        int y_down_align4 = (shape.y / 4 * 4);
+        MASK_DTYPE4 temp_0, temp_1, temp_2, temp_3;
+        
+        if(sl_kv * 4 < y_down_align4) {
+            temp_0 = vload4(0, input_mask + in_offset);
+            temp_1 = (sl * 4 + 1 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0, input_mask + in_offset + shape.y);
+            temp_2 = (sl * 4 + 2 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0, input_mask + in_offset + shape.y * 2);
+            temp_3 = (sl * 4 + 3 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0, input_mask + in_offset + shape.y * 3);
+        } else if(sl_kv * 4 + 1 == shape.y){
+            temp_0 = (MASK_DTYPE4)(input_mask[in_offset], 0, 0, 0);
+            temp_1 = (sl * 4 + 1 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y], 0, 0, 0);//vload4(0, input_mask + in_offset + shape.y);
+            temp_2 = (sl * 4 + 2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*2], 0, 0, 0);//vload4(0, input_mask + in_offset + shape.y * 2);
+            temp_3 = (sl * 4 + 3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*3], 0, 0, 0);//vload4(0, input_mask + in_offset + shape.y * 3);
+        } else if(sl_kv * 4 + 2 == shape.y){
+            temp_0 = (MASK_DTYPE4)(input_mask[in_offset], input_mask[in_offset+1], 0, 0);
+            temp_1 = (sl * 4 + 1 >= shape.x) ? (MASK_DTYPE4)0 : (FLOAT4)(input_mask[in_offset + shape.y], input_mask[in_offset + shape.y + 1], 0, 0);//vload4(0, input_mask + in_offset + shape.y);
+            temp_2 = (sl * 4 + 2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*2], input_mask[in_offset + shape.y*2 + 1], 0, 0);//vload4(0, input_mask + in_offset + shape.y * 2);
+            temp_3 = (sl * 4 + 3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*3], input_mask[in_offset + shape.y*3 + 1], 0, 0);//vload4(0, input_mask + in_offset + shape.y * 3);
+        } else if(sl_kv * 4 + 3 == shape.y){
+            temp_0 = (MASK_DTYPE4)(input_mask[in_offset], input_mask[in_offset+1], input_mask[in_offset+2], 0);
+            temp_1 = (sl * 4 + 1 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y], input_mask[in_offset + shape.y + 1], input_mask[in_offset + shape.y + 2], 0);//vload4(0, input_mask + in_offset + shape.y);
+            temp_2 = (sl * 4 + 2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*2], input_mask[in_offset + shape.y*2 + 1], input_mask[in_offset + shape.y*2 + 2], 0);//vload4(0, input_mask + in_offset + shape.y * 2);
+            temp_3 = (sl * 4 + 3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset + shape.y*3], input_mask[in_offset + shape.y*3 + 1], input_mask[in_offset + shape.y*3 + 2], 0);//vload4(0, input_mask + in_offset + shape.y * 3);
+        }
+
+        vstore4(temp_0, 0, output_mask + out_offset);
+        vstore4(temp_1, 0, output_mask + out_offset + seq_len_kv_pack);
+        vstore4(temp_2, 0, output_mask + out_offset + 2 * seq_len_kv_pack);
+        vstore4(temp_3, 0, output_mask + out_offset + 3 * seq_len_kv_pack);
+    }
+
+}
+
+__kernel void qkv_transpose_output(GLOBAL_SIZE_3_DIMS
+          __global const FLOAT *input, // [Batch * mNumHead, ROUND_UP(mHeadDim, mTileHDN), ROUND_UP(seqLen, mTileQ)]
+          __global FLOAT *output, // [Batch, seqLen/4, mNumHead， mHeadDim, 4]
+          __private const int tile_q,
+          __private const int tile_hdn,
+          __private const int seq_len,
+          __private const int head_num,
+          __private const int head_dim
+) {
+    
+    const int sl = get_global_id(0); // seqLen_4
+    const int hd = get_global_id(1); // mHeadDim_4
+    const int z = get_global_id(2); // Batch * mNumHead
+    DEAL_NON_UNIFORM_DIM3(sl, hd, z);
+    
+    const int b = z / head_num;
+    const int hn = z % head_num;
+        
+    const int seq_len_pack = ((seq_len + tile_q - 1) / tile_q) * tile_q;
+    const int head_dim_pack = ((head_dim + tile_hdn - 1) / tile_hdn) * tile_hdn;
+    
+    const int offset_inp = ((b * head_num + hn) * head_dim_pack + 4 * hd) * seq_len_pack + 4 * sl;
+    
+    const int offset_out = (((b * seq_len + sl*4) * head_num + hn) * head_dim + 4 * hd);
+    
+    // Q
+    FLOAT4 temp_0 = vload4(0, input + offset_inp);
+    FLOAT4 temp_1 = vload4(0, input + offset_inp + seq_len_pack);
+    FLOAT4 temp_2 = vload4(0, input + offset_inp + 2 * seq_len_pack);
+    FLOAT4 temp_3 = vload4(0, input + offset_inp + 3 * seq_len_pack);
+    
+    vstore4((FLOAT4)(temp_0.s0, temp_1.s0, temp_2.s0, temp_3.s0), 0, output + offset_out);
+    if(4 * sl + 1 >= seq_len) return;
+    vstore4((FLOAT4)(temp_0.s1, temp_1.s1, temp_2.s1, temp_3.s1), 0, output + offset_out + head_num*head_dim);
+    if(4 * sl + 2 >= seq_len) return;
+    vstore4((FLOAT4)(temp_0.s2, temp_1.s2, temp_2.s2, temp_3.s2), 0, output + offset_out + 2*head_num*head_dim);
+    if(4 * sl + 3 >= seq_len) return;
+    vstore4((FLOAT4)(temp_0.s3, temp_1.s3, temp_2.s3, temp_3.s3), 0, output + offset_out + 3*head_num*head_dim);
+
+}
+
+#ifndef NUMHEAD_GROUP_SIZE
+#define NUMHEAD_GROUP_SIZE 1
+#endif
 
 __kernel void matmul_qk_div_mask(GLOBAL_SIZE_3_DIMS
-                              __global const FLOAT *input0, // query [1 query_seq_len/4 head_num head_dim 4]
-                              __global const FLOAT *input1, // key [1 key_seq_len/4 head_num head_dim 4]
-                              __global FLOAT *output, // prefill [1 head_num query_seq_len/4 key_seq_len 4]   decode[1 head_num key_seq_len/4 4]
-                              __global FLOAT *past_key, // [1  head_num max_length/4 head_dim 4]
-#ifdef ADD_MASK
+                              __global const FLOAT *input0, // query [1 query_seq_len head_num head_dim]
+                              __global const FLOAT *input1, // key [1 key_seq_len head_num head_dim]
+                              __global FLOAT *output, // prefill [1 head_num query_seq_len key_seq_len]   decode[1 head_num key_seq_len/4 4]
+                              __global FLOAT *past_key, // [1 max_length head_num head_dim]
+                              #ifdef ADD_MASK
                               __global const FLOAT* mask,
-#else
-                              __global const int* mask, // [1 1 query_seq_len key_seq_len 4]
-#endif
+                              #else
+                              __global const int* mask, // [1 1 query_seq_len key_seq_len]
+                              #endif
                               __private const float scale,
                               __private const int query_seq_len,
                               __private const int key_seq_len,
                               __private const int head_num,
                               __private const int kv_head_num,
                               __private const int head_dim) {
-
-    const int x = get_global_id(0); // query_seq_len / 4 for prefill   1 for decode
-    const int y = get_global_id(1); // head_num
-    const int z = get_global_id(2); // key_seq_len / 4
+                                  
+    const int x = get_global_id(0); // key_seq_len
+    const int y = get_global_id(1); // query_seq_len for prefill   1 for decode
+    const int z = get_global_id(2); // head_num
     DEAL_NON_UNIFORM_DIM3(x, y, z);
     
-    int yin = y / NUMHEAD_GROUP_SIZE;
-    const int offset = head_num * head_dim * 4;
-    const int offset_head = y * head_dim * 4;
-    __global const FLOAT *A_offset = input0 + x * offset + offset_head;
-    __global FLOAT *Pastkey_offset = past_key + (z * kv_head_num + yin) * head_dim * 4;
-    const int z4 = z << 2;
-    float4 Vscale = (float4)scale;
+    int x4 = x << 2;
+    int y4 = y << 2;
+    int zin = z / NUMHEAD_GROUP_SIZE;
+    __global const FLOAT *A_offset = input0 + (y4 * head_num + z) * head_dim;
+    __global FLOAT *Pastkey_offset = past_key + (x4 * kv_head_num + zin) * head_dim;
+    int strideA = head_num * head_dim;
+    int strideB = kv_head_num * head_dim;
 #ifdef OPENCL_PREFILL_ATTENTION
-    __global const FLOAT *B_offset = input1 + (z * kv_head_num + yin) * head_dim * 4;
-    const int x4 = x << 2;
-    const int query_seq_len4 = (query_seq_len + 3) / 4;
-    const int output_offset = y * query_seq_len4 * key_seq_len * 4;
+    __global const FLOAT *B_offset = input1 + (x4 * kv_head_num + zin) * head_dim;
+    int output_offset = (z * query_seq_len + y4) * key_seq_len + x4;
     float4 out0 = 0;
     float4 out1 = 0;
     float4 out2 = 0;
     float4 out3 = 0;
     
+    bool A1_enable = y4 + 1 < query_seq_len;
+    bool A2_enable = y4 + 2 < query_seq_len;
+    bool A3_enable = y4 + 3 < query_seq_len;
+    
+    bool B1_enable = x4 + 1 < key_seq_len;
+    bool B2_enable = x4 + 2 < key_seq_len;
+    bool B3_enable = x4 + 3 < key_seq_len;
+    
     const int head_dim4 = (head_dim + 3) / 4;
-#ifdef HEADDIM_LEAVE
+    #ifdef HEADDIM_LEAVE
     for(int i = 0; i < head_dim4 - 1; ++i){
-        float16 A = convert_float16(vload16(i, A_offset));
-        float16 B = convert_float16(vload16(i, B_offset));
+        float4 A0 = convert_float4(vload4(i, A_offset));
+        float4 A1 = A1_enable ? convert_float4(vload4(i, A_offset + strideA)) : (float4)0;
+        float4 A2 = A2_enable ? convert_float4(vload4(i, A_offset + strideA + strideA)) : (float4)0;
+        float4 A3 = A3_enable ? convert_float4(vload4(i, A_offset + strideA + strideA + strideA)) : (float4)0;
+        float4 B0 = convert_float4(vload4(i, B_offset));
+        float4 B1 = B1_enable ? convert_float4(vload4(i, B_offset + strideB)) : (float4)0;
+        float4 B2 = B2_enable ? convert_float4(vload4(i, B_offset + strideB + strideB)) : (float4)0;
+        float4 B3 = B3_enable ? convert_float4(vload4(i, B_offset + strideB + strideB + strideB)) : (float4)0;
         
-        out0 = mad(A.s0123, (float4)B.s0, out0);
-        out1 = mad(A.s0123, (float4)B.s1, out1);
-        out2 = mad(A.s0123, (float4)B.s2, out2);
-        out3 = mad(A.s0123, (float4)B.s3, out3);
+        out0.x += dot(A0, B0);
+        out0.y += dot(A0, B1);
+        out0.z += dot(A0, B2);
+        out0.w += dot(A0, B3);
         
-        out0 = mad(A.s4567, (float4)B.s4, out0);
-        out1 = mad(A.s4567, (float4)B.s5, out1);
-        out2 = mad(A.s4567, (float4)B.s6, out2);
-        out3 = mad(A.s4567, (float4)B.s7, out3);
+        out1.x += dot(A1, B0);
+        out1.y += dot(A1, B1);
+        out1.z += dot(A1, B2);
+        out1.w += dot(A1, B3);
         
-        out0 = mad(A.s89ab, (float4)B.s8, out0);
-        out1 = mad(A.s89ab, (float4)B.s9, out1);
-        out2 = mad(A.s89ab, (float4)B.sa, out2);
-        out3 = mad(A.s89ab, (float4)B.sb, out3);
+        out2.x += dot(A2, B0);
+        out2.y += dot(A2, B1);
+        out2.z += dot(A2, B2);
+        out2.w += dot(A2, B3);
         
-        out0 = mad(A.scdef, (float4)B.sc, out0);
-        out1 = mad(A.scdef, (float4)B.sd, out1);
-        out2 = mad(A.scdef, (float4)B.se, out2);
-        out3 = mad(A.scdef, (float4)B.sf, out3);
+        out3.x += dot(A3, B0);
+        out3.y += dot(A3, B1);
+        out3.z += dot(A3, B2);
+        out3.w += dot(A3, B3);
         
-        vstore16(CONVERT_FLOAT16(B), i, Pastkey_offset);
+        vstore4(CONVERT_FLOAT4(B0), i, Pastkey_offset);
+        vstore4(CONVERT_FLOAT4(B1), i, Pastkey_offset + strideB);
+        vstore4(CONVERT_FLOAT4(B2), i, Pastkey_offset + strideB + strideB);
+        vstore4(CONVERT_FLOAT4(B3), i, Pastkey_offset + strideB + strideB + strideB);
     }
     for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
-        float4 A = convert_float4(vload4(i, A_offset));
-        float4 B = convert_float4(vload4(i, B_offset));
+        float A0 = A_offset[i];
+        float A1 = A1_enable ? A_offset[i + strideA] : 0;
+        float A2 = A2_enable ? A_offset[i + strideA + strideA] : 0;
+        float A3 = A3_enable ? A_offset[i + strideA + strideA + strideA] : 0;
+        float B0 = B_offset[i];
+        float B1 = B1_enable ? B_offset[i + strideB] : 0;
+        float B2 = B2_enable ? B_offset[i + strideB + strideB] : 0;
+        float B3 = B3_enable ? B_offset[i + strideB + strideB + strideB] : 0;
+        
+        out0.x += A0 * B0;
+        out0.y += A0 * B1;
+        out0.z += A0 * B2;
+        out0.w += A0 * B3;
         
-        out0 = mad(A, (float4)B.s0, out0);
-        out1 = mad(A, (float4)B.s1, out1);
-        out2 = mad(A, (float4)B.s2, out2);
-        out3 = mad(A, (float4)B.s3, out3);
+        out1.x += A1 * B0;
+        out1.y += A1 * B1;
+        out1.z += A1 * B2;
+        out1.w += A1 * B3
         
-        vstore4(CONVERT_FLOAT4(B), i, Pastkey_offset);
+        out2.x += A2 * B0;
+        out2.y += A2 * B1;
+        out2.z += A2 * B2;
+        out2.w += A2 * B3;
+        
+        out3.x += A3 * B0;
+        out3.y += A3 * B1;
+        out3.z += A3 * B2;
+        out3.w += A3 * B3;
+        
+        Pastkey_offset[i] = (FLOAT)B0;
+        Pastkey_offset[i + strideB] = (FLOAT)B1;
+        Pastkey_offset[i + strideB + strideB] = (FLOAT)B2;
+        Pastkey_offset[i + strideB + strideB + strideB] = (FLOAT)B3;
     }
-#else
+    #else
     for(int i = 0; i < head_dim4; ++i){
-        float16 A = convert_float16(vload16(i, A_offset));
-        float16 B = convert_float16(vload16(i, B_offset));
+        float4 A0 = convert_float4(vload4(i, A_offset));
+        float4 A1 = A1_enable ? convert_float4(vload4(i, A_offset + strideA)) : (float4)0;
+        float4 A2 = A2_enable ? convert_float4(vload4(i, A_offset + strideA + strideA)) : (float4)0;
+        float4 A3 = A3_enable ? convert_float4(vload4(i, A_offset + strideA + strideA + strideA)) : (float4)0;
+        float4 B0 = convert_float4(vload4(i, B_offset));
+        float4 B1 = B1_enable ? convert_float4(vload4(i, B_offset + strideB)) : (float4)0;
+        float4 B2 = B2_enable ? convert_float4(vload4(i, B_offset + strideB + strideB)) : (float4)0;
+        float4 B3 = B3_enable ? convert_float4(vload4(i, B_offset + strideB + strideB + strideB)) : (float4)0;
         
-        out0 = mad(A.s0123, (float4)B.s0, out0);
-        out1 = mad(A.s0123, (float4)B.s1, out1);
-        out2 = mad(A.s0123, (float4)B.s2, out2);
-        out3 = mad(A.s0123, (float4)B.s3, out3);
+        out0.x += dot(A0, B0);
+        out0.y += dot(A0, B1);
+        out0.z += dot(A0, B2);
+        out0.w += dot(A0, B3);
         
-        out0 = mad(A.s4567, (float4)B.s4, out0);
-        out1 = mad(A.s4567, (float4)B.s5, out1);
-        out2 = mad(A.s4567, (float4)B.s6, out2);
-        out3 = mad(A.s4567, (float4)B.s7, out3);
+        out1.x += dot(A1, B0);
+        out1.y += dot(A1, B1);
+        out1.z += dot(A1, B2);
+        out1.w += dot(A1, B3);
         
-        out0 = mad(A.s89ab, (float4)B.s8, out0);
-        out1 = mad(A.s89ab, (float4)B.s9, out1);
-        out2 = mad(A.s89ab, (float4)B.sa, out2);
-        out3 = mad(A.s89ab, (float4)B.sb, out3);
+        out2.x += dot(A2, B0);
+        out2.y += dot(A2, B1);
+        out2.z += dot(A2, B2);
+        out2.w += dot(A2, B3);
         
-        out0 = mad(A.scdef, (float4)B.sc, out0);
-        out1 = mad(A.scdef, (float4)B.sd, out1);
-        out2 = mad(A.scdef, (float4)B.se, out2);
-        out3 = mad(A.scdef, (float4)B.sf, out3);
-    
-        vstore16(CONVERT_FLOAT16(B), i, Pastkey_offset);
+        out3.x += dot(A3, B0);
+        out3.y += dot(A3, B1);
+        out3.z += dot(A3, B2);
+        out3.w += dot(A3, B3);
+        
+        vstore4(CONVERT_FLOAT4(B0), i, Pastkey_offset);
+        vstore4(CONVERT_FLOAT4(B1), i, Pastkey_offset + strideB);
+        vstore4(CONVERT_FLOAT4(B2), i, Pastkey_offset + strideB + strideB);
+        vstore4(CONVERT_FLOAT4(B3), i, Pastkey_offset + strideB + strideB + strideB);
     }
-#endif
-    
-    out0 *= Vscale;
-    out1 *= Vscale;
-    out2 *= Vscale;
-    out3 *= Vscale;
-
-    float4 mask0, mask1, mask2, mask3;
-    mask = mask + (x4 * key_seq_len + z4) * 4;
-    mask0.s0 = mask[0]; mask1.s0 = mask[4]; mask2.s0 = mask[8]; mask3.s0 = mask[12]; mask += key_seq_len * 4;
-    mask0.s1 = mask[0]; mask1.s1 = mask[4]; mask2.s1 = mask[8]; mask3.s1 = mask[12]; mask += key_seq_len * 4;
-    mask0.s2 = mask[0]; mask1.s2 = mask[4]; mask2.s2 = mask[8]; mask3.s2 = mask[12]; mask += key_seq_len * 4;
-    mask0.s3 = mask[0]; mask1.s3 = mask[4]; mask2.s3 = mask[8]; mask3.s3 = mask[12];
-#ifdef ADD_MASK
+    #endif
+    out0 *= (float4)scale;
+    out1 *= (float4)scale;
+    out2 *= (float4)scale;
+    out3 *= (float4)scale;
+    float4 mask0 = convert_float4(vload4(0, mask + y4 * key_seq_len + x4));
+    float4 mask1 = convert_float4(vload4(0, mask + (y4 + 1) * key_seq_len + x4));
+    float4 mask2 = convert_float4(vload4(0, mask + (y4 + 2) * key_seq_len + x4));
+    float4 mask3 = convert_float4(vload4(0, mask + (y4 + 3) * key_seq_len + x4));
+    #ifdef ADD_MASK
     out0 += mask0;
     out1 += mask1;
     out2 += mask2;
     out3 += mask3;
-#else
+    #else
     out0 = (mask0 == (float4)0) ? (float4)(-FLT_MAX) : out0;
     out1 = (mask1 == (float4)0) ? (float4)(-FLT_MAX) : out1;
     out2 = (mask2 == (float4)0) ? (float4)(-FLT_MAX) : out2;
     out3 = (mask3 == (float4)0) ? (float4)(-FLT_MAX) : out3;
-#endif
-
-    vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset + x * key_seq_len * 4 + z4 * 4);
-    if(z4 + 1 >= key_seq_len) return;
-    vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset + x * key_seq_len * 4 + (z4 + 1) * 4);
-    if(z4 + 2 >= key_seq_len) return;
-    vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset + x * key_seq_len * 4 + (z4 + 2) * 4);
-    if(z4 + 3 >= key_seq_len) return;
-    vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset + x * key_seq_len * 4 + (z4 + 3) * 4);
+    #endif
+    if(B3_enable){
+        vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
+        if(!A1_enable) return;
+        output_offset += key_seq_len;
+        vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
+        if(!A2_enable) return;
+        output_offset += key_seq_len;
+        vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
+        if(!A3_enable) return;
+        output_offset += key_seq_len;
+        vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
+    } else if(B2_enable){
+        vstore3(CONVERT_FLOAT3((float3)(out0.x, out0.y, out0.z)), 0, output + output_offset);
+        if(!A1_enable) return;
+        output_offset += key_seq_len;
+        vstore3(CONVERT_FLOAT3((float3)(out1.x, out1.y, out1.z)), 0, output + output_offset);
+        if(!A2_enable) return;
+        output_offset += key_seq_len;
+        vstore3(CONVERT_FLOAT3((float3)(out2.x, out2.y, out2.z)), 0, output + output_offset);
+        if(!A3_enable) return;
+        output_offset += key_seq_len;
+        vstore3(CONVERT_FLOAT3((float3)(out3.x, out3.y, out3.z)), 0, output + output_offset);
+    } else if(B1_enable){
+        vstore2(CONVERT_FLOAT2((float2)(out0.x, out0.y)), 0, output + output_offset);
+        if(!A1_enable) return;
+        output_offset += key_seq_len;
+        vstore2(CONVERT_FLOAT2((float2)(out1.x, out1.y)), 0, output + output_offset);
+        if(!A2_enable) return;
+        output_offset += key_seq_len;
+        vstore2(CONVERT_FLOAT2((float2)(out2.x, out2.y)), 0, output + output_offset);
+        if(!A3_enable) return;
+        output_offset += key_seq_len;
+        vstore2(CONVERT_FLOAT2((float2)(out3.x, out3.y)), 0, output + output_offset);
+    } else {
+        output[output_offset] = out0.x;
+        if(!A1_enable) return;
+        output[output_offset + key_seq_len] = out1.x;
+        if(!A2_enable) return;
+        output[output_offset + key_seq_len + key_seq_len] = out2.x;
+        if(!A3_enable) return;
+        output[output_offset + key_seq_len + key_seq_len + key_seq_len] = out3.x;
+    }
 #else
-    __global const FLOAT *B_offset = input1 + yin * head_dim * 4;
-    const int key_seq_len4 = (key_seq_len + 3) / 4;
     float4 out = 0;
     const int head_dim4 = (head_dim + 3) / 4;
-    
-#ifdef HEADDIM_LEAVE
+    int key_seq_len4 = (key_seq_len + 3) / 4;
+    #ifdef HEADDIM_LEAVE
     for(int i = 0; i < head_dim4 - 1; ++i){
-        float16 A = convert_float16(vload16(i, A_offset));
-        float16 B = convert_float16(vload16(i, Pastkey_offset));
-        
-        out = mad((float4)A.s0, B.s0123, out);
-        out = mad((float4)A.s4, B.s4567, out);
-        out = mad((float4)A.s8, B.s89ab, out);
-        out = mad((float4)A.sc, B.scdef, out);
+        float4 A = convert_float4(vload4(i, A_offset));
+        float4 B0 = convert_float4(vload4(i, Pastkey_offset));
+        float4 B1 = convert_float4(vload4(i, Pastkey_offset + strideB));
+        float4 B2 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB));
+        float4 B3 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB + strideB));
+    
+        out.x += dot(A, B0);
+        out.y += dot(A, B1);
+        out.z += dot(A, B2);
+        out.w += dot(A, B3);
     }
     for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
-        float4 A = convert_float4(vload4(i, A_offset));
-        float4 B = convert_float4(vload4(i, Pastkey_offset));
-        
-        out = mad((float4)A.s0, B, out);
+        float A = A_offset[i];
+        float B0 = Pastkey_offset[i];
+        float B1 = Pastkey_offset[i + strideB];
+        float B2 = Pastkey_offset[i + strideB + strideB];
+        float B3 = Pastkey_offset[i + strideB + strideB];
+        out.x += A * B0;
+        out.y += A * B1;
+        out.z += A * B2;
+        out.w += A * B3;
     }
-#else
+    #else
     for(int i = 0; i < head_dim4; ++i){
-        float16 A = convert_float16(vload16(i, A_offset));
-        float16 B = convert_float16(vload16(i, Pastkey_offset));
+        float4 A = convert_float4(vload4(i, A_offset));
+        float4 B0 = convert_float4(vload4(i, Pastkey_offset));
+        float4 B1 = convert_float4(vload4(i, Pastkey_offset + strideB));
+        float4 B2 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB));
+        float4 B3 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB + strideB));
     
-        out = mad((float4)A.s0, B.s0123, out);
-        out = mad((float4)A.s4, B.s4567, out);
-        out = mad((float4)A.s8, B.s89ab, out);
-        out = mad((float4)A.sc, B.scdef, out);
+        out.x += dot(A, B0);
+        out.y += dot(A, B1);
+        out.z += dot(A, B2);
+        out.w += dot(A, B3);
     }
-#endif
-    if(z == key_seq_len4 - 1){
-        int remain = key_seq_len - z * 4 - 1;
-        Pastkey_offset += remain;
+    #endif
+    int remain = key_seq_len - x4;
+    if(x == key_seq_len4 - 1){
+        __global const FLOAT *B_offset = input1 + zin * head_dim;
+        Pastkey_offset += (remain - 1) * strideB;
         float tmp = 0;
-        for(int i = 0; i < head_dim; ++i){
-            float A = A_offset[i*4];
-            float B = B_offset[i*4];
-            Pastkey_offset[i * 4] = B;
+        #ifdef HEADDIM_LEAVE
+        for(int i = 0; i < head_dim4 - 1; ++i){
+            float4 A = convert_float4(vload4(i, A_offset));
+            float4 B = convert_float4(vload4(i, B_offset));
+        
+            tmp += dot(A, B);
+            vstore4(CONVERT_FLOAT4(B), i, Pastkey_offset);
+        }
+        for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
+            float A = A_offset[i];
+            float B = B_offset[i];
             tmp += A * B;
+            Pastkey_offset[i] = B;
         }
+        #else
+        for(int i = 0; i < head_dim4; ++i){
+            float4 A = convert_float4(vload4(i, A_offset));
+            float4 B = convert_float4(vload4(i, B_offset));
+        
+            tmp += dot(A, B);
+            vstore4(CONVERT_FLOAT4(B), i, Pastkey_offset);
+        }
+        #endif
         float *out_ptr = (float*)&out;
-        out_ptr[remain] = tmp;
+        out_ptr[remain - 1] = tmp;
+    }
+    out *= (float4)scale;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out), 0, output + z * key_seq_len + x4);
+    } else if (remain >= 3){
+        vstore3(CONVERT_FLOAT3((float3)(out.x, out.y, out.z)), 0, output + z * key_seq_len + x4);
+    } else if (remain >= 2){
+        vstore2(CONVERT_FLOAT2((float2)(out.x, out.y)), 0, output + z * key_seq_len + x4);
+    } else {
+        output[z * key_seq_len + x4] = out.x;
     }
-    out *= Vscale;
-    vstore4(CONVERT_FLOAT4(out), 0, output + y * key_seq_len4 * 4 + z4);
 #endif
 }
 
 __kernel void matmul_qkv(GLOBAL_SIZE_3_DIMS
-                              __global const FLOAT *input0, // qk prefill [1 head_num qk_seq_len/4 value_seq_len 4]   decode[1 head_num value_seq_len/4 4]
-                              __global const FLOAT *input1, // [1 value_seq_len/4 head_num head_dim 4]
-                              __global FLOAT *output, // [1 qk_seq_len head_num*head_dim 1 4]
-                              __global FLOAT *past_value, // [1 value_seq_len/4 head_num head_dim 4]
+                              __global const FLOAT *input0, // qk prefill [1 head_num qk_seq_len value_seq_len]   decode[1 head_num value_seq_len]
+                              __global const FLOAT *input1, // [1 value_seq_len head_num head_dim]
+                              __global FLOAT *output, // [1 qk_seq_len head_num head_dim]
+                              __global FLOAT *past_value, // [1 value_seq_len head_num head_dim]
                               __private const int qk_seq_len,
                               __private const int value_seq_len,
                               __private const int head_num,
                               __private const int kv_head_num,
                               __private const int head_dim) {
-
-    const int x = get_global_id(0); // prefill qk_seq_len / 4   decode 1
+                                  
+    const int x = get_global_id(0); // head_dim << 2
     const int y = get_global_id(1); // head_num
-    const int z = get_global_id(2); // head_dim << 2
-    const int z4 = z << 2;
+    const int z = get_global_id(2); // prefill qk_seq_len decode 1
+    
+    const int x4 = x << 2;
     DEAL_NON_UNIFORM_DIM3(x, y, z);
     
     const int yin = y / NUMHEAD_GROUP_SIZE;
 #ifdef OPENCL_PREFILL_ATTENTION
-    const int offset = head_num * head_dim * 4;
-    const int stride = kv_head_num * head_dim * 4;
-    const int offset_head = y * head_dim * 4 + z4 * 4;
-    const int value_seq_len4 = (value_seq_len + 3) / 4;
-    const int qk_seq_len4 = (qk_seq_len + 3) / 4;
-    __global const FLOAT *A_offset = input0 + (y * qk_seq_len4 + x) * value_seq_len * 4;
-    __global const FLOAT *B_offset = input1 + yin * head_dim * 4 + z4 * 4;
-    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim * 4 + z4 * 4;
+    int z4 = z << 2;
+    int value_seq_len4 = (value_seq_len + 3) / 4;
+    int loop_end = max(value_seq_len4 - 1, 0);
+    const int stride = kv_head_num * head_dim;
+    __global const FLOAT *A_offset = input0 + (y * qk_seq_len + z4) * value_seq_len;
+    __global const FLOAT *B_offset = input1 + yin * head_dim + x4;
+    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim + x4;
     COMPUTE_FLOAT4 out0 = 0;
     COMPUTE_FLOAT4 out1 = 0;
     COMPUTE_FLOAT4 out2 = 0;
     COMPUTE_FLOAT4 out3 = 0;
     
-    for(int i = 0; i < value_seq_len4 - 1; ++i){
+    for(int i = 0; i < loop_end; ++i){
         int index = i << 2;
-        COMPUTE_FLOAT4 A0 = CONVERT_COMPUTE_FLOAT4(vload4(index, A_offset));
-        COMPUTE_FLOAT4 A1 = CONVERT_COMPUTE_FLOAT4(vload4(index + 1, A_offset));
-        COMPUTE_FLOAT4 A2 = CONVERT_COMPUTE_FLOAT4(vload4(index + 2, A_offset));
-        COMPUTE_FLOAT4 A3 = CONVERT_COMPUTE_FLOAT4(vload4(index + 3, A_offset));
-        COMPUTE_FLOAT16 B = CONVERT_COMPUTE_FLOAT16(vload16(0, B_offset + i * stride));
+        COMPUTE_FLOAT4 A0 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
+        COMPUTE_FLOAT4 A1 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len));
+        COMPUTE_FLOAT4 A2 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len + value_seq_len));
+        COMPUTE_FLOAT4 A3 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len + value_seq_len + value_seq_len));
+        COMPUTE_FLOAT4 B0 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 0) * stride));
+        COMPUTE_FLOAT4 B1 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 1) * stride));
+        COMPUTE_FLOAT4 B2 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 2) * stride));
+        COMPUTE_FLOAT4 B3 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 3) * stride));
         
-        out0 = mad(A0, (COMPUTE_FLOAT4)B.s0, out0);
-        out0 = mad(A1, (COMPUTE_FLOAT4)B.s1, out0);
-        out0 = mad(A2, (COMPUTE_FLOAT4)B.s2, out0);
-        out0 = mad(A3, (COMPUTE_FLOAT4)B.s3, out0);
+        out0 = mad(B0, (COMPUTE_FLOAT4)A0.x, out0);
+        out0 = mad(B1, (COMPUTE_FLOAT4)A0.y, out0);
+        out0 = mad(B2, (COMPUTE_FLOAT4)A0.z, out0);
+        out0 = mad(B3, (COMPUTE_FLOAT4)A0.w, out0);
         
-        out1 = mad(A0, (COMPUTE_FLOAT4)B.s4, out1);
-        out1 = mad(A1, (COMPUTE_FLOAT4)B.s5, out1);
-        out1 = mad(A2, (COMPUTE_FLOAT4)B.s6, out1);
-        out1 = mad(A3, (COMPUTE_FLOAT4)B.s7, out1);
+        out1 = mad(B0, (COMPUTE_FLOAT4)A1.x, out1);
+        out1 = mad(B1, (COMPUTE_FLOAT4)A1.y, out1);
+        out1 = mad(B2, (COMPUTE_FLOAT4)A1.z, out1);
+        out1 = mad(B3, (COMPUTE_FLOAT4)A1.w, out1);
         
-        out2 = mad(A0, (COMPUTE_FLOAT4)B.s8, out2);
-        out2 = mad(A1, (COMPUTE_FLOAT4)B.s9, out2);
-        out2 = mad(A2, (COMPUTE_FLOAT4)B.sa, out2);
-        out2 = mad(A3, (COMPUTE_FLOAT4)B.sb, out2);
+        out2 = mad(B0, (COMPUTE_FLOAT4)A2.x, out2);
+        out2 = mad(B1, (COMPUTE_FLOAT4)A2.y, out2);
+        out2 = mad(B2, (COMPUTE_FLOAT4)A2.z, out2);
+        out2 = mad(B3, (COMPUTE_FLOAT4)A2.w, out2);
         
-        out3 = mad(A0, (COMPUTE_FLOAT4)B.sc, out3);
-        out3 = mad(A1, (COMPUTE_FLOAT4)B.sd, out3);
-        out3 = mad(A2, (COMPUTE_FLOAT4)B.se, out3);
-        out3 = mad(A3, (COMPUTE_FLOAT4)B.sf, out3);
-
-        vstore16(CONVERT_FLOAT16(B), 0, Pastvalue_offset + i * stride);
+        out3 = mad(B0, (COMPUTE_FLOAT4)A3.x, out3);
+        out3 = mad(B1, (COMPUTE_FLOAT4)A3.y, out3);
+        out3 = mad(B2, (COMPUTE_FLOAT4)A3.z, out3);
+        out3 = mad(B3, (COMPUTE_FLOAT4)A3.w, out3);
+        vstore4(CONVERT_FLOAT4(B0), 0, Pastvalue_offset + (index + 0) * stride);
+        vstore4(CONVERT_FLOAT4(B1), 0, Pastvalue_offset + (index + 1) * stride);
+        vstore4(CONVERT_FLOAT4(B2), 0, Pastvalue_offset + (index + 2) * stride);
+        vstore4(CONVERT_FLOAT4(B3), 0, Pastvalue_offset + (index + 3) * stride);
     }
-
-#ifdef HEADDIM_LEAVE
-    COMPUTE_FLOAT16 B = CONVERT_COMPUTE_FLOAT16(vload16(0, B_offset + (value_seq_len4 - 1) * stride));
-    COMPUTE_FLOAT *B_ptr = (COMPUTE_FLOAT*)&B;
-    for(int i = (value_seq_len4 - 1) * 4, j = 0; i < value_seq_len; ++i, ++j){
-        COMPUTE_FLOAT4 A0 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
-        out0 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j], out0);
-        out1 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 4], out1);
-        out2 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 8], out2);
-        out3 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 12], out3);
+    for(int i = loop_end << 2; i < value_seq_len; ++i){
+        COMPUTE_FLOAT A0 = A_offset[i];
+        COMPUTE_FLOAT A1 = A_offset[i + value_seq_len];
+        COMPUTE_FLOAT A2 = A_offset[i + value_seq_len + value_seq_len];
+        COMPUTE_FLOAT A3 = A_offset[i + value_seq_len + value_seq_len + value_seq_len];
+        COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + i * stride));
+        
+        out0 = mad(B, (COMPUTE_FLOAT4)A0, out0);
+        out1 = mad(B, (COMPUTE_FLOAT4)A1, out1);
+        out2 = mad(B, (COMPUTE_FLOAT4)A2, out2);
+        out3 = mad(B, (COMPUTE_FLOAT4)A3, out3);
+        vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + i * stride);
     }
-    vstore4(CONVERT_FLOAT4(out0), 0, output + x * offset + (y * head_dim + z4) * 4);
-    vstore4(CONVERT_FLOAT4(B.s0123), 0, Pastvalue_offset + (value_seq_len4 - 1) * stride);
-    if(z4 + 1 >= head_dim) return;
-    vstore4(CONVERT_FLOAT4(out1), 0, output + x * offset + (y * head_dim + z4 + 1) * 4);
-    vstore4(CONVERT_FLOAT4(B.s4567), 1, Pastvalue_offset + (value_seq_len4 - 1) * stride);
-    if(z4 + 2 >= head_dim) return;
-    vstore4(CONVERT_FLOAT4(out2), 0, output + x * offset + (y * head_dim + z4 + 2) * 4);
-    vstore4(CONVERT_FLOAT4(B.s89ab), 2, Pastvalue_offset + (value_seq_len4 - 1) * stride);
-    if(z4 + 3 >= head_dim) return;
-    vstore4(CONVERT_FLOAT4(out3), 0, output + x * offset + (y * head_dim + z4 + 3) * 4);
-    vstore4(CONVERT_FLOAT4(B.scdef), 3, Pastvalue_offset + (value_seq_len4 - 1) * stride);
-#else
-    COMPUTE_FLOAT16 B = CONVERT_COMPUTE_FLOAT16(vload16(0, B_offset + (value_seq_len4 - 1) * stride));
-    vstore16(CONVERT_FLOAT16(B), 0, Pastvalue_offset + (value_seq_len4 - 1) * stride);
-    COMPUTE_FLOAT *B_ptr = (COMPUTE_FLOAT*)&B;
-    for(int i = (value_seq_len4 - 1) * 4, j = 0; i < value_seq_len; ++i, ++j){
-        COMPUTE_FLOAT4 A0 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
-        out0 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j], out0);
-        out1 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 4], out1);
-        out2 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 8], out2);
-        out3 = mad(A0, (COMPUTE_FLOAT4)B_ptr[j + 12], out3);
+    
+    #ifdef HEADDIM_LEAVE
+    int remain = head_dim - x4;
+    int output_offset = (z4 * head_num + y) * head_dim + x4;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
+    } else if(remain == 3){
+        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out0.x, out0.y, out0.z)), 0, output + output_offset);
+    } else if(remain == 2){
+        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out0.x, out0.y)), 0, output + output_offset);
+    } else{
+        output[output_offset] = out0.x;
     }
-    vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output + x * offset + (y * head_dim + z4) * 4);
-#endif
+    if(z4 + 1 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
+    } else if(remain == 3){
+        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out1.x, out1.y, out1.z)), 0, output + output_offset);
+    } else if(remain == 2){
+        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out1.x, out1.y)), 0, output + output_offset);
+    } else{
+        output[output_offset] = out1.x;
+    }
+    if(z4 + 2 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
+    } else if(remain == 3){
+        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out2.x, out2.y, out2.z)), 0, output + output_offset);
+    } else if(remain == 2){
+        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out2.x, out2.y)), 0, output + output_offset);
+    } else{
+        output[output_offset] = out2.x;
+    }
+    if(z4 + 3 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
+    } else if(remain == 3){
+        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out3.x, out3.y, out3.z)), 0, output + output_offset);
+    } else if(remain == 2){
+        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out3.x, out3.y)), 0, output + output_offset);
+    } else{
+        output[(x * head_num + y) * head_dim + z4] = out3.x;
+    }
+    #else
+    int output_offset = (z4 * head_num + y) * head_dim + x4;
+    vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
+    if(z4 + 1 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
+    if(z4 + 2 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
+    if(z4 + 3 >= qk_seq_len) return;
+    output_offset += head_num * head_dim;
+    vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
+    #endif
 
 #else
-    const int value_seq_len4 = (value_seq_len + 3) / 4;
-    const int stride = kv_head_num * head_dim * 4;
-    const int offset = head_num * head_dim * 4;
-    const int offset_head = y * head_dim * 4 + z4 * 4;
-    const int loop = (value_seq_len + 2) / 4;
-    __global const FLOAT *A_offset = input0 + y * value_seq_len4 * 4;
-    __global const FLOAT *B_offset = input1 + yin * head_dim * 4 + z4 * 4;
-    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim * 4 + z4 * 4;
+    int value_seq_len4 = (value_seq_len - 1 + 3) / 4;
+    int loop_end = max(value_seq_len4 - 1, 0);
+    const int stride = kv_head_num * head_dim;
+    __global const FLOAT *A_offset = input0 + y * value_seq_len;
+    __global const FLOAT *B_offset = input1 + yin * head_dim + x4;
+    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim + x4;
     COMPUTE_FLOAT4 out = 0;
     
-    for(int i = 0; i < loop - 1; i++){
+    for(int i = 0; i < loop_end; i++){
+        int index = i << 2;
         COMPUTE_FLOAT4 A = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
-        COMPUTE_FLOAT16 B = CONVERT_COMPUTE_FLOAT16(vload16(0, Pastvalue_offset + i * stride));
+        COMPUTE_FLOAT4 B0 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 0) * stride));
+        COMPUTE_FLOAT4 B1 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 1) * stride));
+        COMPUTE_FLOAT4 B2 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 2) * stride));
+        COMPUTE_FLOAT4 B3 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 3) * stride));
         
-        out.s0 += dot(A, B.s0123);
-        out.s1 += dot(A, B.s4567);
-        out.s2 += dot(A, B.s89ab);
-        out.s3 += dot(A, B.scdef);
+        out = mad(B0, (COMPUTE_FLOAT4)A.x, out);
+        out = mad(B1, (COMPUTE_FLOAT4)A.y, out);
+        out = mad(B2, (COMPUTE_FLOAT4)A.z, out);
+        out = mad(B3, (COMPUTE_FLOAT4)A.w, out);
     }
-    int start = (loop - 1) < 0 ? 0 : (loop - 1);
-    COMPUTE_FLOAT16 B_Vec = CONVERT_COMPUTE_FLOAT16(vload16(0, Pastvalue_offset + start * stride));
-    COMPUTE_FLOAT *B_ptr = (COMPUTE_FLOAT *)&B_Vec;
-    for(int i = start * 4; i < value_seq_len - 1; ++i){
+    for(int i = loop_end << 2; i < value_seq_len - 1; i++){
         COMPUTE_FLOAT A = A_offset[i];
+        COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + i * stride));
         
-        int index = i % 4;
-        out.s0 += A * B_ptr[index];
-        out.s1 += A * B_ptr[index+4];
-        out.s2 += A * B_ptr[index+8];
-        out.s3 += A * B_ptr[index+12];
+        out = mad(B, (COMPUTE_FLOAT4)A, out);
     }
     COMPUTE_FLOAT A = A_offset[value_seq_len - 1];
-    COMPUTE_FLOAT B0 = B_offset[0];
-    COMPUTE_FLOAT B1 = B_offset[4];
-    COMPUTE_FLOAT B2 = B_offset[8];
-    COMPUTE_FLOAT B3 = B_offset[12];
-    out.s0 += A * B0;
-    out.s1 += A * B1;
-    out.s2 += A * B2;
-    out.s3 += A * B3;
-    int index = ((value_seq_len - 1) >> 2) * stride + ((value_seq_len - 1) % 4);
-    
-#ifdef HEADDIM_LEAVE
-    Pastvalue_offset[index] = B0;
-    output[(y * head_dim + z4) * 4] = out.s0;
-    if(z4 + 1 >= head_dim) return;
-    Pastvalue_offset[index + 4] = B1;
-    output[(y * head_dim + z4 + 1) * 4] = out.s1;
-    if(z4 + 2 >= head_dim) return;
-    Pastvalue_offset[index + 8] = B2;
-    output[(y * head_dim + z4 + 2) * 4] = out.s2;
-    if(z4 + 3 >= head_dim) return;
-    Pastvalue_offset[index + 12] = B3;
-    output[(y * head_dim + z4 + 3) * 4] = out.s3;
-#else
-    Pastvalue_offset[index] = B0;
-    Pastvalue_offset[index + 4] = B1;
-    Pastvalue_offset[index + 8] = B2;
-    Pastvalue_offset[index + 12] = B3;
+    COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset));
+    out = mad(B, (COMPUTE_FLOAT4)A, out);
     
-    output[(y * head_dim + z4) * 4] = out.s0;
-    output[(y * head_dim + z4 + 1) * 4] = out.s1;
-    output[(y * head_dim + z4 + 2) * 4] = out.s2;
-    output[(y * head_dim + z4 + 3) * 4] = out.s3;
-#endif
+    #ifdef HEADDIM_LEAVE
+    int remain = head_dim - x4;
+    if(remain >= 4){
+        vstore4(CONVERT_FLOAT4(out), 0, output + y * head_dim + x4);
+        vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
+    } else if(remain == 3){
+        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out.x, out.y, out.z)), 0, output + y * head_dim + x4);
+        vstore3(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x, B.y, B.z)), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
+    } else if(remain == 2){
+        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out.x, out.y)), 0, output + y * head_dim + x4);
+        vstore2(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x, B.y)), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
+    } else{
+        output[(x * head_num + y) * head_dim + x4] = out.x;
+        Pastvalue_offset[(value_seq_len - 1) * stride] = B.x;
+    }
+    #else
+    vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
+    vstore4(CONVERT_FLOAT4(out), 0, output + y * head_dim + x4);
+    #endif
     
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/binary_buf.cl b/source/backend/opencl/execution/cl/binary_buf.cl
index 3528882ab..4ad1bad8d 100644
--- a/source/backend/opencl/execution/cl/binary_buf.cl
+++ b/source/backend/opencl/execution/cl/binary_buf.cl
@@ -5,84 +5,79 @@
 
 __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
                          __global INPUT_TYPE* input0, __global INPUT_TYPE* input1, __global OUTPUT_TYPE* output,
-                         __private const int4 shape,//[N,H,W,C4]
-                         __private const int2 isFull,
+                         __private const int size,
                          __private const int activationType) {
-    int2 pos = (int2)(get_global_id(0), get_global_id(1));//NC4, HW
+    int2 pos = (int2)(get_global_id(0), get_global_id(1));//NCHW, 1
     
     if (pos.x < global_dim0 && pos.y < global_dim1) {
-    #ifdef WH_PACK4
-        int offset = pos.x * (shape.y*shape.z/4) + pos.y;
-        #ifdef A_SINGLE
-        float data0 = input0[0];
-        float16 in0_16 = (float16)data0;
-        #else
-        float16 in0_16 = convert_float16(vload16(offset, input0));
-        #endif
-        
-        #ifdef B_SINGLE
-        float data1 = input1[0];
-        float16 in1_16 = (float16)data1;
-        #else
-        float16 in1_16 = convert_float16(vload16(offset, input1));
-        #endif
-        
-        float16 out;
-        float4 in0 = in0_16.s0123;
-        float4 in1 = in1_16.s0123;
-        out.s0123 = OPERATOR;
-        
-        in0 = in0_16.s4567;
-        in1 = in1_16.s4567;
-        out.s4567 = OPERATOR;
-        
-        in0 = in0_16.s89ab;
-        in1 = in1_16.s89ab;
-        out.s89ab = OPERATOR;
-        
-        in0 = in0_16.scdef;
-        in1 = in1_16.scdef;
-        out.scdef = OPERATOR;
-        
-        if(activationType == 1) {
-            out = fmax(out, (float16)0);
-        }
-        vstore16(CONVERT_OUTPUT16(out), offset, output);
-    #else
-        int offset = pos.x * (shape.y*shape.z) + pos.y;
-        #ifdef A_SINGLE
-        float data0 = input0[0];
-        float4 in0 = (float4)(data0, data0, data0, data0);
-        #else
-        float4 in0 = convert_float4(vload4(offset, input0));
-        #endif
+        int offset = pos.x << 2;
+#ifdef PACK_LEAVE
+        if(offset + 3 >= size){
+            int remain = size - offset;
+            float4 in0, in1;
+            float* in0_ptr = (float*)&in0;
+            float* in1_ptr = (float*)&in1;
+            
+            for(int i = 0; i < remain; ++i){
+                #ifdef A_SINGLE
+                in0_ptr[i] = (float)input0[0];
+                #else
+                in0_ptr[i] = (float)input0[offset + i];
+                #endif
         
-        #ifdef B_SINGLE
-        float data1 = input1[0];
-        float4 in1 = (float4)(data1, data1, data1, data1);
-        #else
-        float4 in1 = convert_float4(vload4(offset, input1));
-        #endif
+                #ifdef B_SINGLE
+                in1_ptr[i] = (float)input1[0];
+                #else
+                in1_ptr[i] = (float)input1[offset + i];
+                #endif
+            }
+            float4 out = OPERATOR;
+            if(activationType == 1) {
+                out = fmax(out, (float4)0);
+            }
+            float* out_ptr = (float*)&out;
+            for(int i = 0; i < remain; ++i){
+                output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
+            }
+        }else {
+#endif
+            #ifdef A_SINGLE
+            float data0 = input0[0];
+            float4 in0 = (float4)(data0, data0, data0, data0);
+            #else
+            float4 in0 = convert_float4(vload4(0, input0 + offset));
+            #endif
         
-        float4 out = OPERATOR;
+            #ifdef B_SINGLE
+            float data1 = input1[0];
+            float4 in1 = (float4)(data1, data1, data1, data1);
+            #else
+            float4 in1 = convert_float4(vload4(0, input1 + offset));
+            #endif
+            
+            float4 out = OPERATOR;
         
-        if(activationType == 1) {
-            out = fmax(out, (float4)0);
+            if(activationType == 1) {
+                out = fmax(out, (float4)0);
+            }
+            vstore4(CONVERT_OUTPUT4(out), 0, output + offset);
+#ifdef PACK_LEAVE
         }
-        vstore4(CONVERT_OUTPUT4(out), offset, output);
-    #endif
+#endif
     }
 }
 
 
 __kernel void prelu_buf(__private int global_dim0, __private int global_dim1,
                          __global INPUT_TYPE* input0, __global INPUT_TYPE* input1, __global OUTPUT_TYPE* output,
-                         __private const int4 shape//[N,H,W,C4]
+                         __private const int4 shape
                          ) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1));//NC4, HW
-    
+                                 
     if (pos.x < global_dim0 && pos.y < global_dim1) {
-        int offset = pos.x * (shape.y*shape.z) + pos.y;
+        int b = pos.x / shape.w;
+        int c = pos.x % shape.w;
+        int offset = (b + c * shape.x) * (shape.y*shape.z) + pos.y;
         float4 in0 = convert_float4(vload4(offset, input0));
         float4 in1 = convert_float4(vload4(pos.x % shape.w, input1));
         float4 out = OPERATOR;
diff --git a/source/backend/opencl/execution/cl/binary_subgroup_buf.cl b/source/backend/opencl/execution/cl/binary_subgroup_buf.cl
index e3362a21b..54c162120 100644
--- a/source/backend/opencl/execution/cl/binary_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/binary_subgroup_buf.cl
@@ -19,7 +19,7 @@ __kernel void binary_buf_c4_c4_c4(__private int global_dim0, __private int globa
     const int batch_idx = get_global_id(2);
     const int channel_idx = get_global_id(1);
 
-    const int offset = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     
     float4 in0 = convert_float4(vload4(0, input0 + offset*isFull.x));
     float4 in1 = convert_float4(vload4(0, input1 + offset*isFull.y));
@@ -57,7 +57,7 @@ __kernel void binary_buf_c4_c4_c16(__private int global_dim0, __private int glob
     const int dst_width = shape.z + output_pad_left + output_pad_right;
     const int channe_out_idx = channel_idx >> 2;
 
-    const int offset = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int dst_offset =  (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left) * 16 + (channel_idx % 4) * 4;
     
     float4 in0 = convert_float4(vload4(0, input0 + offset*isFull.x));
@@ -105,7 +105,7 @@ __kernel void binary_buf_c4_c16_c4(__private int global_dim0, __private int glob
     const int src_width = shape.z + input1_pad_left + input1_pad_right;
     const int channe_out_idx = channel_idx >> 2;
 
-    const int offset0 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset0 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset1 = (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left) * 16 + (channel_idx % 4) * 4;
 
     float4 in0 = convert_float4(vload4(0, input0 + offset0*isFull.x));
@@ -142,7 +142,7 @@ __kernel void binary_buf_c16_c4_c4(__private int global_dim0, __private int glob
     const int src_width = shape.z + input0_pad_left + input0_pad_right;
     const int channe_out_idx = channel_idx >> 2;
 
-    const int offset1 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset1 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset0 = (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left) * 16 + (channel_idx % 4) * 4;
     
     float4 in0 = convert_float4(vload4(0, input0 + offset0*isFull.x));
@@ -181,7 +181,7 @@ __kernel void binary_buf_c4_c16_c16(__private int global_dim0, __private int glo
     const int dst_width = shape.z + output_pad_left + output_pad_right;
     const int channe_out_idx = channel_idx >> 2;
 
-    const int offset0 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset0 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset1 = (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left) * 16 + (channel_idx % 4) * 4;
     const int dst_offset =  (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left) * 16 + (channel_idx % 4) * 4;
     
@@ -231,7 +231,7 @@ __kernel void binary_buf_c16_c4_c16(__private int global_dim0, __private int glo
     const int dst_width = shape.z + output_pad_left + output_pad_right;
     const int channe_out_idx = channel_idx >> 2;
 
-    const int offset1 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset1 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset0 = (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left) * 16 + (channel_idx % 4) * 4;
     const int dst_offset =  (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left) * 16 + (channel_idx % 4) * 4;
    
@@ -277,7 +277,7 @@ __kernel void prelu_buf_c4_c4(__private int global_dim0, __private int global_di
     const int batch_idx = get_global_id(2);
     const int channel_idx = get_global_id(1);
     
-    const int offset0 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset0 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset1 = channel_idx * 4;
     
     float4 in0 = convert_float4(vload4(0, input0 + offset0));
@@ -304,7 +304,7 @@ __kernel void prelu_buf_c4_c16(__private int global_dim0, __private int global_d
     const int dst_width = shape.z + output_pad_left + output_pad_right;
     const int channe_out_idx = channel_idx >> 2;
     
-    const int offset0 = (((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset0 = (((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
     const int offset1 = channel_idx * 4;
     const int offset =  (((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left) * 16 + (channel_idx % 4) * 4;
 
@@ -385,11 +385,11 @@ __kernel void prelu_buf_c16_c4(__private int global_dim0, __private int global_d
     const int channel_idx = get_group_id(1);
     const int sglid = get_sub_group_local_id();
     const int src_width = shape.z + input0_pad_left + input0_pad_right;
-    const int width_height = shape.z * shape.y * 4;
+    const int batch_width_height = shape.x * shape.z * shape.y * 4;
 
     const int offset0 = (((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left) * 16;
     const int offset1 = channel_idx * 16;
-    const int offset =  (((batch_idx*channel4+(channel_idx<<2))*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset =  (((batch_idx+(channel_idx<<2)*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
 
     float4 in0 = convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0 + offset0))));
     float4 in1 = (float4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1 + offset1))));
@@ -400,7 +400,7 @@ __kernel void prelu_buf_c16_c4(__private int global_dim0, __private int global_d
     const int lid_y = sglid / 4;
     int block_size = w_idx + 4 > shape.z ? (shape.z % 4) : 4;
     for (int i = 0; i < block_size; i++) {
-        output[offset + i * 4 + lid_y * width_height + lid_x] = (OUTPUT_TYPE)out[i];
+        output[offset + i * 4 + lid_y * batch_width_height + lid_x] = (OUTPUT_TYPE)out[i];
     }
 }
 
@@ -478,11 +478,11 @@ __kernel void binary_buf_c16_c16_c4(__private int global_dim0, __private int glo
     const int sglid = get_sub_group_local_id();
     const int src0_width = shape.z + input0_pad_left + input0_pad_right;
     const int src1_width = shape.z + input1_pad_left + input1_pad_right;
-    const int width_height = shape.z * shape.y * 4;
+    const int batch_width_height = shape.x * shape.z * shape.y * 4;
 
     const int offset0 = (((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src0_width+w_idx+input0_pad_left) * 16;
     const int offset1 = (((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src1_width+w_idx+input1_pad_left) * 16;
-    const int offset =  (((batch_idx*channel4+(channel_idx << 2))*shape.y+h_idx)*shape.z+w_idx) * 4;
+    const int offset =  (((batch_idx+(channel_idx << 2)*shape.x)*shape.y+h_idx)*shape.z+w_idx) * 4;
 
     float4 in0 = isFull.x ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0 + offset0)))) : (float4)(input0[0]);
     float4 in1 = isFull.y ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1 + offset1)))) : (float4)(input1[0]);
@@ -496,6 +496,6 @@ __kernel void binary_buf_c16_c16_c4(__private int global_dim0, __private int glo
     const int lid_y = sglid / 4;
     int block_size = w_idx + 4 > shape.z ? (shape.z % 4) : 4;
     for (int i = 0; i < block_size; i++) {
-        output[offset + i * 4 + lid_y * width_height + lid_x] = (OUTPUT_TYPE)out[i];
+        output[offset + i * 4 + lid_y * batch_width_height + lid_x] = (OUTPUT_TYPE)out[i];
     }
 }
diff --git a/source/backend/opencl/execution/cl/buffer_convert_buf.cl b/source/backend/opencl/execution/cl/buffer_convert_buf.cl
index 1563c65ea..6a4b4e220 100644
--- a/source/backend/opencl/execution/cl/buffer_convert_buf.cl
+++ b/source/backend/opencl/execution/cl/buffer_convert_buf.cl
@@ -7,231 +7,71 @@
     if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
         return;                                                     \
     }
-// convert data from buffer(nhwc) to buffer(nc4hw4)
-__kernel void nhwc_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS
-                                   __global const INPUT_TYPE *input_ptr,
-                                   __private const int height,
-                                   __private const int width, __private const int channels,
-                                   __global OUTPUT_TYPE *output) {
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
-
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
-
-    const int batch_idx     = image_height_idx / height;
-    const int height_idx    = image_height_idx % height;
-    const int width_idx     = image_width_idx % width;
-    const int channel_4_idx = (image_width_idx / width) << 2;
-    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
-
-    const int remain_channel                = channels - channel_4_idx;
-    float4 values                           = convert_float4(vload4(0, input_ptr + buffer_offset));
-
-    if (remain_channel == 3) {
-        values.w = 0;
-    } else if (remain_channel == 2) {
-        values.z = 0;
-        values.w = 0;
-    } else if (remain_channel == 1) {
-        values.y = 0;
-        values.z = 0;
-        values.w = 0;
-    }
-    const int out_offset = (((batch_idx * ((channels+3)/4) + channel_4_idx/4) * height + height_idx) * width + width_idx)*4;
-    vstore4(CONVERT_OUTPUT4(values), 0, output+out_offset);
-}
-
-// convert data from buffer(nchw) to buffer(nc4hw4)
-__kernel void nchw_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS
-                                   __global const INPUT_TYPE *input_ptr,
-                                   __private const int height, __private const int width, __private const int channels,
-                                   __global OUTPUT_TYPE *output) {
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
-    
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
-
-    const int batch_idx     = image_height_idx / height;
-    const int height_idx    = image_height_idx % height;
-    const int width_idx     = image_width_idx % width;
-    const int channel_4_idx = image_width_idx / width << 2;
-    const int buffer_offset = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
-
-    const int remain_channel    = channels - channel_4_idx;
-    const int height_width_size = height * width;
-    float4 output_values    = 0;
-
-    if (remain_channel >= 4) {
-        int offset      = buffer_offset;
-        output_values.x = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.y = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.z = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.w = (float)*(input_ptr + offset);
-    } else if (remain_channel == 3) {
-        int offset      = buffer_offset;
-        output_values.x = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.y = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.z = (float)*(input_ptr + offset);
-    } else if (remain_channel == 2) {
-        int offset      = buffer_offset;
-        output_values.x = (float)*(input_ptr + offset);
-        offset += height_width_size;
-        output_values.y = (float)*(input_ptr + offset);
-    } else if (remain_channel == 1) {
-        int offset      = buffer_offset;
-        output_values.x = (float)*(input_ptr + offset);
+#define GLOBAL_SIZE_3_DIMS __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                       \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+        return;                                                     \
     }
 
-    const int out_offset = (((batch_idx * ((channels+3)/4) + channel_4_idx/4) * height + height_idx) * width + width_idx)*4;
-    vstore4(CONVERT_OUTPUT4(output_values), 0, output+out_offset);
-}
-
-
-__kernel void nchw_buffer_to_nchw_buffer(GLOBAL_SIZE_2_DIMS
-                                   __global INPUT_TYPE *input_ptr,
-                                   __private const int height, __private const int width, __private const int channels,
-                                   __private const int input_pad_left, __private const int input_pad_right,
-                                   __private const int output_pad_left, __private const int output_pad_right,
-                                   __global OUTPUT_TYPE *output) {
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
-    
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
-
-    const int src_width = width + input_pad_left + input_pad_right;
-    const int dst_width = width + output_pad_left + output_pad_right;
-    const int batch_idx     = image_height_idx / height;
-    const int height_idx    = image_height_idx % height;
-    const int width_idx     = image_width_idx % width;
-    const int channel_idx = image_width_idx / width;
-    const int in_offset = ((batch_idx * channels + channel_idx) * height + height_idx) * src_width + width_idx + input_pad_left;
-    const int out_offset = ((batch_idx * channels + channel_idx) * height + height_idx) * dst_width + width_idx + output_pad_left;
-
-    output[out_offset] = (OUTPUT_TYPE)input_ptr[in_offset];
-}
-
-// convert data from image(b h, ic/4 w ic4) to buffer(nhwc)
-__kernel void nc4hw4_buffer_to_nhwc_buffer(GLOBAL_SIZE_2_DIMS
-                                    __global OUTPUT_TYPE *output,
-                                    __private const int height, __private const int width,
-                                    __private const int channels,
-                                    __global INPUT_TYPE *input_ptr) {
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
-
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+#define MNN_DATA_FORMAT_NCHW 0
+#define MNN_DATA_FORMAT_NHWC 1
+#define MNN_DATA_FORMAT_NC4HW4 2
+#define MNN_DATA_FORMAT_C4NHW4 3
+__kernel void buffer_convert_to_buffer(GLOBAL_SIZE_3_DIMS
+                                    __global const INPUT_TYPE *input_ptr,
+                                    __private const int4 shape, // N C H W
+                                    __global OUTPUT_TYPE *output_ptr
+) {
 
-    const int batch_idx     = image_height_idx / height;
-    const int height_idx    = image_height_idx % height;
-    const int width_idx     = image_width_idx % width;
-    const int channel_4_idx = (image_width_idx / width) << 2;
-    const int buffer_offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_4_idx;
+    int wh  = get_global_id(0);
+    int c = get_global_id(1);
+    int n = get_global_id(2);
 
-    const int in_offset = (((batch_idx * ((channels+3)/4) + channel_4_idx/4) * height + height_idx) * width + width_idx)*4;
+    DEAL_NON_UNIFORM_DIM3(wh, c, n);
+    int w = wh % shape.w;
+    int h = wh / shape.w;
     
-    float4 values        = convert_float4(vload4(0, input_ptr+in_offset));
-    const int remain_channel = channels - channel_4_idx;
-    if (remain_channel >= 4) {
-        vstore4(CONVERT_OUTPUT4(values), 0, output + buffer_offset);
-    } else if (remain_channel == 3) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-        offset++;
-        output[offset] = (OUTPUT_TYPE)values.y;
-        offset++;
-        output[offset] = (OUTPUT_TYPE)values.z;
-    } else if (remain_channel == 2) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-        offset++;
-        output[offset] = (OUTPUT_TYPE)values.y;
-    } else if (remain_channel == 1) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-    }
-}
+#if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int input_offset = ((n * shape.y + c) * shape.z + h) * shape.w + w;
+#elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int input_offset = ((n * shape.z + h) * shape.w + w) * shape.y + c;
+#elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int input_offset = ((((c / 4) * shape.x + n) * shape.z + h) * shape.w + w) * 4 + (c % 4);
+#endif
 
-// convert data from buffer(nc4hw4) to buffer(nchw)
-__kernel void nc4hw4_buffer_to_nchw_buffer(GLOBAL_SIZE_2_DIMS
-                                    __global OUTPUT_TYPE *output,
-                                    __private const int height, __private const int width,
-                                    __private const int channels,
-                                    __global INPUT_TYPE *input_ptr) {
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
-    
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
+#if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int output_offset = ((n * shape.y + c) * shape.z + h) * shape.w + w;
+#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int output_offset = ((n * shape.z + h) * shape.w + w) * shape.y + c;
+#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int output_offset = ((((c / 4) * shape.x + n) * shape.z + h) * shape.w + w) * 4 + (c % 4);
+#endif
 
-    const int batch_idx  = image_height_idx / height;
-    const int height_idx = image_height_idx % height;
-    const int width_idx  = image_width_idx % width;
-    int channel_4_idx    = (image_width_idx / width) * 4;
-    int buffer_offset    = ((batch_idx * channels + channel_4_idx) * height + height_idx) * width + width_idx;
-    
-    const int in_offset = (((batch_idx * ((channels+3)/4) + channel_4_idx/4) * height + height_idx) * width + width_idx)*4;
-    float4 values    = convert_float4(vload4(0, input_ptr+in_offset));
-
-    const int height_width_size = height * width;
-
-    const int remain_channel = channels - channel_4_idx;
-
-    if (remain_channel >= 4) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.y;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.z;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.w;
-    } else if (remain_channel == 3) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.y;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.z;
-    } else if (remain_channel == 2) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-        offset += height_width_size;
-        output[offset] = (OUTPUT_TYPE)values.y;
-    } else if (remain_channel == 1) {
-        int offset     = buffer_offset;
-        output[offset] = (OUTPUT_TYPE)values.x;
-    }
+    output_ptr[output_offset] = input_ptr[input_offset];
 }
 
-__kernel void nc4hw4_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS
+__kernel void buffer_copy_to_buffer(GLOBAL_SIZE_2_DIMS
                                     __global const INPUT_TYPE *input_ptr,
-                                    __private const int2 output_shape,
-                                    __private const int2 src_stride,
-                                    __private const int2 dst_stride,
-                                    __global OUTPUT_TYPE *output
+                                    __global OUTPUT_TYPE *output_ptr,
+                                    __private const int size // N C H W
 ) {
 
-    int image_width_idx  = get_global_id(0);
-    int image_height_idx = get_global_id(1);
+    const int x  = get_global_id(0);
+    const int y  = get_global_id(1);
 
-    DEAL_NON_UNIFORM_DIM2(image_width_idx, image_height_idx);
-
-    const int batch_idx         = image_height_idx / output_shape.x;
-    const int height_idx        = image_height_idx % output_shape.x;
-    const int width_idx         = image_width_idx % output_shape.y;
-    const int channel_block_idx = image_width_idx / output_shape.y;
-    int2 src_bc_offset = src_stride * (int2)(batch_idx, channel_block_idx);
-    int2 dst_bc_offset = dst_stride * (int2)(batch_idx, channel_block_idx);
-    int src_buffer_offset =
-        (((src_bc_offset.x + src_bc_offset.y) * output_shape.x + height_idx) * output_shape.y + width_idx) * 4;
-    int dst_buffer_offset =
-        (((dst_bc_offset.x + dst_bc_offset.y) * output_shape.x + height_idx) * output_shape.y + width_idx) * 4;
-    
-    vstore4(CONVERT_OUTPUT4(vload4(0, input_ptr + src_buffer_offset)), 0, output+dst_buffer_offset);
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int offset = x << 2;
+#ifdef PACK_LEAVE
+    if(offset + 3 >= size){
+        for(int i = 0; i < size - offset; ++i){
+            output_ptr[offset + i] = (OUTPUT_TYPE)input_ptr[offset + i];
+        }
+    } else {
+#endif
+        vstore4(CONVERT_OUTPUT4(vload4(0, input_ptr+offset)), 0, output_ptr+offset);
+#ifdef PACK_LEAVE
+    }
+#endif
 }
 
 // convert kernel : from buffer(oihw) to image(oc/4 h w , ic oc4)
diff --git a/source/backend/opencl/execution/cl/buffer_convert_quant.cl b/source/backend/opencl/execution/cl/buffer_convert_quant.cl
index 5043e1418..8062b1a49 100644
--- a/source/backend/opencl/execution/cl/buffer_convert_quant.cl
+++ b/source/backend/opencl/execution/cl/buffer_convert_quant.cl
@@ -155,28 +155,29 @@ __kernel void conv2d_1x1_weight_quant_image(GLOBAL_SIZE_2_DIMS
                                             __private const int input_channel,
                                             __private const int output_channel) {
 
-    int x  = get_global_id(0); // ic / 16
+    int x  = get_global_id(0); // ic / 32
     int y = get_global_id(1); // oc
 
     DEAL_NON_UNIFORM_DIM2(x, y);
-    const int xin = x << 4;
 #ifdef USE_LOW_BIT_WEIGHT_INT4
+    const int xin = x << 5;
 #ifdef CHANNEL_LEAVE
-    uchar8 out = 0;
+    uchar16 out = 0;
     uchar *out_ptr = (uchar*)&out;
-    for(int i = 0; i < 8; ++i){
+    for(int i = 0; i < 16; ++i){
         int index0 = y * input_channel + xin + i * 2;
         int index1 = y * input_channel + xin + i * 2 + 1;
         uchar s0 = input_ptr[index0/2];
         uchar s1 = input_ptr[index1/2];
         out_ptr[i] = ((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));
     }
-    write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(out)));
+    write_imagei(output, (int2)(y, x), as_int4(out));
 #else
     const int inputOffset = (y * input_channel + xin)/2;
-    write_imageui(output, (int2)(y, x), convert_uint4(as_ushort4(vload8(0, input_ptr + inputOffset))));
+    write_imagei(output, (int2)(y, x), as_int4(vload16(0, input_ptr + inputOffset)));
 #endif
 #else
+    const int xin = x << 4;
     const int inputOffset = y * input_channel + xin;
     write_imagei(output, (int2)(y, x), as_int4(vload16(0, input_ptr + inputOffset)));
 #endif
@@ -205,7 +206,6 @@ __kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS
 #ifdef USE_LOW_BIT_WEIGHT_INT4
     const int inputOffset = (yin * input_channel + xin) / 2;
     const int outputOffset = ((x * outputChannelC4 + y) * icPack * ocPack) / 2;
-#ifdef CHANNEL_LEAVE
     for(int i = 0; i < icPack; ++i){
         for(int j = 0; j < ocPack / 2; ++j){
             int index0 = (yin + j * 2) * input_channel + xin + i;
@@ -217,18 +217,6 @@ __kernel void conv2d_1x1_ic_oc_weight_quant_buffer(GLOBAL_SIZE_2_DIMS
             output_ptr[outputOffset + i * (ocPack / 2) + j] = s0 | s1;
         }
     }
-#else
-    for(int i = 0; i < icPack/2; ++i){
-        for(int j = 0; j < ocPack / 2; ++j){
-            char s0 = input_ptr[inputOffset + (j * 2) * (input_channel / 2) + i];
-            char s1 = input_ptr[inputOffset + (j * 2 + 1) * (input_channel / 2) + i];
-            char d0 = (s0 & 0xf0) | ((s1 & 0xf0) >> 4);
-            char d1 = ((s0 & 0x0f) << 4) | (s1 & 0x0f);
-            output_ptr[outputOffset + (i * 2) * (ocPack / 2) + j] = d0;
-            output_ptr[outputOffset + (i * 2 + 1) * (ocPack / 2) + j] = d1;
-        }
-    }
-#endif
 #else
     const int inputOffset = yin * input_channel + xin;
     const int outputOffset = (x * outputChannelC4 + y) * icPack * ocPack;
diff --git a/source/backend/opencl/execution/cl/cast_buf.cl b/source/backend/opencl/execution/cl/cast_buf.cl
index d9c1fb8e7..247071e8b 100644
--- a/source/backend/opencl/execution/cl/cast_buf.cl
+++ b/source/backend/opencl/execution/cl/cast_buf.cl
@@ -2,36 +2,46 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-#define GLOBAL_SIZE_3_DIMS \
-__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+#define GLOBAL_SIZE_2_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1,
 
-#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
-    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
         return;                                                                                   \
     }
 
-__kernel void cast_buf(GLOBAL_SIZE_3_DIMS
+__kernel void cast_buf(GLOBAL_SIZE_2_DIMS
                             __global INPUT_TYPE* input,
                             __global OUTPUT_TYPE* output,
-                            __private const int width,
-                            __private const int height,
-                            __private const int channelBlock
+                            __private const int size
                             ) {
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
+    const int idx = get_global_id(0);
+    const int idy = get_global_id(1);
 
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
-    
-    const int batch_idx = batch_channel_idx / channelBlock;
-    const int channel_idx = batch_channel_idx % channelBlock;
-    
-    const int inp_offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
-#ifdef TO_BOOL
-    int4 value = convert_int4(vload4(0, input + inp_offset));
-    value = value == (int4)0 ? (int4)0 : (int4)1;
-    vstore4(CONVERT_OUTPUT4(value), 0, output + inp_offset);
-#else
-    vstore4(CONVERT_OUTPUT4(vload4(0, input + inp_offset)), 0, output + inp_offset);
+    DEAL_NON_UNIFORM_DIM2(idx, idy);
+    const int inp_offset = idx * 4;
+#ifdef PACK_LEAVE
+    if(inp_offset + 3 >= size){
+        int remain = size - inp_offset;
+        for(int i = 0; i < remain; ++i){
+            #ifdef TO_BOOL
+            int value = (int)input[inp_offset + i];
+            value = value == 0 ? 0 : 1;
+            output[inp_offset + i] = (OUTPUT_TYPE)value;
+            #else
+            output[inp_offset + i] = (OUTPUT_TYPE)input[inp_offset + i];
+            #endif
+        }
+    }else {
+#endif
+        #ifdef TO_BOOL
+        int4 value = convert_int4(vload4(0, input + inp_offset));
+        value = value == (int4)0 ? (int4)0 : (int4)1;
+        vstore4(CONVERT_OUTPUT4(value), 0, output + inp_offset);
+        #else
+        vstore4(CONVERT_OUTPUT4(vload4(0, input + inp_offset)), 0, output + inp_offset);
+        #endif
+#ifdef PACK_LEAVE
+    }
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/conv_2d_buf.cl b/source/backend/opencl/execution/cl/conv_2d_buf.cl
index 9aed2e670..07f8d96fe 100644
--- a/source/backend/opencl/execution/cl/conv_2d_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_buf.cl
@@ -9,6 +9,77 @@
         return;                                                     \
     }
 
+#ifdef CONV_LOCAL_SIZE
+__kernel
+void conv_2d_1x1_local(__private const int out_w_blocks,
+                          __global const FLOAT *input,
+                          __global const FLOAT *kernel_ptr,
+                          __global const FLOAT *bias_ptr,
+                          __global FLOAT *output,
+                          __private const int in_c_block,
+                          __private const int batch,
+                          __private const int out_h,
+                          __private const int out_w,
+                          __private const int out_c_block,
+                          __private const int out_c_pack) {
+
+    const int lid = get_local_id(0);
+    const int out_c_w_idx = get_global_id(1); //c/4 w
+    const int out_b_h_idx  = get_global_id(2); //b h
+    
+    COMPUTE_FLOAT4 local sum[CONV_LOCAL_SIZE];
+    
+    const int out_c_idx = out_c_w_idx / out_w_blocks;
+    const int out_w_idx = out_c_w_idx % out_w_blocks;
+    const int out_b_idx = out_b_h_idx / out_h; // equal to in_b_idx
+    const int out_h_idx = out_b_h_idx % out_h; // equal to in_h_idx
+
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
+    COMPUTE_FLOAT4 out0 = (COMPUTE_FLOAT4)0;
+
+    int offset = out_c_idx*4;
+    int inp_offset = (((out_b_idx+in_c_block*batch)*out_h + out_h_idx)* out_w + out_w_idx) << 2;
+    
+    const int inp_add = batch*out_h*out_w*4;
+    for (ushort in_channel_block_idx = lid; in_channel_block_idx < in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {
+        
+        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*4);
+
+        COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset+in_channel_block_idx*inp_add));
+        COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
+        COMPUTE_FLOAT4 weights1 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack));
+        COMPUTE_FLOAT4 weights2 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack));
+        COMPUTE_FLOAT4 weights3 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset + out_c_pack + out_c_pack + out_c_pack));
+
+        out0 = mad(in0.x, weights0, out0);
+        out0 = mad(in0.y, weights1, out0);
+        out0 = mad(in0.z, weights2, out0);
+        out0 = mad(in0.w, weights3, out0);
+    }
+    
+    sum[lid] = out0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int i = CONV_LOCAL_SIZE/2; i > 0; i /= 2){
+        if (lid < i)
+            sum[lid] = sum[lid] + sum[lid + i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    out0 = sum[0] + bias0;
+    if(lid == 0){
+#ifdef RELU
+        out0 = fmax(out0, (COMPUTE_FLOAT4)0);
+#endif
+
+#ifdef RELU6
+        out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+#endif
+
+        const int out_offset = (((out_b_idx + out_c_idx*batch)*out_h + out_h_idx)* out_w + out_w_idx)*4;
+        vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
+    }
+}
+#endif
+
 __kernel
 void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __global const FLOAT *input,
@@ -18,6 +89,7 @@ void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __private const int in_c_block,
                           __private const int out_h,
                           __private const int out_w,
+                          __private const int out_b,
                           __private const int out_c_block,
                           __private const int out_c_pack) {
 
@@ -38,15 +110,11 @@ void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     COMPUTE_FLOAT4 out3 = out0;
 
     const int intput_width_idx0 = out_w4_idx;
-    
-
+    int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0) << 2;
     int offset = out_c_idx*4;
-    int inp_offset = (((out_b_idx*in_c_block)*out_h + out_h_idx)* out_w + intput_width_idx0) << 2;
-    
-    const int inp_add = out_h*out_w*4;
+    const int inp_add = out_b*out_h*out_w*4;
     for (ushort in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
         
-        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*4);
 
         COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
         COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
@@ -95,7 +163,7 @@ void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_block + out_c_idx)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx * out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_w - out_w4_idx;
     if (remain >= 4) {
@@ -123,6 +191,7 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __private const int in_c_block,
                           __private const int out_h,
                           __private const int out_w,
+                          __private const int out_b,
                           __private const int out_c_block,
                           __private const int out_c_pack) {
 
@@ -148,12 +217,12 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     COMPUTE_FLOAT4 out7 = out4;
 
     const int intput_width_idx0 = out_w4_idx;
+    int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
+    int offset = out_c_idx*8;
+    const int inp_add = out_b*out_h*out_w*4;
 
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
 
-        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*8);
-        const int inp_offset =
-        (((out_b_idx*in_c_block + in_channel_block_idx)*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
         
         COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
         COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
@@ -208,6 +277,9 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
         out7 = mad(in3.y, weights3, out7);
         out7 = mad(in3.z, weights5, out7);
         out7 = mad(in3.w, weights7, out7);
+        
+        offset += 4 * out_c_pack;
+        inp_offset += inp_add;
     }
 
 #ifdef RELU
@@ -234,10 +306,10 @@ void conv_2d_1x1_c8h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_block + out_c_idx*2)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*2*out_b)*out_h + out_h_idx)* out_w + out_w4_idx)*4;
 
     __global FLOAT * _tempoutput = output + out_offset;
-    __global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w;
+    __global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w*out_b;
 
 #ifdef BLOCK_LEAVE
     const int remain = out_w - out_w4_idx;
@@ -287,6 +359,7 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __private const int in_c_block,
                           __private const int out_h,
                           __private const int out_w,
+                          __private const int out_b,
                           __private const int out_c_block,
                           __private const int out_c_pack) {
 
@@ -308,11 +381,10 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     COMPUTE_FLOAT4 out5 = out4;
 
     const int intput_width_idx0 = out_w2_idx;
+    int inp_offset = ((out_b_idx * out_h + out_h_idx)* out_w + intput_width_idx0)<<2;
+    int offset = out_c_idx*8;
+    const int inp_add = out_b*out_h*out_w*4;
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
-
-        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*8);
-        const int inp_offset =
-        (((out_b_idx*in_c_block + in_channel_block_idx)*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
         
         COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
         COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
@@ -344,6 +416,9 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
         out5 = mad(in1.y, weights3, out5);
         out5 = mad(in1.z, weights5, out5);
         out5 = mad(in1.w, weights7, out5);
+        
+        offset += 4 * out_c_pack;
+        inp_offset += inp_add;
     }
 
 #ifdef RELU
@@ -362,11 +437,11 @@ void conv_2d_1x1_c8h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out5 = clamp(out5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_block + out_c_idx*2)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*2*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
 
 
     __global FLOAT * _tempoutput = output + out_offset;
-    __global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w;
+    __global FLOAT * _tempoutput1 = _tempoutput + 4*out_h*out_w*out_b;
 
 #ifdef BLOCK_LEAVE
     const int remain = out_w - out_w2_idx;
@@ -405,6 +480,7 @@ void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __private const int in_c_block,
                           __private const int out_h,
                           __private const int out_w,
+                          __private const int out_b,
                           __private const int out_c_block,
                           __private const int out_c_pack) {
 
@@ -420,12 +496,12 @@ void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
 
     COMPUTE_FLOAT4 out0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias_ptr));
     const int intput_width_idx0 = out_w_idx;
+    int offset = out_c_idx*4;
+    int inp_offset = ((out_b_idx * out_h + out_h_idx) * out_w + intput_width_idx0)*4;
+    const int inp_add = out_b*out_h*out_w*4;
     
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
         
-        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*4);
-        const int inp_offset =
-        (((out_b_idx*in_c_block + in_channel_block_idx)*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
         
         COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
         COMPUTE_FLOAT4 weights0 = CONVERT_COMPUTE_FLOAT4(vload4(0, kernel_ptr + offset));
@@ -437,6 +513,9 @@ void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
         out0 = mad(in0.y, weights1, out0);
         out0 = mad(in0.z, weights2, out0);
         out0 = mad(in0.w, weights3, out0);
+        
+        offset += 4 * out_c_pack;
+        inp_offset += inp_add;
     }
 
 #ifdef RELU
@@ -447,7 +526,7 @@ void conv_2d_1x1_c4h1w1(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_block + out_c_idx)*out_h + out_h_idx)* out_w + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*out_b)*out_h + out_h_idx)* out_w + out_w_idx)*4;
 
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
 }
@@ -462,6 +541,7 @@ void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
                           __private const int in_c_block,
                           __private const int out_h,
                           __private const int out_w,
+                          __private const int out_b,
                           __private const int out_c_block,
                           __private const int out_c_pack) {
 
@@ -481,12 +561,11 @@ void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     COMPUTE_FLOAT4 out1 = out0;
 
     const int intput_width_idx0 = out_w2_idx;
+    int offset = out_c_idx*4;
+    int inp_offset = ((out_b_idx*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
+    const int inp_add = out_b*out_h*out_w*4;
     
     for (int in_channel_block_idx = 0; in_channel_block_idx < in_c_block; ++in_channel_block_idx) {
-
-        int offset = mad24(in_channel_block_idx*4, out_c_pack, out_c_idx*4);
-        const int inp_offset =
-        (((out_b_idx*in_c_block + in_channel_block_idx)*out_h + out_h_idx)* out_w + intput_width_idx0)*4;
         
         COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
         COMPUTE_FLOAT4 in1 = CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
@@ -505,6 +584,9 @@ void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
         out1 = mad(in1.y, weights1, out1);
         out1 = mad(in1.z, weights2, out1);
         out1 = mad(in1.w, weights3, out1);
+        
+        offset += 4 * out_c_pack;
+        inp_offset += inp_add;
     }
 
 #ifdef RELU
@@ -517,7 +599,7 @@ void conv_2d_1x1_c4h1w2(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,
     out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_block + out_c_idx)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*out_b)*out_h + out_h_idx)* out_w + out_w2_idx)*4;
 
 #ifdef BLOCK_LEAVE
     const int remain = out_w - out_w2_idx;
@@ -541,6 +623,7 @@ void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -580,7 +663,7 @@ void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + kw_start) * 4;
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
             for(int ix = in_w_idx_start; ix < in_w_idx_end; ix += dilate_hw.y) {
-                int inp_offset = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + ix) * 4;
+                int inp_offset = (((out_b_idx + in_c_idx * batch) * in_hw.x + iy) * in_hw.y + ix) * 4;
                 COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
                 
                 const int filter_w_inc = (ix-in_w_idx_start)/dilate_hw.y;
@@ -606,8 +689,7 @@ void conv_2d_c4h1w1(GLOBAL_SIZE_2_DIMS
 #ifdef RELU6
     out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
-
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
  
 }
@@ -621,6 +703,7 @@ void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -658,7 +741,7 @@ void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -696,7 +779,7 @@ void conv_2d_c4h1w2(GLOBAL_SIZE_2_DIMS
     out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     if(out_w_idx + 1 >= out_hw.y) return;
@@ -715,6 +798,7 @@ void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -756,7 +840,7 @@ void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -812,7 +896,7 @@ void conv_2d_c4h1w4(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
 
@@ -840,6 +924,7 @@ void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -879,7 +964,7 @@ void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -937,7 +1022,7 @@ void conv_2d_c4h4w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -972,6 +1057,7 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -1016,7 +1102,7 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx * batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -1107,7 +1193,7 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -1125,12 +1211,12 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks){
         return;
     }
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
@@ -1151,12 +1237,12 @@ void conv_2d_c8h4w1(GLOBAL_SIZE_2_DIMS
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out2), 2 * out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), 3 * out_hw.y, output+out_offset);
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks){
         return;
     }
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
@@ -1173,6 +1259,7 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -1212,7 +1299,7 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     for(ushort in_c_idx = 0; in_c_idx < in_c_blocks; in_c_idx++) {
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -1270,7 +1357,7 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 2){
@@ -1279,12 +1366,12 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks){
         return;
     }
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 2){
         vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
@@ -1294,12 +1381,12 @@ void conv_2d_c8h2w1(GLOBAL_SIZE_2_DIMS
 #else
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out1), out_hw.y, output+out_offset);
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks){
         return;
     }
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
 #endif
@@ -1314,6 +1401,7 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -1361,7 +1449,7 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx * batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -1450,7 +1538,7 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
     if(remain >= 4){
@@ -1463,10 +1551,10 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     }else if(remain == 1){
         vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     }
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks)return;
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
     }else if(remain == 3){
@@ -1479,10 +1567,10 @@ void conv_2d_c8h1w4(GLOBAL_SIZE_2_DIMS
     }
 #else
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0, out1, out2, out3)), 0, output+out_offset);
-#ifdef CHANNEL_LEAVE
+    #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks)return;
-#endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    #endif
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/conv_2d_c16_subgroup_buf.cl b/source/backend/opencl/execution/cl/conv_2d_c16_subgroup_buf.cl
index a64f6d9ab..2167f8014 100644
--- a/source/backend/opencl/execution/cl/conv_2d_c16_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_c16_subgroup_buf.cl
@@ -48,6 +48,7 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b2(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
@@ -82,10 +83,10 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b2(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch *  ((output_channel + 3) / 4);
+    const uint output_b_pitch = output_fs_pitch * batch;
 
-    const uint output_offset = b * output_b_pitch +
-                               (feature_block << 2) * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               (feature_block << 2) * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -242,13 +243,13 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b2(
     if ((feature_block+1)*16 >= output_channel) {
         for (int i = 0; i < 2 && (x + i) < output_width; i++) {
             if ((feature_block*16 + lid_y * 4 + lid_x < output_channel))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 2 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 #if SLM_DIV_FACTOR > 1
@@ -269,6 +270,7 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b4(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
@@ -303,10 +305,10 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b4(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch *  ((output_channel + 3) / 4);
+    const uint output_b_pitch = output_fs_pitch * batch;
 
-    const uint output_offset = b * output_b_pitch +
-                               (feature_block << 2) * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               (feature_block << 2) * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -463,13 +465,13 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b4(
     if ((feature_block+1)*16 >= output_channel) {
         for (int i = 0; i < 4 && (x + i) < output_width; i++) {
             if ((feature_block*16 + lid_y * 4 + lid_x < output_channel))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 4 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 #if SLM_DIV_FACTOR > 1
@@ -490,6 +492,7 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b8(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
@@ -524,10 +527,10 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b8(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch *  ((output_channel + 3) / 4);
+    const uint output_b_pitch = output_fs_pitch * batch;
 
-    const uint output_offset = b * output_b_pitch +
-                               (feature_block << 2) * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               (feature_block << 2) * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -684,13 +687,13 @@ __kernel void conv_2d_buf_subgroup_c16_c4_b8(
     if ((feature_block+1)*16 >= output_channel) {
         for (int i = 0; i < 8 && (x + i) < output_width; i++) {
             if ((feature_block*16 + lid_y * 4 + lid_x < output_channel))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 8 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 #if SLM_DIV_FACTOR > 1
@@ -711,6 +714,7 @@ __kernel void conv_2d_buf_subgroup_c16_c16_b2(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
@@ -944,6 +948,7 @@ __kernel void conv_2d_buf_subgroup_c16_c16_b4(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
@@ -1177,6 +1182,7 @@ __kernel void conv_2d_buf_subgroup_c16_c16_b8(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left, 
     __private const int input_pad_right,
diff --git a/source/backend/opencl/execution/cl/conv_2d_c1_subgroup_buf.cl b/source/backend/opencl/execution/cl/conv_2d_c1_subgroup_buf.cl
index 6e4f81324..2e40d99e4 100644
--- a/source/backend/opencl/execution/cl/conv_2d_c1_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_c1_subgroup_buf.cl
@@ -47,6 +47,7 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b2(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -80,11 +81,11 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b2(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch * output_pack;
+    const uint output_b_pitch = output_fs_pitch * batch;
     
     
-    const uint output_offset = b * output_b_pitch +
-                               f_block * 4 * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               f_block * 4 * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -160,13 +161,13 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b2(
     if ((f_block+1)*16 >= output_channel) {
         for (int i = 0; i < 2 && (x + i) < output_width; i++) {
             if ((f_block*16 + lid_y * 4 < output_pack * 4))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 2 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 }
@@ -184,6 +185,7 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b4(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -217,11 +219,11 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b4(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch * output_pack;
+    const uint output_b_pitch = output_fs_pitch * batch;
     
     
-    const uint output_offset = b * output_b_pitch +
-                               f_block * 4 * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               f_block * 4 * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -297,13 +299,13 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b4(
     if ((f_block+1)*16 >= output_channel) {
         for (int i = 0; i < 4 && (x + i) < output_width; i++) {
             if ((f_block*16 + lid_y * 4 < output_pack * 4))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 4 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 }
@@ -321,6 +323,7 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b8(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -354,11 +357,11 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b8(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * output_width;
     const uint output_fs_pitch = output_y_pitch * output_height;
-    const uint output_b_pitch = output_fs_pitch * output_pack;
+    const uint output_b_pitch = output_fs_pitch * batch;
     
     
-    const uint output_offset = b * output_b_pitch +
-                               f_block * 4 * output_fs_pitch +
+    const uint output_offset = b * output_fs_pitch +
+                               f_block * 4 * output_b_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -434,13 +437,13 @@ __kernel void conv_2d_buf_subgroup_c1_c4_b8(
     if ((f_block+1)*16 >= output_channel) {
         for (int i = 0; i < 8 && (x + i) < output_width; i++) {
             if ((f_block*16 + lid_y * 4 < output_pack * 4))
-                output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+                output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
     else
     {
         for (int i = 0; i < 8 && (x + i) < output_width; i++) {
-            output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
+            output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = (FLOAT)dst[i];
         }
     }
 }
@@ -458,6 +461,7 @@ __kernel void conv_2d_buf_subgroup_c1_c16_b2(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -607,6 +611,7 @@ __kernel void conv_2d_buf_subgroup_c1_c16_b4(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -756,6 +761,7 @@ __kernel void conv_2d_buf_subgroup_c1_c16_b8(
     __private const int output_width,
     __private const int output_height,
     __private const int output_channel,
+    __private const int batch,
     __private const int x_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right,
@@ -890,4 +896,4 @@ __kernel void conv_2d_buf_subgroup_c1_c16_b8(
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/source/backend/opencl/execution/cl/conv_2d_int_buf.cl b/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
index aeed184ce..e42398c63 100644
--- a/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_int_buf.cl
@@ -34,6 +34,7 @@ void conv_2d_int_c4h1w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -77,7 +78,7 @@ void conv_2d_int_c4h1w1(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + kw_start) * 4;
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
             for(int ix = in_w_idx_start; ix < in_w_idx_end; ix += dilate_hw.y) {
-                int inp_offset = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + ix) * 4;
+                int inp_offset = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + ix) * 4;
                 COMPUTE_FLOAT4 in0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
                 
                 const int filter_w_inc = (ix-in_w_idx_start)/dilate_hw.y;
@@ -141,7 +142,7 @@ void conv_2d_int_c4h1w1(GLOBAL_SIZE_2_DIMS
     out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
  
 }
@@ -160,6 +161,7 @@ void conv_2d_int_c4h1w2(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -203,7 +205,7 @@ void conv_2d_int_c4h1w2(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -278,7 +280,7 @@ void conv_2d_int_c4h1w2(GLOBAL_SIZE_2_DIMS
     out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
     if(out_w_idx + 1 >= out_hw.y) return;
@@ -302,6 +304,7 @@ void conv_2d_int_c4h1w4(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -349,7 +352,7 @@ void conv_2d_int_c4h1w4(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -442,7 +445,7 @@ void conv_2d_int_c4h1w4(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
 
@@ -475,6 +478,7 @@ void conv_2d_int_c4h4w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -520,7 +524,7 @@ void conv_2d_int_c4h4w1(GLOBAL_SIZE_2_DIMS
         COMPUTE_FLOAT4 offset = (COMPUTE_FLOAT4)(ScaleOffset.s1, ScaleOffset.s3, ScaleOffset.s5, ScaleOffset.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -615,7 +619,7 @@ void conv_2d_int_c4h4w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -655,6 +659,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -709,7 +714,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
         COMPUTE_FLOAT4 offset1 = (COMPUTE_FLOAT4)(ScaleOffset1.s1, ScaleOffset1.s3, ScaleOffset1.s5, ScaleOffset1.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -873,7 +878,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 4){
@@ -896,7 +901,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
         return;
     }
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
@@ -922,7 +927,7 @@ void conv_2d_int_c8h4w1(GLOBAL_SIZE_2_DIMS
         return;
     }
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out4), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out5), out_hw.y, output+out_offset);
     vstore4(CONVERT_FLOAT4(out6), 2 * out_hw.y, output+out_offset);
@@ -944,6 +949,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -993,7 +999,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
         COMPUTE_FLOAT4 offset1 = (COMPUTE_FLOAT4)(ScaleOffset1.s1, ScaleOffset1.s3, ScaleOffset1.s5, ScaleOffset1.s7);
         //weights  NC4HW4  [1,  4*icC4,  ocC4*kh*kw,  1] xic4
         //index:   [0, 4*in_c_idx, out_c_idx*kh*kw + kh_start*kw + kw_start, 0]
-        const int inp_offset_base = (out_b_idx * in_c_blocks + in_c_idx) * in_hw.x * in_hw.y * 4;
+        const int inp_offset_base = (out_b_idx + in_c_idx*batch) * in_hw.x * in_hw.y * 4;
 
         for(int iy = 0; iy < filter_hw.x; iy++) {
             int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + iy)*filter_hw.y + kw_start) * 4;
@@ -1122,7 +1128,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
     out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.x - out_h_idx;
     if(remain >= 2){
@@ -1136,7 +1142,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
         return;
     }
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 2){
         vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
@@ -1151,7 +1157,7 @@ void conv_2d_int_c8h2w1(GLOBAL_SIZE_2_DIMS
         return;
     }
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
     vstore4(CONVERT_FLOAT4(out3), out_hw.y, output+out_offset);
 #endif
@@ -1171,6 +1177,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
                       __private const int2 in_hw,
                       __private const int inChannel,
                       __private const int in_c_blocks,
+                      __private const int batch,
                       __private const int2 out_hw,
                       __private const int2 filter_hw,
                       __private const int2 stride_hw,
@@ -1227,7 +1234,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
         int weight_offset = ((((4*in_c_idx+0)* out_c_blocks + out_c_idx) *filter_hw.x + kh_start)*filter_hw.y + 0) * 4;
 
         for(int iy = in_h_idx_start; iy < in_h_idx_end; iy += dilate_hw.x) {
-            const int inp_offset_base = (((out_b_idx * in_c_blocks + in_c_idx) * in_hw.x + iy) * in_hw.y + 0) * 4;
+            const int inp_offset_base = (((out_b_idx + in_c_idx*batch) * in_hw.x + iy) * in_hw.y + 0) * 4;
 
             for(int fw = 0; fw < filter_hw.y; fw++) {
                 const int in_w0_idx = fw * dilate_hw.y + in_w0_idx_base;
@@ -1389,7 +1396,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
     out7 = clamp(out7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((out_b_idx*out_c_blocks + out_c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    int out_offset = (((out_b_idx + out_c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 #ifdef BLOCK_LEAVE
     const int remain = out_hw.y - out_w_idx;
     if(remain >= 4){
@@ -1405,7 +1412,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
 #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks)return;
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     if(remain >= 4){
         vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
     }else if(remain == 3){
@@ -1421,7 +1428,7 @@ void conv_2d_int_c8h1w4(GLOBAL_SIZE_2_DIMS
 #ifdef CHANNEL_LEAVE
     if(out_c_idx + 1 >= out_c_blocks)return;
 #endif
-    out_offset = (((out_b_idx*out_c_blocks + out_c_idx + 1)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    out_offset = (((out_b_idx + (out_c_idx + 1)*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
     vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4, out5, out6, out7)), 0, output+out_offset);
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/deconv_2d.cl b/source/backend/opencl/execution/cl/deconv_2d.cl
index 195806221..80fdf982b 100644
--- a/source/backend/opencl/execution/cl/deconv_2d.cl
+++ b/source/backend/opencl/execution/cl/deconv_2d.cl
@@ -17,7 +17,7 @@ __kernel void deconv_2d(GLOBAL_SIZE_3_DIMS
                         #ifdef BIAS
                         __global FLOAT* bias,
                         #endif
-                        __global FLOAT* output,
+                        __global FLOAT* output, __private const int batch,
                     #else
                         __read_only image2d_t input,
                         __read_only image2d_t weights,
@@ -82,7 +82,7 @@ __kernel void deconv_2d(GLOBAL_SIZE_3_DIMS
                 weights3 = vload4(kernel_x_3*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y, weights);
 
                 bool outBoundry = (idx_h < 0 || idx_h >= input_shape.x || kernel_start_x < 0 || in_width0 >= input_shape.y);
-                int inp_offset = (((out_b_idx * in_channel_blocks + ic) * input_shape.x + idx_h) * input_shape.y + in_width0) * 4;
+                int inp_offset = (((out_b_idx + ic * batch) * input_shape.x + idx_h) * input_shape.y + in_width0) * 4;
                 in0 = outBoundry ? (FLOAT4)0 : vload4(0, input+inp_offset);
 
                 out0 = mad(in0.x, weights0, out0);
@@ -127,7 +127,7 @@ __kernel void deconv_2d(GLOBAL_SIZE_3_DIMS
 #endif
 
 #ifdef USE_BUFFER
-    const int out_offset = (((out_b_idx*out_channel_blocks + out_channel_blocks_idx)*output_shape.x + out_h_idx)*output_shape.y + out_w_idx)*4;
+    const int out_offset = (((out_b_idx + out_channel_blocks_idx*batch)*output_shape.x + out_h_idx)*output_shape.y + out_w_idx)*4;
     vstore4(out0, 0, output+out_offset);
 #else
     int out_image_width_idx = mad24(out_channel_blocks_idx, output_shape.y, out_w_idx);
diff --git a/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl b/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
index 586315962..c32400af9 100644
--- a/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
+++ b/source/backend/opencl/execution/cl/depthwise_conv2d_buf.cl
@@ -23,7 +23,7 @@ void depthwise_conv2d_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -58,7 +58,7 @@ void depthwise_conv2d_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh * dilate_hw.x;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             const int kw_dilate = kw * dilate_hw.y;
@@ -92,7 +92,7 @@ void depthwise_conv2d_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue3 = clamp(outValue3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
 
     const int remain     = out_hw.y - out_w4_idx;
     if (remain >= 4) {
@@ -119,7 +119,7 @@ void depthwise_conv2d_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -150,7 +150,7 @@ void depthwise_conv2d_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh * dilate_hw.x;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             const int kw_dilate = kw * dilate_hw.y;
@@ -176,7 +176,7 @@ void depthwise_conv2d_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue1 = clamp(outValue1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
 
     const int remain     = out_hw.y - out_w2_idx;
     if (remain >= 2) {
@@ -194,7 +194,7 @@ void depthwise_conv2d_c4h1w1(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -222,7 +222,7 @@ void depthwise_conv2d_c4h1w1(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh * dilate_hw.x;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             const int kw_dilate = kw * dilate_hw.y;
@@ -244,7 +244,7 @@ void depthwise_conv2d_c4h1w1(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue0 = clamp(outValue0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w_idx)*4;
 
     vstore4(CONVERT_FLOAT4(outValue0), 0, output+out_offset);
 }
@@ -255,7 +255,7 @@ void depthwise_conv2d_s1_c8h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -294,8 +294,8 @@ void depthwise_conv2d_s1_c8h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset_c0 = (((b_idx*c_blocks + c_idx+0)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
-        int inp_offset_c1 = (((b_idx*c_blocks + c_idx+1)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset_c0 = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset_c1 = (((b_idx + (c_idx+1)*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             COMPUTE_FLOAT4 inValue0 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c0));
@@ -349,7 +349,7 @@ void depthwise_conv2d_s1_c8h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue7 = clamp(outValue7, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
+    int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
 
     const int remain     = out_hw.y - out_w4_idx;
     if (remain >= 4) {
@@ -370,7 +370,7 @@ void depthwise_conv2d_s1_c8h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     
     if(c_idx + 1 >= c_blocks) return;
     
-    out_offset += out_hw.x * out_hw.y * 4;
+    out_offset += batch * out_hw.x * out_hw.y * 4;
     if (remain >= 4) {
         vstore4(CONVERT_FLOAT4(outValue4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(outValue5), 1, output+out_offset);
@@ -395,7 +395,7 @@ void depthwise_conv2d_s1_c8h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -428,8 +428,8 @@ void depthwise_conv2d_s1_c8h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset_c0 = (((b_idx*c_blocks + c_idx+0)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
-        int inp_offset_c1 = (((b_idx*c_blocks + c_idx+1)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset_c0 = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset_c1 = (((b_idx + (c_idx+1)*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             COMPUTE_FLOAT4 inValue0 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset_c0));
@@ -467,7 +467,7 @@ void depthwise_conv2d_s1_c8h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue5 = clamp(outValue5, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
+    int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
 
     const int remain     = out_hw.y - out_w2_idx;
     if (remain >= 2) {
@@ -479,7 +479,7 @@ void depthwise_conv2d_s1_c8h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     
     if(c_idx + 1 >= c_blocks) return;
     
-    out_offset += out_hw.x * out_hw.y * 4;
+    out_offset += batch * out_hw.x * out_hw.y * 4;
     if (remain >= 2) {
         vstore4(CONVERT_FLOAT4(outValue4), 0, output+out_offset);
         vstore4(CONVERT_FLOAT4(outValue5), 1, output+out_offset);
@@ -494,7 +494,7 @@ void depthwise_conv2d_s1_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -530,7 +530,7 @@ void depthwise_conv2d_s1_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
         const int in_h_cur = in_h_start + kh;
         if(in_h_cur < 0 || in_h_cur >= in_hw.x) continue;
         
-        int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
+        int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_cur)* in_hw.y + in_w_start_0)*4;
         for (int kw = 0; kw < filter_hw.y; kw++) {
             const int filter_idx = mad24(kh, filter_hw.y, kw);
             inValue0 = (in_w_start_0+kw < 0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0, input+inp_offset));
@@ -563,7 +563,7 @@ void depthwise_conv2d_s1_c4h1w4(GLOBAL_SIZE_2_DIMS __global const FLOAT *input,
     outValue3 = clamp(outValue3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w4_idx)*4;
 
     const int remain     = out_hw.y - out_w4_idx;
     if (remain >= 4) {
@@ -590,7 +590,7 @@ void depthwise_conv2d_k3s1p1_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -617,7 +617,7 @@ void depthwise_conv2d_k3s1p1_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
     const int in_h_start = out_h_idx - pad_hw.x;
     COMPUTE_FLOAT4 inValue0, inValue1, inValue2, inValue3;
     //first line
-    const int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_start)* in_hw.y + in_w_start_0)*4;
+    const int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_start)* in_hw.y + in_w_start_0)*4;
     inValue0 = (in_h_start < 0 || in_w_start_0 < 0          ) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
     inValue1 = (in_h_start < 0 || in_w_start_0+1 >=  in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
     inValue2 = (in_h_start < 0 || in_w_start_0+2 >=  in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(2, input+inp_offset));
@@ -690,7 +690,7 @@ void depthwise_conv2d_k3s1p1_c4h1w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
     outValue1 = clamp(outValue1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h_idx)*out_hw.y + out_w2_idx)*4;
 
     const int remain     = out_hw.y - out_w2_idx;
     if (remain >= 2) {
@@ -708,7 +708,7 @@ void depthwise_conv2d_k3s1p1_c4h2w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
                                   __global const FLOAT *bias,
                                   __global FLOAT *output,
                                   __private const int2 in_hw,
-                                  __private const int channel,
+                                  __private const int batch,
                                   __private const int2 out_hw,
                                   __private const int2 filter_hw,
                                   __private const int2 pad_hw,
@@ -739,7 +739,7 @@ void depthwise_conv2d_k3s1p1_c4h2w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
     const int in_h_start = out_h2_idx - pad_hw.x;
     COMPUTE_FLOAT4 inValue0, inValue1, inValue2, inValue3;
     //first line
-    const int inp_offset = (((b_idx*c_blocks + c_idx)*in_hw.x + in_h_start)* in_hw.y + in_w_start)*4;
+    const int inp_offset = (((b_idx + c_idx*batch)*in_hw.x + in_h_start)* in_hw.y + in_w_start)*4;
     inValue0 = (in_h_start < 0 || in_w_start < 0          ) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0, input+inp_offset));
     inValue1 = (in_h_start < 0 || in_w_start+1 >=  in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(1, input+inp_offset));
     inValue2 = (in_h_start < 0 || in_w_start+2 >=  in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(2, input+inp_offset));
@@ -830,7 +830,7 @@ void depthwise_conv2d_k3s1p1_c4h2w2(GLOBAL_SIZE_2_DIMS __global const FLOAT *inp
     outValue3 = clamp(outValue3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-    const int out_offset = (((b_idx*c_blocks + c_idx)*out_hw.x + out_h2_idx)*out_hw.y + out_w2_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*out_hw.x + out_h2_idx)*out_hw.y + out_w2_idx)*4;
 
     const int remain_w     = out_hw.y - out_w2_idx;
     const int remain_h     = out_hw.x - out_h2_idx;
diff --git a/source/backend/opencl/execution/cl/depthwise_conv2d_subgroup_buf.cl b/source/backend/opencl/execution/cl/depthwise_conv2d_subgroup_buf.cl
index 7d7698059..1bed1618e 100644
--- a/source/backend/opencl/execution/cl/depthwise_conv2d_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/depthwise_conv2d_subgroup_buf.cl
@@ -12,6 +12,7 @@ __kernel void depthwise_conv_2d_buf_c16_c16(
    __private const int inputHeight,
    __private const int inputWidth,
    __private const int Channel,
+   __private const int Batch,
    __private const int input_pad_left,
    __private const int input_pad_right,
    __private const int outputHeight,
@@ -130,6 +131,7 @@ __kernel void depthwise_conv_2d_buf_c16_c4(
    __private const int inputHeight,
    __private const int inputWidth,
    __private const int Channel,
+   __private const int Batch,
    __private const int input_pad_left,
    __private const int input_pad_right,
    __private const int outputHeight,
@@ -167,10 +169,10 @@ __kernel void depthwise_conv_2d_buf_c16_c4(
     const uint output_x_pitch = 4;
     const uint output_y_pitch = output_x_pitch * outputWidth;
     const uint output_fs_pitch = output_y_pitch * outputHeight;
-    const uint output_b_pitch = output_fs_pitch * ((Channel + 3) / 4);
+    const uint output_b_pitch = output_fs_pitch * Batch;
 
-    const uint output_offset = b * output_b_pitch +
-                               (c << 2) * output_fs_pitch +
+    const uint output_offset = (c << 2) * output_b_pitch +
+                               b * output_fs_pitch +
                                y * output_y_pitch +
                                x * output_x_pitch;
 
@@ -223,6 +225,6 @@ __kernel void depthwise_conv_2d_buf_c16_c4(
     const uint lid_x = sglid % 4;
     const uint lid_y = sglid / 4;
     for (int i = 0; i < 8 && (x + i) < outputWidth; i++) {
-        output[output_offset + lid_y * output_fs_pitch + i * output_x_pitch + lid_x] = dst[i];
+        output[output_offset + lid_y * output_b_pitch + i * output_x_pitch + lid_x] = dst[i];
     }
-}
\ No newline at end of file
+}
diff --git a/source/backend/opencl/execution/cl/gather_buf.cl b/source/backend/opencl/execution/cl/gather_buf.cl
index 22b23dbe0..8af02b080 100644
--- a/source/backend/opencl/execution/cl/gather_buf.cl
+++ b/source/backend/opencl/execution/cl/gather_buf.cl
@@ -17,8 +17,6 @@ __kernel void batch_gather_buf(__private int global_dim0, __private int global_d
                             __private const int4 stride_dst,
                             __private const int2 steps,
                             __private const int2 iters,
-                            __private const int4 dst_c4size,// w, h, c, n
-                            __private const int4 src_c4size,// w, h, c, n
                             __private const int inputSize) {
     int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
     
@@ -28,91 +26,22 @@ __kernel void batch_gather_buf(__private int global_dim0, __private int global_d
         int y = pos.x / x_size;
 
         int2 index = (int2)(pos.z, pos.z);
-        
-        #ifdef OFFSET_DST
-        {
-            int offset_value = pos.z;
-            int off_c4_size = (offset_dst_shape.z + 3) >> 2;
-            #ifdef GATHER_INPUT_NHWC
-            int off_c = offset_value % offset_dst_shape.z; offset_value /= offset_dst_shape.z;
-            int off_w = offset_value % offset_dst_shape.x; offset_value /= offset_dst_shape.x;
-            int off_h = offset_value % offset_dst_shape.y;
-            int off_b = offset_value / offset_dst_shape.y;
-            #else
-            int off_w = offset_value % offset_dst_shape.x; offset_value /= offset_dst_shape.x;
-            int off_h = offset_value % offset_dst_shape.y; offset_value /= offset_dst_shape.y;
-            int off_c = offset_value % offset_dst_shape.z;
-            int off_b = offset_value / offset_dst_shape.z;
-            #endif
-            int real_dst_offset = (((off_b * off_c4_size + off_c / 4) * offset_dst_shape.y + off_h) * offset_dst_shape.x + off_w) * 4 + off_c % 4;
-            index.x = offset_dst_ptr[real_dst_offset];
-        }
-        #endif
-    
-        #ifdef OFFSET_SRC
-        {
-            int offset_value = pos.z;
-            int off_c4_size = (offset_src_shape.z + 3) >> 2;
-            #ifdef GATHER_INPUT_NHWC
-            int off_c = offset_value % offset_src_shape.z; offset_value /= offset_src_shape.z;
-            int off_w = offset_value % offset_src_shape.x; offset_value /= offset_src_shape.x;
-            int off_h = offset_value % offset_src_shape.y;
-            int off_b = offset_value / offset_src_shape.y;
-            #else
-            int off_w = offset_value % offset_src_shape.x; offset_value /= offset_src_shape.x;
-            int off_h = offset_value % offset_src_shape.y; offset_value /= offset_src_shape.y;
-            int off_c = offset_value % offset_src_shape.z;
-            int off_b = offset_value / offset_src_shape.z;
-            #endif
-            int real_src_offset = (((off_b * off_c4_size + off_c / 4) * offset_src_shape.y + off_h) * offset_src_shape.x + off_w) * 4 + off_c % 4;
-            index.y = offset_src_ptr[real_src_offset];
-        }
-        #endif
-    
+#ifdef OFFSET_DST
+        index.x = offset_dst_ptr[pos.z];
+#endif
+            
+#ifdef OFFSET_SRC
+        index.y = offset_src_ptr[pos.z];
+#endif
         int2 offset = index * steps;
         int src_offset = offset.y + stride_src.w + x * stride_src.x + y * stride_src.y + pos.y * stride_src.z;
         int dst_offset = offset.x + stride_dst.w + x * stride_dst.x + y * stride_dst.y + pos.y * stride_dst.z;
 
-        int src_offsetC4, dst_offsetC4;
-        {
-#ifdef GATHER_INPUT_NHWC
-            int c = src_offset % src_c4size.z; src_offset /= src_c4size.z;
-            int w = src_offset % src_c4size.x; src_offset /= src_c4size.x;
-            int h = src_offset % src_c4size.y;
-            int b = src_offset / src_c4size.y;
-            int c4_size = (src_c4size.z + 3) / 4;
-            src_offsetC4 = (((b * c4_size + (c / 4)) * src_c4size.y + h) * src_c4size.x + w) * 4 + (c % 4);
-#else
-            int w = src_offset % src_c4size.x; src_offset /= src_c4size.x;
-            int h = src_offset % src_c4size.y; src_offset /= src_c4size.y;
-            int c = src_offset % src_c4size.z;
-            int b = src_offset / src_c4size.z;
-            int c4_size = (src_c4size.z + 3) / 4;
-            src_offsetC4 = (((b * c4_size + (c / 4)) * src_c4size.y + h) * src_c4size.x + w) * 4 + (c % 4);
-#endif
-        }
-        {
-#ifdef GATHER_OUTPUT_NHWC
-            int c = dst_offset % dst_c4size.z; dst_offset /= dst_c4size.z;
-            int w = dst_offset % dst_c4size.x; dst_offset /= dst_c4size.x;
-            int h = dst_offset % dst_c4size.y;
-            int b = dst_offset / dst_c4size.y;
-            int c4_size = (dst_c4size.z + 3) / 4;
-            dst_offsetC4 = (((b * c4_size + (c / 4)) * dst_c4size.y + h) * dst_c4size.x + w) * 4 + (c % 4);
-#else
-            int w = dst_offset % dst_c4size.x; dst_offset /= dst_c4size.x;
-            int h = dst_offset % dst_c4size.y; dst_offset /= dst_c4size.y;
-            int c = dst_offset % dst_c4size.z;
-            int b = dst_offset / dst_c4size.z;
-            int c4_size = (dst_c4size.z + 3) / 4;
-            dst_offsetC4 = (((b * c4_size + (c / 4)) * dst_c4size.y + h) * dst_c4size.x + w) * 4 + (c % 4);
-#endif
-        }
         if(offset.x >= 0){
             if(offset.y >= 0 && offset.y < inputSize){
-                output[dst_offsetC4] = (OUTPUT_TYPE)input[src_offsetC4];
+                output[dst_offset] = (OUTPUT_TYPE)input[src_offset];
             }else{
-                output[dst_offsetC4] = (OUTPUT_TYPE)(0);
+                output[dst_offset] = (OUTPUT_TYPE)(0);
             }
         }
     }
diff --git a/source/backend/opencl/execution/cl/gemm_buf.cl b/source/backend/opencl/execution/cl/gemm_buf.cl
index 903b62252..0e4fe3d46 100644
--- a/source/backend/opencl/execution/cl/gemm_buf.cl
+++ b/source/backend/opencl/execution/cl/gemm_buf.cl
@@ -10,118 +10,7 @@
         return; \
     }
 
-__kernel void gemm_buf(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input0,
-                        __global const FLOAT* input1,
-                        __global FLOAT* output,
-                        __private const int width,//UP_DIV(wUnit*hUnit,4)
-                        __private const int height,//dstChannelC4
-                        __private const int srcChannelC4,
-                        __private const int alpha2) {
-    int2 pos = (int2)(get_global_id(0), get_global_id(1));
-    UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
-
-    const int pos_x = pos.x % width;
-    const int pos_y = pos.x / width;
-    const int pos_z = pos.y;
-
-    COMPUTE_FLOAT16 o = (COMPUTE_FLOAT16)0;
-    
-    int kenerlY   = mad24(pos_z, height, pos_y);
-
-    for (int k = 0; k < srcChannelC4; ++k) {        
-        //NHWC  [1, 1, alpha2*height, srcChannelC4*4] x 4
-        //index:[0, 0, pos_z*width+pos_y,    index+0]
-        //int inp1_offset = (((k * (alpha2*height) + kenerlY) * (srcChannelC4*4) + index)*4 + 0)*4;
-        
-        COMPUTE_FLOAT16 k_v16 = CONVERT_COMPUTE_FLOAT16(vload16(kenerlY*(srcChannelC4) + k, input1));
-        
-        //NC4HW4 [alpha*alpha, srcChannelC4, width, 4] x 4
-        //index: [pos_z,       k,            pos_x, 0]
-        
-        COMPUTE_FLOAT16 s = CONVERT_COMPUTE_FLOAT16(vload16(((pos_z*srcChannelC4 + k) * width + pos_x), input0));
-
-        o = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0, (COMPUTE_FLOAT4)s.s4, (COMPUTE_FLOAT4)s.s8, (COMPUTE_FLOAT4)s.sc), (COMPUTE_FLOAT16)(k_v16.s0123, k_v16.s0123, k_v16.s0123, k_v16.s0123), o);
-        o = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1, (COMPUTE_FLOAT4)s.s5, (COMPUTE_FLOAT4)s.s9, (COMPUTE_FLOAT4)s.sd), (COMPUTE_FLOAT16)(k_v16.s4567, k_v16.s4567, k_v16.s4567, k_v16.s4567), o);
-        o = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2, (COMPUTE_FLOAT4)s.s6, (COMPUTE_FLOAT4)s.sa, (COMPUTE_FLOAT4)s.se), (COMPUTE_FLOAT16)(k_v16.s89ab, k_v16.s89ab, k_v16.s89ab, k_v16.s89ab), o);
-        o = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3, (COMPUTE_FLOAT4)s.s7, (COMPUTE_FLOAT4)s.sb, (COMPUTE_FLOAT4)s.sf), (COMPUTE_FLOAT16)(k_v16.scdef, k_v16.scdef, k_v16.scdef, k_v16.scdef), o);
-    }
-    
-    //index: [pos_y,  pos_z,  0, pos_x]
-    int out_offset = (((pos_y * alpha2 + pos_z) * 4 + 0) * width + pos_x) * 4;
-
-    vstore4(CONVERT_FLOAT4(o.s0123), 0, output+out_offset);
-    vstore4(CONVERT_FLOAT4(o.s4567), 0, output+out_offset+4*width);
-    vstore4(CONVERT_FLOAT4(o.s89ab), 0, output+out_offset+8*width);
-    vstore4(CONVERT_FLOAT4(o.scdef), 0, output+out_offset+12*width);
-}
-
-
-
-__kernel void gemm_buf2(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input0,
-                        __global const FLOAT* input1,
-                        __global FLOAT* output,
-                        __private const int width,//UP_DIV(wUnit*hUnit,8)
-                        __private const int height,//dstChannelC4
-                        __private const int srcChannelC4,
-                        __private const int alpha2) {
-    int2 pos = (int2)(get_global_id(0), get_global_id(1));
-    UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
-
-    const int width_block = (width+1) >> 1;
-    const int pos_x = (pos.x % width_block) << 1;
-    const int pos_y = pos.x / width_block;
-    const int pos_z = pos.y;
-
-    COMPUTE_FLOAT16 o0 = (COMPUTE_FLOAT16)0;
-    COMPUTE_FLOAT16 o1 = (COMPUTE_FLOAT16)0;
-
-    const int kenerlY   = mad24(pos_z, height, pos_y);
-    const int kernel_base = mul24(kenerlY, srcChannelC4);
-    const int inp_base = (pos_z*srcChannelC4 + 0) * width + pos_x;
-    
-    for (int k = 0; k < srcChannelC4; ++k) {
-        //NHWC  [1, 1, alpha2*height, srcChannelC4*4] x 4
-        //index:[0, 0, pos_z*width+pos_y,    index+0]
-        //int inp1_offset = (((k * (alpha2*height) + kenerlY) * (srcChannelC4*4) + index)*4 + 0)*4;
-        
-        COMPUTE_FLOAT16 k_v16 = CONVERT_COMPUTE_FLOAT16(vload16(kernel_base + k, input1));
-        
-        //NC4HW4 [alpha*alpha, srcChannelC4, width, 4] x 4
-        //index: [pos_z,       k,            pos_x, 0]
-        
-        const int inp_offset = mad24(k, width, inp_base);
-        COMPUTE_FLOAT16 s = CONVERT_COMPUTE_FLOAT16(vload16(inp_offset, input0));
-
-        o0 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0, (COMPUTE_FLOAT4)s.s4, (COMPUTE_FLOAT4)s.s8, (COMPUTE_FLOAT4)s.sc), (COMPUTE_FLOAT16)(k_v16.s0123, k_v16.s0123, k_v16.s0123, k_v16.s0123), o0);
-        o0 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1, (COMPUTE_FLOAT4)s.s5, (COMPUTE_FLOAT4)s.s9, (COMPUTE_FLOAT4)s.sd), (COMPUTE_FLOAT16)(k_v16.s4567, k_v16.s4567, k_v16.s4567, k_v16.s4567), o0);
-        o0 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2, (COMPUTE_FLOAT4)s.s6, (COMPUTE_FLOAT4)s.sa, (COMPUTE_FLOAT4)s.se), (COMPUTE_FLOAT16)(k_v16.s89ab, k_v16.s89ab, k_v16.s89ab, k_v16.s89ab), o0);
-        o0 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3, (COMPUTE_FLOAT4)s.s7, (COMPUTE_FLOAT4)s.sb, (COMPUTE_FLOAT4)s.sf), (COMPUTE_FLOAT16)(k_v16.scdef, k_v16.scdef, k_v16.scdef, k_v16.scdef), o0);
-        
-        s = CONVERT_COMPUTE_FLOAT16(vload16(inp_offset + 1, input0));
-        o1 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0, (COMPUTE_FLOAT4)s.s4, (COMPUTE_FLOAT4)s.s8, (COMPUTE_FLOAT4)s.sc), (COMPUTE_FLOAT16)(k_v16.s0123, k_v16.s0123, k_v16.s0123, k_v16.s0123), o1);
-        o1 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1, (COMPUTE_FLOAT4)s.s5, (COMPUTE_FLOAT4)s.s9, (COMPUTE_FLOAT4)s.sd), (COMPUTE_FLOAT16)(k_v16.s4567, k_v16.s4567, k_v16.s4567, k_v16.s4567), o1);
-        o1 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2, (COMPUTE_FLOAT4)s.s6, (COMPUTE_FLOAT4)s.sa, (COMPUTE_FLOAT4)s.se), (COMPUTE_FLOAT16)(k_v16.s89ab, k_v16.s89ab, k_v16.s89ab, k_v16.s89ab), o1);
-        o1 = mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3, (COMPUTE_FLOAT4)s.s7, (COMPUTE_FLOAT4)s.sb, (COMPUTE_FLOAT4)s.sf), (COMPUTE_FLOAT16)(k_v16.scdef, k_v16.scdef, k_v16.scdef, k_v16.scdef), o1);
-    }
-
-    //index: [pos_y,  pos_z,  0, pos_x]
-    int out_offset = (((pos_y * alpha2 + pos_z) * 4 + 0) * width + pos_x) * 4;
-
-    vstore4(CONVERT_FLOAT4(o0.s0123), 0, output+out_offset);
-    vstore4(CONVERT_FLOAT4(o0.s4567), 0, output+out_offset+4*width);
-    vstore4(CONVERT_FLOAT4(o0.s89ab), 0, output+out_offset+8*width);
-    vstore4(CONVERT_FLOAT4(o0.scdef), 0, output+out_offset+12*width);
-    
-    if(pos_x + 1 >= width) return;
-    vstore4(CONVERT_FLOAT4(o1.s0123), 1, output+out_offset);
-    vstore4(CONVERT_FLOAT4(o1.s4567), 1, output+out_offset+4*width);
-    vstore4(CONVERT_FLOAT4(o1.s89ab), 1, output+out_offset+8*width);
-    vstore4(CONVERT_FLOAT4(o1.scdef), 1, output+out_offset+12*width);
-}
-
-// [B, K/4, area, 4] -> [alignK, alignM] (M = B * area)
+// [K/4, M, 4] -> [alignK, alignM]
 __kernel void transpose_pad(GLOBAL_SIZE_DIM2
                         const int alignM,
                         const int alignK,
@@ -131,7 +20,6 @@ __kernel void transpose_pad(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input,
                         __global FLOAT* output
                         ) {
-#ifdef AREA_EQUAL_1
     const int idx_m4 = get_global_id(0); // idx M
     const int idx_k4 = get_global_id(1); // idx K
     UNIFORM_BOUNDRY_CHECK(idx_m4, idx_k4);
@@ -139,71 +27,25 @@ __kernel void transpose_pad(GLOBAL_SIZE_DIM2
     const int idx_m = idx_m4 << 2;
     const int idx_k = idx_k4 << 2;
     const int K_4 = (K + 3) >> 2;
-    const int in_offset_base  = (idx_m * K_4 + idx_k4) * 4;
+    const int in_offset_base  = (idx_k4 * M + idx_m) * 4;
     const int out_offset_base = idx_k * alignM + idx_m;
     
-    FLOAT4 m0k4 = (idx_k4 >= K_4 || idx_m + 0 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base);
-    FLOAT4 m1k4 = (idx_k4 >= K_4 || idx_m + 1 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + (K_4 << 2));
-    FLOAT4 m2k4 = (idx_k4 >= K_4 || idx_m + 2 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + (K_4 << 2) * 2);
-    FLOAT4 m3k4 = (idx_k4 >= K_4 || idx_m + 3 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + (K_4 << 2) * 3);
-    
-    vstore4((FLOAT4)(m0k4.x, m1k4.x, m2k4.x, m3k4.x), 0, output + out_offset_base);
-    vstore4((FLOAT4)(m0k4.y, m1k4.y, m2k4.y, m3k4.y), 0, output + out_offset_base + alignM);
-    vstore4((FLOAT4)(m0k4.z, m1k4.z, m2k4.z, m3k4.z), 0, output + out_offset_base + alignM + alignM);
-    vstore4((FLOAT4)(m0k4.w, m1k4.w, m2k4.w, m3k4.w), 0, output + out_offset_base + alignM + alignM + alignM);
-#elif defined BATCH_EQUAL_1
-
-    const int idx_m4 = get_global_id(0); // idx M
-    const int idx_k4 = get_global_id(1); // idx K
-    UNIFORM_BOUNDRY_CHECK(idx_m4, idx_k4);
-
-    const int idx_m = idx_m4 << 2;
-    const int idx_k = idx_k4 << 2;
-    const int K_4 = (K + 3) >> 2;
-    const int in_offset_base  = (idx_k4 * area + idx_m) * 4;
-    const int out_offset_base = idx_k * alignM + idx_m;
-
     FLOAT4 m0k4 = (idx_k4 >= K_4 || idx_m + 0 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base);
     FLOAT4 m1k4 = (idx_k4 >= K_4 || idx_m + 1 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + 4);
     FLOAT4 m2k4 = (idx_k4 >= K_4 || idx_m + 2 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + 8);
     FLOAT4 m3k4 = (idx_k4 >= K_4 || idx_m + 3 >= M) ? (FLOAT4)0 : vload4(0, input + in_offset_base + 12);
-
+    
     vstore4((FLOAT4)(m0k4.x, m1k4.x, m2k4.x, m3k4.x), 0, output + out_offset_base);
     vstore4((FLOAT4)(m0k4.y, m1k4.y, m2k4.y, m3k4.y), 0, output + out_offset_base + alignM);
     vstore4((FLOAT4)(m0k4.z, m1k4.z, m2k4.z, m3k4.z), 0, output + out_offset_base + alignM + alignM);
     vstore4((FLOAT4)(m0k4.w, m1k4.w, m2k4.w, m3k4.w), 0, output + out_offset_base + alignM + alignM + alignM);
+}
 
-#else
-
-    const int idx_m = get_global_id(0); // idx M
-    const int idx_k4 = get_global_id(1); // idx K
-    UNIFORM_BOUNDRY_CHECK(idx_m, idx_k4);
-    
-    const int K_4 = (K + 3) >> 2;
-    const int idx_k = idx_k4 << 2;
-    const int out_offset_base = idx_k * alignM + idx_m;
-    
-    if(idx_k4 >= K_4 || idx_m >= M) {
-        output[out_offset_base] = (FLOAT)0;
-        output[out_offset_base + alignM] = (FLOAT)0;
-        output[out_offset_base + alignM + alignM] = (FLOAT)0;
-        output[out_offset_base + alignM + alignM + alignM] = (FLOAT)0;
-        return;
-    }
-    const int idx_b = idx_m / area;
-    const int idx_area = idx_m % area;
-    
-    const int in_offset_base  = ((idx_b * K_4 + idx_k4) * area + idx_area) * 4;
-    FLOAT4 data = vload4(0, input + in_offset_base);
-    
-    output[out_offset_base] = data.x;
-    output[out_offset_base + alignM] = data.y;
-    output[out_offset_base + alignM + alignM] = data.z;
-    output[out_offset_base + alignM + alignM + alignM] = data.w;
+#ifndef M_VEC
+#define M_VEC 1
 #endif
-}
 
-// [alignM, alignN] -> [B, N/4, area, 4] (M = B * area)
+// [alignM, alignN] -> [N/4, B, area, N4] (M = B * area)
 __kernel void transpose_bias(GLOBAL_SIZE_DIM2
                         const int alignM,
                         const int alignN,
@@ -214,133 +56,24 @@ __kernel void transpose_bias(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input1,
                         __global FLOAT* output
                         ) {
-#ifdef AREA_EQUAL_1
-    const int idx_m = get_global_id(0); // idx M
-    const int idx_n_16 = get_global_id(1); // idx N
-    UNIFORM_BOUNDRY_CHECK(idx_m, idx_n_16);
+    int idx_m = get_global_id(0); // idx M
+    int idx_n4 = get_global_id(1); // idx N
+    UNIFORM_BOUNDRY_CHECK(idx_m, idx_n4);
 
-    const int N_4 = (N + 3) >> 2;
-    const int N_16 = (N + 15) >> 4;
-    const int N_left = N & 15;
-    bool canVec16 = (N_left == 0 || (N_left != 0 && idx_n_16 < N_16 - 1));
-    if(canVec16) {
-        FLOAT16 res0 = vload16(0, input0 + idx_m * alignN + (idx_n_16 << 4));
-        FLOAT16 res1 = vload16(0, input1 + (idx_n_16 << 4));
-        FLOAT16 res = res0 + res1;
-        #ifdef RELU
-            res = fmax(res, (FLOAT16)0);
-        #endif
-        #ifdef RELU6
-            res = clamp(res, (FLOAT16)0, (FLOAT16)6);
-        #endif
-        vstore16(res, 0, output + ((idx_m * N_4 + (idx_n_16 << 2)) << 2));
-    } else {
+    const int idx_n = idx_n4 << 2;
 
-        FLOAT4 res0 = vload4(0, input0 + idx_m * alignN + (idx_n_16 << 4));
-        FLOAT4 res1 = vload4(0, input1 + (idx_n_16 << 4));
+    idx_m = idx_m * M_VEC;
+    FLOAT4 res1 = vload4(0, input1 + idx_n);
+    #pragma unroll
+    for(int i = 0; i < M_VEC; i++) {
+        FLOAT4 res0 = vload4(0, input0 + (idx_m + i) * alignN + idx_n);
         FLOAT4 res = res0 + res1;
         #ifdef RELU
-            res = fmax(res, (FLOAT4)0);
-        #endif
-        #ifdef RELU6
-            res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-        #endif
-        vstore4(res, 0, output + ((idx_m * N_4 + (idx_n_16 << 2)) << 2));
-        
-        if(idx_n_16 * 4 + 1 >= N_4) return;
-        res0 = vload4(0, input0 + idx_m * alignN + (idx_n_16 << 4) + 4);
-        res1 = vload4(0, input1 + (idx_n_16 << 4) + 4);
-        res = res0 + res1;
-        #ifdef RELU
-            res = fmax(res, (FLOAT4)0);
-        #endif
-        #ifdef RELU6
-            res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-        #endif
-        vstore4(res, 0, output + ((idx_m * N_4 + (idx_n_16 << 2)) << 2) + 4);
-        
-        if(idx_n_16 * 4 + 2 >= N_4) return;
-        res0 = vload4(0, input0 + idx_m * alignN + (idx_n_16 << 4) + 8);
-        res1 = vload4(0, input1 + (idx_n_16 << 4) + 8);
-        res = res0 + res1;
-        #ifdef RELU
-            res = fmax(res, (FLOAT4)0);
-        #endif
-        #ifdef RELU6
-            res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-        #endif
-        vstore4(res, 0, output + ((idx_m * N_4 + (idx_n_16 << 2)) << 2) + 8);
-        
-        if(idx_n_16 * 4 + 3 >= N_4) return;
-        res0 = vload4(0, input0 + idx_m * alignN + (idx_n_16 << 4) + 12);
-        res1 = vload4(0, input1 + (idx_n_16 << 4) + 12);
-        res = res0 + res1;
-        #ifdef RELU
-            res = fmax(res, (FLOAT4)0);
+        res = fmax(res, (FLOAT4)0);
         #endif
         #ifdef RELU6
-            res = clamp(res, (FLOAT4)0, (FLOAT4)6);
+        res = clamp(res, (FLOAT4)0, (FLOAT4)6);
         #endif
-        vstore4(res, 0, output + ((idx_m * N_4 + (idx_n_16 << 2)) << 2) + 12);
+        vstore4(res, 0, output + ((idx_n4 * M + idx_m + i) << 2));
     }
-#else
-    const int idx_m = get_global_id(0); // idx M
-    const int idx_n_16 = get_global_id(1); // idx N
-    UNIFORM_BOUNDRY_CHECK(idx_m, idx_n_16);
-    
-    const int N_4 = (N + 3) >> 2;
-
-    const int idx_b = idx_m / area;
-    const int idx_area = idx_m % area;
-    
-    const int inp_base_offset = idx_m * alignN + (idx_n_16 << 4);
-    const int out_base_offset = ((idx_b * N_4 + idx_n_16 * 4) * area + idx_area) * 4;
-    
-    FLOAT4 res0 = vload4(0, input0 + inp_base_offset);
-    FLOAT4 res1 = vload4(0, input1 + (idx_n_16 << 4));
-    FLOAT4 res = res0 + res1;
-    #ifdef RELU
-        res = fmax(res, (FLOAT4)0);
-    #endif
-    #ifdef RELU6
-        res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-    #endif
-    vstore4(res, 0, output + out_base_offset);
-    
-    if(idx_n_16 * 4 + 1 >= N_4) return;
-    res0 = vload4(0, input0 + inp_base_offset + 4);
-    res1 = vload4(0, input1 + (idx_n_16 << 4) + 4);
-    res = res0 + res1;
-    #ifdef RELU
-        res = fmax(res, (FLOAT4)0);
-    #endif
-    #ifdef RELU6
-        res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-    #endif
-    vstore4(res, 0, output + out_base_offset + area * 4);
-    
-    if(idx_n_16 * 4 + 2 >= N_4) return;
-    res0 = vload4(0, input0 + inp_base_offset + 8);
-    res1 = vload4(0, input1 + (idx_n_16 << 4) + 8);
-    res = res0 + res1;
-    #ifdef RELU
-        res = fmax(res, (FLOAT4)0);
-    #endif
-    #ifdef RELU6
-        res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-    #endif
-    vstore4(res, 0, output + out_base_offset + area * 8);
-    
-    if(idx_n_16 * 4 + 3 >= N_4) return;
-    res0 = vload4(0, input0 + inp_base_offset + 12);
-    res1 = vload4(0, input1 + (idx_n_16 << 4) + 12);
-    res = res0 + res1;
-    #ifdef RELU
-        res = fmax(res, (FLOAT4)0);
-    #endif
-    #ifdef RELU6
-        res = clamp(res, (FLOAT4)0, (FLOAT4)6);
-    #endif
-    vstore4(res, 0, output + out_base_offset + area * 12);
-#endif
 }
diff --git a/source/backend/opencl/execution/cl/gemm_conv1x1_buf.cl b/source/backend/opencl/execution/cl/gemm_conv1x1_buf.cl
new file mode 100644
index 000000000..35304f433
--- /dev/null
+++ b/source/backend/opencl/execution/cl/gemm_conv1x1_buf.cl
@@ -0,0 +1,760 @@
+#ifdef MNN_SUPPORT_FP16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define GLOBAL_SIZE_DIM2 \
+    __private int global_size_dim0, __private int global_size_dim1,
+
+#define UNIFORM_BOUNDRY_CHECK(index0, index1) \
+    if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { \
+        return; \
+    }
+
+#define GLOBAL_SIZE_DIM3 \
+    __private int global_size_dim0, __private int global_size_dim1, __private int global_size_dim2,
+
+#define UNIFORM_BOUNDRY_CHECK3(index0, index1, index2) \
+    if(index0 >= global_size_dim0 || index1 >= global_size_dim1 || index2 >= global_size_dim2) { \
+        return; \
+    }
+
+#define UCHAR16_TO_2CHAR16(a, b, c) \
+    a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8;         \
+    a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8;         \
+    b.s0 = (c.s8 >> 4) - 8; b.s1 = (c.s8 & 15) - 8; b.s2 = (c.s9 >> 4) - 8; b.s3 = (c.s9 & 15) - 8; b.s4 = (c.sa >> 4) - 8; b.s5 = (c.sa & 15) - 8; b.s6 = (c.sb >> 4) - 8; b.s7 = (c.sb & 15) - 8;         \
+    b.s8 = (c.sc >> 4) - 8; b.s9 = (c.sc & 15) - 8; b.sa = (c.sd >> 4) - 8; b.sb = (c.sd & 15) - 8; b.sc = (c.se >> 4) - 8; b.sd = (c.se & 15) - 8; b.se = (c.sf >> 4) - 8; b.sf = (c.sf & 15) - 8;
+
+#define UCHAR8_TO_CHAR16(a, c) \
+    a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8;         \
+    a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8;
+
+#define DOT16X16(a, b, c) \
+    c += dot(a.s0123, b.s0123); \
+    c += dot(a.s4567, b.s4567); \
+    c += dot(a.s89ab, b.s89ab); \
+    c += dot(a.scdef, b.scdef);
+
+#if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+#define CHANNEL_PACK 32
+#else
+#define CHANNEL_PACK 16
+#endif
+
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+#define WEIGHT_STRIDE 16
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+#define WEIGHT_STRIDE 8
+#endif
+
+__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef USE_IMAGE
+inline COMPUTE_FLOAT16 readWeight(__read_only image2d_t weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    return CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(ix, iy)))) * scale + offset;
+}
+#else
+
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+inline COMPUTE_FLOAT16 readWeight(__global const char *weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    return CONVERT_COMPUTE_FLOAT16(vload16(0, weight)) * scale + offset;
+}
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+inline COMPUTE_FLOAT16 readWeight(__global const uchar *weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    uchar16 charWeightsInt40 = vload16(0, weight);
+    uchar8 charWeightsInt4 = vload8(0, weight);
+    char16 charWeights = 0;
+    UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+    return CONVERT_COMPUTE_FLOAT16(charWeights) * scale + offset;
+}
+#endif
+#endif
+
+__kernel void inverse_quant_weight(GLOBAL_SIZE_DIM2
+    #ifdef USE_IMAGE
+    __read_only image2d_t weight,
+    #else
+    #if (defined USE_LOW_BIT_WEIGHT_INT8)
+    __global const char *weight,
+    #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+    __global const uchar *weight,
+    #endif
+    #endif
+    __global const float *dequantScaleOffset,
+    __global FLOAT* output,
+    __private const int outputChannelAlign,
+    __private const int outputChannel4Align,
+    __private const int blockDim){
+    const int x = get_global_id(0); //ic
+    const int y = get_global_id(1); //oc
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+    #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+    
+    const int ic = x << 5;
+    const int oc = y << 2;
+    const int output_offset = ic * outputChannelAlign + oc;
+
+    int kindex = (ic / blockDim) * outputChannel4Align * 2;
+    COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + kindex + oc * 2));
+    COMPUTE_FLOAT16 weights00, weights01, weights10, weights11, weights20, weights21, weights30, weights31;
+    {
+        uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(oc, x)));
+        uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(oc + 1, x)));
+        uchar16 charWeightsInt42 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(oc + 2, x)));
+        uchar16 charWeightsInt43 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(oc + 3, x)));
+        char16 charWeights0, charWeights1;
+        UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+        weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+        weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+        UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+        weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+        weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+        UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt42);
+        weights20 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
+        weights21 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s4 + ScaleOffset.s5;
+        UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt43);
+        weights30 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s6 + ScaleOffset.s7;
+        weights31 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
+    }
+    COMPUTE_FLOAT *weights00_ptr = (COMPUTE_FLOAT *)&weights00;
+    COMPUTE_FLOAT *weights10_ptr = (COMPUTE_FLOAT *)&weights10;
+    COMPUTE_FLOAT *weights20_ptr = (COMPUTE_FLOAT *)&weights20;
+    COMPUTE_FLOAT *weights30_ptr = (COMPUTE_FLOAT *)&weights30;
+    COMPUTE_FLOAT *weights01_ptr = (COMPUTE_FLOAT *)&weights01;
+    COMPUTE_FLOAT *weights11_ptr = (COMPUTE_FLOAT *)&weights11;
+    COMPUTE_FLOAT *weights21_ptr = (COMPUTE_FLOAT *)&weights21;
+    COMPUTE_FLOAT *weights31_ptr = (COMPUTE_FLOAT *)&weights31;
+    #pragma unroll
+    for (int i = 0; i < 16; ++i){
+        FLOAT4 out = CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights00_ptr[i], weights10_ptr[i], weights20_ptr[i], weights30_ptr[i]));
+        vstore4(out, 0, output+output_offset+i*outputChannelAlign);
+    }
+    #pragma unroll
+    for (int i = 0; i < 16; ++i){
+        FLOAT4 out = CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights01_ptr[i], weights11_ptr[i], weights21_ptr[i], weights31_ptr[i]));
+        vstore4(out, 0, output+output_offset+(i + 16)*outputChannelAlign);
+    }
+    #else
+    const int ic = x << 4;
+    const int oc = y << 2;
+#ifndef USE_IMAGE
+    #if (defined USE_LOW_BIT_WEIGHT_INT4)
+    int weight_offset = oc * 8;
+    int weight_oc_offset = outputChannel4Align * 8;
+    int weight_stride = 8;
+    #else
+    int weight_offset = oc * 16;
+    int weight_oc_offset = outputChannel4Align * 16;
+    int weight_stride = 16;
+    #endif
+#endif
+    const int output_offset = ic * outputChannelAlign + oc;
+
+    int kindex = (ic / blockDim) * outputChannel4Align * 2;
+    COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + kindex + oc * 2));
+    #ifdef USE_IMAGE
+    COMPUTE_FLOAT16 weights0 = readWeight(weight, oc, x, ScaleOffset.s0, ScaleOffset.s1);
+    COMPUTE_FLOAT16 weights1 = readWeight(weight, oc + 1, x, ScaleOffset.s2, ScaleOffset.s3);
+    COMPUTE_FLOAT16 weights2 = readWeight(weight, oc + 2, x, ScaleOffset.s4, ScaleOffset.s5);
+    COMPUTE_FLOAT16 weights3 = readWeight(weight, oc + 3, x, ScaleOffset.s6, ScaleOffset.s7);
+    #else
+    COMPUTE_FLOAT16 weights0 = readWeight(weight + weight_offset + x * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+    COMPUTE_FLOAT16 weights1 = readWeight(weight + weight_offset + x * weight_oc_offset + weight_stride, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+    COMPUTE_FLOAT16 weights2 = readWeight(weight + weight_offset + x * weight_oc_offset + 2 * weight_stride, 0, 0, ScaleOffset.s4, ScaleOffset.s5);
+    COMPUTE_FLOAT16 weights3 = readWeight(weight + weight_offset + x * weight_oc_offset + 3 * weight_stride, 0, 0, ScaleOffset.s6, ScaleOffset.s7);
+    #endif
+    COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT*)&weights0;
+    COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT*)&weights1;
+    COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT*)&weights2;
+    COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT*)&weights3;
+    #pragma unroll
+    for (int i = 0; i < 16; ++i){
+        FLOAT4 out = CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights0_ptr[i], weights1_ptr[i], weights2_ptr[i], weights3_ptr[i]));
+        vstore4(out, 0, output+output_offset+i*outputChannelAlign);
+    }
+    #endif
+}
+
+__kernel void reshape_nchw4_nhwc4(GLOBAL_SIZE_DIM2
+__global const FLOAT* input,
+__global FLOAT* output,
+__private const int bhw,
+__private const int channel,
+__private const int channelAlign){
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //bhw
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+    
+    const int x4 = x << 2;
+    const int y4 = y << 2;
+    const int input_offset = (x * bhw + y4) * 4;
+    FLOAT4 in0 = vload4(0, input + input_offset);
+    FLOAT4 in1 = (y4 + 1 < bhw) ? vload4(0, input + input_offset + 4) : (FLOAT4)0;
+    FLOAT4 in2 = (y4 + 2 < bhw) ? vload4(0, input + input_offset + 8) : (FLOAT4)0;
+    FLOAT4 in3 = (y4 + 3 < bhw) ? vload4(0, input + input_offset + 12) : (FLOAT4)0;
+    
+#ifdef INPUT_CHANNEL_LEAVE
+    if(x4 + 3 >= channel){
+        FLOAT *in0_ptr = (FLOAT*)&in0;
+        FLOAT *in1_ptr = (FLOAT*)&in1;
+        FLOAT *in2_ptr = (FLOAT*)&in2;
+        FLOAT *in3_ptr = (FLOAT*)&in3;
+        int remain = x4 + 3 - channel;
+        for(int i = remain; i >= 0; i--){
+            in0_ptr[3 - i] = 0;
+            in1_ptr[3 - i] = 0;
+            in2_ptr[3 - i] = 0;
+            in3_ptr[3 - i] = 0;
+        }
+    }
+#endif
+    
+#ifdef FORMAT_CNHW
+    int idx = x / 4;
+    int idy = x % 4;
+    const int bhw4 = (bhw + 3) / 4 * 4;
+    int output_offset = ((idx * bhw4 + y4) * 4 + idy) * 4; // [c/16 b 4 4]
+    vstore4(in0, 0, output+output_offset);
+    vstore4(in1, 0, output+output_offset+16);
+    vstore4(in2, 0, output+output_offset+32);
+    vstore4(in3, 0, output+output_offset+48);
+#else
+    FLOAT16 out = (FLOAT16)(in0.s0, in1.s0, in2.s0, in3.s0, in0.s1, in1.s1, in2.s1, in3.s1, in0.s2, in1.s2, in2.s2, in3.s2, in0.s3, in1.s3, in2.s3, in3.s3);
+    const int output_offset = (y * channelAlign + x4) * 4;
+    vstore16(out, 0, output+output_offset);
+#endif
+}
+
+__kernel void reshape_nhwc4_nchw4(GLOBAL_SIZE_DIM2
+__global const FLOAT* input,
+__global FLOAT* output,
+__private const int bhw,
+__private const int channelAlign){
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //bhw
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+    
+    const int x4 = x << 2;
+    const int y4 = y << 2;
+    const int output_offset = (x * bhw + y4) * 4;
+    
+
+    const int input_offset = (y * channelAlign + x4) * 4;
+    FLOAT16 in = vload16(0, input + input_offset);
+    
+    FLOAT4 out0 = (FLOAT4)(in.s0, in.s4, in.s8, in.sc);
+    FLOAT4 out1 = (FLOAT4)(in.s1, in.s5, in.s9, in.sd);
+    FLOAT4 out2 = (FLOAT4)(in.s2, in.s6, in.sa, in.se);
+    FLOAT4 out3 = (FLOAT4)(in.s3, in.s7, in.sb, in.sf);
+
+    vstore4(out0, 0, output+output_offset);
+    if(y4 + 1 >= bhw) return;
+    vstore4(out1, 0, output+output_offset+4);
+    if(y4 + 2 >= bhw) return;
+    vstore4(out2, 0, output+output_offset+8);
+    if(y4 + 3 >= bhw) return;
+    vstore4(out3, 0, output+output_offset+12);
+}
+
+
+__kernel void gemm_b4_c4_buf(GLOBAL_SIZE_DIM2
+                        __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+                        __global const char *weight,
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+                        __global const uchar *weight,
+#endif
+#endif
+                        __global const float *dequantScaleOffset,
+                        __global const FLOAT *bias,
+                        __global FLOAT* output,
+                        __private const int bhw4,
+                        __private const int dstChannelAlign,
+                        __private const int srcChannelAlign,
+                        __private const int blockNum,
+                        __private const int blockDim) {
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //b
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+
+    const int out_c_idx = x << 2;
+    const int out_b_idx = y << 2;
+
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(0, bias + out_c_idx));
+    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
+    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1, out2 = (COMPUTE_FLOAT4)bias0.s2, out3 = (COMPUTE_FLOAT4)bias0.s3;
+
+#ifdef FORMAT_CNHW
+    int input_offset = out_b_idx * 16;
+#else
+    int input_offset = out_b_idx * srcChannelAlign;
+#endif
+    int out_offset = out_b_idx * dstChannelAlign + out_c_idx * 4;
+    
+#ifndef USE_IMAGE
+    int weight_offset = out_c_idx * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelAlign * WEIGHT_STRIDE;
+#endif
+
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
+    
+    for (int i = 0; i < blockNum; i++){
+        int kindex = i * dstChannelAlign * 2;
+        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + kindex + out_c_idx * 2));
+        for (int j = 0; j < loop; j++) {
+            int k = i * loop + j;
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11, weights20, weights21, weights30, weights31;
+            {
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
+                uchar16 charWeightsInt42 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)));
+                uchar16 charWeightsInt43 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt42);
+                weights20 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
+                weights21 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s4 + ScaleOffset.s5;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt43);
+                weights30 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s6 + ScaleOffset.s7;
+                weights31 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
+            }
+            #ifdef FORMAT_CNHW
+            int k2 = k << 1;
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16));
+            DOT16X16(in, weights00, out.s0);
+            DOT16X16(in, weights10, out1.s0);
+            DOT16X16(in, weights20, out2.s0);
+            DOT16X16(in, weights30, out3.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 16));
+            DOT16X16(in, weights00, out.s1);
+            DOT16X16(in, weights10, out1.s1);
+            DOT16X16(in, weights20, out2.s1);
+            DOT16X16(in, weights30, out3.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 32));
+            DOT16X16(in, weights00, out.s2);
+            DOT16X16(in, weights10, out1.s2);
+            DOT16X16(in, weights20, out2.s2);
+            DOT16X16(in, weights30, out3.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 48));
+            DOT16X16(in, weights00, out.s3);
+            DOT16X16(in, weights10, out1.s3);
+            DOT16X16(in, weights20, out2.s3);
+            DOT16X16(in, weights30, out3.s3);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16));
+            DOT16X16(in, weights01, out.s0);
+            DOT16X16(in, weights11, out1.s0);
+            DOT16X16(in, weights21, out2.s0);
+            DOT16X16(in, weights31, out3.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 16));
+            DOT16X16(in, weights01, out.s1);
+            DOT16X16(in, weights11, out1.s1);
+            DOT16X16(in, weights21, out2.s1);
+            DOT16X16(in, weights31, out3.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 32));
+            DOT16X16(in, weights01, out.s2);
+            DOT16X16(in, weights11, out1.s2);
+            DOT16X16(in, weights21, out2.s2);
+            DOT16X16(in, weights31, out3.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 48));
+            DOT16X16(in, weights01, out.s3);
+            DOT16X16(in, weights11, out1.s3);
+            DOT16X16(in, weights21, out2.s3);
+            DOT16X16(in, weights31, out3.s3);
+            #else
+            int k32 = k << 5;
+            COMPUTE_FLOAT *weights00_ptr = (COMPUTE_FLOAT *)&weights00;
+            COMPUTE_FLOAT *weights10_ptr = (COMPUTE_FLOAT *)&weights10;
+            COMPUTE_FLOAT *weights20_ptr = (COMPUTE_FLOAT *)&weights20;
+            COMPUTE_FLOAT *weights30_ptr = (COMPUTE_FLOAT *)&weights30;
+            COMPUTE_FLOAT *weights01_ptr = (COMPUTE_FLOAT *)&weights01;
+            COMPUTE_FLOAT *weights11_ptr = (COMPUTE_FLOAT *)&weights11;
+            COMPUTE_FLOAT *weights21_ptr = (COMPUTE_FLOAT *)&weights21;
+            COMPUTE_FLOAT *weights31_ptr = (COMPUTE_FLOAT *)&weights31;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+                out = mad(in, weights00_ptr[i], out);
+                out1 = mad(in, weights10_ptr[i], out1);
+                out2 = mad(in, weights20_ptr[i], out2);
+                out3 = mad(in, weights30_ptr[i], out3);
+            }
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4));
+                out = mad(in, weights01_ptr[i], out);
+                out1 = mad(in, weights11_ptr[i], out1);
+                out2 = mad(in, weights21_ptr[i], out2);
+                out3 = mad(in, weights31_ptr[i], out3);
+            }
+            #endif
+            #else
+            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, out_c_idx, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, out_c_idx + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight, out_c_idx + 2, k, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight, out_c_idx + 3, k, ScaleOffset.s6, ScaleOffset.s7);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight + weight_offset + k * weight_oc_offset + 2 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight + weight_offset + k * weight_oc_offset + 3 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s6, ScaleOffset.s7);
+            #endif
+            #ifdef FORMAT_CNHW
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16));
+            DOT16X16(in, weights0, out.s0);
+            DOT16X16(in, weights1, out1.s0);
+            DOT16X16(in, weights2, out2.s0);
+            DOT16X16(in, weights3, out3.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 16));
+            DOT16X16(in, weights0, out.s1);
+            DOT16X16(in, weights1, out1.s1);
+            DOT16X16(in, weights2, out2.s1);
+            DOT16X16(in, weights3, out3.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 32));
+            DOT16X16(in, weights0, out.s2);
+            DOT16X16(in, weights1, out1.s2);
+            DOT16X16(in, weights2, out2.s2);
+            DOT16X16(in, weights3, out3.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 48));
+            DOT16X16(in, weights0, out.s3);
+            DOT16X16(in, weights1, out1.s3);
+            DOT16X16(in, weights2, out2.s3);
+            DOT16X16(in, weights3, out3.s3);
+            #else
+            int k16 = k << 4;
+            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
+            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
+            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
+            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
+                out = mad(in, weights0_ptr[i], out);
+                out1 = mad(in, weights1_ptr[i], out1);
+                out2 = mad(in, weights2_ptr[i], out2);
+                out3 = mad(in, weights3_ptr[i], out3);
+            }
+            #endif
+            #endif
+        }
+    }
+#ifdef RELU
+    out = fmax(out, (COMPUTE_FLOAT4)0);
+    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
+    out2 = fmax(out2, (COMPUTE_FLOAT4)0);
+    out3 = fmax(out3, (COMPUTE_FLOAT4)0);
+#endif
+
+#ifdef RELU6
+    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+#endif
+
+    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
+    vstore4(CONVERT_FLOAT4(out1), 0, output+out_offset + 4);
+    vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset + 8);
+    vstore4(CONVERT_FLOAT4(out3), 0, output+out_offset + 12);
+}
+
+__kernel void gemm_b4_c2_buf(GLOBAL_SIZE_DIM2
+                        __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+                        __global const char *weight,
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+                        __global const uchar *weight,
+#endif
+#endif
+                        __global const float *dequantScaleOffset,
+                        __global const FLOAT *bias,
+                        __global FLOAT* output,
+                        __private const int bhw4,
+                        __private const int dstChannelAlign,
+                        __private const int srcChannelAlign,
+                        __private const int blockNum,
+                        __private const int blockDim) {
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //b
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+
+    const int out_c_idx = x << 1;
+    const int out_b_idx = y << 2;
+
+    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx));
+    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
+    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
+    
+#ifdef FORMAT_CNHW
+    int input_offset = out_b_idx * 16;
+#else
+    int input_offset = out_b_idx * srcChannelAlign;
+#endif
+    int out_offset = out_b_idx * dstChannelAlign + out_c_idx * 4;
+    
+#ifndef USE_IMAGE
+    int weight_offset = out_c_idx * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelAlign * WEIGHT_STRIDE;
+#endif
+
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
+
+    for (int i = 0; i < blockNum; i++){
+        int kindex = i * dstChannelAlign * 2;
+        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + kindex + out_c_idx * 2));
+        for (int j = 0; j < loop; j++) {
+            int k = i * loop + j;
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11;
+            {
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+            }
+            #ifdef FORMAT_CNHW
+            int k2 = k << 1;
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16));
+            DOT16X16(in, weights00, out.s0);
+            DOT16X16(in, weights10, out1.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 16));
+            DOT16X16(in, weights00, out.s1);
+            DOT16X16(in, weights10, out1.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 32));
+            DOT16X16(in, weights00, out.s2);
+            DOT16X16(in, weights10, out1.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 48));
+            DOT16X16(in, weights00, out.s3);
+            DOT16X16(in, weights10, out1.s3);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16));
+            DOT16X16(in, weights01, out.s0);
+            DOT16X16(in, weights11, out1.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 16));
+            DOT16X16(in, weights01, out.s1);
+            DOT16X16(in, weights11, out1.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 32));
+            DOT16X16(in, weights01, out.s2);
+            DOT16X16(in, weights11, out1.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 48));
+            DOT16X16(in, weights01, out.s3);
+            DOT16X16(in, weights11, out1.s3);
+            #else
+            int k32 = k << 5;
+            COMPUTE_FLOAT *weights00_ptr = (COMPUTE_FLOAT *)&weights00;
+            COMPUTE_FLOAT *weights10_ptr = (COMPUTE_FLOAT *)&weights10;
+            COMPUTE_FLOAT *weights01_ptr = (COMPUTE_FLOAT *)&weights01;
+            COMPUTE_FLOAT *weights11_ptr = (COMPUTE_FLOAT *)&weights11;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+                out = mad(in, weights00_ptr[i], out);
+                out1 = mad(in, weights10_ptr[i], out1);
+            }
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4));
+                out = mad(in, weights01_ptr[i], out);
+                out1 = mad(in, weights11_ptr[i], out1);
+            }
+            #endif
+            #else
+            COMPUTE_FLOAT16 weights0, weights1;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, out_c_idx, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, out_c_idx + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+            #endif
+            #ifdef FORMAT_CNHW
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16));
+            DOT16X16(in, weights0, out.s0);
+            DOT16X16(in, weights1, out1.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 16));
+            DOT16X16(in, weights0, out.s1);
+            DOT16X16(in, weights1, out1.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 32));
+            DOT16X16(in, weights0, out.s2);
+            DOT16X16(in, weights1, out1.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 48));
+            DOT16X16(in, weights0, out.s3);
+            DOT16X16(in, weights1, out1.s3);
+            #else
+            int k16 = k << 4;
+            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
+            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
+                out = mad(in, weights0_ptr[i], out);
+                out1 = mad(in, weights1_ptr[i], out1);
+            }
+            #endif
+            #endif
+        }
+    }
+    
+#ifdef RELU
+    out = fmax(out, (COMPUTE_FLOAT4)0);
+    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
+#endif
+
+#ifdef RELU6
+    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+#endif
+
+    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
+    vstore4(CONVERT_FLOAT4(out1), 0, output+out_offset+4);
+}
+
+__kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2
+                        __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+                        __global const char *weight,
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+                        __global const uchar *weight,
+#endif
+#endif
+                        __global const float *dequantScaleOffset,
+                        __global const FLOAT *bias,
+                        __global FLOAT* output,
+                        __private const int bhw4,
+                        __private const int dstChannelAlign,
+                        __private const int srcChannelAlign,
+                        __private const int blockNum,
+                        __private const int blockDim) {
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //b
+
+    UNIFORM_BOUNDRY_CHECK(x, y);
+
+    const int out_c_idx = x;
+    const int out_b_idx = y << 2;
+
+    COMPUTE_FLOAT bias0 = bias[out_c_idx];
+    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0;
+    
+#ifdef FORMAT_CNHW
+    int input_offset = out_b_idx * 16;
+#else
+    int input_offset = out_b_idx * srcChannelAlign;
+#endif
+    int out_offset = out_b_idx * dstChannelAlign + out_c_idx * 4;
+
+#ifndef USE_IMAGE
+    int weight_offset = out_c_idx * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelAlign * WEIGHT_STRIDE;
+#endif
+
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
+    
+    for (int i = 0; i < blockNum; i++){
+        int kindex = i * dstChannelAlign * 2;
+        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
+        for (int j = 0; j < loop; j++) {
+            int k = i * loop + j;
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11;
+            {
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+            }
+            #ifdef FORMAT_CNHW
+            int k2 = k << 1;
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16));
+            DOT16X16(in, weights00, out.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 16));
+            DOT16X16(in, weights00, out.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 32));
+            DOT16X16(in, weights00, out.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k2 * bhw4 * 16 + 48));
+            DOT16X16(in, weights00, out.s3);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16));
+            DOT16X16(in, weights01, out.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 16));
+            DOT16X16(in, weights01, out.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 32));
+            DOT16X16(in, weights01, out.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + (k2 + 1) * bhw4 * 16 + 48));
+            DOT16X16(in, weights01, out.s3);
+            #else
+            int k32 = k << 5;
+            COMPUTE_FLOAT *weights00_ptr = (COMPUTE_FLOAT *)&weights00;
+            COMPUTE_FLOAT *weights01_ptr = (COMPUTE_FLOAT *)&weights01;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i) * 4));
+                out = mad(in, weights00_ptr[i], out);
+            }
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k32 + i + 16) * 4));
+                out = mad(in, weights01_ptr[i], out);
+            }
+            #endif
+            #else
+            COMPUTE_FLOAT16 weights;
+            #ifdef USE_IMAGE
+            weights = readWeight(weight, out_c_idx, k, ScaleOffset.s0, ScaleOffset.s1);
+            #else
+            weights = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            #endif
+            #ifdef FORMAT_CNHW
+            COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16));
+            DOT16X16(in, weights, out.s0);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 16));
+            DOT16X16(in, weights, out.s1);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 32));
+            DOT16X16(in, weights, out.s2);
+            in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * bhw4 * 16 + 48));
+            DOT16X16(in, weights, out.s3);
+            #else
+            int k16 = k << 4;
+            COMPUTE_FLOAT *weights_ptr = (COMPUTE_FLOAT *)&weights;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i){
+                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
+                out = mad(in, weights_ptr[i], out);
+            }
+            #endif
+            #endif
+        }
+    }
+    
+#ifdef RELU
+    out = fmax(out, (COMPUTE_FLOAT4)0);
+#endif
+
+#ifdef RELU6
+    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
+#endif
+    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
+}
diff --git a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl b/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
deleted file mode 100644
index 083268503..000000000
--- a/source/backend/opencl/execution/cl/gemm_quant_batch_buf.cl
+++ /dev/null
@@ -1,821 +0,0 @@
-#ifdef MNN_SUPPORT_FP16
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif
-
-#define GLOBAL_SIZE_DIM2 \
-    __private int global_size_dim0, __private int global_size_dim1,
-
-#define UNIFORM_BOUNDRY_CHECK(index0, index1) \
-    if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { \
-        return; \
-    }
-
-#define GLOBAL_SIZE_DIM3 \
-    __private int global_size_dim0, __private int global_size_dim1, __private int global_size_dim2,
-
-#define UNIFORM_BOUNDRY_CHECK3(index0, index1, index2) \
-    if(index0 >= global_size_dim0 || index1 >= global_size_dim1 || index2 >= global_size_dim2) { \
-        return; \
-    }
-
-#define UCHAR16_TO_2CHAR16(a, b, c) \
-    a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8;         \
-    a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8;         \
-    b.s0 = (c.s8 >> 4) - 8; b.s1 = (c.s8 & 15) - 8; b.s2 = (c.s9 >> 4) - 8; b.s3 = (c.s9 & 15) - 8; b.s4 = (c.sa >> 4) - 8; b.s5 = (c.sa & 15) - 8; b.s6 = (c.sb >> 4) - 8; b.s7 = (c.sb & 15) - 8;         \
-    b.s8 = (c.sc >> 4) - 8; b.s9 = (c.sc & 15) - 8; b.sa = (c.sd >> 4) - 8; b.sb = (c.sd & 15) - 8; b.sc = (c.se >> 4) - 8; b.sd = (c.se & 15) - 8; b.se = (c.sf >> 4) - 8; b.sf = (c.sf & 15) - 8;
-
-#define UCHAR8_TO_CHAR16(a, c) \
-    a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8;         \
-    a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8;
-
-#define DOT16X16(a, b, c) \
-    c += dot(a.s0123, b.s0123); \
-    c += dot(a.s4567, b.s4567); \
-    c += dot(a.s89ab, b.s89ab); \
-    c += dot(a.scdef, b.scdef);
-
-__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-__kernel void reshape_nchw4_nhwc4(GLOBAL_SIZE_DIM3
-__global const FLOAT* input,
-__global FLOAT* output,
-__private const int width_height,
-__private const int batch,
-__private const int channel,
-__private const int channelC4){
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-    const int wh = get_global_id(2); // w*h
-
-    UNIFORM_BOUNDRY_CHECK3(x, y, wh);
-    
-    const int x4 = x << 2;
-    const int y4 = y << 2;
-    const int channel4 = channelC4 * 4;
-    const int stride = channel4 * width_height;
-    const int input_offset = (y4 * channel4 + x4) * width_height + wh * 4;
-    const int output_offset = ((y * width_height + wh) * channel4 + x4) * 4;
-    FLOAT4 in0 = vload4(0, input + input_offset);
-    FLOAT4 in1 = (y4 + 1 < batch) ? vload4(0, input + input_offset + stride) : (FLOAT4)0;
-    FLOAT4 in2 = (y4 + 2 < batch) ? vload4(0, input + input_offset + 2 * stride) : (FLOAT4)0;
-    FLOAT4 in3 = (y4 + 3 < batch) ? vload4(0, input + input_offset + 3 * stride) : (FLOAT4)0;
-    
-#ifdef INPUT_CHANNEL_LEAVE
-    if(x4 + 3 >= channel){
-        FLOAT *in0_ptr = (FLOAT*)&in0;
-        FLOAT *in1_ptr = (FLOAT*)&in1;
-        FLOAT *in2_ptr = (FLOAT*)&in2;
-        FLOAT *in3_ptr = (FLOAT*)&in3;
-        int remain = x4 + 3 - channel;
-        for(int i = remain; i >= 0; i--){
-            in0_ptr[3 - remain] = 0;
-            in1_ptr[3 - remain] = 0;
-            in2_ptr[3 - remain] = 0;
-            in3_ptr[3 - remain] = 0;
-        }
-    }
-#endif
-    
-    FLOAT16 out = (FLOAT16)(in0.s0, in1.s0, in2.s0, in3.s0, in0.s1, in1.s1, in2.s1, in3.s1, in0.s2, in1.s2, in2.s2, in3.s2, in0.s3, in1.s3, in2.s3, in3.s3);
-    
-    vstore16(out, 0, output+output_offset);
-}
-
-__kernel void reshape_nhwc4_nchw4(GLOBAL_SIZE_DIM3
-__global const FLOAT* input,
-__global FLOAT* output,
-__private const int width_height,
-__private const int batch,
-__private const int channelC4){
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-    const int wh  = get_global_id(2); //w*h
-
-    UNIFORM_BOUNDRY_CHECK3(x, y, wh);
-    
-    const int x4 = x << 2;
-    const int y4 = y << 2;
-    const int channel4 = channelC4 * 4;
-    const int stride = channel4 * width_height;
-    const int input_offset = ((y * width_height + wh) * channel4 + x4) * 4;
-    const int output_offset = (y4 * channel4 + x4) * width_height + wh * 4;
-    FLOAT16 in = vload16(0, input + input_offset);
-    
-    FLOAT4 out0 = (FLOAT4)(in.s0, in.s4, in.s8, in.sc);
-    FLOAT4 out1 = (FLOAT4)(in.s1, in.s5, in.s9, in.sd);
-    FLOAT4 out2 = (FLOAT4)(in.s2, in.s6, in.sa, in.se);
-    FLOAT4 out3 = (FLOAT4)(in.s3, in.s7, in.sb, in.sf);
-    
-    vstore4(out0, 0, output+output_offset);
-    if(y4 + 1 >= batch) return;
-    vstore4(out1, 0, output+output_offset+stride);
-    if(y4 + 2 >= batch) return;
-    vstore4(out2, 0, output+output_offset+2*stride);
-    if(y4 + 3 >= batch) return;
-    vstore4(out3, 0, output+output_offset+3*stride);
-}
-
-
-__kernel void gemm_b4_c4_buf(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-                        __global const char *weight,
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-                        __global const uchar *weight,
-#endif
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x;
-    const int out_b_idx = y << 2;
-
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
-    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1, out2 = (COMPUTE_FLOAT4)bias0.s2, out3 = (COMPUTE_FLOAT4)bias0.s3;
-    
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx * 4) * 4;
-
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 4 * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 4 * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
-#endif
-
-    const int loop = (blockDim + 15) / 16;
-#ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-#else
-    const int loop_end = loop;
-#endif
-    
-    for (int i = 0; i < blockNum; i++){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx, dequantScaleOffset + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            weights2 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 32)) * ScaleOffset.s4 + ScaleOffset.s5;
-            weights3 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 48)) * ScaleOffset.s6 + ScaleOffset.s7;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar16 charWeightsInt40 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                uchar16 charWeightsInt41 = vload16(0, weight + weight_offset + k * weight_oc_offset + 16);
-                {
-                    char16 charWeights0 = 0;
-                    char16 charWeights1 = 0;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
-                    weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                    weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
-                    weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
-                    weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
-                }
-            }
-#endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-                out2 = mad(in, weights2_ptr[i], out2);
-                out3 = mad(in, weights3_ptr[i], out3);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            weights2 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 32)) * ScaleOffset.s4 + ScaleOffset.s5;
-            weights3 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 48)) * ScaleOffset.s6 + ScaleOffset.s7;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar16 charWeightsInt40 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                uchar16 charWeightsInt41 = vload16(0, weight + weight_offset + k * weight_oc_offset + 16);
-                {
-                    char16 charWeights0 = 0;
-                    char16 charWeights1 = 0;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
-                    weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                    weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
-                    weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
-                    weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
-                }
-            }
-#endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-                out2 = mad(in, weights2_ptr[i], out2);
-                out3 = mad(in, weights3_ptr[i], out3);
-            }
-        }
-#endif
-    }
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
-    out2 = fmax(out2, (COMPUTE_FLOAT4)0);
-    out3 = fmax(out3, (COMPUTE_FLOAT4)0);
-#endif
-
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-
-    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
-    vstore4(CONVERT_FLOAT4(out1), 0, output+out_offset + 4);
-    vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset + 8);
-    vstore4(CONVERT_FLOAT4(out3), 0, output+out_offset + 12);
-}
-
-__kernel void gemm_b4_c2_buf(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-                        __global const char *weight,
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-                        __global const uchar *weight,
-#endif
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x;
-    const int out_b_idx = y << 2;
-
-    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, bias));
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
-    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
-    
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx * 2) * 4;
-
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 2 * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 2 * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
-#endif
-
-    const int loop = (blockDim + 15) / 16;
-#ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-#else
-    const int loop_end = loop;
-#endif
-
-    for (int i = 0; i < blockNum; i++){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, dequantScaleOffset + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            COMPUTE_FLOAT16 weights0, weights1;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar16 charWeightsInt4 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-#endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            
-            COMPUTE_FLOAT16 weights0, weights1;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar16 charWeightsInt4 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-#endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-            }
-        }
-#endif
-    }
-    
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
-#endif
-
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-
-    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
-    vstore4(CONVERT_FLOAT4(out1), 0, output+out_offset+4);
-}
-
-__kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-                        __global const char *weight,
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-                        __global const uchar *weight,
-#endif
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x;
-    const int out_b_idx = y << 2;
-
-    COMPUTE_FLOAT bias0 = bias[out_c_idx];
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0;
-    
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
-
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
-#endif
-
-    const int loop = (blockDim + 15) / 16;
-#ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-#else
-    const int loop_end = loop;
-#endif
-    
-    for (int i = 0; i < blockNum; i++){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            COMPUTE_FLOAT16 weights;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar8 charWeightsInt4 = vload8(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-#endif
-            COMPUTE_FLOAT *weights_ptr = (COMPUTE_FLOAT *)&weights;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights_ptr[i], out);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            COMPUTE_FLOAT16 weights;
-#if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-#elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar8 charWeightsInt4 = vload8(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-#endif
-            COMPUTE_FLOAT *weights_ptr = (COMPUTE_FLOAT *)&weights;
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights_ptr[i], out);
-            }
-        }
-#endif
-    }
-    
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-#endif
-
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
-}
-
-__kernel void gemm_b4_c4_image(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-                        __read_only image2d_t weight,
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x << 2;
-    const int out_b_idx = y << 2;
-        
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(0, bias + out_c_idx));
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
-    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
-    COMPUTE_FLOAT4 out2 = (COMPUTE_FLOAT4)bias0.s2;
-    COMPUTE_FLOAT4 out3 = (COMPUTE_FLOAT4)bias0.s3;
-
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
-    
-    const int loop = (blockDim + 15) / 16;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-    #else
-    const int loop_end = loop;
-    #endif
-    
-    for (int i = 0; i < blockNum; i++){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(0, dequantScaleOffset + out_c_idx * 2 + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5;
-            COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k))));
-                uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                char16 charWeights2 = 0;
-                char16 charWeights3 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42);
-                UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-                out2 = mad(in, weights2_ptr[i], out2);
-                out3 = mad(in, weights3_ptr[i], out3);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            COMPUTE_FLOAT16 weights2 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 2, k)))) * ScaleOffset.s4 + ScaleOffset.s5;
-            COMPUTE_FLOAT16 weights3 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 3, k)))) * ScaleOffset.s6 + ScaleOffset.s7;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                uchar8 charWeightsInt42 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 2, k))));
-                uchar8 charWeightsInt43 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 3, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                char16 charWeights2 = 0;
-                char16 charWeights3 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                UCHAR8_TO_CHAR16(charWeights2, charWeightsInt42);
-                UCHAR8_TO_CHAR16(charWeights3, charWeightsInt43);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                weights2 = CONVERT_COMPUTE_FLOAT16(charWeights2) * ScaleOffset.s4 + ScaleOffset.s5;
-                weights3 = CONVERT_COMPUTE_FLOAT16(charWeights3) * ScaleOffset.s6 + ScaleOffset.s7;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            COMPUTE_FLOAT *weights2_ptr = (COMPUTE_FLOAT *)&weights2;
-            COMPUTE_FLOAT *weights3_ptr = (COMPUTE_FLOAT *)&weights3;
-            #pragma unroll
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-                out2 = mad(in, weights2_ptr[i], out2);
-                out3 = mad(in, weights3_ptr[i], out3);
-            }
-        }
-#endif
-    }
-
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
-    out2 = fmax(out2, (COMPUTE_FLOAT4)0);
-    out3 = fmax(out3, (COMPUTE_FLOAT4)0);
-#endif
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    vstore4(CONVERT_FLOAT4(out), 0, output + out_offset);
-    vstore4(CONVERT_FLOAT4(out1), 0, output + out_offset + 4);
-    vstore4(CONVERT_FLOAT4(out2), 0, output + out_offset + 8);
-    vstore4(CONVERT_FLOAT4(out3), 0, output + out_offset + 12);
-}
-__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-                        __read_only image2d_t weight,
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x << 1;
-    const int out_b_idx = y << 2;
-        
-    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx));
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0.s0;
-    COMPUTE_FLOAT4 out1 = (COMPUTE_FLOAT4)bias0.s1;
-
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
-    
-    const int loop = (blockDim + 15) / 16;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-    #else
-    const int loop_end = loop;
-    #endif
-    
-    for (int i = 0; i < blockNum; i++){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            COMPUTE_FLOAT *weights1_ptr = (COMPUTE_FLOAT *)&weights1;
-            #pragma unroll
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-                out1 = mad(in, weights1_ptr[i], out1);
-            }
-        }
-#endif
-    }
-
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-    out1 = fmax(out1, (COMPUTE_FLOAT4)0);
-#endif
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-    out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    vstore4(CONVERT_FLOAT4(out), 0, output + out_offset);
-    vstore4(CONVERT_FLOAT4(out1), 0, output + out_offset + 4);
-}
-__kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-                        __read_only image2d_t weight,
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int x = get_global_id(0); //c
-    const int y  = get_global_id(1); //b
-    UNIFORM_BOUNDRY_CHECK(x, y);
-
-    const int out_c_idx = x;
-    const int out_b_idx = y << 2;
-    
-    COMPUTE_FLOAT bias0 = bias[out_c_idx];
-    COMPUTE_FLOAT4 out = (COMPUTE_FLOAT4)bias0;
-    
-    int input_offset = out_b_idx * srcChannelC4 * 4;
-    int out_offset = (out_b_idx * dstChannelC4 + out_c_idx) * 4;
-    
-    const int loop = (blockDim + 15) / 16;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    const int remain = blockDim - loop_end*16;
-    #else
-    const int loop_end = loop;
-    #endif
-
-    for (int i = 0; i < blockNum; ++i){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0;
-            {
-                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            #pragma unroll
-            for (int i = 0; i < 16; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-            }
-        }
-#ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k16 = k << 4;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0;
-            {
-                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            #endif
-            COMPUTE_FLOAT *weights0_ptr = (COMPUTE_FLOAT *)&weights0;
-            #pragma unroll
-            for (int i = 0; i < remain; ++i){
-                COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k16 + i) * 4));
-                out = mad(in, weights0_ptr[i], out);
-            }
-        }
-#endif
-    }
-
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-#endif
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
-}
-        
diff --git a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
index 82b7b02db..df362ce09 100644
--- a/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
+++ b/source/backend/opencl/execution/cl/gemv_conv1x1_buf.cl
@@ -31,21 +31,58 @@
         COMPUTE_FLOAT* ptr = (COMPUTE_FLOAT*)&data; \
         int remain = k + 15 - channel; \
         for(int r = remain; r >= 0; r--){ \
-            ptr[15 - remain] = 0; \
+            ptr[15 - r] = 0; \
         }  \
     }
 #else
     #define PADZEROS(k, channel, data)
 #endif
 
+#if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+#define CHANNEL_PACK 32
+#else
+#define CHANNEL_PACK 16
+#endif
+
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+#define WEIGHT_STRIDE 16
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+#define WEIGHT_STRIDE 8
+#endif
+
 __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef USE_IMAGE
+inline COMPUTE_FLOAT16 readWeight(__read_only image2d_t weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    return CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(ix, iy)))) * scale + offset;
+}
+#else
+
+#if (defined USE_LOW_BIT_WEIGHT_INT8)
+inline COMPUTE_FLOAT16 readWeight(__global const char *weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    return CONVERT_COMPUTE_FLOAT16(vload16(0, weight)) * scale + offset;
+}
+#elif (defined USE_LOW_BIT_WEIGHT_INT4)
+inline COMPUTE_FLOAT16 readWeight(__global const uchar *weight, int ix, int iy, COMPUTE_FLOAT scale, COMPUTE_FLOAT offset){
+    uchar16 charWeightsInt40 = vload16(0, weight);
+    uchar8 charWeightsInt4 = vload8(0, weight);
+    char16 charWeights = 0;
+    UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
+    return CONVERT_COMPUTE_FLOAT16(charWeights) * scale + offset;
+}
+#endif
+#endif
+
 
-__kernel void gemm_conv_c4_buf(GLOBAL_SIZE_DIM2
+__kernel void gemv_conv_c4_buf(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
                         __global const char *weight,
 #elif (defined USE_LOW_BIT_WEIGHT_INT4)
                         __global const uchar *weight,
+#endif
 #endif
                         __global const float *dequantScaleOffset,
                         __global const FLOAT *bias,
@@ -53,49 +90,28 @@ __kernel void gemm_conv_c4_buf(GLOBAL_SIZE_DIM2
                         __private const int dstChannelC4,
                         __private const int srcChannelC4,
                         __private const int srcChannel,
-                        __private const int batch,
-                        __private const int height,
-                        __private const int width,
+                        __private const int bhw,
                         __private const int blockNum,
                         __private const int blockDim) {
-    const int out_c_w_idx = get_global_id(0); //c/4 w
-    const int out_b_h_idx  = get_global_id(1); //b h
+    const int x = get_global_id(0); //c/4
+    const int y = get_global_id(1); //b h w
 
-    UNIFORM_BOUNDRY_CHECK(out_c_w_idx, out_b_h_idx);
-
-    const int out_c_idx = out_c_w_idx / width;
-    const int out_w_idx = out_c_w_idx % width;
-#ifdef BACTH_BLOCK4
-    const int out_b_idx = (out_b_h_idx / height) << 2;
-#else
-    const int out_b_idx = out_b_h_idx / height;
-#endif
-    const int out_h_idx = out_b_h_idx % height;
+    UNIFORM_BOUNDRY_CHECK(x, y);
 
-    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
-    COMPUTE_FLOAT4 out = bias0;
-#ifdef BACTH_BLOCK4
-    COMPUTE_FLOAT4 out1 = bias0, out2 = bias0, out3 = bias0;
-    int input_offset1 = (((out_b_idx + 1) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset2 = (((out_b_idx + 2) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset3 = (((out_b_idx + 3) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    bool isValidBatch1 = out_b_idx + 1 < batch;
-    bool isValidBatch2 = out_b_idx + 2 < batch;
-    bool isValidBatch3 = out_b_idx + 3 < batch;
-#endif
+    COMPUTE_FLOAT4 bias0 = CONVERT_COMPUTE_FLOAT4(vload4(x, bias));
+    COMPUTE_FLOAT4 out0 = bias0;
+    int idn = x << 2;
+    int idm = y;
     
-    int input_offset = ((out_b_idx * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx) * height + out_h_idx) * width + out_w_idx) * 4;
-    int wh = width * height * 4;
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 4 * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 4 * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
+    int input_offset0 = idm * 4;
+
+    int out_offset = (x * bhw + idm) * 4;
+#ifndef USE_IMAGE
+    int weight_offset = x * 4 * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelC4 * 4 * WEIGHT_STRIDE;
 #endif
 
-    const int loop = (blockDim + 15) / 16;
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
 #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
 #else
@@ -104,122 +120,119 @@ __kernel void gemm_conv_c4_buf(GLOBAL_SIZE_DIM2
     
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT8 ScaleOffset = CONVERT_COMPUTE_FLOAT8(vload8(x, dequantScaleOffset + kindex));
         for (int j = 0; j < loop_end; ++j) {
             int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k4 = k << 2;
-            #endif
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            weights2 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 32)) * ScaleOffset.s4 + ScaleOffset.s5;
-            weights3 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 48)) * ScaleOffset.s6 + ScaleOffset.s7;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k32 = k << 5;
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11, weights20, weights21, weights30, weights31;
             {
-                uchar16 charWeightsInt40 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                uchar16 charWeightsInt41 = vload16(0, weight + weight_offset + k * weight_oc_offset + 16);
-                {
-                    char16 charWeights0 = 0;
-                    char16 charWeights1 = 0;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
-                    weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                    weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
-                    weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
-                    weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
-                }
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 1, k)));
+                uchar16 charWeightsInt42 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 2, k)));
+                uchar16 charWeightsInt43 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 3, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt42);
+                weights20 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
+                weights21 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s4 + ScaleOffset.s5;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt43);
+                weights30 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s6 + ScaleOffset.s7;
+                weights31 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
             }
+            {
+                COMPUTE_FLOAT16 in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32));
+                COMPUTE_FLOAT16 in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32 + 16));
+                DOT16X16(in0, weights00, out0.s0);DOT16X16(in1, weights01, out0.s0);
+                DOT16X16(in0, weights10, out0.s1);DOT16X16(in1, weights11, out0.s1);
+                DOT16X16(in0, weights20, out0.s2);DOT16X16(in1, weights21, out0.s2);
+                DOT16X16(in0, weights30, out0.s3);DOT16X16(in1, weights31, out0.s3);
+            }
+            #else
+            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, idn + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight, idn + 2, k, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight, idn + 3, k, ScaleOffset.s6, ScaleOffset.s7);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight + weight_offset + k * weight_oc_offset + 2 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight + weight_offset + k * weight_oc_offset + 3 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s6, ScaleOffset.s7);
             #endif
             {
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-                DOT16X16(in, weights2, out.s2);
-                DOT16X16(in, weights3, out.s3);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset1));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-                DOT16X16(in, weights2, out1.s2);
-                DOT16X16(in, weights3, out1.s3);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset2));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
-                DOT16X16(in, weights2, out2.s2);
-                DOT16X16(in, weights3, out2.s3);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset3));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
-                DOT16X16(in, weights2, out3.s2);
-                DOT16X16(in, weights3, out3.s3);
+                COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(k, input));
+                DOT16X16(in, weights0, out0.s0);
+                DOT16X16(in, weights1, out0.s1);
+                DOT16X16(in, weights2, out0.s2);
+                DOT16X16(in, weights3, out0.s3);
             }
             #endif
         }
         #ifdef INPUT_CHANNEL_LEAVE
         {
             int k = i * loop + loop_end;
-            int k4 = k << 2;
-            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            weights2 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 32)) * ScaleOffset.s4 + ScaleOffset.s5;
-            weights3 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 48)) * ScaleOffset.s6 + ScaleOffset.s7;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k8 = k << 3;
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11, weights20, weights21, weights30, weights31;
             {
-                uchar16 charWeightsInt40 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                uchar16 charWeightsInt41 = vload16(0, weight + weight_offset + k * weight_oc_offset + 16);
-                {
-                    char16 charWeights0 = 0;
-                    char16 charWeights1 = 0;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
-                    weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                    weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-                    UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
-                    weights2 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
-                    weights3 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
-                }
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 1, k)));
+                uchar16 charWeightsInt42 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 2, k)));
+                uchar16 charWeightsInt43 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 3, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt42);
+                weights20 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s4 + ScaleOffset.s5;
+                weights21 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s4 + ScaleOffset.s5;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt43);
+                weights30 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s6 + ScaleOffset.s7;
+                weights31 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s6 + ScaleOffset.s7;
+                
+                PADZEROS(k, srcChannel, weights00);PADZEROS(k + 16, srcChannel, weights01);
+                PADZEROS(k, srcChannel, weights10);PADZEROS(k + 16, srcChannel, weights11);
+                PADZEROS(k, srcChannel, weights20);PADZEROS(k + 16, srcChannel, weights21);
+                PADZEROS(k, srcChannel, weights30);PADZEROS(k + 16, srcChannel, weights31);
             }
+            {
+                COMPUTE_FLOAT16 in0, in1;
+                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k8 * 4));
+                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + (k8 + 1) * 4) : (FLOAT4)0);
+                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + (k8 + 2) * 4) : (FLOAT4)0);
+                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + (k8 + 3) * 4) : (FLOAT4)0);
+                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + (k8 + 4) * 4) : (FLOAT4)0);
+                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + (k8 + 5) * 4) : (FLOAT4)0);
+                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + (k8 + 6) * 4) : (FLOAT4)0);
+                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + (k8 + 7) * 4) : (FLOAT4)0);
+                DOT16X16(in0, weights00, out0.s0);DOT16X16(in1, weights01, out0.s0);
+                DOT16X16(in0, weights10, out0.s1);DOT16X16(in1, weights11, out0.s1);
+                DOT16X16(in0, weights20, out0.s2);DOT16X16(in1, weights21, out0.s2);
+                DOT16X16(in0, weights30, out0.s3);DOT16X16(in1, weights31, out0.s3);
+            }
+            #else
+            int k4 = k << 2;
+            COMPUTE_FLOAT16 weights0, weights1, weights2, weights3;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, idn + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight, idn + 2, k, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight, idn + 3, k, ScaleOffset.s6, ScaleOffset.s7);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+            weights2 = readWeight(weight + weight_offset + k * weight_oc_offset + 2 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s4, ScaleOffset.s5);
+            weights3 = readWeight(weight + weight_offset + k * weight_oc_offset + 3 * WEIGHT_STRIDE, 0, 0, ScaleOffset.s6, ScaleOffset.s7);
             #endif
             PADZEROS(k, srcChannel, weights0);
             PADZEROS(k, srcChannel, weights1);
@@ -227,109 +240,40 @@ __kernel void gemm_conv_c4_buf(GLOBAL_SIZE_DIM2
             PADZEROS(k, srcChannel, weights3);
             {
                 COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-                DOT16X16(in, weights2, out.s2);
-                DOT16X16(in, weights3, out.s3);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-                DOT16X16(in, weights2, out1.s2);
-                DOT16X16(in, weights3, out1.s3);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
-                DOT16X16(in, weights2, out2.s2);
-                DOT16X16(in, weights3, out2.s3);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
-                DOT16X16(in, weights2, out3.s2);
-                DOT16X16(in, weights3, out3.s3);
+                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k4 * 4));
+                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + (k4 + 1) * 4) : (FLOAT4)0);
+                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + (k4 + 2) * 4) : (FLOAT4)0);
+                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + (k4 + 3) * 4) : (FLOAT4)0);
+                DOT16X16(in, weights0, out0.s0);
+                DOT16X16(in, weights1, out0.s1);
+                DOT16X16(in, weights2, out0.s2);
+                DOT16X16(in, weights3, out0.s3);
             }
             #endif
         }
         #endif
     }
-    
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT4)0);
-#endif
-
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-
-    vstore4(CONVERT_FLOAT4(out), 0, output+out_offset);
-#ifdef BACTH_BLOCK4
-    if(isValidBatch1){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out1 = fmax(out1, (COMPUTE_FLOAT4)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out1, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    
-        vstore4(CONVERT_FLOAT4(out1), 0, output+out_offset);
-    }
-    if(isValidBatch2){
-        out_offset += dstChannelC4 * height * width * 4;
 #ifdef RELU
-        out2 = fmax(out2, (COMPUTE_FLOAT4)0);
+    out0 = fmax(out0, (COMPUTE_FLOAT4)0);
 #endif
 
 #ifdef RELU6
-        out2 = clamp(out2, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    
-        vstore4(CONVERT_FLOAT4(out2), 0, output+out_offset);
-    }
-    if(isValidBatch3){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out3 = fmax(out3, (COMPUTE_FLOAT4)0);
+    out0 = clamp(out0, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
 #endif
 
-#ifdef RELU6
-        out3 = clamp(out3, (COMPUTE_FLOAT4)0, (COMPUTE_FLOAT4)6);
-#endif
-    
-        vstore4(CONVERT_FLOAT4(out3), 0, output+out_offset);
-    }
-#endif
+    vstore4(CONVERT_FLOAT4(out0), 0, output+out_offset);
 }
 
-__kernel void gemm_conv_c2_buf(GLOBAL_SIZE_DIM2
+__kernel void gemv_conv_c2_buf(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
                         __global const char *weight,
 #elif (defined USE_LOW_BIT_WEIGHT_INT4)
                         __global const uchar *weight,
+#endif
 #endif
                         __global const float *dequantScaleOffset,
                         __global const FLOAT *bias,
@@ -337,48 +281,26 @@ __kernel void gemm_conv_c2_buf(GLOBAL_SIZE_DIM2
                         __private const int dstChannelC4,
                         __private const int srcChannelC4,
                         __private const int srcChannel,
-                        __private const int batch,
-                        __private const int height,
-                        __private const int width,
+                        __private const int bhw,
                         __private const int blockNum,
                         __private const int blockDim) {
-    const int out_c_w_idx = get_global_id(0); //c/4 w
-    const int out_b_h_idx  = get_global_id(1); //b h
-
-    UNIFORM_BOUNDRY_CHECK(out_c_w_idx, out_b_h_idx);
+    const int x = get_global_id(0); //c/2
+    const int y = get_global_id(1); //b h w
 
-    const int out_c_idx = out_c_w_idx / width;
-    const int out_w_idx = out_c_w_idx % width;
-#ifdef BACTH_BLOCK4
-    const int out_b_idx = (out_b_h_idx / height) << 2;
-#else
-    const int out_b_idx = out_b_h_idx / height;
-#endif
-    const int out_h_idx = out_b_h_idx % height;
-
-    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, bias));
-    COMPUTE_FLOAT2 out = bias0;
-#ifdef BACTH_BLOCK4
-    COMPUTE_FLOAT2 out1 = bias0, out2 = bias0, out3 = bias0;
-    int input_offset1 = (((out_b_idx + 1) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset2 = (((out_b_idx + 2) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset3 = (((out_b_idx + 3) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    bool isValidBatch1 = out_b_idx + 1 < batch;
-    bool isValidBatch2 = out_b_idx + 2 < batch;
-    bool isValidBatch3 = out_b_idx + 3 < batch;
-#endif
-    int input_offset = ((out_b_idx * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int out_offset = (((out_b_idx * dstChannelC4 + (out_c_idx * 2) / 4) * height + out_h_idx) * width + out_w_idx) * 4 + ((out_c_idx * 2)%4);
-    int wh = width * height * 4;
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 2 * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 2 * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
-#endif
-
-    const int loop = (blockDim + 15) / 16;
+    UNIFORM_BOUNDRY_CHECK(x, y);
+    
+    int idn = x << 1;
+    int idm = y;
+    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(x, bias));
+    COMPUTE_FLOAT2 out0 = bias0;
+    int input_offset0 = idm * 4;
+    int out_offset = ((x * 2) / 4 * bhw + idm) * 4 + ((x * 2) % 4);
+#ifndef USE_IMAGE
+    int weight_offset = x * 2 * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelC4 * 4 * WEIGHT_STRIDE;
+#endif
+
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
 #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
 #else
@@ -387,137 +309,98 @@ __kernel void gemm_conv_c2_buf(GLOBAL_SIZE_DIM2
 
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(x, dequantScaleOffset + kindex));
         for (int j = 0; j < loop_end; ++j) {
             int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k4 = k << 2;
-            #endif
-            COMPUTE_FLOAT16 weights0, weights1;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k32 = k << 5;
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11;
             {
-                uchar16 charWeightsInt4 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 1, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
             }
-            #endif
             {
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset1));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset2));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
+                COMPUTE_FLOAT16 in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32));
+                COMPUTE_FLOAT16 in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32 + 16));
+                DOT16X16(in0, weights00, out0.s0);DOT16X16(in1, weights01, out0.s0);
+                DOT16X16(in0, weights10, out0.s1);DOT16X16(in1, weights11, out0.s1);
             }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset3));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
+            #else
+            COMPUTE_FLOAT16 weights0, weights1;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, idn + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
+            #endif
+            {
+                COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(k, input));
+                DOT16X16(in, weights0, out0.s0);
+                DOT16X16(in, weights1, out0.s1);
             }
             #endif
         }
         #ifdef INPUT_CHANNEL_LEAVE
         {
             int k = i * loop + loop_end;
-            int k4 = k << 2;
-            COMPUTE_FLOAT16 weights0, weights1;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights0 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            weights1 = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset + 16)) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k8 = k << 3;
+            COMPUTE_FLOAT16 weights00, weights01, weights10, weights11;
             {
-                uchar16 charWeightsInt4 = vload16(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                uchar16 charWeightsInt41 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn + 1, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt41);
+                weights10 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s2 + ScaleOffset.s3;
+                weights11 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
+                
+                PADZEROS(k, srcChannel, weights00);PADZEROS(k + 16, srcChannel, weights01);
+                PADZEROS(k, srcChannel, weights10);PADZEROS(k + 16, srcChannel, weights11);
             }
+            {
+                COMPUTE_FLOAT16 in0, in1;
+                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k8 * 4));
+                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + (k8 + 1) * 4) : (FLOAT4)0);
+                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + (k8 + 2) * 4) : (FLOAT4)0);
+                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + (k8 + 3) * 4) : (FLOAT4)0);
+                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + (k8 + 4) * 4) : (FLOAT4)0);
+                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + (k8 + 5) * 4) : (FLOAT4)0);
+                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + (k8 + 6) * 4) : (FLOAT4)0);
+                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + (k8 + 7) * 4) : (FLOAT4)0);
+                DOT16X16(in0, weights00, out0.s0);DOT16X16(in1, weights01, out0.s0);
+                DOT16X16(in0, weights10, out0.s1);DOT16X16(in1, weights11, out0.s1);
+            }
+            #else
+            int k4 = k << 2;
+            COMPUTE_FLOAT16 weights0, weights1;
+            #ifdef USE_IMAGE
+            weights0 = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight, idn + 1, k, ScaleOffset.s2, ScaleOffset.s3);
+            #else
+            weights0 = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
+            weights1 = readWeight(weight + weight_offset + k * weight_oc_offset + WEIGHT_STRIDE, 0, 0, ScaleOffset.s2, ScaleOffset.s3);
             #endif
             PADZEROS(k, srcChannel, weights0);
             PADZEROS(k, srcChannel, weights1);
             {
                 COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
+                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k4 * 4));
+                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + (k4 + 1) * 4) : (FLOAT4)0);
+                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + (k4 + 2) * 4) : (FLOAT4)0);
+                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + (k4 + 3) * 4) : (FLOAT4)0);
+                DOT16X16(in, weights0, out0.s0);
+                DOT16X16(in, weights1, out0.s1);
             }
             #endif
         }
@@ -525,60 +408,26 @@ __kernel void gemm_conv_c2_buf(GLOBAL_SIZE_DIM2
     }
     
 #ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT2)0);
+    out0 = fmax(out0, (COMPUTE_FLOAT2)0);
 #endif
 
 #ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
+    out0 = clamp(out0, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
 #endif
 
-    vstore2(CONVERT_FLOAT2(out), 0, output+out_offset);
-#ifdef BACTH_BLOCK4
-    if(isValidBatch1){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out1 = fmax(out1, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out1, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-        
-        vstore2(CONVERT_FLOAT2(out1), 0, output+out_offset);
-    }
-    if(isValidBatch2){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out2 = fmax(out2, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out2 = clamp(out2, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-        
-        vstore2(CONVERT_FLOAT2(out2), 0, output+out_offset);
-    }
-    if(isValidBatch3){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out3 = fmax(out3, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out3 = clamp(out3, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-        
-        vstore2(CONVERT_FLOAT2(out3), 0, output+out_offset);
-    }
-#endif
+    vstore2(CONVERT_FLOAT2(out0), 0, output+out_offset);
 }
 
-__kernel void gemm_conv_c1_buf(GLOBAL_SIZE_DIM2
+__kernel void gemv_conv_c1_buf(GLOBAL_SIZE_DIM2
                         __global const FLOAT* input,
+#ifdef USE_IMAGE
+                        __read_only image2d_t weight,
+#else
 #if (defined USE_LOW_BIT_WEIGHT_INT8)
                         __global const char *weight,
 #elif (defined USE_LOW_BIT_WEIGHT_INT4)
                         __global const uchar *weight,
+#endif
 #endif
                         __global const float *dequantScaleOffset,
                         __global const FLOAT *bias,
@@ -586,50 +435,28 @@ __kernel void gemm_conv_c1_buf(GLOBAL_SIZE_DIM2
                         __private const int dstChannelC4,
                         __private const int srcChannelC4,
                         __private const int srcChannel,
-                        __private const int batch,
-                        __private const int height,
-                        __private const int width,
+                        __private const int bhw,
                         __private const int blockNum,
                         __private const int blockDim) {
-    const int out_c_w_idx = get_global_id(0); //c/4 w
-    const int out_b_h_idx  = get_global_id(1); //b h
+    const int x = get_global_id(0); //c
+    const int y  = get_global_id(1); //b h w
 
-    UNIFORM_BOUNDRY_CHECK(out_c_w_idx, out_b_h_idx);
+    UNIFORM_BOUNDRY_CHECK(x, y);
+    int idn = x;
+    int idm = y;
 
-    const int out_c_idx = out_c_w_idx / width;
-    const int out_w_idx = out_c_w_idx % width;
-#ifdef BACTH_BLOCK4
-    const int out_b_idx = (out_b_h_idx / height) << 2;
-#else
-    const int out_b_idx = out_b_h_idx / height;
-#endif
-    const int out_h_idx = out_b_h_idx % height;
-
-    COMPUTE_FLOAT bias0 = bias[out_c_idx];
-    COMPUTE_FLOAT out = bias0;
+    COMPUTE_FLOAT bias0 = bias[x];
+    COMPUTE_FLOAT out0 = bias0;
     
-#ifdef BACTH_BLOCK4
-    COMPUTE_FLOAT out1 = bias0, out2 = bias0, out3 = bias0;
-    int input_offset1 = (((out_b_idx + 1) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset2 = (((out_b_idx + 2) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset3 = (((out_b_idx + 3) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    bool isValidBatch1 = out_b_idx + 1 < batch;
-    bool isValidBatch2 = out_b_idx + 2 < batch;
-    bool isValidBatch3 = out_b_idx + 3 < batch;
-#endif
+    int input_offset0 = idm * 4;
     
-    int input_offset = ((out_b_idx * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx/4) * height + out_h_idx) * width + out_w_idx) * 4 + (out_c_idx%4);
-    int wh = width * height * 4;
-#if (defined USE_LOW_BIT_WEIGHT_INT4)
-    int weight_offset = out_c_idx * 8;
-    int weight_oc_offset = dstChannelC4 * 32;
-#else
-    int weight_offset = out_c_idx * 16;
-    int weight_oc_offset = dstChannelC4 * 64;
+    int out_offset = ((x / 4) * bhw + idm) * 4 + (x % 4);
+#ifndef USE_IMAGE
+    int weight_offset = x * WEIGHT_STRIDE;
+    int weight_oc_offset = dstChannelC4 * 4 * WEIGHT_STRIDE;
 #endif
 
-    const int loop = (blockDim + 15) / 16;
+    const int loop = (blockDim + CHANNEL_PACK - 1) / CHANNEL_PACK;
 #ifdef INPUT_CHANNEL_LEAVE
     const int loop_end = max(loop - 1, 0);
 #else
@@ -638,633 +465,92 @@ __kernel void gemm_conv_c1_buf(GLOBAL_SIZE_DIM2
     
     for (int i = 0; i < blockNum; ++i){
         int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
+        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(x, dequantScaleOffset + kindex));
         for (int j = 0; j < loop_end; ++j) {
             int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k4 = k << 2;
-            #endif
-            COMPUTE_FLOAT16 weights;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k32 = k << 5;
+            COMPUTE_FLOAT16 weights00, weights01;
             {
-                uchar8 charWeightsInt4 = vload8(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
             }
-            #endif
             {
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights, out);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset1));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset2));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights, out2);
+                COMPUTE_FLOAT16 in0 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32));
+                COMPUTE_FLOAT16 in1 = CONVERT_COMPUTE_FLOAT16(vload16(0, input + k32 + 16));
+                DOT16X16(in0, weights00, out0);DOT16X16(in1, weights01, out0);
             }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(k, input + input_offset3));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights, out3);
-            }
-            #endif
-        }
-        #ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
-            int k4 = k << 2;
+            #else
             COMPUTE_FLOAT16 weights;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            weights = CONVERT_COMPUTE_FLOAT16(vload16(0, weight + weight_offset + k * weight_oc_offset)) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            {
-                uchar8 charWeightsInt4 = vload8(0, weight + weight_offset + k * weight_oc_offset);
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
-            #endif
-            PADZEROS(k, srcChannel, weights);
-            {
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights, out);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights, out2);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights, out3);
-            }
-            #endif
-        }
-        #endif
-    }
-    
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-    output[out_offset] = out;
-#ifdef BACTH_BLOCK4
-    if(isValidBatch1){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out1 = fmax(out1, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out1, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-            
-        output[out_offset] = out1;
-    }
-    if(isValidBatch2){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out2 = fmax(out2, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-        out2 = clamp(out2, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-            
-        output[out_offset] = out2;
-    }
-    if(isValidBatch3){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out3 = fmax(out3, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-        out3 = clamp(out3, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-            
-        output[out_offset] = out3;
-    }
-#endif
-}
-__kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-                        __read_only image2d_t weight,
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int srcChannel,
-                        __private const int batch,
-                        __private const int height,
-                        __private const int width,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int out_c_w_idx = get_global_id(0); //c/4 w
-    const int out_b_h_idx  = get_global_id(1); //b h
-    UNIFORM_BOUNDRY_CHECK(out_c_w_idx, out_b_h_idx);
-
-    const int out_c_idx = (out_c_w_idx / width) << 1;
-    const int out_w_idx = out_c_w_idx % width;
-#ifdef BACTH_BLOCK4
-    const int out_b_idx = (out_b_h_idx / height) << 2;
-#else
-    const int out_b_idx = out_b_h_idx / height;
-#endif
-    const int out_h_idx = out_b_h_idx % height;
-        
-    COMPUTE_FLOAT2 bias0 = CONVERT_COMPUTE_FLOAT2(vload2(0, bias + out_c_idx));
-    COMPUTE_FLOAT2 out = bias0;
-    
-#ifdef BACTH_BLOCK4
-    COMPUTE_FLOAT2 out1 = bias0, out2 = bias0, out3 = bias0;
-    int input_offset1 = (((out_b_idx + 1) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset2 = (((out_b_idx + 2) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset3 = (((out_b_idx + 3) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    bool isValidBatch1 = out_b_idx + 1 < batch;
-    bool isValidBatch2 = out_b_idx + 2 < batch;
-    bool isValidBatch3 = out_b_idx + 3 < batch;
-#endif
-
-    int input_offset = ((out_b_idx * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx/4) * height + out_h_idx) * width + out_w_idx) * 4 + (out_c_idx % 4);
-    int wh = width * height * 4;
-
-    const int loop = (blockDim + 15) / 16;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    #else
-    const int loop_end = loop;
-    #endif
-
-    for (int i = 0; i < blockNum; ++i){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT4 ScaleOffset = CONVERT_COMPUTE_FLOAT4(vload4(0, dequantScaleOffset + out_c_idx * 2 + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k4 = k << 2;
-            #endif
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1;
-            {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
+            #ifdef USE_IMAGE
+            weights = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            #else
+            weights = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
             #endif
             {
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
+                COMPUTE_FLOAT16 in = CONVERT_COMPUTE_FLOAT16(vload16(k, input));
+                DOT16X16(in, weights, out0);
             }
             #endif
         }
         #ifdef INPUT_CHANNEL_LEAVE
         {
             int k = i * loop + loop_end;
-            int k4 = k << 2;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            COMPUTE_FLOAT16 weights1 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx + 1, k)))) * ScaleOffset.s2 + ScaleOffset.s3;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0, weights1;
+            #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)
+            int k8 = k << 3;
+            COMPUTE_FLOAT16 weights00, weights01;
             {
-                uchar8 charWeightsInt40 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                uchar8 charWeightsInt41 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx + 1, k))));
-                char16 charWeights0 = 0;
-                char16 charWeights1 = 0;
-                UCHAR8_TO_CHAR16(charWeights0, charWeightsInt40);
-                UCHAR8_TO_CHAR16(charWeights1, charWeightsInt41);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
-                weights1 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s2 + ScaleOffset.s3;
-            }
-            #endif
-            PADZEROS(k, srcChannel, weights0);
-            PADZEROS(k, srcChannel, weights1);
-            {
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out.s0);
-                DOT16X16(in, weights1, out.s1);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out1.s0);
-                DOT16X16(in, weights1, out1.s1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out2.s0);
-                DOT16X16(in, weights1, out2.s1);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out3.s0);
-                DOT16X16(in, weights1, out3.s1);
-            }
-            #endif
-        }
-        #endif
-    }
-
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT2)0);
-#endif
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-    vstore2(CONVERT_FLOAT2(out), 0, output + out_offset);
-#ifdef BACTH_BLOCK4
-    if(isValidBatch1){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out1 = fmax(out1, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out1, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-            
-        vstore2(CONVERT_FLOAT2(out1), 0, output+out_offset);
-    }
-    if(isValidBatch2){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out2 = fmax(out2, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out2 = clamp(out2, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-            
-        vstore2(CONVERT_FLOAT2(out2), 0, output+out_offset);
-    }
-    if(isValidBatch3){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out3 = fmax(out3, (COMPUTE_FLOAT2)0);
-#endif
-
-#ifdef RELU6
-        out3 = clamp(out3, (COMPUTE_FLOAT2)0, (COMPUTE_FLOAT2)6);
-#endif
-            
-        vstore2(CONVERT_FLOAT2(out3), 0, output+out_offset);
-    }
-#endif
-}
-__kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2
-                        __global const FLOAT* input,
-                        __read_only image2d_t weight,
-                        __global const float *dequantScaleOffset,
-                        __global const FLOAT *bias,
-                        __global FLOAT* output,
-                        __private const int dstChannelC4,
-                        __private const int srcChannelC4,
-                        __private const int srcChannel,
-                        __private const int batch,
-                        __private const int height,
-                        __private const int width,
-                        __private const int blockNum,
-                        __private const int blockDim) {
-    const int out_c_w_idx = get_global_id(0); //c/4 w
-    const int out_b_h_idx  = get_global_id(1); //b h
-    UNIFORM_BOUNDRY_CHECK(out_c_w_idx, out_b_h_idx);
-
-    const int out_c_idx = out_c_w_idx / width;
-    const int out_w_idx = out_c_w_idx % width;
-#ifdef BACTH_BLOCK4
-    const int out_b_idx = (out_b_h_idx / height) << 2;
-#else
-    const int out_b_idx = out_b_h_idx / height;
-#endif
-    const int out_h_idx = out_b_h_idx % height;
-    
-    COMPUTE_FLOAT bias0 = bias[out_c_idx];
-    COMPUTE_FLOAT out = bias0;
-    
-    int input_offset = ((out_b_idx * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int out_offset = (((out_b_idx * dstChannelC4 + out_c_idx/4)* height + out_h_idx) * width + out_w_idx) * 4 + (out_c_idx%4);
-    int wh = width * height * 4;
-#ifdef BACTH_BLOCK4
-    COMPUTE_FLOAT out1 = bias0, out2 = bias0, out3 = bias0;
-    int input_offset1 = (((out_b_idx + 1) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset2 = (((out_b_idx + 2) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    int input_offset3 = (((out_b_idx + 3) * srcChannelC4 * height + out_h_idx) * width + out_w_idx) * 4;
-    bool isValidBatch1 = out_b_idx + 1 < batch;
-    bool isValidBatch2 = out_b_idx + 2 < batch;
-    bool isValidBatch3 = out_b_idx + 3 < batch;
-#endif
-
-    const int loop = (blockDim + 15) / 16;
-    #ifdef INPUT_CHANNEL_LEAVE
-    const int loop_end = max(loop - 1, 0);
-    #else
-    const int loop_end = loop;
-    #endif
-    
-    for (int i = 0; i < blockNum; ++i){
-        int kindex = i * dstChannelC4 * 4 * 2;
-        COMPUTE_FLOAT2 ScaleOffset = CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx, dequantScaleOffset + kindex));
-        for (int j = 0; j < loop_end; j++) {
-            int k = i * loop + j;
-            #ifndef WIDTH_HEIGHT_1
-            int k4 = k << 2;
-            #endif
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0;
-            {
-                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
+                uchar16 charWeightsInt40 = as_uchar16(read_imagei(weight, SAMPLER, (int2)(idn, k)));
+                char16 charWeights0, charWeights1;
+                UCHAR16_TO_2CHAR16(charWeights0, charWeights1, charWeightsInt40);
+                weights00 = CONVERT_COMPUTE_FLOAT16(charWeights0) * ScaleOffset.s0 + ScaleOffset.s1;
+                weights01 = CONVERT_COMPUTE_FLOAT16(charWeights1) * ScaleOffset.s0 + ScaleOffset.s1;
+                
+                PADZEROS(k, srcChannel, weights00);PADZEROS(k + 16, srcChannel, weights01);
             }
-            #endif
             {
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset1 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset2 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out2);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                #ifdef WIDTH_HEIGHT_1
-                in = CONVERT_COMPUTE_FLOAT16(vload16(0, input + input_offset3 + k * 16));
-                #else
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 1) * wh));
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 2) * wh));
-                in.scdef = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + (k4 + 3) * wh));
-                #endif
-                DOT16X16(in, weights0, out3);
-            }
-            #endif
-        }
-        #ifdef INPUT_CHANNEL_LEAVE
-        {
-            int k = i * loop + loop_end;
+                COMPUTE_FLOAT16 in0, in1;
+                in0.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k8 * 4));
+                in0.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 1 < srcChannelC4 ? vload4(0, input + (k8 + 1) * 4) : (FLOAT4)0);
+                in0.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 2 < srcChannelC4 ? vload4(0, input + (k8 + 2) * 4) : (FLOAT4)0);
+                in0.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 3 < srcChannelC4 ? vload4(0, input + (k8 + 3) * 4) : (FLOAT4)0);
+                in1.s0123 = CONVERT_COMPUTE_FLOAT4(k8 + 4 < srcChannelC4 ? vload4(0, input + (k8 + 4) * 4) : (FLOAT4)0);
+                in1.s4567 = CONVERT_COMPUTE_FLOAT4(k8 + 5 < srcChannelC4 ? vload4(0, input + (k8 + 5) * 4) : (FLOAT4)0);
+                in1.s89ab = CONVERT_COMPUTE_FLOAT4(k8 + 6 < srcChannelC4 ? vload4(0, input + (k8 + 6) * 4) : (FLOAT4)0);
+                in1.scdef = CONVERT_COMPUTE_FLOAT4(k8 + 7 < srcChannelC4 ? vload4(0, input + (k8 + 7) * 4) : (FLOAT4)0);
+                DOT16X16(in0, weights00, out0);DOT16X16(in1, weights01, out0);
+            }
+            #else
             int k4 = k << 2;
-            #if (defined USE_LOW_BIT_WEIGHT_INT8)
-            COMPUTE_FLOAT16 weights0 = CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight, SAMPLER, (int2)(out_c_idx, k)))) * ScaleOffset.s0 + ScaleOffset.s1;
-            #elif (defined USE_LOW_BIT_WEIGHT_INT4)
-            COMPUTE_FLOAT16 weights0;
-            {
-                uchar8 charWeightsInt4 = as_uchar8(convert_ushort4(read_imageui(weight, SAMPLER, (int2)(out_c_idx, k))));
-                char16 charWeights = 0;
-                UCHAR8_TO_CHAR16(charWeights, charWeightsInt4);
-                weights0 = CONVERT_COMPUTE_FLOAT16(charWeights) * ScaleOffset.s0 + ScaleOffset.s1;
-            }
+            COMPUTE_FLOAT16 weights;
+            #ifdef USE_IMAGE
+            weights = readWeight(weight, idn, k, ScaleOffset.s0, ScaleOffset.s1);
+            #else
+            weights = readWeight(weight + weight_offset + k * weight_oc_offset, 0, 0, ScaleOffset.s0, ScaleOffset.s1);
             #endif
-            PADZEROS(k, srcChannel, weights0);
+            PADZEROS(k, srcChannel, weights);
             {
-               COMPUTE_FLOAT16 in;
-               in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset + k4 * wh));
-               in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 1) * wh) : (FLOAT4)0);
-               in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 2) * wh) : (FLOAT4)0);
-               in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset + (k4 + 3) * wh) : (FLOAT4)0);
-               DOT16X16(in, weights0, out);
-            }
-            #ifdef BACTH_BLOCK4
-            if(isValidBatch1){
                 COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset1 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset1 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out1);
-            }
-            if(isValidBatch2){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset2 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset2 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out2);
-            }
-            if(isValidBatch3){
-                COMPUTE_FLOAT16 in;
-                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + input_offset3 + k4 * wh));
-                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 1) * wh) : (FLOAT4)0);
-                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 2) * wh) : (FLOAT4)0);
-                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + input_offset3 + (k4 + 3) * wh) : (FLOAT4)0);
-                DOT16X16(in, weights0, out3);
+                in.s0123 = CONVERT_COMPUTE_FLOAT4(vload4(0, input + k4 * 4));
+                in.s4567 = CONVERT_COMPUTE_FLOAT4(k4 + 1 < srcChannelC4 ? vload4(0, input + (k4 + 1) * 4) : (FLOAT4)0);
+                in.s89ab = CONVERT_COMPUTE_FLOAT4(k4 + 2 < srcChannelC4 ? vload4(0, input + (k4 + 2) * 4) : (FLOAT4)0);
+                in.scdef = CONVERT_COMPUTE_FLOAT4(k4 + 3 < srcChannelC4 ? vload4(0, input + (k4 + 3) * 4) : (FLOAT4)0);
+                DOT16X16(in, weights, out0);
             }
             #endif
         }
         #endif
     }
-
-#ifdef RELU
-    out = fmax(out, (COMPUTE_FLOAT)0);
-#endif
-#ifdef RELU6
-    out = clamp(out, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-    output[out_offset] = out;
-#ifdef BACTH_BLOCK4
-    if(isValidBatch1){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out1 = fmax(out1, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out1, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-                
-        output[out_offset] = out1;
-    }
-    if(isValidBatch2){
-        out_offset += dstChannelC4 * height * width * 4;
-#ifdef RELU
-        out2 = fmax(out2, (COMPUTE_FLOAT)0);
-#endif
-
-#ifdef RELU6
-        out1 = clamp(out2, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-                
-        output[out_offset] = out2;
-    }
-    if(isValidBatch3){
-        out_offset += dstChannelC4 * height * width * 4;
+    
 #ifdef RELU
-        out3 = fmax(out3, (COMPUTE_FLOAT)0);
+    out0 = fmax(out0, (COMPUTE_FLOAT)0);
 #endif
 
 #ifdef RELU6
-        out3 = clamp(out3, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
-#endif
-                
-        output[out_offset] = out3;
-    }
+    out0 = clamp(out0, (COMPUTE_FLOAT)0, (COMPUTE_FLOAT)6);
 #endif
+    output[out_offset] = out0;
 }
-        
diff --git a/source/backend/opencl/execution/cl/grid_sample_buf.cl b/source/backend/opencl/execution/cl/grid_sample_buf.cl
index 391a88163..758cb2295 100644
--- a/source/backend/opencl/execution/cl/grid_sample_buf.cl
+++ b/source/backend/opencl/execution/cl/grid_sample_buf.cl
@@ -61,7 +61,7 @@ __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
                         __private const int input_width,
                         __private const int output_height,
                         __private const int output_width,
-                        __private const int channelBlocks,
+                        __private const int batch,
                         __private const enum BorderMode paddingMode,
                         __private const int alignCorners){
     
@@ -88,19 +88,13 @@ __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
                                 (xn,xn,xn,xn) (y5,y6,y7,y8)
                                 ---------------------------
     */
-    const int slice = output_height_idx / 4;
-    const int slice_blocks = (output_height + 3) / 4;
     // output_width_block_idx means gird y offset, 2 means grid width
-    const int grid_offset = ((output_batch_idx * slice_blocks + slice) * output_width + output_width_block_idx) * 2;
-    COMPUTE_FLOAT4 grid_x = CONVERT_COMPUTE_FLOAT4(vload4(grid_offset, grid));
-    COMPUTE_FLOAT4 grid_y = CONVERT_COMPUTE_FLOAT4(vload4(grid_offset + 1, grid));
+    const int grid_offset = (output_batch_idx * output_height + output_height_idx) * output_width + output_width_block_idx;
+    COMPUTE_FLOAT2 grid_xy = CONVERT_COMPUTE_FLOAT2(vload2(grid_offset, grid));
 
-    const float arr[8] = {grid_x.x, grid_y.x, grid_x.y, grid_y.y, grid_x.z, grid_y.z, grid_x.w, grid_y.w};
-    
     // get grid x,y
-    const int arr_offset = output_height_idx % 4;
-    const float x = arr[2 * arr_offset];
-    const float y = arr[2 * arr_offset + 1];
+    const float x = (float)grid_xy.x;
+    const float y = (float)grid_xy.y;
 
     // convert grid x,y to input x,y coordinate range
     float in_grid_x = getPosition(x, input_width, alignCorners);
@@ -110,10 +104,10 @@ __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
     int nw = floor(in_grid_x + 0.5f);
     int nh = floor(in_grid_y + 0.5f);
 
-    const int inp_offset_base = (output_batch_idx * channelBlocks + output_channel_block_idx) * input_height;
+    const int inp_offset_base = (output_batch_idx + output_channel_block_idx * batch) * input_height;
     COMPUTE_FLOAT4 value = sample(nh, nw, inp_offset_base, input, input_height, input_width, paddingMode);
 
-    const int output_offset = ((output_batch_idx * channelBlocks + output_channel_block_idx ) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    const int output_offset = ((output_batch_idx + output_channel_block_idx * batch) * output_height + output_height_idx) * output_width + output_width_block_idx;
     vstore4(CONVERT_FLOAT4(value), output_offset, output);
 }
 
@@ -124,7 +118,7 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
                         __private const int input_width,
                         __private const int output_height,
                         __private const int output_width,
-                        __private const int channelBlocks,
+                        __private const int batch,
                         __private const enum BorderMode paddingMode,
                         __private const int alignCorners){
 
@@ -137,19 +131,14 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
     const int output_batch_idx  = output_batch_height_block_idx / output_height;
     const int output_height_idx = output_batch_height_block_idx % output_height;
 
-    const int slice = output_height_idx / 4;
-    const int slice_blocks = (output_height + 3) / 4;
     // output_width_block_idx means gird y offset, 2 means grid width
-    const int grid_offset = ((output_batch_idx * slice_blocks + slice) * output_width + output_width_block_idx) * 2;
-    COMPUTE_FLOAT4 grid_x = CONVERT_COMPUTE_FLOAT4(vload4(grid_offset, grid));
-    COMPUTE_FLOAT4 grid_y = CONVERT_COMPUTE_FLOAT4(vload4(grid_offset + 1, grid));
+    const int grid_offset = (output_batch_idx * output_height + output_height_idx) * output_width + output_width_block_idx;
+    COMPUTE_FLOAT2 grid_xy = CONVERT_COMPUTE_FLOAT2(vload2(grid_offset, grid));
 
-    const float arr[8] = {grid_x.x, grid_y.x, grid_x.y, grid_y.y, grid_x.z, grid_y.z, grid_x.w, grid_y.w};
     
     // get grid x,y
-    const int arr_offset = output_height_idx % 4;
-    const float x = arr[2 * arr_offset];
-    const float y = arr[2 * arr_offset + 1];
+    const float x = (float)grid_xy.x;
+    const float y = (float)grid_xy.y;
 
     // convert grid x,y to input x,y coordinate range
     float in_grid_x = getPosition(x, input_width, alignCorners);
@@ -164,7 +153,7 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
     float y_weight = in_h1 - in_grid_y;
 
     // bilinear interpolation
-    const int inp_offset_base = (output_batch_idx * channelBlocks + output_channel_block_idx) * input_height;
+    const int inp_offset_base = (output_batch_idx + output_channel_block_idx * batch) * input_height;
     COMPUTE_FLOAT4 i00 = sample(in_h0, in_w0, inp_offset_base, input, input_height, input_width, paddingMode);
     COMPUTE_FLOAT4 i01 = sample(in_h0, in_w1, inp_offset_base, input, input_height, input_width, paddingMode);
     COMPUTE_FLOAT4 i10 = sample(in_h1, in_w0, inp_offset_base, input, input_height, input_width, paddingMode);
@@ -173,6 +162,6 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
     COMPUTE_FLOAT4 value = CONVERT_COMPUTE_FLOAT4(((COMPUTE_FLOAT4)x_weight * CONVERT_COMPUTE_FLOAT4(i00)  + (COMPUTE_FLOAT4)(1.0f - x_weight) * CONVERT_COMPUTE_FLOAT4(i01)) * (COMPUTE_FLOAT4)y_weight  +
                     ((COMPUTE_FLOAT4)x_weight * CONVERT_COMPUTE_FLOAT4(i10)  + (COMPUTE_FLOAT4)(1.0f - x_weight) * CONVERT_COMPUTE_FLOAT4(i11)) * (COMPUTE_FLOAT4)(1.0f- y_weight));
     
-    const int output_offset = ((output_batch_idx * channelBlocks + output_channel_block_idx ) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    const int output_offset = ((output_batch_idx + output_channel_block_idx * batch) * output_height + output_height_idx) * output_width + output_width_block_idx;
     vstore4(CONVERT_FLOAT4(value), output_offset, output);
 }
diff --git a/source/backend/opencl/execution/cl/input_transe_buf.cl b/source/backend/opencl/execution/cl/input_transe_buf.cl
index 352a7b1ab..1b86b6e7d 100644
--- a/source/backend/opencl/execution/cl/input_transe_buf.cl
+++ b/source/backend/opencl/execution/cl/input_transe_buf.cl
@@ -12,6 +12,7 @@ __kernel void conv_transe_c4_c1(
     __private const int input_width,
     __private const int input_height,
     __private const int input_channel,
+    __private const int batch,
     __private const int channel_blocks,
     __private const int input_pad_left,
     __private const int input_pad_right)
@@ -29,10 +30,10 @@ __kernel void conv_transe_c4_c1(
     const uint input_x_pitch = 4;
     const uint input_y_pitch = input_x_pitch * input_width;
     const uint input_f_pitch = input_y_pitch * input_height;
-    const uint input_b_pitch = input_f_pitch * channel_blocks;
+    const uint input_b_pitch = input_f_pitch * batch;
 
-    const uint input_offset = b * input_b_pitch +
-                              c * input_f_pitch +
+    const uint input_offset = b * input_f_pitch +
+                              c * input_b_pitch +
                               h * input_y_pitch +
                               w * input_x_pitch;
 
@@ -63,6 +64,7 @@ __kernel void conv_transe_c4_c16(
     int input_width,
     int input_height,
     int input_channel,
+    int batch,
     int channel_blocks,
     int input_pad_left,
     int input_pad_right)
@@ -80,10 +82,10 @@ __kernel void conv_transe_c4_c16(
     const uint input_x_pitch = 4;
     const uint input_y_pitch = input_x_pitch * input_width;
     const uint input_f_pitch = input_y_pitch * input_height;
-    const uint input_b_pitch = input_f_pitch * channel_blocks;
+    const uint input_b_pitch = input_f_pitch * batch;
     
-    const uint input_offset = b * input_b_pitch +
-                              c * input_f_pitch +
+    const uint input_offset = b * input_f_pitch +
+                              c * input_b_pitch +
                               h * input_y_pitch +
                               w * input_x_pitch;
     
@@ -110,4 +112,4 @@ __kernel void conv_transe_c4_c16(
             vstore4((FLOAT4)0, 0, output + pad_offset + i * output_x_pitch);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/source/backend/opencl/execution/cl/interp_buf.cl b/source/backend/opencl/execution/cl/interp_buf.cl
index 464997c15..99bcea8db 100644
--- a/source/backend/opencl/execution/cl/interp_buf.cl
+++ b/source/backend/opencl/execution/cl/interp_buf.cl
@@ -20,7 +20,7 @@ __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
                       __private const int input_width,
                       __private const int out_height,
                       __private const int out_width,
-                      __private const int channelBlocks) {
+                      __private const int batch) {
     const int output_channel_block_idx      = get_global_id(0);
     const int output_width_block_idx        = get_global_id(1);
     const int output_batch_height_block_idx = get_global_id(2);
@@ -40,10 +40,10 @@ __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
     const int in_w_index       = min(max(0, (int)floor(in_w_idx)), input_width-1);
 #endif
 
-    const int inp_offset = ((output_batch_idx * channelBlocks + output_channel_block_idx) * input_height + in_h_index) * input_width + in_w_index;
+    const int inp_offset = ((output_batch_idx + output_channel_block_idx*batch) * input_height + in_h_index) * input_width + in_w_index;
     FLOAT4 value = vload4(inp_offset, input);
 
-    const int out_offset = ((output_batch_idx * channelBlocks + output_channel_block_idx) * out_height + output_height_idx) * out_width + output_width_block_idx;
+    const int out_offset = ((output_batch_idx + output_channel_block_idx*batch) * out_height + output_height_idx) * out_width + output_width_block_idx;
     vstore4(value, out_offset, output);
 }
 
@@ -57,7 +57,7 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
                             __private const int input_width,
                             __private const int out_height,
                             __private const int out_width,
-                            __private const int channelBlocks) {
+                            __private const int batch) {
     const int output_channel_block_idx      = get_global_id(0);
     const int output_width_block_idx        = get_global_id(1);
     const int output_batch_height_block_idx = get_global_id(2);
@@ -77,7 +77,7 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
     float factor_w = (in_w_idx - (int)floor(in_w_idx));
     float factor_h = (in_h_idx - (int)floor(in_h_idx));
     
-    const int inp_offset_base = (output_batch_idx * channelBlocks + output_channel_block_idx) * input_height;
+    const int inp_offset_base = (output_batch_idx + output_channel_block_idx*batch) * input_height;
     const int inp_offset_00 = (inp_offset_base + in_h0_index) * input_width + in_w0_index;
     const int inp_offset_01 = (inp_offset_base + in_h0_index) * input_width + in_w1_index;
     const int inp_offset_10 = (inp_offset_base + in_h1_index) * input_width + in_w0_index;
@@ -90,7 +90,7 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
 
     FLOAT4 value = CONVERT_FLOAT4((float4)((1.0-factor_w)*(1.0-factor_h))*convert_float4(value_00) + (float4)(factor_w*(1.0-factor_h))*convert_float4(value_01) + (float4)((1.0-factor_w)*factor_h)*convert_float4(value_10) + (float4)(factor_w*factor_h)*convert_float4(value_11));
     
-    const int out_offset = ((output_batch_idx * channelBlocks + output_channel_block_idx) * out_height + output_height_idx) * out_width + output_width_block_idx;
+    const int out_offset = ((output_batch_idx + output_channel_block_idx*batch) * out_height + output_height_idx) * out_width + output_width_block_idx;
     
     vstore4(value, out_offset, output);
 }
@@ -109,7 +109,7 @@ __kernel void nearest3D_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
         __private const int out_depth,
         __private const int out_height,
         __private const int out_width,
-        __private const int channelBlocks) {
+        __private const int batch) {
     const int output_channel_block_idx      = get_global_id(0);
     const int output_height_width_block_idx = get_global_id(1);
     const int output_batch_depth_block_idx  = get_global_id(2);
@@ -129,11 +129,11 @@ __kernel void nearest3D_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,
     const int in_h_index      = min(max(0, (int)floor(in_h_idx)), input_height-1);
     const int in_w_index       = min(max(0, (int)floor(in_w_idx)), input_width-1);
 
-    const int inp_offset = (((output_batch_idx * channelBlocks + output_channel_block_idx)
+    const int inp_offset = (((output_batch_idx + output_channel_block_idx*batch)
             * input_depth + in_d_index) * input_height + in_h_index) * input_width + in_w_index;
 
-    const int out_offset = (((output_batch_idx * channelBlocks + output_channel_block_idx)
+    const int out_offset = (((output_batch_idx + output_channel_block_idx*batch)
             * out_depth + output_depth_idx) * out_height + output_height_idx) * out_width + output_width_idx;
     FLOAT4 value = vload4(inp_offset, input);
     vstore4(value, out_offset, output);
-}
\ No newline at end of file
+}
diff --git a/source/backend/opencl/execution/cl/layernorm_buf.cl b/source/backend/opencl/execution/cl/layernorm_buf.cl
index 3ee18e085..eab7caf8f 100644
--- a/source/backend/opencl/execution/cl/layernorm_buf.cl
+++ b/source/backend/opencl/execution/cl/layernorm_buf.cl
@@ -2,274 +2,48 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-__kernel void layernorm_w_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                        __global const FLOAT * input,
-                        __global FLOAT * output,
-                        __private const int width,
-                        __private const int height,
-                        __private const int channel,
-#ifdef GAMMA_BETA
-                        __global const FLOAT *gamma,
-                        __global const FLOAT *beta,
-#endif
-                        __private float epsilon){
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    float4 local sum[LOCAL_SIZE];
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int h = pos.y % height;
-        const int c = pos.y / height;
-        const int b = pos.z;
-        const int lid = get_local_id(0);
-        const int channel4 = (channel + 3) / 4;
-        const int offset = ((b * channel4 + c) * height + h) * width * 4;
-
-        float4 in_sum = 0;
-#ifdef RMSNORM
-        float4 mean = 0;
-#else
-        for(int i = lid; i < width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-            in_sum += in;
-        }
-        sum[lid] = in_sum;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        
-        float4 mean = sum[0] / (float4)width;
-#endif
-        in_sum = 0;
-        for(int i = lid; i < width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-            in_sum += (in - mean) * (in - mean);
-        }
-        sum[lid] = in_sum;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        float4 square_sum = sum[0] / (float4)width;
-        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
-        for(int i = lid; i < width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-#ifdef GAMMA_BETA
-            float4 out = (in - mean) * value * (float4)gamma[i] + (float4)beta[i];
-#else
-            float4 out = (in - mean) * value;
-#endif
-            vstore4(CONVERT_FLOAT4(out), i, output + offset);
-        }
-    }
-}
-
-
-__kernel void layernorm_hw_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                        __global const FLOAT * input,
-                        __global FLOAT * output,
-                        __private const int width,
-                        __private const int height,
-                        __private const int channel,
-#ifdef GAMMA_BETA
-                        __global const FLOAT *gamma,
-                        __global const FLOAT *beta,
-#endif
-                        __private float epsilon){
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    float4 local sum[LOCAL_SIZE];
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int c = pos.y;
-        const int b = pos.z;
-        const int height_width = height * width;
-        const int channel4 = (channel + 3) / 4;
-        const int lid = get_local_id(0);
-        const int offset = ((b * channel4 + c) * height) * width * 4;
-
-        float4 in_sum = 0;
-#ifdef RMSNORM
-        float4 mean = 0;
-#else
-        for(int i = lid; i < height_width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-            in_sum += in;
-        }
-        sum[lid] = in_sum;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        
-        float4 mean = sum[0] / (float4)height_width;
-#endif
-        in_sum = 0;
-        for(int i = lid; i < height_width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-            in_sum += (in - mean) * (in - mean);
-        }
-        sum[lid] = in_sum;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        float4 square_sum = sum[0] / (float4)height_width;
-        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
-        for(int i = lid; i < height_width; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset));
-#ifdef GAMMA_BETA
-            float4 out = (in - mean) * value * (float4)gamma[i] + (float4)beta[i];
-#else
-            float4 out = (in - mean) * value;
-#endif
-            vstore4(CONVERT_FLOAT4(out), i, output + offset);
-        }
-    }
-}
-
-__kernel void layernorm_chw_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                        __global const FLOAT * input,
-                        __global FLOAT * output,
-                        __private const int width,
-                        __private const int height,
-                        __private const int channel,
-#ifdef GAMMA_BETA
-                        __global const FLOAT *gamma,
-                        __global const FLOAT *beta,
-#endif
-                        __private float epsilon){
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    float local sum[LOCAL_SIZE];
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int b = pos.z;
-        const int sum_size = width * height * channel;
-        const int reduce_size = width * height;
-        const int lid = get_local_id(0);
-        const int channel4 = (channel + 3) / 4;
-        const int channel_remain = channel - (channel4 - 1) * 4;
-        const int offset = ((b * channel4) * height) * width * 4;
-        const int wh_offset = height * width * 4;
-        
-        float4 in_sum = 0;
-        float4 in_sum_left = 0;
-        float *in_sum_left_ptr = (float*)(&in_sum_left);
-#ifdef RMSNORM
-        float4 mean = 0;
-#else
-        for(int c = 0; c < channel4 - 1; ++c){
-            for(int i = lid; i < reduce_size; i+=LOCAL_SIZE){
-                float4 in = convert_float4(vload4(i, input + offset + c * wh_offset));
-                in_sum += in;
-            }
-        }
-        for(int i = lid; i < reduce_size; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset + (channel4 - 1) * wh_offset));
-            in_sum_left += in;
-        }
-        in_sum.x = in_sum.x + in_sum.y + in_sum.z + in_sum.w;
-        for(int i = 1; i < channel_remain; ++i){
-            in_sum_left_ptr[0] += in_sum_left_ptr[i];
-        }
-        sum[lid] = in_sum.x + in_sum_left.x;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        
-        float4 mean = sum[0] / (float4)sum_size;
-#endif
-        in_sum = 0;
-        in_sum_left = 0;
-        for(int c = 0; c < channel4 - 1; ++c){
-            for(int i = lid; i < reduce_size; i+=LOCAL_SIZE){
-                float4 in = convert_float4(vload4(i, input + offset + c * wh_offset));
-                in_sum += (in - mean) * (in - mean);
-            }
-        }
-        
-        for(int i = lid; i < reduce_size; i+=LOCAL_SIZE){
-            float4 in = convert_float4(vload4(i, input + offset + (channel4 - 1) * wh_offset));
-            in_sum_left += (in - mean) * (in - mean);
-        }
-        
-        in_sum.x = in_sum.x + in_sum.y + in_sum.z + in_sum.w;
-        for(int i = 1; i < channel_remain; ++i){
-            in_sum_left_ptr[0] += in_sum_left_ptr[i];
-        }
-        
-        sum[lid] = in_sum.x + in_sum_left.x;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-            if (lid < i)
-                sum[lid] = sum[lid] + sum[lid + i];
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-        float4 square_sum = sum[0] / (float4)sum_size;
-        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
-        for(int c = 0; c < channel4; ++c){
-            for(int i = lid; i < reduce_size; i+=LOCAL_SIZE){
-                float4 in = convert_float4(vload4(i, input + offset + c * wh_offset));
-#ifdef GAMMA_BETA
-                float4 out = (in - mean) * value * (float4)gamma[c * reduce_size + i] + (float4)beta[c * reduce_size + i];
-#else
-                float4 out = (in - mean) * value;
-#endif
-                vstore4(CONVERT_FLOAT4(out), i, output + offset + c * wh_offset);
-            }
-        }
-    }
-}
-
-
-__kernel void layernorm_plain_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+__kernel void layernorm_buf(__private int global_dim0, __private int global_dim1,
                         __global const FLOAT * input,
                         __global FLOAT * output,
                         __private const int inside,
-                        __private const int outside,
 #ifdef GAMMA_BETA
                         __global const FLOAT *gamma,
                         __global const FLOAT *beta,
 #endif
                         __private float epsilon){
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    COMPUTE_FLOAT local sum[LOCAL_SIZE];
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int idx_out = pos.z;
+    int2 pos = (int2)(get_global_id(0), get_global_id(1));
+#if LOCAL_SIZE > 1
+    float local sum[LOCAL_SIZE];
+    if (pos.x < global_dim0 && pos.y < global_dim1) {
         const int lid = get_local_id(0);
-        const int offset = idx_out * inside;
+        const int offset = pos.y * inside;
         const int inside_v4 = (inside + 3) >> 2;
+        #ifdef PACK_LEAVE
+        const int loop = inside_v4 - 1;
         const int inside_remain = inside - ((inside_v4-1) << 2);
-
-        COMPUTE_FLOAT4 in_sum = 0;
+        #else
+        const int loop = inside_v4;
+        #endif
+        
+        float4 in_sum = 0;
         int index = lid;
-        for(; index < inside_v4 - 1; index+=LOCAL_SIZE){
-            COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(index, input + offset));
+        #ifdef RMSNORM
+        float4 mean = (float4)0;
+        #else
+        for(; index < loop; index+=LOCAL_SIZE){
+            float4 in = convert_float4(vload4(index, input + offset));
             in_sum += in;
         }
         sum[lid] = in_sum.x + in_sum.y + in_sum.z+ in_sum.w;
         
-        COMPUTE_FLOAT4 in_left = 0;
+        #ifdef PACK_LEAVE
         if(index == inside_v4 - 1) {
-            in_left = CONVERT_COMPUTE_FLOAT4(vload4(inside_v4 - 1, input + offset));
-            sum[lid] = sum[lid] + in_left.x;
-            if(inside_remain > 1) {
-                sum[lid] = sum[lid] + in_left.y;
-            }
-            if(inside_remain > 2) {
-                sum[lid] = sum[lid] + in_left.z;
-            }
-            if(inside_remain > 3) {
-                sum[lid] = sum[lid] + in_left.w;
+            for(int i = 0; i < inside_remain; ++i)
+                float in = input[offset + index * 4 + i];
+                sum[lid] = sum[lid] + in;
             }
         }
+        #endif
         
         barrier(CLK_LOCAL_MEM_FENCE);
         for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
@@ -278,47 +52,87 @@ __kernel void layernorm_plain_buf(__private int global_dim0, __private int globa
             barrier(CLK_LOCAL_MEM_FENCE);
         }
         
-        COMPUTE_FLOAT4 mean = sum[0] / (COMPUTE_FLOAT4)inside;
+        float4 mean = sum[0] / (float4)inside;
+        #endif
 
         in_sum = 0;
         index = lid;
-        for(; index < inside_v4 - 1; index+=LOCAL_SIZE){
-            COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(index, input + offset));
+        for(; index < loop; index+=LOCAL_SIZE){
+            float4 in = convert_float4(vload4(index, input + offset));
             in_sum += (in - mean) * (in - mean);
         }
         sum[lid] = in_sum.x + in_sum.y + in_sum.z + in_sum.w;
-        
+        #ifdef PACK_LEAVE
         if(index == inside_v4 - 1) {
-            COMPUTE_FLOAT4 in_left = CONVERT_COMPUTE_FLOAT4(vload4(inside_v4 - 1, input + offset));
-            in_sum = (in_left - mean) * (in_left - mean);
-            sum[lid] = sum[lid] + in_sum.x;
-            if(inside_remain > 1) {
-                sum[lid] = sum[lid] + in_sum.y;
-            }
-            if(inside_remain > 2) {
-                sum[lid] = sum[lid] + in_sum.z;
-            }
-            if(inside_remain > 3) {
-                sum[lid] = sum[lid] + in_sum.w;
+            for(int i = 0; i < inside_remain; ++i)
+                float in = input[offset + index * 4 + i];
+                in = (in - mean) * (in - mean);
+                sum[lid] = sum[lid] + in;
             }
         }
+        #endif
         barrier(CLK_LOCAL_MEM_FENCE);
         for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
             if (lid < i)
                 sum[lid] = sum[lid] + sum[lid + i];
             barrier(CLK_LOCAL_MEM_FENCE);
         }
-        COMPUTE_FLOAT4 square_sum = sum[0] / (COMPUTE_FLOAT4)inside;
-        COMPUTE_FLOAT4 value = (COMPUTE_FLOAT4)1.0f / (COMPUTE_FLOAT4)sqrt(square_sum + (COMPUTE_FLOAT4)epsilon);
-
-        for(int i = lid; i < inside_v4; i+=LOCAL_SIZE){
-            COMPUTE_FLOAT4 in = CONVERT_COMPUTE_FLOAT4(vload4(i, input + offset));
-#ifdef GAMMA_BETA
-            COMPUTE_FLOAT4 out = (in - mean) * value * CONVERT_COMPUTE_FLOAT4(vload4(i, gamma)) + CONVERT_COMPUTE_FLOAT4(vload4(i, beta));
+        float4 square_sum = sum[0] / (float4)inside;
+        float4 value = (float4)1.0f / (float4)sqrt(square_sum + (float4)epsilon);
+        index = lid;
+        for(; index < loop; index+=LOCAL_SIZE){
+            float4 in = convert_float4(vload4(index, input + offset));
+            #ifdef GAMMA_BETA
+            float4 out = (in - mean) * value * convert_float4(vload4(index, gamma)) + convert_float4(vload4(index, beta));
+            #else
+            float4 out = (in - mean) * value;
+            #endif
+            vstore4(CONVERT_FLOAT4(out), index, output + offset);
+        }
+        #ifdef PACK_LEAVE
+        if(index == inside_v4 - 1) {
+            for(int i = 0; i < inside_remain; ++i){
+                float in = input[offset + index * 4 + i];
+                #ifdef GAMMA_BETA
+                float out = (in - mean.x) * value.x * (float)gamma[index * 4 + i] + (float)beta[index * 4 + i];
+                #else
+                float out = (in - mean.x) * value.x;
+                #endif
+                output[offset + index * 4 + i] = out;
+            }
+        }
+        #endif
+    }
 #else
-            COMPUTE_FLOAT4 out = (in - mean) * value;
-#endif
-            vstore4(CONVERT_FLOAT4(out), i, output + offset);
+    if (pos.x < global_dim0 && pos.y < global_dim1) {
+        const int offset = pos.y * inside;
+        #ifdef RMSNORM
+        float mean = 0;
+        #else
+        float in_sum = 0;
+        for(int index = 0; index < inside; index++){
+            in_sum += (float)input[offset + index];
+        }
+        float mean = in_sum / inside;
+        #endif
+
+        in_sum = 0;
+        for(int index = 0; index < inside; index++){
+            float in = (float)input[offset + index];
+            in_sum += (in - mean) * (in - mean);
+        }
+        float square_sum = in_sum / inside;
+        float value = 1.0f / sqrt(square_sum + epsilon);
+        for(int i = 0; i < inside; ++i){
+            float in = input[offset + i];
+            #ifdef GAMMA_BETA
+            float out = (in - mean) * value * (float)gamma[i] + (float)beta[i];
+            #else
+            float out = (in - mean) * value;
+            #endif
+            output[offset + i] = out;
         }
     }
+
+#endif
 }
diff --git a/source/backend/opencl/execution/cl/loop_buf.cl b/source/backend/opencl/execution/cl/loop_buf.cl
index de7cb5725..c1a0521dc 100644
--- a/source/backend/opencl/execution/cl/loop_buf.cl
+++ b/source/backend/opencl/execution/cl/loop_buf.cl
@@ -21,7 +21,7 @@
     #define TSH 8 // thread handle size H dimension
 #endif
 
-// [N C4 H 1 4] -> [N H C 1]
+// [C4 N H 1 4] -> [N H C 1]
 __kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,
                         __global OUTPUT_TYPE* output,
                         __private const int widthPad,
@@ -39,7 +39,6 @@ __kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,
     // group id
     const int c = get_group_id(0) * WGSC;
     const int h = get_group_id(1) * WGSH;
-    const int channel_4 = (channel + 3) >> 2;
 
     int jc = lidc;
     int ih = lidh;
@@ -53,7 +52,7 @@ __kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,
             int offset_h = i * WGSH / TSH + ih;
             int offset_c = j * WGSC / TSC + jc ;
             // [TSH, WGSH / TSH]   [TSC / 4, WGSC / TSC, 4]
-            localData[offset_h][offset_c] = (h + offset_h >= height || c + 4 * offset_c >= channel) ? (INPUT_TYPE4)0 : vload4(0, input + ((b * channel_4 + (c/4+offset_c)) * height + (h+offset_h)) * 4);
+            localData[offset_h][offset_c] = (h + offset_h >= height || c + 4 * offset_c >= channel) ? (INPUT_TYPE4)0 : vload4(0, input + ((b + (c/4+offset_c)*batch) * height + (h+offset_h)) * 4);
         }
     }
     
@@ -78,7 +77,7 @@ __kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,
         }
     }
 }
-// [N C4 H W 4] -> [N C W H]
+// [C4 N H W 4] -> [N C W H]
 __kernel void tile_trans_4d_buf(__global INPUT_TYPE* input,
                         __global OUTPUT_TYPE* output,
                         __private const int widthPad,
@@ -99,7 +98,6 @@ __kernel void tile_trans_4d_buf(__global INPUT_TYPE* input,
     // group id
     const int w = get_group_id(0) * WGSW;
     const int h = get_group_id(1) * WGSH;
-    const int channel_4 = (channel + 3) >> 2;
 
     int jw = lidw;
     int ih = lidh;
@@ -112,7 +110,7 @@ __kernel void tile_trans_4d_buf(__global INPUT_TYPE* input,
         for(int j = 0; j < TSW; j++) {
             int offset_h = h + ih + i * WGSH/TSH;
             int offset_w = w + jw + j * WGSW/TSW;
-            localData[ih + i * WGSH / TSH][jw + j * WGSW/TSW] = (offset_h >= height || offset_w >= width) ? (INPUT_TYPE4)0 : vload4(0, input + (((b * channel_4 + c4) * height + offset_h) * width + offset_w) * 4);
+            localData[ih + i * WGSH / TSH][jw + j * WGSW/TSW] = (offset_h >= height || offset_w >= width) ? (INPUT_TYPE4)0 : vload4(0, input + (((b + c4 * batch) * height + offset_h) * width + offset_w) * 4);
         }
     }
     
@@ -234,8 +232,8 @@ __kernel void tile_buf(__private int global_dim0, __private int global_dim1, __p
         const int c = c_4 << 2;
         const int x_src_pitch = 4;
         const int y_src_pitch = x_src_pitch * width;
-        const int c_src_pitch = y_src_pitch * height;
-        const int b_src_pitch = c_src_pitch * ((channel + 3) / 4);
+        const int b_src_pitch = y_src_pitch * height;
+        const int c_src_pitch = b_src_pitch * batch;
         
         bool outBound = (w >= width || h >= height || c >= channel);
 #ifdef MNN_NHWC
@@ -390,156 +388,32 @@ __kernel void pack_buf(__private int global_dim0, __private int global_dim1, __p
 }
 
 #ifdef LOOP_BINARY_OPERATOR
-__kernel void broadcast_binary_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+__kernel void loop_binary_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
                          __global OUTPUT_TYPE* output, __global INPUT_TYPE* input0, __global INPUT_TYPE* input1,
-                         __private const int8 src0_size, //(batch, channel, height, width)
-                         __private const int4 src0C4_size, // nc4hw4
-                         __private const int8 src1_size,
-                         __private const int4 src1C4_size,
-                         __private const int8 dst_size,
-                         __private const int dst_width,
-                         __private const int dst_height,
-                         __private const int dst_channel,
-                         __private const int channel_block) {
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        
-        const int wo = pos.x;
-        const int ho = pos.y;
-        const int co = pos.z % channel_block;
-        const int no = pos.z / channel_block;
-        const int output_offset =  ((((no * channel_block) + co) * dst_height + ho) * dst_width + wo) * 4;
-        int co4 = co << 2;
-        int4 covec = (int4)(co4 % dst_channel, (co4 + 1) % dst_channel, (co4 + 2) % dst_channel, (co4 + 3) % dst_channel);
-        int4 out_offset = ((no * dst_channel + covec) * dst_height + ho) * dst_width + wo;
-        int4 w = out_offset % (dst_size.s3 * dst_size.s4); out_offset /= (dst_size.s3 * dst_size.s4);
-        int4 h = out_offset % dst_size.s2; out_offset /= dst_size.s2;
-        int4 c = out_offset % dst_size.s1; out_offset /= dst_size.s1;
-        int4 n = out_offset % dst_size.s0;
-        float4 in0, in1;
-    
-#ifdef BROADCAST_INPUT1
-        in0 = convert_float4(vload4(0, input0 + output_offset));
-        const int src1_channel_block = (src1C4_size.y + 3) / 4;
-        float* in1_ptr = (float*)&in1;
-        {
-            int4 w0 = w % (src1_size.s3 * src1_size.s4);
-            int4 h0 = h % src1_size.s2;
-            int4 c0 = c % src1_size.s1;
-            int4 n0 = n % src1_size.s0;
-            int* w0_ptr = (int*)&w0;
-            int* h0_ptr = (int*)&h0;
-            int* c0_ptr = (int*)&c0;
-            int* n0_ptr = (int*)&n0;
-            for(int i = 0; i < 4; ++i){
-                int c4offset = ((n0_ptr[i] * src1_size.s1 + c0_ptr[i]) * src1_size.s2 + h0_ptr[i]) * src1_size.s3 * src1_size.s4 + w0_ptr[i];
-                int wc4 = c4offset % src1C4_size.w; c4offset /= src1C4_size.w;
-                int hc4 = c4offset % src1C4_size.z; c4offset /= src1C4_size.z;
-                int cc4 = c4offset % src1C4_size.y; c4offset /= src1C4_size.y;
-                int nc4 = c4offset % src1C4_size.x;
-                int cc4_offset = cc4 / 4;
-                int cc4_remain = cc4 % 4;
-                in1_ptr[i] = (float)input1[((((nc4 * src1_channel_block) + cc4_offset) * src1C4_size.z + hc4) * src1C4_size.w + wc4) * 4 + cc4_remain];
-            }
-        }
-#else
-        const int src0_channel_block = (src0C4_size.y + 3) / 4;
-        float* in0_ptr = (float*)&in0;
-        {
-            int4 w0 = w % (src0_size.s3 * src0_size.s4);
-            int4 h0 = h % src0_size.s2;
-            int4 c0 = c % src0_size.s1;
-            int4 n0 = n % src0_size.s0;
-            int* w0_ptr = (int*)&w0;
-            int* h0_ptr = (int*)&h0;
-            int* c0_ptr = (int*)&c0;
-            int* n0_ptr = (int*)&n0;
-            for(int i = 0; i < 4; ++i){
-                int c4offset = ((n0_ptr[i] * src0_size.s1 + c0_ptr[i]) * src0_size.s2 + h0_ptr[i]) * src0_size.s3 * src0_size.s4 + w0_ptr[i];
-                int wc4 = c4offset % src0C4_size.w; c4offset /= src0C4_size.w;
-                int hc4 = c4offset % src0C4_size.z; c4offset /= src0C4_size.z;
-                int cc4 = c4offset % src0C4_size.y; c4offset /= src0C4_size.y;
-                int nc4 = c4offset % src0C4_size.x;
-                int cc4_offset = cc4 / 4;
-                int cc4_remain = cc4 % 4;
-                in0_ptr[i] = (float)input0[((((nc4 * src0_channel_block) + cc4_offset) * src0C4_size.z + hc4) * src0C4_size.w + wc4) * 4 + cc4_remain];
-            }
-        }
-        in1 = convert_float4(vload4(0, input1 + output_offset));
-#endif
-        float4 out = LOOP_BINARY_OPERATOR;
-        vstore4(CONVERT_OUTPUT4(out), 0, output + output_offset);
-    }
-}
-
-__kernel void broadcast_binary_channel_equall_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                         __global OUTPUT_TYPE* output, __global INPUT_TYPE* input0, __global INPUT_TYPE* input1,
-                         __private const int8 src0_size, //(batch, channel, height, width)
-                         __private const int4 src0C4_size, // nc4hw4
-                         __private const int8 src1_size,
-                         __private const int4 src1C4_size,
-                         __private const int8 dst_size,
-                         __private const int dst_width,
-                         __private const int dst_height,
-                         __private const int dst_channel,
-                         __private const int channel_block) {
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int wo = pos.x;
-        const int ho = pos.y;
-        const int co = pos.z % channel_block;
-        const int no = pos.z / channel_block;
-        const int output_offset = ((((no * channel_block) + co) * dst_height + ho) * dst_width + wo) * 4;
-#ifdef BROADCAST_INPUT1
-        const int src1_channel_block = (src1C4_size.y + 3) / 4;
-        const int input_offset = (((((no % src1_size.s0) * src1_channel_block) + co) * src1C4_size.z + (ho % src1_size.s2)) * src1C4_size.w + (wo % (src1_size.s3 * src1_size.s4))) * 4;
-        float4 in0 = convert_float4(vload4(0, input0 + output_offset));
-        float4 in1 = convert_float4(vload4(0, input1 + input_offset));
-#else
-        const int src0_channel_block = (src0C4_size.y + 3) / 4;
-        const int input_offset = (((((no % src0_size.s0) * src0_channel_block) + co) * src0C4_size.z + (ho % src0_size.s2)) * src0C4_size.w + (wo % (src0_size.s3 * src0_size.s4))) * 4;
-        float4 in0 = convert_float4(vload4(0, input0 + input_offset));
-        float4 in1 = convert_float4(vload4(0, input1 + output_offset));
-#endif
-        float4 out = LOOP_BINARY_OPERATOR;
-        vstore4(CONVERT_OUTPUT4(out), 0, output + output_offset);
-    }
-}
-
-//channel = 1 and dimmision = 1
-__kernel void broadcast_binary_dimmision1_channel1_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
-                         __global OUTPUT_TYPE* output, __global INPUT_TYPE* input0, __global INPUT_TYPE* input1,
-                         __private const int8 src0_size, //(batch, channel, height, width)
-                         __private const int4 src0C4_size, // nc4hw4
-                         __private const int8 src1_size,
-                         __private const int4 src1C4_size,
-                         __private const int8 dst_size,
-                         __private const int dst_width,
-                         __private const int dst_height,
-                         __private const int dst_channel,
-                         __private const int channel_block) {
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+                         __private const int input0Stride0,
+                         __private const int input0Stride1,
+                         __private const int input0Stride2,
+                         __private const int input1Stride0,
+                         __private const int input1Stride1,
+                         __private const int input1Stride2,
+                         __private const int outputStride0,
+                         __private const int outputStride1,
+                         __private const int outputStride2
+                         ) {
+                             
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
     
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int wo = pos.x;
-        const int ho = pos.y;
-        const int co = pos.z % channel_block;
-        const int no = pos.z / channel_block;
+    if (x < global_dim0 && y < global_dim1 && z < global_dim2) {
         
-        const int output_offset = ((((no * channel_block) + co) * dst_height + ho) * dst_width + wo) * 4;
-#ifdef BROADCAST_INPUT1
-        const int input_offset = ((no % src1_size.s0) * src1_size.s2 + (ho % src1_size.s2)) * src1_size.s3 * src1_size.s4 + (wo % (src1_size.s3 * src1_size.s4));
-        float4 in0 = convert_float4(vload4(0, input0 + output_offset));
-        float4 in1 = (float4)(input1[input_offset]);
-#else
-        const int input_offset = ((no % src0_size.s0) * src0_size.s2 + (ho % src0_size.s2)) * src0_size.s3 * src0_size.s4 + (wo % (src0_size.s3 * src0_size.s4));
-        float4 in0 = (float4)(input0[input_offset]);
-        float4 in1 = convert_float4(vload4(0, input1 + output_offset));
-#endif
-        float4 out = LOOP_BINARY_OPERATOR;
-        vstore4(CONVERT_OUTPUT4(out), 0, output + output_offset);
+        int inputIndex0 = z * input0Stride0 + y * input0Stride1 + x * input0Stride2;
+        int inputIndex1 = z * input1Stride0 + y * input1Stride1 + x * input1Stride2;
+        int outputIndex = z * outputStride0 + y * outputStride1 + x * outputStride2;
+        float in0 = (float)input0[inputIndex0];
+        float in1 = (float)input1[inputIndex1];
+        float out = LOOP_BINARY_OPERATOR;
+        output[outputIndex] = (OUTPUT_TYPE)out;
     }
 }
 #endif
diff --git a/source/backend/opencl/execution/cl/matmul_buf.cl b/source/backend/opencl/execution/cl/matmul_buf.cl
index 4d2b65756..c4ddd12d8 100644
--- a/source/backend/opencl/execution/cl/matmul_buf.cl
+++ b/source/backend/opencl/execution/cl/matmul_buf.cl
@@ -16,426 +16,170 @@ __kernel void matmul_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,
                      __global const FLOAT* input_c,
                      #endif
                      __global FLOAT* output_c, 
-                     __private const int channels,
-                     __private const int channel_blocks,
-                     __private const int width_blocks,
-                     __private const int width) {
-    const int width_blocks_idx = get_global_id(0);// output W
-    const int height_idx       = get_global_id(1);// output H
-
-    DEAL_NON_UNIFORM_DIM2(width_blocks_idx, height_idx);
-    COMPUTE_FLOAT4 a;
-    COMPUTE_FLOAT4 b0 = 0, b1 = 0, b2 = 0, b3 = 0;
-    COMPUTE_FLOAT4 v_zero = (COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);
+                     __private const int M,
+                     __private const int N,
+                     __private const int K) {
+    int2 pos = (int2)(get_global_id(0), get_global_id(1)); // N M
+
+    DEAL_NON_UNIFORM_DIM2(pos.x, pos.y);
+    const int idn = pos.x << 2;
+    const int idm = pos.y << 2;
+    
+    COMPUTE_FLOAT4 out[4];
 
     #ifdef BIAS
-    COMPUTE_FLOAT4 temp = CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx, input_c));
-
-    COMPUTE_FLOAT result0 = temp.x;
-    COMPUTE_FLOAT result1 = temp.y;
-    COMPUTE_FLOAT result2 = temp.z;
-    COMPUTE_FLOAT result3 = temp.w;
+    COMPUTE_FLOAT4 bias = CONVERT_COMPUTE_FLOAT4(vload4(0, input_c + idn));
+    #pragma unroll
+    for(int i = 0; i < 4; ++i){
+        out[i] = bias;
+    }
     #else
-    COMPUTE_FLOAT result0 = 0;
-    COMPUTE_FLOAT result1 = 0;
-    COMPUTE_FLOAT result2 = 0;
-    COMPUTE_FLOAT result3 = 0;
+    #pragma unroll
+    for(int i = 0; i < 4; ++i){
+        out[i] = (COMPUTE_FLOAT4)0;
+    }
     #endif
 
-    const int remain = channel_blocks*4 - channels;
-    for (short pos = 0; pos < channel_blocks - 1; pos += 1) {
-        const int inpa_offset = height_idx * channel_blocks + pos;
-        a = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-
-        const int inpb_offset = (pos*4) * width_blocks + width_blocks_idx;
-
-        b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        b1 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks, input_b));
-        b2 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*2, input_b));
-        b3 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*3, input_b));
-
-        COMPUTE_FLOAT4 btmp0 = (COMPUTE_FLOAT4)(b0.s0, b1.s0, b2.s0, b3.s0);
-        COMPUTE_FLOAT4 btmp1 = (COMPUTE_FLOAT4)(b0.s1, b1.s1, b2.s1, b3.s1);
-        COMPUTE_FLOAT4 btmp2 = (COMPUTE_FLOAT4)(b0.s2, b1.s2, b2.s2, b3.s2);
-        COMPUTE_FLOAT4 btmp3 = (COMPUTE_FLOAT4)(b0.s3, b1.s3, b2.s3, b3.s3);
-
-        result0 += dot(a, btmp0);
-        result1 += dot(a, btmp1);
-        result2 += dot(a, btmp2);
-        result3 += dot(a, btmp3);
-    }
+    const int K4 = (K + 3)/4;
+    #ifdef K_LEAVE
+    const int loop_end = max(K4 - 1, 0);
+    const int remain = K - loop_end*4;
+    #else
+    const int loop_end = K4;
+    #endif
     
-    {
-        const int inpa_offset = height_idx * channel_blocks + channel_blocks - 1;
-        a = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-
-        const int inpb_offset = ((channel_blocks - 1)*4) * width_blocks + width_blocks_idx;
-
-        b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        b1 = (remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks, input_b));
-        b2 = (remain >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*2, input_b));
-        b3 = (remain >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*3, input_b));
-        if (remain == 3) {
-            a.y = 0;
-            a.z = 0;
-            a.w = 0;
-        } else if (remain == 2) {
-            a.z = 0;
-            a.w = 0;
-        } else if (remain == 1) {
-            a.w = 0;;
-        }
-
-        COMPUTE_FLOAT4 btmp0 = (COMPUTE_FLOAT4)(b0.s0, b1.s0, b2.s0, b3.s0);
-        COMPUTE_FLOAT4 btmp1 = (COMPUTE_FLOAT4)(b0.s1, b1.s1, b2.s1, b3.s1);
-        COMPUTE_FLOAT4 btmp2 = (COMPUTE_FLOAT4)(b0.s2, b1.s2, b2.s2, b3.s2);
-        COMPUTE_FLOAT4 btmp3 = (COMPUTE_FLOAT4)(b0.s3, b1.s3, b2.s3, b3.s3);
-
-        result0 += dot(a, btmp0);
-        result1 += dot(a, btmp1);
-        result2 += dot(a, btmp2);
-        result3 += dot(a, btmp3);
-    }
-
-    const int out_offset = height_idx * width_blocks + width_blocks_idx;
-    vstore4(CONVERT_FLOAT4((COMPUTE_FLOAT4)(result0, result1, result2, result3)), out_offset, output_c);
-}
-
-__kernel void matmul_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,
-                     __global const FLOAT* input_b,
-                    #ifdef BIAS
-                     __global const FLOAT* input_c,
-                    #endif
-                     __global FLOAT* output_c, 
-                     __private const int channels,
-                     __private const int channel_blocks,
-                     __private const int width_blocks,
-                     __private const int width) {
-    const int width_blocks_idx = get_global_id(0);
-    const int height_idx       = get_global_id(1);
-
-    DEAL_NON_UNIFORM_DIM2(width_blocks_idx, height_idx);
-    COMPUTE_FLOAT4 a;
-    COMPUTE_FLOAT4 b0 = 0, b1 = 0, b2 = 0, b3 = 0;
-    COMPUTE_FLOAT4 v_zero = (COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);
-
-    #ifdef BIAS
-    COMPUTE_FLOAT4 temp = CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx, input_c));
-    COMPUTE_FLOAT result0 = temp.x;
-    COMPUTE_FLOAT result1 = temp.y;
-    COMPUTE_FLOAT result2 = temp.z;
-    COMPUTE_FLOAT result3 = temp.w;
+    #ifdef TRANSPOSE_A
+    __global const FLOAT* input_a_offset = input_a + idm; // K x M
     #else
-    COMPUTE_FLOAT result0 = 0;
-    COMPUTE_FLOAT result1 = 0;
-    COMPUTE_FLOAT result2 = 0;
-    COMPUTE_FLOAT result3 = 0;
+    __global const FLOAT* input_a_offset = input_a + idm * K; // M x K
     #endif
-
-    const int remaina = channel_blocks*4 - channels;
-    const int remainb = (width_blocks_idx+1)*4 - width;
-    for (short pos = 0; pos < channel_blocks - 1; pos += 1) {
-        const int inpa_offset = height_idx * channel_blocks + pos;
-        a = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-
-        const int inpb_offset = (width_blocks_idx*4) * channel_blocks + pos;
-
-        b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        b1 = (remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks, input_b));
-        b2 = (remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*2, input_b));
-        b3 = (remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*3, input_b));
-
-        result0 += dot(a, b0);
-        result1 += dot(a, b1);
-        result2 += dot(a, b2);
-        result3 += dot(a, b3);
-    }
     
-    {
-        const int inpa_offset = height_idx * channel_blocks + channel_blocks - 1;
-        a = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-
-        const int inpb_offset = (width_blocks_idx*4) * channel_blocks + channel_blocks - 1;
-
-        b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        b1 = (remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks, input_b));
-        b2 = (remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*2, input_b));
-        b3 = (remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*3, input_b));
-
-        if (remaina == 3) {
-            a.y = 0;
-            a.z = 0;
-            a.w = 0;
-        } else if (remaina == 2) {
-            a.z = 0;
-            a.w = 0;
-        } else if (remaina == 1) {
-            a.w = 0;
-        }
-
-        result0 += dot(a, b0);
-        result1 += dot(a, b1);
-        result2 += dot(a, b2);
-        result3 += dot(a, b3);
-    }
-    const int out_offset = height_idx * width_blocks + width_blocks_idx;
-    vstore4(CONVERT_FLOAT4((COMPUTE_FLOAT4)(result0, result1, result2, result3)), out_offset, output_c);
-}
-
-
-__kernel void matmul_transA_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,
-                 __global const FLOAT* input_b,
-                #ifdef BIAS
-                 __global const FLOAT* input_c,
-                #endif
-                 __global FLOAT* output_c,
-                 __private const int channels,
-                 __private const int channel_blocks,
-                 __private const int height,
-                 __private const int height_blocks,
-                 __private const int width_blocks,
-                 __private const int width) {
-    const int width_blocks_idx = get_global_id(0);
-    const int height_blocks_idx = get_global_id(1);
-
-    DEAL_NON_UNIFORM_DIM2(width_blocks_idx, height_blocks_idx);
-
-    COMPUTE_FLOAT4 v_zero = (COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);
-    #ifdef BIAS
-    COMPUTE_FLOAT4 result0 = CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx, input_c));
-    COMPUTE_FLOAT4 result1 = result0;
-    COMPUTE_FLOAT4 result2 = result0;
-    COMPUTE_FLOAT4 result3 = result0;
+    #ifdef TRANSPOSE_B
+    __global const FLOAT* input_b_offset = input_b + idn * K; // N x K
     #else
-    COMPUTE_FLOAT4 result0 = 0;
-    COMPUTE_FLOAT4 result1 = 0;
-    COMPUTE_FLOAT4 result2 = 0;
-    COMPUTE_FLOAT4 result3 = 0;
+    __global const FLOAT* input_b_offset = input_b + idn; // K x N
     #endif
     
-    const int remain = channel_blocks*4 - channels;
-    for (short pos = 0; pos < channel_blocks - 1; pos += 1) {
-
-        const int inpa_offset = (4*pos) * height_blocks + height_blocks_idx;
-        COMPUTE_FLOAT4 a0 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-        COMPUTE_FLOAT4 a1 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks, input_a));
-        COMPUTE_FLOAT4 a2 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*2, input_a));
-        COMPUTE_FLOAT4 a3 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*3, input_a));
-
-        const int inpb_offset = (4*pos) * width_blocks + width_blocks_idx;
-        COMPUTE_FLOAT4 b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        COMPUTE_FLOAT4 b1 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks, input_b));
-        COMPUTE_FLOAT4 b2 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*2, input_b));
-        COMPUTE_FLOAT4 b3 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*3, input_b));
-
-        COMPUTE_FLOAT4 a0_trans = (COMPUTE_FLOAT4)(a0.x, a1.x, a2.x, a3.x);
-        COMPUTE_FLOAT4 a1_trans = (COMPUTE_FLOAT4)(a0.y, a1.y, a2.y, a3.y);
-        COMPUTE_FLOAT4 a2_trans = (COMPUTE_FLOAT4)(a0.z, a1.z, a2.z, a3.z);
-        COMPUTE_FLOAT4 a3_trans = (COMPUTE_FLOAT4)(a0.w, a1.w, a2.w, a3.w);
-        
-        COMPUTE_FLOAT4 b0_trans = (COMPUTE_FLOAT4)(b0.x, b1.x, b2.x, b3.x);
-        COMPUTE_FLOAT4 b1_trans = (COMPUTE_FLOAT4)(b0.y, b1.y, b2.y, b3.y);
-        COMPUTE_FLOAT4 b2_trans = (COMPUTE_FLOAT4)(b0.z, b1.z, b2.z, b3.z);
-        COMPUTE_FLOAT4 b3_trans = (COMPUTE_FLOAT4)(b0.w, b1.w, b2.w, b3.w);
-
-        //matmul
-        result0.x += dot(a0_trans, b0_trans);
-        result0.y += dot(a0_trans, b1_trans);
-        result0.z += dot(a0_trans, b2_trans);
-        result0.w += dot(a0_trans, b3_trans);
-        
-        result1.x += dot(a1_trans, b0_trans);
-        result1.y += dot(a1_trans, b1_trans);
-        result1.z += dot(a1_trans, b2_trans);
-        result1.w += dot(a1_trans, b3_trans);
+    for (int k = 0; k < loop_end; ++k) {
+        int kindex = k << 2;
+        COMPUTE_FLOAT4 A[4]; // m4 x k4
+        COMPUTE_FLOAT4 B[4]; // k4 x n4
+        #ifdef TRANSPOSE_A
+        {
+            COMPUTE_FLOAT4 tmp0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + kindex * M));
+            COMPUTE_FLOAT4 tmp1 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + (kindex + 1) * M));
+            COMPUTE_FLOAT4 tmp2 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + (kindex + 2) * M));
+            COMPUTE_FLOAT4 tmp3 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + (kindex + 3) * M));
+            
+            A[0] = (COMPUTE_FLOAT4)(tmp0.x, tmp1.x, tmp2.x, tmp3.x);
+            A[1] = (COMPUTE_FLOAT4)(tmp0.y, tmp1.y, tmp2.y, tmp3.y);
+            A[2] = (COMPUTE_FLOAT4)(tmp0.z, tmp1.z, tmp2.z, tmp3.z);
+            A[3] = (COMPUTE_FLOAT4)(tmp0.w, tmp1.w, tmp2.w, tmp3.w);
+        }
+        #else
+        A[0] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + kindex));
+        A[1] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + kindex + K));
+        A[2] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + kindex + 2 * K));
+        A[3] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + kindex + 3 * K));
+        #endif
         
-        result2.x += dot(a2_trans, b0_trans);
-        result2.y += dot(a2_trans, b1_trans);
-        result2.z += dot(a2_trans, b2_trans);
-        result2.w += dot(a2_trans, b3_trans);
+        #ifdef TRANSPOSE_B
+        {
+            COMPUTE_FLOAT4 tmp0 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + kindex));
+            COMPUTE_FLOAT4 tmp1 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + kindex + K));
+            COMPUTE_FLOAT4 tmp2 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + kindex + 2 * K));
+            COMPUTE_FLOAT4 tmp3 = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + kindex + 3 * K));
+            
+            B[0] = (COMPUTE_FLOAT4)(tmp0.x, tmp1.x, tmp2.x, tmp3.x);
+            B[1] = (COMPUTE_FLOAT4)(tmp0.y, tmp1.y, tmp2.y, tmp3.y);
+            B[2] = (COMPUTE_FLOAT4)(tmp0.z, tmp1.z, tmp2.z, tmp3.z);
+            B[3] = (COMPUTE_FLOAT4)(tmp0.w, tmp1.w, tmp2.w, tmp3.w);
+        }
+        #else
+        B[0] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + kindex * N));
+        B[1] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + (kindex + 1) * N));
+        B[2] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + (kindex + 2) * N));
+        B[3] = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + (kindex + 3) * N));
+        #endif
         
-        result3.x += dot(a3_trans, b0_trans);
-        result3.y += dot(a3_trans, b1_trans);
-        result3.z += dot(a3_trans, b2_trans);
-        result3.w += dot(a3_trans, b3_trans);
+        #pragma unroll
+        for (int vec_m = 0; vec_m < 4; ++vec_m){
+            out[vec_m] = mad((COMPUTE_FLOAT4)A[vec_m].x, B[0], out[vec_m]);
+            out[vec_m] = mad((COMPUTE_FLOAT4)A[vec_m].y, B[1], out[vec_m]);
+            out[vec_m] = mad((COMPUTE_FLOAT4)A[vec_m].z, B[2], out[vec_m]);
+            out[vec_m] = mad((COMPUTE_FLOAT4)A[vec_m].w, B[3], out[vec_m]);
+        }
     }
-    
-    {
-        const int inpa_offset = (4*(channel_blocks - 1)) * height_blocks + height_blocks_idx;
-        COMPUTE_FLOAT4 a0 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-        COMPUTE_FLOAT4 a1 = ((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks, input_a)));
-        COMPUTE_FLOAT4 a2 = ((remain >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*2, input_a)));
-        COMPUTE_FLOAT4 a3 = ((remain >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*3, input_a)));
-
-        const int inpb_offset = (4*(channel_blocks - 1)) * width_blocks + width_blocks_idx;
-        COMPUTE_FLOAT4 b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        COMPUTE_FLOAT4 b1 = ((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks, input_b)));
-        COMPUTE_FLOAT4 b2 = ((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*2, input_b)));
-        COMPUTE_FLOAT4 b3 = ((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + width_blocks*3, input_b)));
-
-        COMPUTE_FLOAT4 a0_trans = (COMPUTE_FLOAT4)(a0.x, a1.x, a2.x, a3.x);
-        COMPUTE_FLOAT4 a1_trans = (COMPUTE_FLOAT4)(a0.y, a1.y, a2.y, a3.y);
-        COMPUTE_FLOAT4 a2_trans = (COMPUTE_FLOAT4)(a0.z, a1.z, a2.z, a3.z);
-        COMPUTE_FLOAT4 a3_trans = (COMPUTE_FLOAT4)(a0.w, a1.w, a2.w, a3.w);
+    #ifdef K_LEAVE
+     for (int k = loop_end << 2; k < K; ++k){
+        COMPUTE_FLOAT4 A; // m4
+        COMPUTE_FLOAT4 B; // n4
+        #ifdef TRANSPOSE_A
+        A = CONVERT_COMPUTE_FLOAT4(vload4(0, input_a_offset + k * M));
+        #else
+        A.x = (COMPUTE_FLOAT)input_a_offset[k];
+        A.y = (COMPUTE_FLOAT)input_a_offset[k + K];
+        A.z = (COMPUTE_FLOAT)input_a_offset[k + 2 * K];
+        A.w = (COMPUTE_FLOAT)input_a_offset[k + 3 * K];
+        #endif
         
-        COMPUTE_FLOAT4 b0_trans = (COMPUTE_FLOAT4)(b0.x, b1.x, b2.x, b3.x);
-        COMPUTE_FLOAT4 b1_trans = (COMPUTE_FLOAT4)(b0.y, b1.y, b2.y, b3.y);
-        COMPUTE_FLOAT4 b2_trans = (COMPUTE_FLOAT4)(b0.z, b1.z, b2.z, b3.z);
-        COMPUTE_FLOAT4 b3_trans = (COMPUTE_FLOAT4)(b0.w, b1.w, b2.w, b3.w);
-
-        //matmul
-        result0.x += dot(a0_trans, b0_trans);
-        result0.y += dot(a0_trans, b1_trans);
-        result0.z += dot(a0_trans, b2_trans);
-        result0.w += dot(a0_trans, b3_trans);
-        
-        result1.x += dot(a1_trans, b0_trans);
-        result1.y += dot(a1_trans, b1_trans);
-        result1.z += dot(a1_trans, b2_trans);
-        result1.w += dot(a1_trans, b3_trans);
-        
-        result2.x += dot(a2_trans, b0_trans);
-        result2.y += dot(a2_trans, b1_trans);
-        result2.z += dot(a2_trans, b2_trans);
-        result2.w += dot(a2_trans, b3_trans);
-        
-        result3.x += dot(a3_trans, b0_trans);
-        result3.y += dot(a3_trans, b1_trans);
-        result3.z += dot(a3_trans, b2_trans);
-        result3.w += dot(a3_trans, b3_trans);
+        #ifdef TRANSPOSE_B
+        B.x = (COMPUTE_FLOAT)input_b_offset[k];
+        B.y = (COMPUTE_FLOAT)input_b_offset[k + K];
+        B.z = (COMPUTE_FLOAT)input_b_offset[k + 2 * K];
+        B.w = (COMPUTE_FLOAT)input_b_offset[k + 3 * K];
+        #else
+        B = CONVERT_COMPUTE_FLOAT4(vload4(0, input_b_offset + k * N));
+        #endif
+        out[0] = mad((COMPUTE_FLOAT4)A.x, B, out[0]);
+        out[1] = mad((COMPUTE_FLOAT4)A.y, B, out[1]);
+        out[2] = mad((COMPUTE_FLOAT4)A.z, B, out[2]);
+        out[3] = mad((COMPUTE_FLOAT4)A.w, B, out[3]);
     }
-    
-    const int out_offset = (4*height_blocks_idx) * width_blocks + width_blocks_idx;
-
-    vstore4(CONVERT_FLOAT4(result0), out_offset, output_c);
-    if(4*height_blocks_idx+1 >= height) return;
-    vstore4(CONVERT_FLOAT4(result1), out_offset + width_blocks, output_c);
-    if(4*height_blocks_idx+2 >= height) return;
-    vstore4(CONVERT_FLOAT4(result2), out_offset + width_blocks*2, output_c);
-    if(4*height_blocks_idx+3 >= height) return;
-    vstore4(CONVERT_FLOAT4(result3), out_offset + width_blocks*3, output_c);
-}
-
-__kernel void matmul_transA_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,
-                     __global const FLOAT* input_b,
-                    #ifdef BIAS
-                     __global const FLOAT* input_c,
-                    #endif
-                     __global FLOAT* output_c,
-                     __private const int channels,
-                     __private const int channel_blocks,
-                     __private const int height,
-                     __private const int height_blocks,
-                     __private const int width_blocks,
-                     __private const int width) {
-    const int width_blocks_idx = get_global_id(0);
-    const int height_blocks_idx = get_global_id(1);
-
-    DEAL_NON_UNIFORM_DIM2(width_blocks_idx, height_blocks_idx);
-
-    COMPUTE_FLOAT4 v_zero = (COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);
-    #ifdef BIAS
-    COMPUTE_FLOAT4 result0 = CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx, input_c));
-
-    COMPUTE_FLOAT4 result1 = result0;
-    COMPUTE_FLOAT4 result2 = result0;
-    COMPUTE_FLOAT4 result3 = result0;
-    #else
-    COMPUTE_FLOAT4 result0 = 0;
-    COMPUTE_FLOAT4 result1 = 0;
-    COMPUTE_FLOAT4 result2 = 0;
-    COMPUTE_FLOAT4 result3 = 0;
     #endif
     
-    const int remaina = channel_blocks * 4 - channels;
-    const int remainb = (width_blocks_idx + 1) * 4 - width;
-    for (short pos = 0; pos < channel_blocks - 1; pos += 1) {
-        const int inpa_offset = (4*pos) * height_blocks + height_blocks_idx;
-        COMPUTE_FLOAT4 a0 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-        COMPUTE_FLOAT4 a1 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks, input_a));
-        COMPUTE_FLOAT4 a2 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*2, input_a));
-        COMPUTE_FLOAT4 a3 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*3, input_a));
-
-        const int inpb_offset = (4*width_blocks_idx) * channel_blocks + pos;
-        COMPUTE_FLOAT4 b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        COMPUTE_FLOAT4 b1 = ((remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks, input_b)));
-        COMPUTE_FLOAT4 b2 = ((remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*2, input_b)));
-        COMPUTE_FLOAT4 b3 = ((remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*3, input_b)));
-
-        COMPUTE_FLOAT4 a0_trans = (COMPUTE_FLOAT4)(a0.x, a1.x, a2.x, a3.x);
-        COMPUTE_FLOAT4 a1_trans = (COMPUTE_FLOAT4)(a0.y, a1.y, a2.y, a3.y);
-        COMPUTE_FLOAT4 a2_trans = (COMPUTE_FLOAT4)(a0.z, a1.z, a2.z, a3.z);
-        COMPUTE_FLOAT4 a3_trans = (COMPUTE_FLOAT4)(a0.w, a1.w, a2.w, a3.w);
-
-        //matmul
-        result0.x += dot(a0_trans, b0);
-        result0.y += dot(a0_trans, b1);
-        result0.z += dot(a0_trans, b2);
-        result0.w += dot(a0_trans, b3);
-        
-        result1.x += dot(a1_trans, b0);
-        result1.y += dot(a1_trans, b1);
-        result1.z += dot(a1_trans, b2);
-        result1.w += dot(a1_trans, b3);
-        
-        result2.x += dot(a2_trans, b0);
-        result2.y += dot(a2_trans, b1);
-        result2.z += dot(a2_trans, b2);
-        result2.w += dot(a2_trans, b3);
-        
-        result3.x += dot(a3_trans, b0);
-        result3.y += dot(a3_trans, b1);
-        result3.z += dot(a3_trans, b2);
-        result3.w += dot(a3_trans, b3);
-    }
     
-    {
-        const int inpa_offset = (4*(channel_blocks-1)) * height_blocks + height_blocks_idx;
-        COMPUTE_FLOAT4 a0 = CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset, input_a));
-        COMPUTE_FLOAT4 a1 = ((remaina >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks, input_a)));
-        COMPUTE_FLOAT4 a2 = ((remaina >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*2, input_a)));
-        COMPUTE_FLOAT4 a3 = ((remaina >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset + height_blocks*3, input_a)));
-
-        const int inpb_offset = (4*width_blocks_idx) * channel_blocks + channel_blocks-1;
-        COMPUTE_FLOAT4 b0 = CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset, input_b));
-        COMPUTE_FLOAT4 b1 = ((remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks, input_b)));
-        COMPUTE_FLOAT4 b2 = ((remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*2, input_b)));
-        COMPUTE_FLOAT4 b3 = ((remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset + channel_blocks*3, input_b)));
-
-        COMPUTE_FLOAT4 a0_trans = (COMPUTE_FLOAT4)(a0.x, a1.x, a2.x, a3.x);
-        COMPUTE_FLOAT4 a1_trans = (COMPUTE_FLOAT4)(a0.y, a1.y, a2.y, a3.y);
-        COMPUTE_FLOAT4 a2_trans = (COMPUTE_FLOAT4)(a0.z, a1.z, a2.z, a3.z);
-        COMPUTE_FLOAT4 a3_trans = (COMPUTE_FLOAT4)(a0.w, a1.w, a2.w, a3.w);
-
-        //matmul
-        result0.x += dot(a0_trans, b0);
-        result0.y += dot(a0_trans, b1);
-        result0.z += dot(a0_trans, b2);
-        result0.w += dot(a0_trans, b3);
-        
-        result1.x += dot(a1_trans, b0);
-        result1.y += dot(a1_trans, b1);
-        result1.z += dot(a1_trans, b2);
-        result1.w += dot(a1_trans, b3);
-        
-        result2.x += dot(a2_trans, b0);
-        result2.y += dot(a2_trans, b1);
-        result2.z += dot(a2_trans, b2);
-        result2.w += dot(a2_trans, b3);
-        
-        result3.x += dot(a3_trans, b0);
-        result3.y += dot(a3_trans, b1);
-        result3.z += dot(a3_trans, b2);
-        result3.w += dot(a3_trans, b3);
+    const int out_offset = idm * N + idn;
+    #ifdef M_LEAVE
+    if(idm + 3 >= M){
+        #ifdef N_LEAVE
+        if(idn + 3 >= N){
+            for (int vec_m = 0; vec_m < M - idm; ++vec_m){
+                COMPUTE_FLOAT *out_ptr = (COMPUTE_FLOAT*)&out[vec_m];
+                for(int vec_n = 0; vec_n < N - idn; ++vec_n){
+                    output_c[out_offset + vec_m * N + vec_n] = out_ptr[vec_n];
+                }
+            }
+        } else {
+        #endif
+            for (int vec_m = 0; vec_m < M - idm; ++vec_m){
+                vstore4(CONVERT_FLOAT4(out[vec_m]), 0, output_c + out_offset + vec_m * N);
+            }
+            
+        #ifdef N_LEAVE
+        }
+        #endif
+    } else{
+    #endif
+        #ifdef N_LEAVE
+        if(idn + 3 >= N){
+            #pragma unroll
+            for (int vec_m = 0; vec_m < 4; ++vec_m){
+                COMPUTE_FLOAT *out_ptr = (COMPUTE_FLOAT*)&out[vec_m];
+                for(int vec_n = 0; vec_n < N - idn; ++vec_n){
+                    output_c[out_offset + vec_m * N + vec_n] = out_ptr[vec_n];
+                }
+            }
+        } else {
+        #endif
+            #pragma unroll
+            for (int vec_m = 0; vec_m < 4; ++vec_m){
+                vstore4(CONVERT_FLOAT4(out[vec_m]), 0, output_c + out_offset + vec_m * N);
+            }
+        #ifdef N_LEAVE
+        }
+        #endif
+    #ifdef M_LEAVE
     }
-
-    const int out_offset = (4*height_blocks_idx) * width_blocks + width_blocks_idx;
-
-    vstore4(CONVERT_FLOAT4(result0), out_offset, output_c);
-    if(4*height_blocks_idx+1 >= height) return;
-    vstore4(CONVERT_FLOAT4(result1), out_offset + width_blocks, output_c);
-    if(4*height_blocks_idx+2 >= height) return;
-    vstore4(CONVERT_FLOAT4(result2), out_offset + width_blocks*2, output_c);
-    if(4*height_blocks_idx+3 >= height) return;
-    vstore4(CONVERT_FLOAT4(result3), out_offset + width_blocks*3, output_c);
+    #endif
 }
diff --git a/source/backend/opencl/execution/cl/matmul_params_buf.cl b/source/backend/opencl/execution/cl/matmul_params_buf.cl
index c4520fc8e..a96f2caa7 100644
--- a/source/backend/opencl/execution/cl/matmul_params_buf.cl
+++ b/source/backend/opencl/execution/cl/matmul_params_buf.cl
@@ -83,6 +83,8 @@
 // 2 -> with bias (eltwise_add) [M, N]
 // 3 -> with bias (eltwise_sub) [M, N]
 // 4 -> with bias (eltwise_sub and get negative) [M, N]
+// 5 -> with bias (mask 0 for  invalid) [M, N]
+
 #ifndef BIAS_TYPE
   #define BIAS_TYPE 0
 #endif
@@ -95,6 +97,8 @@
 #define DEAL_BIAS(x, a) x = x - a
 #elif BIAS_TYPE == 4
 #define DEAL_BIAS(x, a) x = a - x
+#elif BIAS_TYPE == 5
+#define DEAL_BIAS(x, a) x = (a == 0 ? (FLOAT)(-FLT_MAX) : x)
 #endif
 
 // By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size
@@ -103,7 +107,32 @@
   #define RELAX_WORKGROUP_SIZE 0
 #endif
 
-#define ZERO (FLOAT)0.0f
+typedef float real_arg;
+#define GetRealArg(x) (FLOAT)x
+typedef FLOAT real;
+
+#ifndef PRECISION_COMPUTE
+#define PRECISION_COMPUTE COMPUTE_FLOAT
+#define CONVERT_PRECISION_COMPUTE(x) CONVERT_COMPUTE_FLOAT(x)
+#endif
+#ifndef PRECISION_COMPUTE2
+#define PRECISION_COMPUTE2 COMPUTE_FLOAT2
+#define CONVERT_PRECISION_COMPUTE2(x) CONVERT_COMPUTE_FLOAT2(x)
+#endif
+#ifndef PRECISION_COMPUTE4
+#define PRECISION_COMPUTE4 COMPUTE_FLOAT4
+#define CONVERT_PRECISION_COMPUTE4(x) CONVERT_COMPUTE_FLOAT4(x)
+#endif
+#ifndef PRECISION_COMPUTE8
+#define PRECISION_COMPUTE8 COMPUTE_FLOAT8
+#define CONVERT_PRECISION_COMPUTE8(x) CONVERT_COMPUTE_FLOAT8(x)
+#endif
+#ifndef PRECISION_COMPUTE16
+#define PRECISION_COMPUTE16 COMPUTE_FLOAT16
+#define CONVERT_PRECISION_COMPUTE16(x) CONVERT_COMPUTE_FLOAT16(x)
+#endif
+
+#define ZERO (PRECISION_COMPUTE)0.0f
 // Sets a variable to zero
 #define SetToZero(a) a = ZERO
 #define IsZero(a) (a == ZERO)
@@ -129,43 +158,72 @@ INLINE_FUNC int GetGroupID0() { return get_group_id(0); }
 
 // =================================================================================================
 
-// End of the C++11 raw string literal
-
-typedef float real_arg;
-#define GetRealArg(x) (FLOAT)x
-typedef FLOAT real;
-
 // Data-widths in dimension M
 #if VWM == 1
     typedef FLOAT realM;
+    #define COMPUTE_FLOATM PRECISION_COMPUTE
+    #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE(x)
+    #define CONVERT_FLOATM(x) CONVERT_FLOAT(x)
 #elif VWM == 2
     typedef FLOAT2 realM;
+    #define COMPUTE_FLOATM PRECISION_COMPUTE2
+    #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE2(x)
+    #define CONVERT_FLOATM(x) CONVERT_FLOAT2(x)
 #elif VWM == 4
     typedef FLOAT4 realM;
+    #define COMPUTE_FLOATM PRECISION_COMPUTE4
+    #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE4(x)
+    #define CONVERT_FLOATM(x) CONVERT_FLOAT4(x)
 #elif VWM == 8
     typedef FLOAT8 realM;
+    #define COMPUTE_FLOATM PRECISION_COMPUTE8
+    #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE8(x)
+    #define CONVERT_FLOATM(x) CONVERT_FLOAT8(x)
 #elif VWM == 16
     typedef FLOAT16 realM;
+    #define COMPUTE_FLOATM PRECISION_COMPUTE16
+    #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE16(x)
+    #define CONVERT_FLOATM(x) CONVERT_FLOAT16(x)
 #endif
 
 // Data-widths in dimension N
 #if VWN == 1
     typedef FLOAT realN;
+    typedef int intN;
+    #define COMPUTE_FLOATN PRECISION_COMPUTE
+    #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE(x)
+    #define CONVERT_FLOATN(x) CONVERT_FLOAT(x)
 #elif VWN == 2
     typedef FLOAT2 realN;
+    typedef int2 intN;
+    #define COMPUTE_FLOATN PRECISION_COMPUTE2
+    #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE2(x)
+    #define CONVERT_FLOATN(x) CONVERT_FLOAT2(x)
 #elif VWN == 4
     typedef FLOAT4 realN;
+    typedef int4 intN;
+    #define COMPUTE_FLOATN PRECISION_COMPUTE4
+    #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE4(x)
+    #define CONVERT_FLOATN(x) CONVERT_FLOAT4(x)
 #elif VWN == 8
     typedef FLOAT8 realN;
+    typedef int8 intN;
+    #define COMPUTE_FLOATN PRECISION_COMPUTE8
+    #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE8(x)
+    #define CONVERT_FLOATN(x) CONVERT_FLOAT8(x)
 #elif VWN == 16
     typedef FLOAT16 realN;
+    typedef int16 intN;
+    #define COMPUTE_FLOATN PRECISION_COMPUTE16
+    #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE16(x)
+    #define CONVERT_FLOATN(x) CONVERT_FLOAT16(x)
 #endif
 
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-INLINE_FUNC realM InitAccRegisters() {
-  realM result;
+INLINE_FUNC COMPUTE_FLOATM InitAccRegisters() {
+  COMPUTE_FLOATM result;
   #if VWM == 1
     SetToZero(result);
   #elif VWM == 2
@@ -206,8 +264,8 @@ INLINE_FUNC realM InitAccRegisters() {
   return result;
 }
 
-INLINE_FUNC realN InitAccRegistersN() {
-  realN result;
+INLINE_FUNC COMPUTE_FLOATN InitAccRegistersN() {
+    COMPUTE_FLOATN result;
   #if VWN == 1
     SetToZero(result);
   #elif VWN == 2
@@ -443,10 +501,10 @@ INLINE_FUNC realN LocalToPrivateB(LOCAL_PTR realN* blm, const int _ni, const int
 #endif
 
 // The vectorised multiply-add function
-INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+INLINE_FUNC COMPUTE_FLOATM MultiplyAddVector(COMPUTE_FLOATM cvec, COMPUTE_FLOATM avec, PRECISION_COMPUTE bval) {
   #if USE_VECTOR_MAD == 1
     #if USE_CL_MAD == 1
-    cvec = mad(avec, (realM)bval, cvec);
+    cvec = mad(avec, (COMPUTE_FLOATM)bval, cvec);
     #else
     cvec += avec * bval;
     #endif
@@ -493,10 +551,10 @@ INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bva
 }
 
 // The vectorised multiply-add function
-INLINE_FUNC realN MultiplyAddVectorN(realN cvec, const real avec, const realN bval) {
+INLINE_FUNC COMPUTE_FLOATN MultiplyAddVectorN(COMPUTE_FLOATN cvec, PRECISION_COMPUTE avec, COMPUTE_FLOATN bval) {
   #if USE_VECTOR_MAD == 1
     #if USE_CL_MAD == 1
-    cvec = mad((realN)avec, bval, cvec);
+    cvec = mad((COMPUTE_FLOATN)avec, bval, cvec);
     #else
     cvec += avec * bval;
     #endif
@@ -571,8 +629,8 @@ INLINE_FUNC INT2 StoreIndexM() {
 }
 
 // layout : [N, M]
-INLINE_FUNC void StoreResultsM(__global realM* cgm, realM c_value, const INT2 baseOffset, const int _mi, const int _ni,
-                              const int kSizeM, const real alpha, const real beta) {
+INLINE_FUNC void StoreResultsM(__global realM* cgm, COMPUTE_FLOATM c_value, const INT2 baseOffset, const int _mi, const int _ni,
+                              const int kSizeM, const PRECISION_COMPUTE alpha, const PRECISION_COMPUTE beta) {
   #if STRM == 0
     int idm = _mi + baseOffset.index[0];
   #elif STRM == 1
@@ -586,11 +644,11 @@ INLINE_FUNC void StoreResultsM(__global realM* cgm, realM c_value, const INT2 ba
   
   int index = idn*(kSizeM/VWM) + idm;
 
-  realM result = c_value;
+  COMPUTE_FLOATM result = c_value;
 
   // The final multiplication with alpha (in case beta == 0)
   #ifdef ONLY_HAVE_ALPHA
-    realM xval = c_value;
+    COMPUTE_FLOATM xval = c_value;
     #if VWM == 1
       Multiply(result, alpha, xval);
     #elif VWM == 2
@@ -632,8 +690,8 @@ INLINE_FUNC void StoreResultsM(__global realM* cgm, realM c_value, const INT2 ba
 
   // The final multiplication with alpha and the addition with beta*C
   #ifdef HAVE_ALPHA_BETA
-    realM xval = c_value;
-    realM yval = cgm[index];
+    COMPUTE_FLOATM xval = c_value;
+    COMPUTE_FLOATM yval = CONVERT_COMPUTE_FLOATM(cgm[index]);
     #if VWM == 1
       AXPBY(result, alpha, xval, beta, yval);
     #elif VWM == 2
@@ -672,7 +730,7 @@ INLINE_FUNC void StoreResultsM(__global realM* cgm, realM c_value, const INT2 ba
       AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
     #endif
   #endif
-  cgm[index] = result;
+  cgm[index] = CONVERT_FLOATM(result);
 }
 
 INLINE_FUNC INT2 StoreIndexN() {
@@ -695,7 +753,7 @@ INLINE_FUNC INT2 StoreIndexN() {
     return res;
 }
 // layout : [M, N]
-INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
+INLINE_FUNC void StoreResultsN(__global realN* cgn, COMPUTE_FLOATN c_value,
                             const INT2 baseOffset,
                             #if BIAS_TYPE > 0
                                 #if BIAS_TYPE > 1
@@ -705,7 +763,7 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
                                 #endif
                             #endif
                             const int _mi, const int _ni,
-                            const int cstride/*kSizeN*/, const int dstride/*kSizeN*/, const real alpha, const real beta) {
+                            const int cstride/*kSizeN*/, const int dstride/*kSizeN*/, const PRECISION_COMPUTE alpha, const PRECISION_COMPUTE beta) {
 
   #if STRM == 0
     int idm = _mi + baseOffset.index[0];
@@ -720,11 +778,11 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
 
   int index = idm * (cstride/VWN) + idn;
   
-  realN result = c_value;
+  COMPUTE_FLOATN result = c_value;
   
   // The final multiplication with alpha (in case beta == 0)
   #ifdef ONLY_HAVE_ALPHA
-    realN xval = c_value;
+    COMPUTE_FLOATN xval = c_value;
     #if VWN == 1
       Multiply(result, alpha, xval);
     #elif VWN == 2
@@ -766,8 +824,8 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
 
   // The final multiplication with alpha and the addition with beta*C
   #ifdef HAVE_ALPHA_BETA
-    realN xval = c_value;
-    realN yval = cgn[index];
+    COMPUTE_FLOATN xval = c_value;
+    COMPUTE_FLOATN yval = CONVERT_COMPUTE_FLOATN(cgn[index]);
     #if VWN == 1
       AXPBY(result, alpha, xval, beta, yval);
     #elif VWN == 2
@@ -810,29 +868,31 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
   
 #if BIAS_TYPE > 0
     #if BIAS_TYPE == 1
-    realN eval = epm[_ni];
+    COMPUTE_FLOATN eval = CONVERT_COMPUTE_FLOATN(epm[_ni]);
+    #elif BIAS_TYPE == 5
+    int index_bias = idm * (dstride/VWN) + idn;
+    intN eval = ((__global intN*)egm)[index_bias];
     #else
-    
     int index_bias = idm * (dstride/VWN) + idn;
-    realN eval = egm[index_bias];
+    COMPUTE_FLOATN eval = CONVERT_COMPUTE_FLOATN(egm[index_bias]);
     #endif
   
   #if VWN == 1
     DEAL_BIAS(result, eval);
     #ifdef RELU
-    result = fmax(result, (FLOAT)0);
+    result = fmax(result, (COMPUTE_FLOATN)0);
     #endif
     #ifdef RELU6
-    result = clamp(result, (FLOAT)0, (FLOAT)6);
+    result = clamp(result, (COMPUTE_FLOATN)0, (COMPUTE_FLOATN)6);
     #endif
   #elif VWN == 2
     DEAL_BIAS(result.x, eval.x);
     DEAL_BIAS(result.y, eval.y);
     #ifdef RELU
-    result = fmax(result, (FLOAT2)0);
+    result = fmax(result, (COMPUTE_FLOATN)0);
     #endif
     #ifdef RELU6
-    result = clamp(result, (FLOAT2)0, (FLOAT2)6);
+    result = clamp(result, (COMPUTE_FLOATN)0, (COMPUTE_FLOATN)6);
     #endif
   #elif VWN == 4
     DEAL_BIAS(result.x, eval.x);
@@ -840,10 +900,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     DEAL_BIAS(result.z, eval.z);
     DEAL_BIAS(result.w, eval.w);
     #ifdef RELU
-    result = fmax(result, (FLOAT4)0);
+    result = fmax(result, (COMPUTE_FLOATN)0);
     #endif
     #ifdef RELU6
-    result = clamp(result, (FLOAT4)0, (FLOAT4)6);
+    result = clamp(result, (COMPUTE_FLOATN)0, (COMPUTE_FLOATN)6);
     #endif
   #elif VWN == 8
     DEAL_BIAS(result.s0, eval.s0);
@@ -855,10 +915,10 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     DEAL_BIAS(result.s6, eval.s6);
     DEAL_BIAS(result.s7, eval.s7);
     #ifdef RELU
-    result = fmax(result, (FLOAT8)0);
+    result = fmax(result, (COMPUTE_FLOATN)0);
     #endif
     #ifdef RELU6
-    result = clamp(result, (FLOAT8)0, (FLOAT8)6);
+    result = clamp(result, (COMPUTE_FLOATN)0, (COMPUTE_FLOATN)6);
     #endif
   #elif VWN == 16
     DEAL_BIAS(result.s0, eval.s0);
@@ -878,15 +938,15 @@ INLINE_FUNC void StoreResultsN(__global realN* cgn, realN c_value,
     DEAL_BIAS(result.sE, eval.sE);
     DEAL_BIAS(result.sF, eval.sF);
     #ifdef RELU
-    result = fmax(result, (FLOAT16)0);
+    result = fmax(result, (COMPUTE_FLOATN)0);
     #endif
     #ifdef RELU6
-    result = clamp(result, (FLOAT16)0, (FLOAT16)6);
+    result = clamp(result, (COMPUTE_FLOATN)0, (COMPUTE_FLOATN)6);
     #endif
   #endif
 #endif
 
-  cgn[index] = result;
+  cgn[index] = CONVERT_FLOATN(result);
 }
 
 
@@ -896,7 +956,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                            #if BIAS_TYPE > 0
                            __global realN* restrict egm,
                            #endif
-                           __global realM* cgm, const real alpha, const real beta
+                           __global realM* cgm, const real_arg alpha, const real_arg beta
                            #if SA == 1 && SB == 1
                              , LOCAL_PTR realM* alm, LOCAL_PTR realN* blm
                            #elif SA == 1
@@ -907,10 +967,10 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                            ) {
   #ifdef OUTPUTMN
   #pragma promote_to_registers
-  realN cpn[MWI*(NWI/VWN)]; // MWI * NWI
+  COMPUTE_FLOATN cpn[MWI*(NWI/VWN)]; // MWI * NWI
   #else
   #pragma promote_to_registers
-  realM cpm[NWI*(MWI/VWM)]; // NWI * MWI
+  COMPUTE_FLOATM cpm[NWI*(MWI/VWM)]; // NWI * MWI
   #endif
 
   // Combined thread identifier (volatile to disable caching)
@@ -941,9 +1001,9 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
   #if SA == 1 || SB == 1
       // Allocates workitem-private memory (registers)
       #pragma promote_to_registers
-      realM apm[MWI/VWM]; // MWI * 1
+      COMPUTE_FLOATM apm[MWI/VWM]; // MWI * 1
       #pragma promote_to_registers
-      realN bpm[NWI/VWN]; // 1 * NWI
+      COMPUTE_FLOATN bpm[NWI/VWN]; // 1 * NWI
       
       for (int kwg = 0; kwg < kSizeK; kwg += KWG) {
         // Loads data: off-chip --> local (matrix A)
@@ -970,10 +1030,10 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
             for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
               // Loads data: local --> private (matrix A)
               #if SA == 1
-                apm[_mi] = LocalToPrivateA(alm, _mi, kg);
+                apm[_mi] = CONVERT_COMPUTE_FLOATM(LocalToPrivateA(alm, _mi, kg));
               // Loads data: off-chip --> private (matrix A)
               #elif SA == 0
-                apm[_mi] = GlobalToPrivateA(agm, _mi, kSizeM, idk);
+                apm[_mi] = CONVERT_COMPUTE_FLOATM(GlobalToPrivateA(agm, _mi, kSizeM, idk));
               #endif
             }
 
@@ -983,10 +1043,10 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
             for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
               // Loads data: local --> private (matrix B)
               #if SB == 1
-                bpm[_ni] = LocalToPrivateB(blm, _ni, kg);
+                bpm[_ni] = CONVERT_COMPUTE_FLOATN(LocalToPrivateB(blm, _ni, kg));
               // Loads data: off-chip --> private (matrix B)
               #else
-                bpm[_ni] = GlobalToPrivateB(bgm, _ni, kSizeN, idk);
+                bpm[_ni] = CONVERT_COMPUTE_FLOATN(GlobalToPrivateB(bgm, _ni, kSizeN, idk));
               #endif
             }
 
@@ -997,7 +1057,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                 for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
                   #pragma unroll
                   for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
-                    const realM aval = apm[_mi];
+                    const COMPUTE_FLOATM aval = apm[_mi];
                     #if VWM == 1
                       // [MWI/VWM, VWM, NWI/VWN, VWN]
                       cpn[(_mi*VWM + 0)*(NWI/VWN) + _ni] = MultiplyAddVectorN(cpn[(_mi*VWM + 0)*(NWI/VWN) + _ni], aval, bpm[_ni]);
@@ -1043,7 +1103,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                 for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
                   #pragma unroll
                   for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-                    const realM aval = apm[_mi];
+                    const COMPUTE_FLOATM aval = apm[_mi];
                     #if VWN == 1
                       cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bpm[_ni]);
                     #elif VWN == 2
@@ -1098,7 +1158,7 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
       for (int _kj = 0; _kj < kSizeK; _kj += 4) {
         #ifdef OUTPUTMN
           #pragma promote_to_registers
-          realN bpm[NWI/VWN]; // 1 * NWI
+          COMPUTE_FLOATN bpm[NWI/VWN]; // 1 * NWI
         
           #pragma unroll
           for(int _ki = 0; _ki < 4; _ki += 1) {
@@ -1106,12 +1166,12 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
             #pragma unroll
             for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
               // Loads data: off-chip --> private (matrix B)
-              bpm[_ni] = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk);
+              bpm[_ni] = CONVERT_COMPUTE_FLOATN(GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk));
             }
 
             #pragma unroll
             for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-              const realM aval = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk);
+              const COMPUTE_FLOATM aval = CONVERT_COMPUTE_FLOATM(GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk));
               #pragma unroll
               for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
                 #if VWM == 1
@@ -1158,22 +1218,22 @@ INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
         #else
         
           #pragma promote_to_registers
-          realM apm[MWI/VWM]; // MWI * 1
+          COMPUTE_FLOATM apm[MWI/VWM]; // MWI * 1
           #pragma unroll
           for(int _ki = 0; _ki < 4; _ki += 1) {
             int idk = _kj + _ki;
             #pragma unroll
             for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
               // Loads data: off-chip --> private (matrix B)
-              apm[_mi] = GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk);
+              apm[_mi] = CONVERT_COMPUTE_FLOATM(GlobalToPrivateOptA(agm, baseIndexA, _mi, stride.s0/*kSizeM*/, idk));
             }
             #pragma unroll
             for (int _ni = 0; _ni < NWI/VWN; _ni += 1) {
-              const realN bval = GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk);
+              const COMPUTE_FLOATN bval = CONVERT_COMPUTE_FLOATN(GlobalToPrivateOptB(bgm, baseIndexB, _ni, stride.s1/*kSizeN*/, idk));
 
               #pragma unroll
               for (int _mi = 0; _mi < MWI/VWM; _mi += 1) {
-                const realM aval = apm[_mi];
+                const COMPUTE_FLOATM aval = apm[_mi];
                 #if VWN == 1
                   cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi] = MultiplyAddVector(cpm[(_ni*VWN + 0)*(MWI/VWM) + _mi], aval, bval);
                 #elif VWN == 2
@@ -1288,8 +1348,6 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
            __private const int4 offset,
            __private const int4 stride
 ) {
-    const real alpha = GetRealArg(arg_alpha);
-    const real beta = GetRealArg(arg_beta);
   
     // Adds the offsets (in case of use of a single temporary buffer for A, B, and C)
     agm = (const __global realM*)((const __global real*)agm + offset.s0);
@@ -1313,25 +1371,25 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
           #if BIAS_TYPE > 0
           egm,
           #endif
-          cgm, alpha, beta, alm, blm);
+          cgm, arg_alpha, arg_beta, alm, blm);
     #elif SA == 1
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
           #if BIAS_TYPE > 0
           egm,
           #endif
-          cgm, alpha, beta, alm);
+          cgm, arg_alpha, arg_beta, alm);
     #elif SB == 1
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
           #if BIAS_TYPE > 0
           egm,
           #endif
-          cgm, alpha, beta, blm);
+          cgm, arg_alpha, arg_beta, blm);
     #else
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm, bgm,
           #if BIAS_TYPE > 0
           egm,
           #endif
-          cgm, alpha, beta);
+          cgm, arg_alpha, arg_beta);
     #endif
 }
 
@@ -1346,29 +1404,32 @@ void XgemmBatched(const int kSizeM,
                   const real_arg arg_alpha,
                   const real_arg arg_beta,
                   const __global realM* restrict agm,
-                  const int batch_offset_a,
                   const __global realN* restrict bgm,
-                  const int batch_offset_b,
                   #if BIAS_TYPE > 0
                   __global realN* restrict egm,
-                  const int batch_offset_e,
                   #endif
                   __global realM* cgm,
-                  const int batch_offset_c) {
+                  const int4 batch_offset, // [batch_offset_a, batch_offset_b, batch_offset_c, batch_offset_e]
+                  const int4 stride, // [stride_a, stride_b, stride_c, stride_e]
+                  /*
+                     total_batch -> [loop_y, loop_x]
+                     with group batch -> [loop_y, loop_x/group_num]
+                     group_size == loop_x/group_num
+                    */
+                  const int4 group // [group_num_a, group_num_b, group_num_e, loop_x]
+) {
     const int batch = get_group_id(2);
-    const real alpha = GetRealArg(arg_alpha);
-    const real beta = GetRealArg(arg_beta);
     
     // Sets the offsets
-    const int a_offset = batch * batch_offset_a;
-    const int b_offset = batch * batch_offset_b;
-    const int c_offset = batch * batch_offset_c;
+    const int a_offset = ((batch / group.w) * group.x + (batch % group.w) / group.x) * batch_offset.x;
+    const int b_offset = ((batch / group.w) * group.y + (batch % group.w) / group.y) * batch_offset.y;
+    const int c_offset = batch * batch_offset.z;
     const __global realM* restrict agm_ = &agm[a_offset / VWM];
     const __global realN* restrict bgm_ = &bgm[b_offset / VWN];
     __global realM* restrict cgm_ = &cgm[c_offset / VWM];
     
     #if BIAS_TYPE > 0
-    const int e_offset = batch * batch_offset_e;
+    const int e_offset = ((batch / group.w) * group.z + (batch % group.w) / group.z) * batch_offset.w;
     __global realN* restrict egm_ = &egm[e_offset / VWN];
     #endif
   
@@ -1379,40 +1440,32 @@ void XgemmBatched(const int kSizeM,
     #if SB == 1
         __local realN blm[KWG * NWG/VWN];
     #endif
-    int4 stride;
-    stride.s0 = kSizeM;
-    stride.s1 = kSizeN;
-    #ifdef OUTPUTMN
-    stride.s2 = kSizeN;
-    #else
-    stride.s2 = kSizeM;
-    #endif
-    stride.s3 = kSizeN;
+
     // Computes the matrix-multiplication and stores the result in global memory
     #if SA == 1 && SB == 1
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
         #if BIAS_TYPE > 0
         egm_,
         #endif
-        cgm_, alpha, beta, alm, blm);
+        cgm_, arg_alpha, arg_beta, alm, blm);
     #elif SA == 1
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
         #if BIAS_TYPE > 0
         egm_,
         #endif
-        cgm_, alpha, beta, alm);
+        cgm_, arg_alpha, arg_beta, alm);
     #elif SB == 1
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
         #if BIAS_TYPE > 0
         egm_,
         #endif
-        cgm_, alpha, beta, blm);
+        cgm_, arg_alpha, arg_beta, blm);
     #else
         XgemmBody(kSizeM, kSizeN, kSizeK, stride, agm_, bgm_,
         #if BIAS_TYPE > 0
         egm_,
         #endif
-        cgm_, alpha, beta);
+        cgm_, arg_alpha, arg_beta);
     #endif
 }
 
diff --git a/source/backend/opencl/execution/cl/opencl_codegen.py b/source/backend/opencl/execution/cl/opencl_codegen.py
index 56d497aee..b18dd59dc 100644
--- a/source/backend/opencl/execution/cl/opencl_codegen.py
+++ b/source/backend/opencl/execution/cl/opencl_codegen.py
@@ -41,7 +41,7 @@ def opencl_codegen():
     for file_name_all in os.listdir(cl_kernel_dir):
         file_path = os.path.join(cl_kernel_dir, file_name_all)
         if file_path[-3:] == ".cl":
-            with open(file_path, "r") as f:
+            with open(file_path, "r", encoding = 'utf-8') as f:
                 file_name = file_name_all[:-3]
                 if file_name[-4:] == "_buf":
                     opencl_source_map += "#ifndef MNN_OPENCL_BUFFER_CLOSED\n"
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index 7eebdd550..a4d2cb4f4 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -1346,7 +1346,7 @@ const char* deconv_2d =
 " #ifdef BIAS\n"
 " __global FLOAT* bias,\n"
 " #endif\n"
-" __global FLOAT* output,\n"
+" __global FLOAT* output,__private const int batch,\n"
 " #else\n"
 " __read_only image2d_t input,\n"
 " __read_only image2d_t weights,\n"
@@ -1406,7 +1406,7 @@ const char* deconv_2d =
 " weights2=vload4(kernel_x_2*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
 " weights3=vload4(kernel_x_3*(out_channel_blocks*kernel_shape.x*kernel_shape.y)+kernel_y,weights);\n"
 " bool outBoundry=(idx_h<0 || idx_h >= input_shape.x || kernel_start_x<0 || in_width0 >= input_shape.y);\n"
-" int inp_offset=(((out_b_idx*in_channel_blocks+ic)*input_shape.x+idx_h)*input_shape.y+in_width0)*4;\n"
+" int inp_offset=(((out_b_idx+ic*batch)*input_shape.x+idx_h)*input_shape.y+in_width0)*4;\n"
 " in0=outBoundry ? (FLOAT4)0 : vload4(0,input+inp_offset);\n"
 " out0=mad(in0.x,weights0,out0);\n"
 " out0=mad(in0.y,weights1,out0);\n"
@@ -1443,7 +1443,7 @@ const char* deconv_2d =
 " out0=clamp(out0,(FLOAT4)0,(FLOAT4)6);\n"
 "#endif\n"
 "#ifdef USE_BUFFER\n"
-" const int out_offset=(((out_b_idx*out_channel_blocks+out_channel_blocks_idx)*output_shape.x+out_h_idx)*output_shape.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_channel_blocks_idx*batch)*output_shape.x+out_h_idx)*output_shape.y+out_w_idx)*4;\n"
 " vstore4(out0,0,output+out_offset);\n"
 "#else\n"
 " int out_image_width_idx=mad24(out_channel_blocks_idx,output_shape.y,out_w_idx);\n"
@@ -1542,7 +1542,7 @@ const char* grid_sample_buf =
 " __private const int input_width,\n"
 " __private const int output_height,\n"
 " __private const int output_width,\n"
-" __private const int channelBlocks,\n"
+" __private const int batch,\n"
 " __private const enum BorderMode paddingMode,\n"
 " __private const int alignCorners){\n"
 " \n"
@@ -1567,27 +1567,21 @@ const char* grid_sample_buf =
 " (xn,xn,xn,xn) (y5,y6,y7,y8)\n"
 " ---------------------------\n"
 " */\n"
-" const int slice=output_height_idx/4;\n"
-" const int slice_blocks=(output_height+3)/4;\n"
 " // output_width_block_idx means gird y offset,2 means grid width\n"
-" const int grid_offset=((output_batch_idx*slice_blocks+slice)*output_width+output_width_block_idx)*2;\n"
-" COMPUTE_FLOAT4 grid_x=CONVERT_COMPUTE_FLOAT4(vload4(grid_offset,grid));\n"
-" COMPUTE_FLOAT4 grid_y=CONVERT_COMPUTE_FLOAT4(vload4(grid_offset+1,grid));\n"
-" const float arr[8]={grid_x.x,grid_y.x,grid_x.y,grid_y.y,grid_x.z,grid_y.z,grid_x.w,grid_y.w};\n"
-" \n"
+" const int grid_offset=(output_batch_idx*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" COMPUTE_FLOAT2 grid_xy=CONVERT_COMPUTE_FLOAT2(vload2(grid_offset,grid));\n"
 " // get grid x,y\n"
-" const int arr_offset=output_height_idx % 4;\n"
-" const float x=arr[2*arr_offset];\n"
-" const float y=arr[2*arr_offset+1];\n"
+" const float x=(float)grid_xy.x;\n"
+" const float y=(float)grid_xy.y;\n"
 " // convert grid x,y to input x,y coordinate range\n"
 " float in_grid_x=getPosition(x,input_width,alignCorners);\n"
 " float in_grid_y=getPosition(y,input_height,alignCorners);\n"
 " // get nearest point\n"
 " int nw=floor(in_grid_x+0.5f);\n"
 " int nh=floor(in_grid_y+0.5f);\n"
-" const int inp_offset_base=(output_batch_idx*channelBlocks+output_channel_block_idx)*input_height;\n"
+" const int inp_offset_base=(output_batch_idx+output_channel_block_idx*batch)*input_height;\n"
 " COMPUTE_FLOAT4 value=sample(nh,nw,inp_offset_base,input,input_height,input_width,paddingMode);\n"
-" const int output_offset=((output_batch_idx*channelBlocks+output_channel_block_idx )*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" const int output_offset=((output_batch_idx+output_channel_block_idx*batch)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
 " vstore4(CONVERT_FLOAT4(value),output_offset,output);\n"
 "}\n"
 "__kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n"
@@ -1597,7 +1591,7 @@ const char* grid_sample_buf =
 " __private const int input_width,\n"
 " __private const int output_height,\n"
 " __private const int output_width,\n"
-" __private const int channelBlocks,\n"
+" __private const int batch,\n"
 " __private const enum BorderMode paddingMode,\n"
 " __private const int alignCorners){\n"
 " const int output_channel_block_idx=get_global_id(0);\n"
@@ -1606,18 +1600,13 @@ const char* grid_sample_buf =
 " DEAL_NON_UNIFORM_DIM3(output_channel_block_idx,output_width_block_idx,output_batch_height_block_idx);\n"
 " const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
 " const int output_height_idx=output_batch_height_block_idx % output_height;\n"
-" const int slice=output_height_idx/4;\n"
-" const int slice_blocks=(output_height+3)/4;\n"
 " // output_width_block_idx means gird y offset,2 means grid width\n"
-" const int grid_offset=((output_batch_idx*slice_blocks+slice)*output_width+output_width_block_idx)*2;\n"
-" COMPUTE_FLOAT4 grid_x=CONVERT_COMPUTE_FLOAT4(vload4(grid_offset,grid));\n"
-" COMPUTE_FLOAT4 grid_y=CONVERT_COMPUTE_FLOAT4(vload4(grid_offset+1,grid));\n"
-" const float arr[8]={grid_x.x,grid_y.x,grid_x.y,grid_y.y,grid_x.z,grid_y.z,grid_x.w,grid_y.w};\n"
+" const int grid_offset=(output_batch_idx*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" COMPUTE_FLOAT2 grid_xy=CONVERT_COMPUTE_FLOAT2(vload2(grid_offset,grid));\n"
 " \n"
 " // get grid x,y\n"
-" const int arr_offset=output_height_idx % 4;\n"
-" const float x=arr[2*arr_offset];\n"
-" const float y=arr[2*arr_offset+1];\n"
+" const float x=(float)grid_xy.x;\n"
+" const float y=(float)grid_xy.y;\n"
 " // convert grid x,y to input x,y coordinate range\n"
 " float in_grid_x=getPosition(x,input_width,alignCorners);\n"
 " float in_grid_y=getPosition(y,input_height,alignCorners);\n"
@@ -1628,7 +1617,7 @@ const char* grid_sample_buf =
 " float x_weight=in_w1-in_grid_x;\n"
 " float y_weight=in_h1-in_grid_y;\n"
 " // bilinear interpolation\n"
-" const int inp_offset_base=(output_batch_idx*channelBlocks+output_channel_block_idx)*input_height;\n"
+" const int inp_offset_base=(output_batch_idx+output_channel_block_idx*batch)*input_height;\n"
 " COMPUTE_FLOAT4 i00=sample(in_h0,in_w0,inp_offset_base,input,input_height,input_width,paddingMode);\n"
 " COMPUTE_FLOAT4 i01=sample(in_h0,in_w1,inp_offset_base,input,input_height,input_width,paddingMode);\n"
 " COMPUTE_FLOAT4 i10=sample(in_h1,in_w0,inp_offset_base,input,input_height,input_width,paddingMode);\n"
@@ -1636,7 +1625,7 @@ const char* grid_sample_buf =
 " COMPUTE_FLOAT4 value=CONVERT_COMPUTE_FLOAT4(((COMPUTE_FLOAT4)x_weight*CONVERT_COMPUTE_FLOAT4(i00)+(COMPUTE_FLOAT4)(1.0f-x_weight)*CONVERT_COMPUTE_FLOAT4(i01))*(COMPUTE_FLOAT4)y_weight +\n"
 " ((COMPUTE_FLOAT4)x_weight*CONVERT_COMPUTE_FLOAT4(i10)+(COMPUTE_FLOAT4)(1.0f-x_weight)*CONVERT_COMPUTE_FLOAT4(i11))*(COMPUTE_FLOAT4)(1.0f- y_weight));\n"
 " \n"
-" const int output_offset=((output_batch_idx*channelBlocks+output_channel_block_idx )*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" const int output_offset=((output_batch_idx+output_channel_block_idx*batch)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
 " vstore4(CONVERT_FLOAT4(value),output_offset,output);\n"
 "}\n"
 ;
@@ -1730,34 +1719,35 @@ const char* range_buf =
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
-"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"__kernel void range_buf(GLOBAL_SIZE_3_DIMS\n"
+"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
+"__kernel void range_buf(GLOBAL_SIZE_2_DIMS\n"
 " __global const INPUT_TYPE* input0,\n"
 " __global const INPUT_TYPE* input2,\n"
 " __global OUTPUT_TYPE* output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channel,\n"
-" __private const int channelBlock\n"
+" __private const int size\n"
 " ) {\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/channelBlock;\n"
-" const int channel_idx=batch_channel_idx % channelBlock;\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
 " \n"
-" const int offset=((((batch_idx*channelBlock)+channel_idx)*height+height_idx)*width+width_idx)*4;\n"
-" const int channel4=channel_idx << 2;\n"
-" int index=(((batch_idx*channel)+channel4)*height+height_idx)*width+width_idx;\n"
-" int size=height*width;\n"
-" int4 index4=(int4)(index,index+size,index+size*2,index+size*3);\n"
+" int index=x << 2;\n"
+" int4 index4=(int4)(index,index+1,index+2,index+3);\n"
 " INPUT_TYPE start=input0[0];\n"
 " INPUT_TYPE step=input2[0];\n"
 " OUTPUT_TYPE4 value=(OUTPUT_TYPE4)start+CONVERT_OUTPUT4(index4)*(OUTPUT_TYPE4)step;\n"
-" vstore4(value,0,output+offset);\n"
+"#ifdef PACK_LEAVE\n"
+" if(index+3 >= size){\n"
+" OUTPUT_TYPE* value_ptr=(OUTPUT_TYPE*)&value;\n"
+" for(int i=0; i<size-index; ++i){\n"
+" output[index+i]=value_ptr[i];\n"
+" }\n"
+" }else{\n"
+"#endif\n"
+" vstore4(value,0,output+index);\n"
+"#ifdef PACK_LEAVE\n"
+" }\n"
+"#endif\n"
 "}\n"
 ;
 #endif
@@ -1782,6 +1772,7 @@ const char* self_attention_buf =
 " __private const int seq_len,\n"
 " __private const int head_num,\n"
 " __private const int head_dim,\n"
+" __private const int batch,\n"
 " __private const int seq_index\n"
 ") {\n"
 " const int sl=get_global_id(0); // seqLen_4\n"
@@ -1808,8 +1799,7 @@ const char* self_attention_buf =
 " return;\n"
 " }\n"
 " \n"
-" const int offset_inp=(((b*seq_len_4+seq_index*seq_len_piece/4+sl)*head_num+hn)*3*head_dim+4*hd)*4;\n"
-" \n"
+" const int offset_inp=((((seq_index*seq_len_piece/4+sl)*batch+b)*head_num+hn)*3*head_dim+4*hd)*4;\n"
 " if(sl*4<seq_len_piece) {\n"
 " FLOAT4 temp_0=vload4(0,input+offset_inp);\n"
 " FLOAT4 temp_1=vload4(0,input+offset_inp+4);\n"
@@ -1851,7 +1841,7 @@ const char* self_attention_buf =
 " return;\n"
 " }\n"
 " \n"
-" const int offset_inp=(((b*seq_len_4+sl)*head_num+hn)*3*head_dim+4*hd)*4;\n"
+" const int offset_inp=(((sl*batch+b)*head_num+hn)*3*head_dim+4*hd)*4;\n"
 " \n"
 " if(sl*4<seq_len_piece) {\n"
 " FLOAT4 temp_0=vload4(0,input+offset_inp);\n"
@@ -1956,7 +1946,7 @@ const char* self_attention_buf =
 " const int out_offset=(outside*shape.z+0)*shape.y+axis;\n"
 " #endif\n"
 " /*Compute Result */\n"
-" for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" for (int i=lid; i<inside_len; i+=SOFTMAX_LOCAL_SIZE) {\n"
 " float value=exp((float)input[offset+ i]-maxValue)/sumValue;\n"
 " #ifdef OUTPUT_TRANSPOSE\n"
 " output[out_offset+ i*shape.y]=value;\n"
@@ -1964,18 +1954,30 @@ const char* self_attention_buf =
 " output[offset+ i]=value;\n"
 " #endif\n"
 " }\n"
+" if(shape.z>inside_len){\n"
+" for(int i=lid+inside_len; i<shape.z; i+=SOFTMAX_LOCAL_SIZE){\n"
+" #ifdef OUTPUT_TRANSPOSE\n"
+" output[out_offset+ i*shape.y]=(FLOAT)0;\n"
+" #else\n"
+" output[offset+ i]=(FLOAT)0;\n"
+" #endif\n"
+" }\n"
+" }\n"
 "}\n"
 "// [N X Y4 4] -> [N Y X]\n"
-"__kernel void trans_3d_buf(__global const FLOAT* input,\n"
+"__kernel void trans_3d_buf(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT* input,\n"
 " __global FLOAT* output,\n"
 " __private const int batch,\n"
 " __private const int width,\n"
 " __private const int height\n"
 ") {\n"
 " int b=get_global_id(2);\n"
-" \n"
-" const int w=get_global_id(0) << 3;\n"
-" const int h=get_global_id(1) << 3;\n"
+" int w=get_global_id(0);\n"
+" int h=get_global_id(1);\n"
+" DEAL_NON_UNIFORM_DIM3(w,h,b);\n"
+" w=w << 3;\n"
+" h=h << 3;\n"
 " \n"
 " const int inp_offset=(b*width+w)*height+h;\n"
 " const int out_offset=(b*height+h)*width+w;\n"
@@ -2005,6 +2007,7 @@ const char* self_attention_buf =
 " __private const int seq_len_piece,\n"
 " __private const int head_num,\n"
 " __private const int head_dim,\n"
+" __private const int batch,\n"
 " __private const int seq_index\n"
 ") {\n"
 " \n"
@@ -2026,8 +2029,7 @@ const char* self_attention_buf =
 " \n"
 " const int offset_inp=((b*head_num+hn)*head_dim_pack+4*hd)*seq_len_pack+4*sl;\n"
 " \n"
-" const int offset_out=(((b*seq_len_4+seq_index*seq_len_piece/4+sl)*head_num+hn)*head_dim+4*hd)*4;\n"
-" \n"
+" const int offset_out=((((seq_index*seq_len_piece/4+sl)*batch+b)*head_num+hn)*head_dim+4*hd)*4;\n"
 " // Q\n"
 " FLOAT4 temp_0=vload4(0,input+offset_inp);\n"
 " FLOAT4 temp_1=vload4(0,input+offset_inp+seq_len_pack);\n"
@@ -2308,63 +2310,75 @@ const char* gemv_conv1x1_buf =
 "#define UCHAR8_TO_CHAR16(a, c) "" a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8; "" a.s8=(c.s4 >> 4)-8; a.s9=(c.s4 & 15)-8; a.sa=(c.s5 >> 4)-8; a.sb=(c.s5 & 15)-8; a.sc=(c.s6 >> 4)-8; a.sd=(c.s6 & 15)-8; a.se=(c.s7 >> 4)-8; a.sf=(c.s7 & 15)-8;\n"
 "#define DOT16X16(a, b, c) "" c += dot(a.s0123, b.s0123); "" c += dot(a.s4567, b.s4567); "" c += dot(a.s89ab, b.s89ab); "" c += dot(a.scdef,b.scdef);\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
-" #define PADZEROS(k, channel, data) {"" COMPUTE_FLOAT* ptr = (COMPUTE_FLOAT*)&data; "" int remain = k + 15 - channel; "" for(int r = remain; r >= 0; r--){ "" ptr[15 - remain] = 0; "" } "" }\n"
+" #define PADZEROS(k, channel, data) {"" COMPUTE_FLOAT* ptr = (COMPUTE_FLOAT*)&data; "" int remain = k + 15 - channel; "" for(int r = remain; r >= 0; r--){ "" ptr[15 - r] = 0; "" } "" }\n"
 "#else\n"
 " #define PADZEROS(k,channel,data)\n"
 "#endif\n"
+"#if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+"#define CHANNEL_PACK 32\n"
+"#else\n"
+"#define CHANNEL_PACK 16\n"
+"#endif\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+"#define WEIGHT_STRIDE 16\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+"#define WEIGHT_STRIDE 8\n"
+"#endif\n"
 "__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
-"__kernel void gemm_conv_c4_buf(GLOBAL_SIZE_DIM2\n"
+"#ifdef USE_IMAGE\n"
+"inline COMPUTE_FLOAT16 readWeight(__read_only image2d_t weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" return CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(ix,iy))))*scale+offset;\n"
+"}\n"
+"#else\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+"inline COMPUTE_FLOAT16 readWeight(__global const char *weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" return CONVERT_COMPUTE_FLOAT16(vload16(0,weight))*scale+offset;\n"
+"}\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+"inline COMPUTE_FLOAT16 readWeight(__global const uchar *weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" uchar16 charWeightsInt40=vload16(0,weight);\n"
+" uchar8 charWeightsInt4=vload8(0,weight);\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" return CONVERT_COMPUTE_FLOAT16(charWeights)*scale+offset;\n"
+"}\n"
+"#endif\n"
+"#endif\n"
+"__kernel void gemv_conv_c4_buf(GLOBAL_SIZE_DIM2\n"
 " __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " __global const char *weight,\n"
 "#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
 " __global const uchar *weight,\n"
 "#endif\n"
+"#endif\n"
 " __global const float *dequantScaleOffset,\n"
 " __global const FLOAT *bias,\n"
 " __global FLOAT* output,\n"
 " __private const int dstChannelC4,\n"
 " __private const int srcChannelC4,\n"
 " __private const int srcChannel,\n"
-" __private const int batch,\n"
-" __private const int height,\n"
-" __private const int width,\n"
+" __private const int bhw,\n"
 " __private const int blockNum,\n"
 " __private const int blockDim) {\n"
-" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
-" const int out_b_h_idx=get_global_id(1); //b h\n"
-" UNIFORM_BOUNDRY_CHECK(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/width;\n"
-" const int out_w_idx=out_c_w_idx % width;\n"
-"#ifdef BACTH_BLOCK4\n"
-" const int out_b_idx=(out_b_h_idx/height) << 2;\n"
-"#else\n"
-" const int out_b_idx=out_b_h_idx/height;\n"
-"#endif\n"
-" const int out_h_idx=out_b_h_idx % height;\n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
-" COMPUTE_FLOAT4 out=bias0;\n"
-"#ifdef BACTH_BLOCK4\n"
-" COMPUTE_FLOAT4 out1=bias0,out2=bias0,out3=bias0;\n"
-" int input_offset1=(((out_b_idx+1)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset2=(((out_b_idx+2)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset3=(((out_b_idx+3)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" bool isValidBatch1=out_b_idx+1<batch;\n"
-" bool isValidBatch2=out_b_idx+2<batch;\n"
-" bool isValidBatch3=out_b_idx+3<batch;\n"
-"#endif\n"
-" \n"
-" int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int out_offset=(((out_b_idx*dstChannelC4+out_c_idx)*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int wh=width*height*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*4*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*4*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
+" const int x=get_global_id(0); //c/4\n"
+" const int y=get_global_id(1); //b h w\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(x,bias));\n"
+" COMPUTE_FLOAT4 out0=bias0;\n"
+" int idn=x << 2;\n"
+" int idm=y;\n"
+" \n"
+" int input_offset0=idm*4;\n"
+" int out_offset=(x*bhw+idm)*4;\n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=x*4*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelC4*4*WEIGHT_STRIDE;\n"
 "#endif\n"
-" const int loop=(blockDim+15)/16;\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
 "#else\n"
@@ -2373,122 +2387,119 @@ const char* gemv_conv1x1_buf =
 " \n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(x,dequantScaleOffset+kindex));\n"
 " for (int j=0; j<loop_end; ++j) {\n"
 " int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k4=k << 2;\n"
-" #endif\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+32))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+48))*ScaleOffset.s6+ScaleOffset.s7;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar16 charWeightsInt40=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" uchar16 charWeightsInt41=vload16(0,weight+weight_offset+k*weight_oc_offset+16);\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11,weights20,weights21,weights30,weights31;\n"
 " {\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+1,k)));\n"
+" uchar16 charWeightsInt42=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+2,k)));\n"
+" uchar16 charWeightsInt43=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+3,k)));\n"
+" char16 charWeights0,charWeights1;\n"
 " UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
 " UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt42);\n"
+" weights20=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights21=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt43);\n"
+" weights30=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights31=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
 " }\n"
-" }\n"
-" #endif\n"
 " {\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" DOT16X16(in,weights2,out.s2);\n"
-" DOT16X16(in,weights3,out.s3);\n"
+" COMPUTE_FLOAT16 in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32));\n"
+" COMPUTE_FLOAT16 in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32+16));\n"
+" DOT16X16(in0,weights00,out0.s0);DOT16X16(in1,weights01,out0.s0);\n"
+" DOT16X16(in0,weights10,out0.s1);DOT16X16(in1,weights11,out0.s1);\n"
+" DOT16X16(in0,weights20,out0.s2);DOT16X16(in1,weights21,out0.s2);\n"
+" DOT16X16(in0,weights30,out0.s3);DOT16X16(in1,weights31,out0.s3);\n"
 " }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset1));\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
-" DOT16X16(in,weights2,out1.s2);\n"
-" DOT16X16(in,weights3,out1.s3);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset2));\n"
+" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,idn+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight,idn+2,k,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight,idn+3,k,ScaleOffset.s6,ScaleOffset.s7);\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+3)*wh));\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight+weight_offset+k*weight_oc_offset+2*WEIGHT_STRIDE,0,0,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight+weight_offset+k*weight_oc_offset+3*WEIGHT_STRIDE,0,0,ScaleOffset.s6,ScaleOffset.s7);\n"
 " #endif\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
-" DOT16X16(in,weights2,out2.s2);\n"
-" DOT16X16(in,weights3,out2.s3);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset3));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
-" DOT16X16(in,weights2,out3.s2);\n"
-" DOT16X16(in,weights3,out3.s3);\n"
+" {\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(k,input));\n"
+" DOT16X16(in,weights0,out0.s0);\n"
+" DOT16X16(in,weights1,out0.s1);\n"
+" DOT16X16(in,weights2,out0.s2);\n"
+" DOT16X16(in,weights3,out0.s3);\n"
 " }\n"
 " #endif\n"
 " }\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " {\n"
 " int k=i*loop+loop_end;\n"
-" int k4=k << 2;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+32))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+48))*ScaleOffset.s6+ScaleOffset.s7;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k8=k << 3;\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11,weights20,weights21,weights30,weights31;\n"
 " {\n"
-" uchar16 charWeightsInt40=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" uchar16 charWeightsInt41=vload16(0,weight+weight_offset+k*weight_oc_offset+16);\n"
-" {\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+1,k)));\n"
+" uchar16 charWeightsInt42=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+2,k)));\n"
+" uchar16 charWeightsInt43=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+3,k)));\n"
+" char16 charWeights0,charWeights1;\n"
 " UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
 " UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt42);\n"
+" weights20=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights21=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt43);\n"
+" weights30=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights31=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" \n"
+" PADZEROS(k,srcChannel,weights00);PADZEROS(k+16,srcChannel,weights01);\n"
+" PADZEROS(k,srcChannel,weights10);PADZEROS(k+16,srcChannel,weights11);\n"
+" PADZEROS(k,srcChannel,weights20);PADZEROS(k+16,srcChannel,weights21);\n"
+" PADZEROS(k,srcChannel,weights30);PADZEROS(k+16,srcChannel,weights31);\n"
 " }\n"
+" {\n"
+" COMPUTE_FLOAT16 in0,in1;\n"
+" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k8*4));\n"
+" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+(k8+1)*4) : (FLOAT4)0);\n"
+" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+(k8+2)*4) : (FLOAT4)0);\n"
+" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+(k8+3)*4) : (FLOAT4)0);\n"
+" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+(k8+4)*4) : (FLOAT4)0);\n"
+" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+(k8+5)*4) : (FLOAT4)0);\n"
+" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+(k8+6)*4) : (FLOAT4)0);\n"
+" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+(k8+7)*4) : (FLOAT4)0);\n"
+" DOT16X16(in0,weights00,out0.s0);DOT16X16(in1,weights01,out0.s0);\n"
+" DOT16X16(in0,weights10,out0.s1);DOT16X16(in1,weights11,out0.s1);\n"
+" DOT16X16(in0,weights20,out0.s2);DOT16X16(in1,weights21,out0.s2);\n"
+" DOT16X16(in0,weights30,out0.s3);DOT16X16(in1,weights31,out0.s3);\n"
 " }\n"
+" #else\n"
+" int k4=k << 2;\n"
+" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,idn+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight,idn+2,k,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight,idn+3,k,ScaleOffset.s6,ScaleOffset.s7);\n"
+" #else\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight+weight_offset+k*weight_oc_offset+2*WEIGHT_STRIDE,0,0,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight+weight_offset+k*weight_oc_offset+3*WEIGHT_STRIDE,0,0,ScaleOffset.s6,ScaleOffset.s7);\n"
 " #endif\n"
 " PADZEROS(k,srcChannel,weights0);\n"
 " PADZEROS(k,srcChannel,weights1);\n"
@@ -2496,148 +2507,62 @@ const char* gemv_conv1x1_buf =
 " PADZEROS(k,srcChannel,weights3);\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" DOT16X16(in,weights2,out.s2);\n"
-" DOT16X16(in,weights3,out.s3);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset1+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset1+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset1+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
-" DOT16X16(in,weights2,out1.s2);\n"
-" DOT16X16(in,weights3,out1.s3);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset2+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset2+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset2+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
-" DOT16X16(in,weights2,out2.s2);\n"
-" DOT16X16(in,weights3,out2.s3);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset3+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset3+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset3+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
-" DOT16X16(in,weights2,out3.s2);\n"
-" DOT16X16(in,weights3,out3.s3);\n"
+" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k4*4));\n"
+" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+(k4+1)*4) : (FLOAT4)0);\n"
+" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+(k4+2)*4) : (FLOAT4)0);\n"
+" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+(k4+3)*4) : (FLOAT4)0);\n"
+" DOT16X16(in,weights0,out0.s0);\n"
+" DOT16X16(in,weights1,out0.s1);\n"
+" DOT16X16(in,weights2,out0.s2);\n"
+" DOT16X16(in,weights3,out0.s3);\n"
 " }\n"
 " #endif\n"
 " }\n"
 " #endif\n"
 " }\n"
-" \n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-"#ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
 "#ifdef RELU\n"
-" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
+" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
 "#endif\n"
 "#ifdef RELU6\n"
-" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+out_offset);\n"
-" }\n"
+" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 "}\n"
-"__kernel void gemm_conv_c2_buf(GLOBAL_SIZE_DIM2\n"
+"__kernel void gemv_conv_c2_buf(GLOBAL_SIZE_DIM2\n"
 " __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " __global const char *weight,\n"
 "#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
 " __global const uchar *weight,\n"
 "#endif\n"
+"#endif\n"
 " __global const float *dequantScaleOffset,\n"
 " __global const FLOAT *bias,\n"
 " __global FLOAT* output,\n"
 " __private const int dstChannelC4,\n"
 " __private const int srcChannelC4,\n"
 " __private const int srcChannel,\n"
-" __private const int batch,\n"
-" __private const int height,\n"
-" __private const int width,\n"
+" __private const int bhw,\n"
 " __private const int blockNum,\n"
 " __private const int blockDim) {\n"
-" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
-" const int out_b_h_idx=get_global_id(1); //b h\n"
-" UNIFORM_BOUNDRY_CHECK(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/width;\n"
-" const int out_w_idx=out_c_w_idx % width;\n"
-"#ifdef BACTH_BLOCK4\n"
-" const int out_b_idx=(out_b_h_idx/height) << 2;\n"
-"#else\n"
-" const int out_b_idx=out_b_h_idx/height;\n"
-"#endif\n"
-" const int out_h_idx=out_b_h_idx % height;\n"
-" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,bias));\n"
-" COMPUTE_FLOAT2 out=bias0;\n"
-"#ifdef BACTH_BLOCK4\n"
-" COMPUTE_FLOAT2 out1=bias0,out2=bias0,out3=bias0;\n"
-" int input_offset1=(((out_b_idx+1)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset2=(((out_b_idx+2)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset3=(((out_b_idx+3)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" bool isValidBatch1=out_b_idx+1<batch;\n"
-" bool isValidBatch2=out_b_idx+2<batch;\n"
-" bool isValidBatch3=out_b_idx+3<batch;\n"
-"#endif\n"
-" int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int out_offset=(((out_b_idx*dstChannelC4+(out_c_idx*2)/4)*height+out_h_idx)*width+out_w_idx)*4+((out_c_idx*2)%4);\n"
-" int wh=width*height*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*2*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*2*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
-"#endif\n"
-" const int loop=(blockDim+15)/16;\n"
+" const int x=get_global_id(0); //c/2\n"
+" const int y=get_global_id(1); //b h w\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" \n"
+" int idn=x << 1;\n"
+" int idm=y;\n"
+" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(x,bias));\n"
+" COMPUTE_FLOAT2 out0=bias0;\n"
+" int input_offset0=idm*4;\n"
+" int out_offset=((x*2)/4*bhw+idm)*4+((x*2) % 4);\n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=x*2*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelC4*4*WEIGHT_STRIDE;\n"
+"#endif\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
 "#else\n"
@@ -2645,137 +2570,98 @@ const char* gemv_conv1x1_buf =
 "#endif\n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(x,dequantScaleOffset+kindex));\n"
 " for (int j=0; j<loop_end; ++j) {\n"
 " int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k4=k << 2;\n"
-" #endif\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11;\n"
 " {\n"
-" uchar16 charWeightsInt4=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+1,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
 " }\n"
-" #endif\n"
 " {\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset1));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
+" COMPUTE_FLOAT16 in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32));\n"
+" COMPUTE_FLOAT16 in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32+16));\n"
+" DOT16X16(in0,weights00,out0.s0);DOT16X16(in1,weights01,out0.s0);\n"
+" DOT16X16(in0,weights10,out0.s1);DOT16X16(in1,weights11,out0.s1);\n"
 " }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset2));\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset3));\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,idn+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+3)*wh));\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
 " #endif\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
+" {\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(k,input));\n"
+" DOT16X16(in,weights0,out0.s0);\n"
+" DOT16X16(in,weights1,out0.s1);\n"
 " }\n"
 " #endif\n"
 " }\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " {\n"
 " int k=i*loop+loop_end;\n"
-" int k4=k << 2;\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k8=k << 3;\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11;\n"
+" {\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn+1,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" \n"
+" PADZEROS(k,srcChannel,weights00);PADZEROS(k+16,srcChannel,weights01);\n"
+" PADZEROS(k,srcChannel,weights10);PADZEROS(k+16,srcChannel,weights11);\n"
+" }\n"
 " {\n"
-" uchar16 charWeightsInt4=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" COMPUTE_FLOAT16 in0,in1;\n"
+" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k8*4));\n"
+" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+(k8+1)*4) : (FLOAT4)0);\n"
+" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+(k8+2)*4) : (FLOAT4)0);\n"
+" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+(k8+3)*4) : (FLOAT4)0);\n"
+" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+(k8+4)*4) : (FLOAT4)0);\n"
+" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+(k8+5)*4) : (FLOAT4)0);\n"
+" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+(k8+6)*4) : (FLOAT4)0);\n"
+" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+(k8+7)*4) : (FLOAT4)0);\n"
+" DOT16X16(in0,weights00,out0.s0);DOT16X16(in1,weights01,out0.s0);\n"
+" DOT16X16(in0,weights10,out0.s1);DOT16X16(in1,weights11,out0.s1);\n"
 " }\n"
+" #else\n"
+" int k4=k << 2;\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,idn+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
+" #else\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
 " #endif\n"
 " PADZEROS(k,srcChannel,weights0);\n"
 " PADZEROS(k,srcChannel,weights1);\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset1+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset1+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset1+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset2+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset2+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset2+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset3+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset3+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset3+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
+" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k4*4));\n"
+" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+(k4+1)*4) : (FLOAT4)0);\n"
+" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+(k4+2)*4) : (FLOAT4)0);\n"
+" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+(k4+3)*4) : (FLOAT4)0);\n"
+" DOT16X16(in,weights0,out0.s0);\n"
+" DOT16X16(in,weights1,out0.s1);\n"
 " }\n"
 " #endif\n"
 " }\n"
@@ -2783,101 +2669,49 @@ const char* gemv_conv1x1_buf =
 " }\n"
 " \n"
 "#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
-" vstore2(CONVERT_FLOAT2(out),0,output+out_offset);\n"
-"#ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out1=fmax(out1,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out1,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
-" \n"
-" vstore2(CONVERT_FLOAT2(out1),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out2=fmax(out2,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out2=clamp(out2,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
-" \n"
-" vstore2(CONVERT_FLOAT2(out2),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out3=fmax(out3,(COMPUTE_FLOAT2)0);\n"
+" out0=fmax(out0,(COMPUTE_FLOAT2)0);\n"
 "#endif\n"
 "#ifdef RELU6\n"
-" out3=clamp(out3,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
-" \n"
-" vstore2(CONVERT_FLOAT2(out3),0,output+out_offset);\n"
-" }\n"
+" out0=clamp(out0,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
 "#endif\n"
+" vstore2(CONVERT_FLOAT2(out0),0,output+out_offset);\n"
 "}\n"
-"__kernel void gemm_conv_c1_buf(GLOBAL_SIZE_DIM2\n"
+"__kernel void gemv_conv_c1_buf(GLOBAL_SIZE_DIM2\n"
 " __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
 "#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
 " __global const char *weight,\n"
 "#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
 " __global const uchar *weight,\n"
 "#endif\n"
+"#endif\n"
 " __global const float *dequantScaleOffset,\n"
 " __global const FLOAT *bias,\n"
 " __global FLOAT* output,\n"
 " __private const int dstChannelC4,\n"
 " __private const int srcChannelC4,\n"
 " __private const int srcChannel,\n"
-" __private const int batch,\n"
-" __private const int height,\n"
-" __private const int width,\n"
+" __private const int bhw,\n"
 " __private const int blockNum,\n"
 " __private const int blockDim) {\n"
-" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
-" const int out_b_h_idx=get_global_id(1); //b h\n"
-" UNIFORM_BOUNDRY_CHECK(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/width;\n"
-" const int out_w_idx=out_c_w_idx % width;\n"
-"#ifdef BACTH_BLOCK4\n"
-" const int out_b_idx=(out_b_h_idx/height) << 2;\n"
-"#else\n"
-" const int out_b_idx=out_b_h_idx/height;\n"
-"#endif\n"
-" const int out_h_idx=out_b_h_idx % height;\n"
-" COMPUTE_FLOAT bias0=bias[out_c_idx];\n"
-" COMPUTE_FLOAT out=bias0;\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //b h w\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" int idn=x;\n"
+" int idm=y;\n"
+" COMPUTE_FLOAT bias0=bias[x];\n"
+" COMPUTE_FLOAT out0=bias0;\n"
 " \n"
-"#ifdef BACTH_BLOCK4\n"
-" COMPUTE_FLOAT out1=bias0,out2=bias0,out3=bias0;\n"
-" int input_offset1=(((out_b_idx+1)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset2=(((out_b_idx+2)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset3=(((out_b_idx+3)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" bool isValidBatch1=out_b_idx+1<batch;\n"
-" bool isValidBatch2=out_b_idx+2<batch;\n"
-" bool isValidBatch3=out_b_idx+3<batch;\n"
-"#endif\n"
+" int input_offset0=idm*4;\n"
 " \n"
-" int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int out_offset=(((out_b_idx*dstChannelC4+out_c_idx/4)*height+out_h_idx)*width+out_w_idx)*4+(out_c_idx%4);\n"
-" int wh=width*height*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
+" int out_offset=((x/4)*bhw+idm)*4+(x % 4);\n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=x*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelC4*4*WEIGHT_STRIDE;\n"
 "#endif\n"
-" const int loop=(blockDim+15)/16;\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
 "#ifdef INPUT_CHANNEL_LEAVE\n"
 " const int loop_end=max(loop-1,0);\n"
 "#else\n"
@@ -2886,676 +2720,152 @@ const char* gemv_conv1x1_buf =
 " \n"
 " for (int i=0; i<blockNum; ++i){\n"
 " int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
+" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(x,dequantScaleOffset+kindex));\n"
 " for (int j=0; j<loop_end; ++j) {\n"
 " int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k4=k << 2;\n"
-" #endif\n"
-" COMPUTE_FLOAT16 weights;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT16 weights00,weights01;\n"
 " {\n"
-" uchar8 charWeightsInt4=vload8(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
 " }\n"
-" #endif\n"
 " {\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights,out);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset1));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights,out1);\n"
+" COMPUTE_FLOAT16 in0=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32));\n"
+" COMPUTE_FLOAT16 in1=CONVERT_COMPUTE_FLOAT16(vload16(0,input+k32+16));\n"
+" DOT16X16(in0,weights00,out0);DOT16X16(in1,weights01,out0);\n"
 " }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset2));\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(k,input+input_offset3));\n"
+" COMPUTE_FLOAT16 weights;\n"
+" #ifdef USE_IMAGE\n"
+" weights=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+3)*wh));\n"
+" weights=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
 " #endif\n"
-" DOT16X16(in,weights,out3);\n"
+" {\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(k,input));\n"
+" DOT16X16(in,weights,out0);\n"
 " }\n"
 " #endif\n"
 " }\n"
 " #ifdef INPUT_CHANNEL_LEAVE\n"
 " {\n"
 " int k=i*loop+loop_end;\n"
-" int k4=k << 2;\n"
-" COMPUTE_FLOAT16 weights;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar8 charWeightsInt4=vload8(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" #endif\n"
-" PADZEROS(k,srcChannel,weights);\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" int k8=k << 3;\n"
+" COMPUTE_FLOAT16 weights00,weights01;\n"
 " {\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights,out);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset1+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset1+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset1+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights,out1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset2+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset2+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset2+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset3+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset3+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset3+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights,out3);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #endif\n"
-" }\n"
-" \n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" output[out_offset]=out;\n"
-"#ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out1=fmax(out1,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out1,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out1;\n"
-" }\n"
-" if(isValidBatch2){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out2=fmax(out2,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out2=clamp(out2,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out2;\n"
-" }\n"
-" if(isValidBatch3){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out3=fmax(out3,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out3=clamp(out3,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out3;\n"
-" }\n"
-"#endif\n"
-"}\n"
-"__kernel void gemm_conv_c2_image(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-" __read_only image2d_t weight,\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int srcChannel,\n"
-" __private const int batch,\n"
-" __private const int height,\n"
-" __private const int width,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
-" const int out_b_h_idx=get_global_id(1); //b h\n"
-" UNIFORM_BOUNDRY_CHECK(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=(out_c_w_idx/width) << 1;\n"
-" const int out_w_idx=out_c_w_idx % width;\n"
-"#ifdef BACTH_BLOCK4\n"
-" const int out_b_idx=(out_b_h_idx/height) << 2;\n"
-"#else\n"
-" const int out_b_idx=out_b_h_idx/height;\n"
-"#endif\n"
-" const int out_h_idx=out_b_h_idx % height;\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(idn,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
 " \n"
-" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(0,bias+out_c_idx));\n"
-" COMPUTE_FLOAT2 out=bias0;\n"
-" \n"
-"#ifdef BACTH_BLOCK4\n"
-" COMPUTE_FLOAT2 out1=bias0,out2=bias0,out3=bias0;\n"
-" int input_offset1=(((out_b_idx+1)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset2=(((out_b_idx+2)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset3=(((out_b_idx+3)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" bool isValidBatch1=out_b_idx+1<batch;\n"
-" bool isValidBatch2=out_b_idx+2<batch;\n"
-" bool isValidBatch3=out_b_idx+3<batch;\n"
-"#endif\n"
-" int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int out_offset=(((out_b_idx*dstChannelC4+out_c_idx/4)*height+out_h_idx)*width+out_w_idx)*4+(out_c_idx % 4);\n"
-" int wh=width*height*4;\n"
-" const int loop=(blockDim+15)/16;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-" for (int i=0; i<blockNum; ++i){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k4=k << 2;\n"
-" #endif\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" PADZEROS(k,srcChannel,weights00);PADZEROS(k+16,srcChannel,weights01);\n"
 " }\n"
-" #endif\n"
 " {\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
+" COMPUTE_FLOAT16 in0,in1;\n"
+" in0.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k8*4));\n"
+" in0.s4567=CONVERT_COMPUTE_FLOAT4(k8+1<srcChannelC4 ? vload4(0,input+(k8+1)*4) : (FLOAT4)0);\n"
+" in0.s89ab=CONVERT_COMPUTE_FLOAT4(k8+2<srcChannelC4 ? vload4(0,input+(k8+2)*4) : (FLOAT4)0);\n"
+" in0.scdef=CONVERT_COMPUTE_FLOAT4(k8+3<srcChannelC4 ? vload4(0,input+(k8+3)*4) : (FLOAT4)0);\n"
+" in1.s0123=CONVERT_COMPUTE_FLOAT4(k8+4<srcChannelC4 ? vload4(0,input+(k8+4)*4) : (FLOAT4)0);\n"
+" in1.s4567=CONVERT_COMPUTE_FLOAT4(k8+5<srcChannelC4 ? vload4(0,input+(k8+5)*4) : (FLOAT4)0);\n"
+" in1.s89ab=CONVERT_COMPUTE_FLOAT4(k8+6<srcChannelC4 ? vload4(0,input+(k8+6)*4) : (FLOAT4)0);\n"
+" in1.scdef=CONVERT_COMPUTE_FLOAT4(k8+7<srcChannelC4 ? vload4(0,input+(k8+7)*4) : (FLOAT4)0);\n"
+" DOT16X16(in0,weights00,out0);DOT16X16(in1,weights01,out0);\n"
 " }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*16));\n"
 " #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
 " int k4=k << 2;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
+" COMPUTE_FLOAT16 weights;\n"
+" #ifdef USE_IMAGE\n"
+" weights=readWeight(weight,idn,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" #else\n"
+" weights=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
 " #endif\n"
-" PADZEROS(k,srcChannel,weights0);\n"
-" PADZEROS(k,srcChannel,weights1);\n"
+" PADZEROS(k,srcChannel,weights);\n"
 " {\n"
 " COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out.s0);\n"
-" DOT16X16(in,weights1,out.s1);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset1+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset1+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset1+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out1.s0);\n"
-" DOT16X16(in,weights1,out1.s1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset2+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset2+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset2+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out2.s0);\n"
-" DOT16X16(in,weights1,out2.s1);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset3+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset3+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset3+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out3.s0);\n"
-" DOT16X16(in,weights1,out3.s1);\n"
+" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k4*4));\n"
+" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+(k4+1)*4) : (FLOAT4)0);\n"
+" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+(k4+2)*4) : (FLOAT4)0);\n"
+" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+(k4+3)*4) : (FLOAT4)0);\n"
+" DOT16X16(in,weights,out0);\n"
 " }\n"
 " #endif\n"
 " }\n"
 " #endif\n"
 " }\n"
+" \n"
 "#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT2)0);\n"
+" out0=fmax(out0,(COMPUTE_FLOAT)0);\n"
 "#endif\n"
 "#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
+" out0=clamp(out0,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
 "#endif\n"
-" vstore2(CONVERT_FLOAT2(out),0,output+out_offset);\n"
-"#ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out1=fmax(out1,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out1,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
+" output[out_offset]=out0;\n"
+"}\n"
+;
+#endif
+const char* raster = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
+"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
+"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
+"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"__kernel void buffer_set_zero(\n"
+" GLOBAL_SIZE_2_DIMS\n"
+" __global OUTPUT_TYPE *output\n"
+" ) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
 " \n"
-" vstore2(CONVERT_FLOAT2(out1),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out2=fmax(out2,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out2=clamp(out2,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
 " \n"
-" vstore2(CONVERT_FLOAT2(out2),0,output+out_offset);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out3=fmax(out3,(COMPUTE_FLOAT2)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out3=clamp(out3,(COMPUTE_FLOAT2)0,(COMPUTE_FLOAT2)6);\n"
-"#endif\n"
+" output[y*global_size_dim0+x]=(OUTPUT_TYPE)(0);\n"
+"}\n"
+"__kernel void image_set_zero(\n"
+" GLOBAL_SIZE_2_DIMS\n"
+" __write_only image2d_t output\n"
+" ) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
 " \n"
-" vstore2(CONVERT_FLOAT2(out3),0,output+out_offset);\n"
-" }\n"
-"#endif\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" WI_DATA(output,(int2)(x,y),(OUTPUT_TYPE_I4)(0));\n"
 "}\n"
-"__kernel void gemm_conv_c1_image(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-" __read_only image2d_t weight,\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int srcChannel,\n"
-" __private const int batch,\n"
-" __private const int height,\n"
-" __private const int width,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int out_c_w_idx=get_global_id(0); //c/4 w\n"
-" const int out_b_h_idx=get_global_id(1); //b h\n"
-" UNIFORM_BOUNDRY_CHECK(out_c_w_idx,out_b_h_idx);\n"
-" const int out_c_idx=out_c_w_idx/width;\n"
-" const int out_w_idx=out_c_w_idx % width;\n"
-"#ifdef BACTH_BLOCK4\n"
-" const int out_b_idx=(out_b_h_idx/height) << 2;\n"
-"#else\n"
-" const int out_b_idx=out_b_h_idx/height;\n"
-"#endif\n"
-" const int out_h_idx=out_b_h_idx % height;\n"
+"__kernel void raster_buffer_direct(\n"
+" GLOBAL_SIZE_3_DIMS\n"
+" __read_only image2d_t input,\n"
+" __private const int inputOffset,\n"
+" __private const int combineSrcOffset,\n"
+" __private const int inputStride0,\n"
+" __private const int inputStride1,\n"
+" __private const int inputStride2,\n"
+" __private const int src_width,\n"
+" __private const int src_height,\n"
+" __private const int src_channel,\n"
+" __global OUTPUT_TYPE *output,\n"
+" __private const int outputOffset,\n"
+" __private const int combineDstOffset,\n"
+" __private const int outputStride0,\n"
+" __private const int outputStride1,\n"
+" __private const int outputStride2,\n"
+" __private const int global_size0\n"
+" ) {\n"
+" const int idx=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" const int z=get_global_id(2);\n"
 " \n"
-" COMPUTE_FLOAT bias0=bias[out_c_idx];\n"
-" COMPUTE_FLOAT out=bias0;\n"
-" \n"
-" int input_offset=((out_b_idx*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int out_offset=(((out_b_idx*dstChannelC4+out_c_idx/4)* height+out_h_idx)*width+out_w_idx)*4+(out_c_idx%4);\n"
-" int wh=width*height*4;\n"
-"#ifdef BACTH_BLOCK4\n"
-" COMPUTE_FLOAT out1=bias0,out2=bias0,out3=bias0;\n"
-" int input_offset1=(((out_b_idx+1)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset2=(((out_b_idx+2)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" int input_offset3=(((out_b_idx+3)*srcChannelC4*height+out_h_idx)*width+out_w_idx)*4;\n"
-" bool isValidBatch1=out_b_idx+1<batch;\n"
-" bool isValidBatch2=out_b_idx+2<batch;\n"
-" bool isValidBatch3=out_b_idx+3<batch;\n"
-"#endif\n"
-" const int loop=(blockDim+15)/16;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-" \n"
-" for (int i=0; i<blockNum; ++i){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" #ifndef WIDTH_HEIGHT_1\n"
-" int k4=k << 2;\n"
-" #endif\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0;\n"
-" {\n"
-" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" #endif\n"
-" {\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset1+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset2+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" #ifdef WIDTH_HEIGHT_1\n"
-" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset3+k*16));\n"
-" #else\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+1)*wh));\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+2)*wh));\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+(k4+3)*wh));\n"
-" #endif\n"
-" DOT16X16(in,weights0,out3);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k4=k << 2;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0;\n"
-" {\n"
-" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-" #endif\n"
-" PADZEROS(k,srcChannel,weights0);\n"
-" {\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out);\n"
-" }\n"
-" #ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset1+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset1+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset1+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset1+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out1);\n"
-" }\n"
-" if(isValidBatch2){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset2+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset2+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset2+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset2+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out2);\n"
-" }\n"
-" if(isValidBatch3){\n"
-" COMPUTE_FLOAT16 in;\n"
-" in.s0123=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset3+k4*wh));\n"
-" in.s4567=CONVERT_COMPUTE_FLOAT4(k4+1<srcChannelC4 ? vload4(0,input+input_offset3+(k4+1)*wh) : (FLOAT4)0);\n"
-" in.s89ab=CONVERT_COMPUTE_FLOAT4(k4+2<srcChannelC4 ? vload4(0,input+input_offset3+(k4+2)*wh) : (FLOAT4)0);\n"
-" in.scdef=CONVERT_COMPUTE_FLOAT4(k4+3<srcChannelC4 ? vload4(0,input+input_offset3+(k4+3)*wh) : (FLOAT4)0);\n"
-" DOT16X16(in,weights0,out3);\n"
-" }\n"
-" #endif\n"
-" }\n"
-" #endif\n"
-" }\n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" output[out_offset]=out;\n"
-"#ifdef BACTH_BLOCK4\n"
-" if(isValidBatch1){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out1=fmax(out1,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out1,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out1;\n"
-" }\n"
-" if(isValidBatch2){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out2=fmax(out2,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out1=clamp(out2,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out2;\n"
-" }\n"
-" if(isValidBatch3){\n"
-" out_offset += dstChannelC4*height*width*4;\n"
-"#ifdef RELU\n"
-" out3=fmax(out3,(COMPUTE_FLOAT)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out3=clamp(out3,(COMPUTE_FLOAT)0,(COMPUTE_FLOAT)6);\n"
-"#endif\n"
-" \n"
-" output[out_offset]=out3;\n"
-" }\n"
-"#endif\n"
-"}\n"
-" \n"
-;
-#endif
-const char* raster = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
-"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
-"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
-"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"__kernel void buffer_set_zero(\n"
-" GLOBAL_SIZE_2_DIMS\n"
-" __global OUTPUT_TYPE *output\n"
-" ) {\n"
-" const int x=get_global_id(0);\n"
-" const int y=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(x,y);\n"
-" \n"
-" output[y*global_size_dim0+x]=(OUTPUT_TYPE)(0);\n"
-"}\n"
-"__kernel void image_set_zero(\n"
-" GLOBAL_SIZE_2_DIMS\n"
-" __write_only image2d_t output\n"
-" ) {\n"
-" const int x=get_global_id(0);\n"
-" const int y=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(x,y);\n"
-" WI_DATA(output,(int2)(x,y),(OUTPUT_TYPE_I4)(0));\n"
-"}\n"
-"__kernel void raster_buffer_direct(\n"
-" GLOBAL_SIZE_3_DIMS\n"
-" __read_only image2d_t input,\n"
-" __private const int inputOffset,\n"
-" __private const int combineSrcOffset,\n"
-" __private const int inputStride0,\n"
-" __private const int inputStride1,\n"
-" __private const int inputStride2,\n"
-" __private const int src_width,\n"
-" __private const int src_height,\n"
-" __private const int src_channel,\n"
-" __global OUTPUT_TYPE *output,\n"
-" __private const int outputOffset,\n"
-" __private const int combineDstOffset,\n"
-" __private const int outputStride0,\n"
-" __private const int outputStride1,\n"
-" __private const int outputStride2,\n"
-" __private const int global_size0\n"
-" ) {\n"
-" const int idx=get_global_id(0);\n"
-" const int y=get_global_id(1);\n"
-" const int z=get_global_id(2);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM3(idx,y,z);\n"
-" const int x=idx % global_size0;\n"
-" const int id=idx/global_size0;\n"
+" DEAL_NON_UNIFORM_DIM3(idx,y,z);\n"
+" const int x=idx % global_size0;\n"
+" const int id=idx/global_size0;\n"
 " \n"
 " int inputIndex=inputOffset+id*combineSrcOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
 " int outputIndex=outputOffset+id*combineDstOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
@@ -3673,6 +2983,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -3699,11 +3010,11 @@ const char* conv_2d_c1_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*output_pack;\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
 " \n"
 " \n"
-" const uint output_offset=b*output_b_pitch +\n"
-" f_block*4*output_fs_pitch +\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" f_block*4*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -3771,13 +3082,13 @@ const char* conv_2d_c1_subgroup_buf =
 " if ((f_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
 " if ((f_block*16+lid_y*4<output_pack*4))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "}\n"
@@ -3794,6 +3105,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -3820,11 +3132,11 @@ const char* conv_2d_c1_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*output_pack;\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
 " \n"
 " \n"
-" const uint output_offset=b*output_b_pitch +\n"
-" f_block*4*output_fs_pitch +\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" f_block*4*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -3892,13 +3204,13 @@ const char* conv_2d_c1_subgroup_buf =
 " if ((f_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
 " if ((f_block*16+lid_y*4<output_pack*4))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "}\n"
@@ -3915,6 +3227,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -3941,11 +3254,11 @@ const char* conv_2d_c1_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*output_pack;\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
 " \n"
 " \n"
-" const uint output_offset=b*output_b_pitch +\n"
-" f_block*4*output_fs_pitch +\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" f_block*4*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -4013,13 +3326,13 @@ const char* conv_2d_c1_subgroup_buf =
 " if ((f_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
 " if ((f_block*16+lid_y*4<output_pack*4))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "}\n"
@@ -4036,6 +3349,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -4168,6 +3482,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -4300,6 +3615,7 @@ const char* conv_2d_c1_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -4837,6 +4153,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -4877,7 +4194,7 @@ const char* conv_2d_int_buf =
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+kw_start)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
 " for(int ix=in_w_idx_start; ix<in_w_idx_end; ix += dilate_hw.y) {\n"
-" int inp_offset=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
+" int inp_offset=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " \n"
 " const int filter_w_inc=(ix-in_w_idx_start)/dilate_hw.y;\n"
@@ -4936,7 +4253,7 @@ const char* conv_2d_int_buf =
 "#ifdef RELU6\n"
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " \n"
 "}\n"
@@ -4954,6 +4271,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -4993,7 +4311,7 @@ const char* conv_2d_int_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -5063,7 +4381,7 @@ const char* conv_2d_int_buf =
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " if(out_w_idx+1 >= out_hw.y) return;\n"
@@ -5086,6 +4404,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -5127,7 +4446,7 @@ const char* conv_2d_int_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -5214,7 +4533,7 @@ const char* conv_2d_int_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if (remain >= 4) {\n"
@@ -5245,6 +4564,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -5286,7 +4606,7 @@ const char* conv_2d_int_buf =
 " COMPUTE_FLOAT4 offset=(COMPUTE_FLOAT4)(ScaleOffset.s1,ScaleOffset.s3,ScaleOffset.s5,ScaleOffset.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -5375,7 +4695,7 @@ const char* conv_2d_int_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -5414,6 +4734,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -5464,7 +4785,7 @@ const char* conv_2d_int_buf =
 " COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -5619,7 +4940,7 @@ const char* conv_2d_int_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -5642,7 +4963,7 @@ const char* conv_2d_int_buf =
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
@@ -5668,7 +4989,7 @@ const char* conv_2d_int_buf =
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
@@ -5689,6 +5010,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -5733,7 +5055,7 @@ const char* conv_2d_int_buf =
 " COMPUTE_FLOAT4 offset1=(COMPUTE_FLOAT4)(ScaleOffset1.s1,ScaleOffset1.s3,ScaleOffset1.s5,ScaleOffset1.s7);\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -5857,7 +5179,7 @@ const char* conv_2d_int_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 2){\n"
@@ -5871,7 +5193,7 @@ const char* conv_2d_int_buf =
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 2){\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
@@ -5886,7 +5208,7 @@ const char* conv_2d_int_buf =
 " return;\n"
 " }\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
 "#endif\n"
@@ -5905,6 +5227,7 @@ const char* conv_2d_int_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -5956,7 +5279,7 @@ const char* conv_2d_int_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -6111,7 +5434,7 @@ const char* conv_2d_int_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if(remain >= 4){\n"
@@ -6127,7 +5450,7 @@ const char* conv_2d_int_buf =
 "#ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks)return;\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 " }else if(remain == 3){\n"
@@ -6143,7 +5466,7 @@ const char* conv_2d_int_buf =
 "#ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks)return;\n"
 "#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 "#endif\n"
 "}\n"
@@ -6166,7 +5489,7 @@ const char* interp_buf =
 " __private const int input_width,\n"
 " __private const int out_height,\n"
 " __private const int out_width,\n"
-" __private const int channelBlocks) {\n"
+" __private const int batch) {\n"
 " const int output_channel_block_idx=get_global_id(0);\n"
 " const int output_width_block_idx=get_global_id(1);\n"
 " const int output_batch_height_block_idx=get_global_id(2);\n"
@@ -6182,9 +5505,9 @@ const char* interp_buf =
 " const int in_h_index=min(max(0,(int)floor(in_h_idx)),input_height-1);\n"
 " const int in_w_index=min(max(0,(int)floor(in_w_idx)),input_width-1);\n"
 "#endif\n"
-" const int inp_offset=((output_batch_idx*channelBlocks+output_channel_block_idx)*input_height+in_h_index)*input_width+in_w_index;\n"
+" const int inp_offset=((output_batch_idx+output_channel_block_idx*batch)*input_height+in_h_index)*input_width+in_w_index;\n"
 " FLOAT4 value=vload4(inp_offset,input);\n"
-" const int out_offset=((output_batch_idx*channelBlocks+output_channel_block_idx)*out_height+output_height_idx)*out_width+output_width_block_idx;\n"
+" const int out_offset=((output_batch_idx+output_channel_block_idx*batch)*out_height+output_height_idx)*out_width+output_width_block_idx;\n"
 " vstore4(value,out_offset,output);\n"
 "}\n"
 "__kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n"
@@ -6197,7 +5520,7 @@ const char* interp_buf =
 " __private const int input_width,\n"
 " __private const int out_height,\n"
 " __private const int out_width,\n"
-" __private const int channelBlocks) {\n"
+" __private const int batch) {\n"
 " const int output_channel_block_idx=get_global_id(0);\n"
 " const int output_width_block_idx=get_global_id(1);\n"
 " const int output_batch_height_block_idx=get_global_id(2);\n"
@@ -6215,7 +5538,7 @@ const char* interp_buf =
 " float factor_w=(in_w_idx-(int)floor(in_w_idx));\n"
 " float factor_h=(in_h_idx-(int)floor(in_h_idx));\n"
 " \n"
-" const int inp_offset_base=(output_batch_idx*channelBlocks+output_channel_block_idx)*input_height;\n"
+" const int inp_offset_base=(output_batch_idx+output_channel_block_idx*batch)*input_height;\n"
 " const int inp_offset_00=(inp_offset_base+in_h0_index)*input_width+in_w0_index;\n"
 " const int inp_offset_01=(inp_offset_base+in_h0_index)*input_width+in_w1_index;\n"
 " const int inp_offset_10=(inp_offset_base+in_h1_index)*input_width+in_w0_index;\n"
@@ -6226,7 +5549,7 @@ const char* interp_buf =
 " FLOAT4 value_11=vload4(inp_offset_11,input);\n"
 " FLOAT4 value=CONVERT_FLOAT4((float4)((1.0-factor_w)*(1.0-factor_h))*convert_float4(value_00)+(float4)(factor_w*(1.0-factor_h))*convert_float4(value_01)+(float4)((1.0-factor_w)*factor_h)*convert_float4(value_10)+(float4)(factor_w*factor_h)*convert_float4(value_11));\n"
 " \n"
-" const int out_offset=((output_batch_idx*channelBlocks+output_channel_block_idx)*out_height+output_height_idx)*out_width+output_width_block_idx;\n"
+" const int out_offset=((output_batch_idx+output_channel_block_idx*batch)*out_height+output_height_idx)*out_width+output_width_block_idx;\n"
 " \n"
 " vstore4(value,out_offset,output);\n"
 "}\n"
@@ -6244,7 +5567,7 @@ const char* interp_buf =
 " __private const int out_depth,\n"
 " __private const int out_height,\n"
 " __private const int out_width,\n"
-" __private const int channelBlocks) {\n"
+" __private const int batch) {\n"
 " const int output_channel_block_idx=get_global_id(0);\n"
 " const int output_height_width_block_idx=get_global_id(1);\n"
 " const int output_batch_depth_block_idx=get_global_id(2);\n"
@@ -6259,9 +5582,9 @@ const char* interp_buf =
 " const int in_d_index=min(max(0,(int)floor(in_d_idx)),input_depth-1);\n"
 " const int in_h_index=min(max(0,(int)floor(in_h_idx)),input_height-1);\n"
 " const int in_w_index=min(max(0,(int)floor(in_w_idx)),input_width-1);\n"
-" const int inp_offset=(((output_batch_idx*channelBlocks+output_channel_block_idx)\n"
+" const int inp_offset=(((output_batch_idx+output_channel_block_idx*batch)\n"
 "*input_depth+in_d_index)*input_height+in_h_index)*input_width+in_w_index;\n"
-" const int out_offset=(((output_batch_idx*channelBlocks+output_channel_block_idx)\n"
+" const int out_offset=(((output_batch_idx+output_channel_block_idx*batch)\n"
 "*out_depth+output_depth_idx)*out_height+output_height_idx)*out_width+output_width_idx;\n"
 " FLOAT4 value=vload4(inp_offset,input);\n"
 " vstore4(value,out_offset,output);\n"
@@ -6554,862 +5877,97 @@ const char* softmax =
 " \n"
 " /*Compute Result */\n"
 " for (int i=0; i<shape.w; i++) {\n"
-" FLOAT4 value=exp(RI_F(input,SAMPLER,(int2)(c*shape.w+i,bh))-maxValue)/sumValue;\n"
-" WI_F(output,(int2)(c*shape.w+i,bh),value);\n"
-" }\n"
-"#endif\n"
-"}\n"
-;
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* binary_buf = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define PI 3.141592653589f\n"
-"__kernel void binary_buf(__private int global_dim0,__private int global_dim1,\n"
-" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
-" __private const int4 shape,//[N,H,W,C4]\n"
-" __private const int2 isFull,\n"
-" __private const int activationType) {\n"
-" int2 pos=(int2)(get_global_id(0),get_global_id(1));//NC4,HW\n"
-" \n"
-" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
-" #ifdef WH_PACK4\n"
-" int offset=pos.x*(shape.y*shape.z/4)+pos.y;\n"
-" #ifdef A_SINGLE\n"
-" float data0=input0[0];\n"
-" float16 in0_16=(float16)data0;\n"
-" #else\n"
-" float16 in0_16=convert_float16(vload16(offset,input0));\n"
-" #endif\n"
-" \n"
-" #ifdef B_SINGLE\n"
-" float data1=input1[0];\n"
-" float16 in1_16=(float16)data1;\n"
-" #else\n"
-" float16 in1_16=convert_float16(vload16(offset,input1));\n"
-" #endif\n"
-" \n"
-" float16 out;\n"
-" float4 in0=in0_16.s0123;\n"
-" float4 in1=in1_16.s0123;\n"
-" out.s0123=OPERATOR;\n"
-" \n"
-" in0=in0_16.s4567;\n"
-" in1=in1_16.s4567;\n"
-" out.s4567=OPERATOR;\n"
-" \n"
-" in0=in0_16.s89ab;\n"
-" in1=in1_16.s89ab;\n"
-" out.s89ab=OPERATOR;\n"
-" \n"
-" in0=in0_16.scdef;\n"
-" in1=in1_16.scdef;\n"
-" out.scdef=OPERATOR;\n"
-" \n"
-" if(activationType == 1) {\n"
-" out=fmax(out,(float16)0);\n"
-" }\n"
-" vstore16(CONVERT_OUTPUT16(out),offset,output);\n"
-" #else\n"
-" int offset=pos.x*(shape.y*shape.z)+pos.y;\n"
-" #ifdef A_SINGLE\n"
-" float data0=input0[0];\n"
-" float4 in0=(float4)(data0,data0,data0,data0);\n"
-" #else\n"
-" float4 in0=convert_float4(vload4(offset,input0));\n"
-" #endif\n"
-" \n"
-" #ifdef B_SINGLE\n"
-" float data1=input1[0];\n"
-" float4 in1=(float4)(data1,data1,data1,data1);\n"
-" #else\n"
-" float4 in1=convert_float4(vload4(offset,input1));\n"
-" #endif\n"
-" \n"
-" float4 out=OPERATOR;\n"
-" \n"
-" if(activationType == 1) {\n"
-" out=fmax(out,(float4)0);\n"
-" }\n"
-" vstore4(CONVERT_OUTPUT4(out),offset,output);\n"
-" #endif\n"
-" }\n"
-"}\n"
-"__kernel void prelu_buf(__private int global_dim0,__private int global_dim1,\n"
-" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
-" __private const int4 shape//[N,H,W,C4]\n"
-" ) {\n"
-" int2 pos=(int2)(get_global_id(0),get_global_id(1));//NC4,HW\n"
-" \n"
-" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
-" int offset=pos.x*(shape.y*shape.z)+pos.y;\n"
-" float4 in0=convert_float4(vload4(offset,input0));\n"
-" float4 in1=convert_float4(vload4(pos.x % shape.w,input1));\n"
-" float4 out=OPERATOR;\n"
-" vstore4(CONVERT_OUTPUT4(out),offset,output);\n"
-" }\n"
-"}\n"
-;
-#endif
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* gemm_quant_batch_buf = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define GLOBAL_SIZE_DIM2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
-"#define UNIFORM_BOUNDRY_CHECK(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
-"#define GLOBAL_SIZE_DIM3 "" __private int global_size_dim0,__private int global_size_dim1,__private int global_size_dim2,\n"
-"#define UNIFORM_BOUNDRY_CHECK3(index0, index1, index2) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1 || index2 >= global_size_dim2) { "" return; "" }\n"
-"#define UCHAR16_TO_2CHAR16(a, b, c) "" a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8; "" a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8; "" b.s0 = (c.s8 >> 4) - 8; b.s1 = (c.s8 & 15) - 8; b.s2 = (c.s9 >> 4) - 8; b.s3 = (c.s9 & 15) - 8; b.s4 = (c.sa >> 4) - 8; b.s5 = (c.sa & 15) - 8; b.s6 = (c.sb >> 4) - 8; b.s7 = (c.sb & 15) - 8; "" b.s8=(c.sc >> 4)-8; b.s9=(c.sc & 15)-8; b.sa=(c.sd >> 4)-8; b.sb=(c.sd & 15)-8; b.sc=(c.se >> 4)-8; b.sd=(c.se & 15)-8; b.se=(c.sf >> 4)-8; b.sf=(c.sf & 15)-8;\n"
-"#define UCHAR8_TO_CHAR16(a, c) "" a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8; "" a.s8=(c.s4 >> 4)-8; a.s9=(c.s4 & 15)-8; a.sa=(c.s5 >> 4)-8; a.sb=(c.s5 & 15)-8; a.sc=(c.s6 >> 4)-8; a.sd=(c.s6 & 15)-8; a.se=(c.s7 >> 4)-8; a.sf=(c.s7 & 15)-8;\n"
-"#define DOT16X16(a, b, c) "" c += dot(a.s0123, b.s0123); "" c += dot(a.s4567, b.s4567); "" c += dot(a.s89ab, b.s89ab); "" c += dot(a.scdef,b.scdef);\n"
-"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
-"__kernel void reshape_nchw4_nhwc4(GLOBAL_SIZE_DIM3\n"
-"__global const FLOAT* input,\n"
-"__global FLOAT* output,\n"
-"__private const int width_height,\n"
-"__private const int batch,\n"
-"__private const int channel,\n"
-"__private const int channelC4){\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" const int wh=get_global_id(2); // w*h\n"
-" UNIFORM_BOUNDRY_CHECK3(x,y,wh);\n"
-" \n"
-" const int x4=x << 2;\n"
-" const int y4=y << 2;\n"
-" const int channel4=channelC4*4;\n"
-" const int stride=channel4*width_height;\n"
-" const int input_offset=(y4*channel4+x4)*width_height+wh*4;\n"
-" const int output_offset=((y*width_height+wh)*channel4+x4)*4;\n"
-" FLOAT4 in0=vload4(0,input+input_offset);\n"
-" FLOAT4 in1=(y4+1<batch) ? vload4(0,input+input_offset+stride) : (FLOAT4)0;\n"
-" FLOAT4 in2=(y4+2<batch) ? vload4(0,input+input_offset+2*stride) : (FLOAT4)0;\n"
-" FLOAT4 in3=(y4+3<batch) ? vload4(0,input+input_offset+3*stride) : (FLOAT4)0;\n"
-" \n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" if(x4+3 >= channel){\n"
-" FLOAT *in0_ptr=(FLOAT*)&in0;\n"
-" FLOAT *in1_ptr=(FLOAT*)&in1;\n"
-" FLOAT *in2_ptr=(FLOAT*)&in2;\n"
-" FLOAT *in3_ptr=(FLOAT*)&in3;\n"
-" int remain=x4+3-channel;\n"
-" for(int i=remain; i >= 0; i--){\n"
-" in0_ptr[3-remain]=0;\n"
-" in1_ptr[3-remain]=0;\n"
-" in2_ptr[3-remain]=0;\n"
-" in3_ptr[3-remain]=0;\n"
-" }\n"
-" }\n"
-"#endif\n"
-" \n"
-" FLOAT16 out=(FLOAT16)(in0.s0,in1.s0,in2.s0,in3.s0,in0.s1,in1.s1,in2.s1,in3.s1,in0.s2,in1.s2,in2.s2,in3.s2,in0.s3,in1.s3,in2.s3,in3.s3);\n"
-" \n"
-" vstore16(out,0,output+output_offset);\n"
-"}\n"
-"__kernel void reshape_nhwc4_nchw4(GLOBAL_SIZE_DIM3\n"
-"__global const FLOAT* input,\n"
-"__global FLOAT* output,\n"
-"__private const int width_height,\n"
-"__private const int batch,\n"
-"__private const int channelC4){\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" const int wh=get_global_id(2); //w*h\n"
-" UNIFORM_BOUNDRY_CHECK3(x,y,wh);\n"
-" \n"
-" const int x4=x << 2;\n"
-" const int y4=y << 2;\n"
-" const int channel4=channelC4*4;\n"
-" const int stride=channel4*width_height;\n"
-" const int input_offset=((y*width_height+wh)*channel4+x4)*4;\n"
-" const int output_offset=(y4*channel4+x4)*width_height+wh*4;\n"
-" FLOAT16 in=vload16(0,input+input_offset);\n"
-" \n"
-" FLOAT4 out0=(FLOAT4)(in.s0,in.s4,in.s8,in.sc);\n"
-" FLOAT4 out1=(FLOAT4)(in.s1,in.s5,in.s9,in.sd);\n"
-" FLOAT4 out2=(FLOAT4)(in.s2,in.s6,in.sa,in.se);\n"
-" FLOAT4 out3=(FLOAT4)(in.s3,in.s7,in.sb,in.sf);\n"
-" \n"
-" vstore4(out0,0,output+output_offset);\n"
-" if(y4+1 >= batch) return;\n"
-" vstore4(out1,0,output+output_offset+stride);\n"
-" if(y4+2 >= batch) return;\n"
-" vstore4(out2,0,output+output_offset+2*stride);\n"
-" if(y4+3 >= batch) return;\n"
-" vstore4(out3,0,output+output_offset+3*stride);\n"
-"}\n"
-"__kernel void gemm_b4_c4_buf(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" __global const char *weight,\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" __global const uchar *weight,\n"
-"#endif\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x;\n"
-" const int out_b_idx=y << 2;\n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
-" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1,out2=(COMPUTE_FLOAT4)bias0.s2,out3=(COMPUTE_FLOAT4)bias0.s3;\n"
-" \n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx*4)*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*4*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*4*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
-"#endif\n"
-" const int loop=(blockDim+15)/16;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
-"#else\n"
-" const int loop_end=loop;\n"
-"#endif\n"
-" \n"
-" for (int i=0; i<blockNum; i++){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(out_c_idx,dequantScaleOffset+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+32))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+48))*ScaleOffset.s6+ScaleOffset.s7;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar16 charWeightsInt40=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" uchar16 charWeightsInt41=vload16(0,weight+weight_offset+k*weight_oc_offset+16);\n"
-" {\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
-" }\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" out2=mad(in,weights2_ptr[i],out2);\n"
-" out3=mad(in,weights3_ptr[i],out3);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+32))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+48))*ScaleOffset.s6+ScaleOffset.s7;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar16 charWeightsInt40=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" uchar16 charWeightsInt41=vload16(0,weight+weight_offset+k*weight_oc_offset+16);\n"
-" {\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
-" }\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" out2=mad(in,weights2_ptr[i],out2);\n"
-" out3=mad(in,weights3_ptr[i],out3);\n"
-" }\n"
-" }\n"
-"#endif\n"
-" }\n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
-" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
-" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset+8);\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+out_offset+12);\n"
-"}\n"
-"__kernel void gemm_b4_c2_buf(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" __global const char *weight,\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" __global const uchar *weight,\n"
-"#endif\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x;\n"
-" const int out_b_idx=y << 2;\n"
-" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,bias));\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
-" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
-" \n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx*2)*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*2*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*2*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
-"#endif\n"
-" const int loop=(blockDim+15)/16;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
-"#else\n"
-" const int loop_end=loop;\n"
-"#endif\n"
-" for (int i=0; i<blockNum; i++){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,dequantScaleOffset+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar16 charWeightsInt4=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" \n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset+16))*ScaleOffset.s2+ScaleOffset.s3;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar16 charWeightsInt4=vload16(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" }\n"
-" }\n"
-"#endif\n"
-" }\n"
-" \n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
-"}\n"
-"__kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" __global const char *weight,\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" __global const uchar *weight,\n"
-"#endif\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x;\n"
-" const int out_b_idx=y << 2;\n"
-" COMPUTE_FLOAT bias0=bias[out_c_idx];\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0;\n"
-" \n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" int weight_offset=out_c_idx*8;\n"
-" int weight_oc_offset=dstChannelC4*32;\n"
-"#else\n"
-" int weight_offset=out_c_idx*16;\n"
-" int weight_oc_offset=dstChannelC4*64;\n"
-"#endif\n"
-" const int loop=(blockDim+15)/16;\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
-"#else\n"
-" const int loop_end=loop;\n"
-"#endif\n"
-" \n"
-" for (int i=0; i<blockNum; i++){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" COMPUTE_FLOAT16 weights;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar8 charWeightsInt4=vload8(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights_ptr=(COMPUTE_FLOAT *)&weights;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights_ptr[i],out);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" COMPUTE_FLOAT16 weights;\n"
-"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" weights=CONVERT_COMPUTE_FLOAT16(vload16(0,weight+weight_offset+k*weight_oc_offset))*ScaleOffset.s0+ScaleOffset.s1;\n"
-"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" {\n"
-" uchar8 charWeightsInt4=vload8(0,weight+weight_offset+k*weight_oc_offset);\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
-"#endif\n"
-" COMPUTE_FLOAT *weights_ptr=(COMPUTE_FLOAT *)&weights;\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights_ptr[i],out);\n"
-" }\n"
-" }\n"
-"#endif\n"
-" }\n"
-" \n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-"}\n"
-"__kernel void gemm_b4_c4_image(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-" __read_only image2d_t weight,\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x << 2;\n"
-" const int out_b_idx=y << 2;\n"
-" \n"
-" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(0,bias+out_c_idx));\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
-" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
-" COMPUTE_FLOAT4 out2=(COMPUTE_FLOAT4)bias0.s2;\n"
-" COMPUTE_FLOAT4 out3=(COMPUTE_FLOAT4)bias0.s3;\n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
-" \n"
-" const int loop=(blockDim+15)/16;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-" \n"
-" for (int i=0; i<blockNum; i++){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" COMPUTE_FLOAT16 weights2=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+2,k))))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" COMPUTE_FLOAT16 weights3=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+3,k))))*ScaleOffset.s6+ScaleOffset.s7;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" uchar8 charWeightsInt42=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+2,k))));\n"
-" uchar8 charWeightsInt43=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+3,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" char16 charWeights2=0;\n"
-" char16 charWeights3=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" UCHAR8_TO_CHAR16(charWeights2,charWeightsInt42);\n"
-" UCHAR8_TO_CHAR16(charWeights3,charWeightsInt43);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights2)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights3)*ScaleOffset.s6+ScaleOffset.s7;\n"
-" }\n"
-" #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" out2=mad(in,weights2_ptr[i],out2);\n"
-" out3=mad(in,weights3_ptr[i],out3);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" COMPUTE_FLOAT16 weights2=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+2,k))))*ScaleOffset.s4+ScaleOffset.s5;\n"
-" COMPUTE_FLOAT16 weights3=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+3,k))))*ScaleOffset.s6+ScaleOffset.s7;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" uchar8 charWeightsInt42=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+2,k))));\n"
-" uchar8 charWeightsInt43=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+3,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" char16 charWeights2=0;\n"
-" char16 charWeights3=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" UCHAR8_TO_CHAR16(charWeights2,charWeightsInt42);\n"
-" UCHAR8_TO_CHAR16(charWeights3,charWeightsInt43);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" weights2=CONVERT_COMPUTE_FLOAT16(charWeights2)*ScaleOffset.s4+ScaleOffset.s5;\n"
-" weights3=CONVERT_COMPUTE_FLOAT16(charWeights3)*ScaleOffset.s6+ScaleOffset.s7;\n"
-" }\n"
-" #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
-" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
-" #pragma unroll\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" out2=mad(in,weights2_ptr[i],out2);\n"
-" out3=mad(in,weights3_ptr[i],out3);\n"
-" }\n"
-" }\n"
-"#endif\n"
-" }\n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
-" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
-" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset+8);\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+out_offset+12);\n"
-"}\n"
-"__kernel void gemm_b4_c2_image(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-" __read_only image2d_t weight,\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x << 1;\n"
-" const int out_b_idx=y << 2;\n"
-" \n"
-" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(0,bias+out_c_idx));\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
-" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
-" \n"
-" const int loop=(blockDim+15)/16;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
-" #else\n"
-" const int loop_end=loop;\n"
-" #endif\n"
-" \n"
-" for (int i=0; i<blockNum; i++){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+out_c_idx*2+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-" #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" }\n"
-" }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" COMPUTE_FLOAT16 weights1=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k))))*ScaleOffset.s2+ScaleOffset.s3;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0,weights1;\n"
-" {\n"
-" uchar8 charWeightsInt40=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" uchar8 charWeightsInt41=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx+1,k))));\n"
-" char16 charWeights0=0;\n"
-" char16 charWeights1=0;\n"
-" UCHAR8_TO_CHAR16(charWeights0,charWeightsInt40);\n"
-" UCHAR8_TO_CHAR16(charWeights1,charWeightsInt41);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" weights1=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
-" }\n"
-" #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
-" #pragma unroll\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
-" out1=mad(in,weights1_ptr[i],out1);\n"
-" }\n"
-" }\n"
-"#endif\n"
-" }\n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
-"}\n"
-"__kernel void gemm_b4_c1_image(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input,\n"
-" __read_only image2d_t weight,\n"
-" __global const float *dequantScaleOffset,\n"
-" __global const FLOAT *bias,\n"
-" __global FLOAT* output,\n"
-" __private const int dstChannelC4,\n"
-" __private const int srcChannelC4,\n"
-" __private const int blockNum,\n"
-" __private const int blockDim) {\n"
-" const int x=get_global_id(0); //c\n"
-" const int y=get_global_id(1); //b\n"
-" UNIFORM_BOUNDRY_CHECK(x,y);\n"
-" const int out_c_idx=x;\n"
-" const int out_b_idx=y << 2;\n"
-" \n"
-" COMPUTE_FLOAT bias0=bias[out_c_idx];\n"
-" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0;\n"
+" FLOAT4 value=exp(RI_F(input,SAMPLER,(int2)(c*shape.w+i,bh))-maxValue)/sumValue;\n"
+" WI_F(output,(int2)(c*shape.w+i,bh),value);\n"
+" }\n"
+"#endif\n"
+"}\n"
+;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* binary_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"#define PI 3.141592653589f\n"
+"__kernel void binary_buf(__private int global_dim0,__private int global_dim1,\n"
+" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
+" __private const int size,\n"
+" __private const int activationType) {\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));//NCHW,1\n"
 " \n"
-" int input_offset=out_b_idx*srcChannelC4*4;\n"
-" int out_offset=(out_b_idx*dstChannelC4+out_c_idx)*4;\n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" int offset=pos.x << 2;\n"
+"#ifdef PACK_LEAVE\n"
+" if(offset+3 >= size){\n"
+" int remain=size-offset;\n"
+" float4 in0,in1;\n"
+" float* in0_ptr=(float*)&in0;\n"
+" float* in1_ptr=(float*)&in1;\n"
 " \n"
-" const int loop=(blockDim+15)/16;\n"
-" #ifdef INPUT_CHANNEL_LEAVE\n"
-" const int loop_end=max(loop-1,0);\n"
-" const int remain=blockDim-loop_end*16;\n"
+" for(int i=0; i<remain; ++i){\n"
+" #ifdef A_SINGLE\n"
+" in0_ptr[i]=(float)input0[0];\n"
 " #else\n"
-" const int loop_end=loop;\n"
+" in0_ptr[i]=(float)input0[offset+i];\n"
 " #endif\n"
-" for (int i=0; i<blockNum; ++i){\n"
-" int kindex=i*dstChannelC4*4*2;\n"
-" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
-" for (int j=0; j<loop_end; j++) {\n"
-" int k=i*loop+j;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0;\n"
-" {\n"
-" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
-" }\n"
+" \n"
+" #ifdef B_SINGLE\n"
+" in1_ptr[i]=(float)input1[0];\n"
+" #else\n"
+" in1_ptr[i]=(float)input1[offset+i];\n"
 " #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" #pragma unroll\n"
-" for (int i=0; i<16; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
 " }\n"
+" float4 out=OPERATOR;\n"
+" if(activationType == 1) {\n"
+" out=fmax(out,(float4)0);\n"
 " }\n"
-"#ifdef INPUT_CHANNEL_LEAVE\n"
-" {\n"
-" int k=i*loop+loop_end;\n"
-" int k16=k << 4;\n"
-" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
-" COMPUTE_FLOAT16 weights0=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k))))*ScaleOffset.s0+ScaleOffset.s1;\n"
-" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
-" COMPUTE_FLOAT16 weights0;\n"
-" {\n"
-" uchar8 charWeightsInt4=as_uchar8(convert_ushort4(read_imageui(weight,SAMPLER,(int2)(out_c_idx,k))));\n"
-" char16 charWeights=0;\n"
-" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
-" weights0=CONVERT_COMPUTE_FLOAT16(charWeights)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" float* out_ptr=(float*)&out;\n"
+" for(int i=0; i<remain; ++i){\n"
+" output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
 " }\n"
+" }else {\n"
+"#endif\n"
+" #ifdef A_SINGLE\n"
+" float data0=input0[0];\n"
+" float4 in0=(float4)(data0,data0,data0,data0);\n"
+" #else\n"
+" float4 in0=convert_float4(vload4(0,input0+offset));\n"
 " #endif\n"
-" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
-" #pragma unroll\n"
-" for (int i=0; i<remain; ++i){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
-" out=mad(in,weights0_ptr[i],out);\n"
+" \n"
+" #ifdef B_SINGLE\n"
+" float data1=input1[0];\n"
+" float4 in1=(float4)(data1,data1,data1,data1);\n"
+" #else\n"
+" float4 in1=convert_float4(vload4(0,input1+offset));\n"
+" #endif\n"
+" \n"
+" float4 out=OPERATOR;\n"
+" \n"
+" if(activationType == 1) {\n"
+" out=fmax(out,(float4)0);\n"
 " }\n"
+" vstore4(CONVERT_OUTPUT4(out),0,output+offset);\n"
+"#ifdef PACK_LEAVE\n"
 " }\n"
 "#endif\n"
 " }\n"
-"#ifdef RELU\n"
-" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
 "}\n"
+"__kernel void prelu_buf(__private int global_dim0,__private int global_dim1,\n"
+" __global INPUT_TYPE* input0,__global INPUT_TYPE* input1,__global OUTPUT_TYPE* output,\n"
+" __private const int4 shape\n"
+" ) {\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));//NC4,HW\n"
 " \n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" int b=pos.x/shape.w;\n"
+" int c=pos.x % shape.w;\n"
+" int offset=(b+c*shape.x)*(shape.y*shape.z)+pos.y;\n"
+" float4 in0=convert_float4(vload4(offset,input0));\n"
+" float4 in1=convert_float4(vload4(pos.x % shape.w,input1));\n"
+" float4 out=OPERATOR;\n"
+" vstore4(CONVERT_OUTPUT4(out),offset,output);\n"
+" }\n"
+"}\n"
 ;
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
@@ -7433,28 +5991,67 @@ const char* raster_buf =
 " \n"
 " output[y*global_size_dim0+x]=(OUTPUT_TYPE)(0.0f);\n"
 "}\n"
-"__kernel void raster_buffer(\n"
+"#define MNN_DATA_FORMAT_NCHW 0\n"
+"#define MNN_DATA_FORMAT_NHWC 1\n"
+"#define MNN_DATA_FORMAT_NC4HW4 2\n"
+"__kernel void raster_direct_buffer(\n"
 " GLOBAL_SIZE_3_DIMS\n"
+" __private const int size_x,\n"
 " __global INPUT_TYPE *input,\n"
 " __private const int inputOffset,\n"
+" __private const int combineSrcOffset,\n"
 " __private const int inputStride0,\n"
 " __private const int inputStride1,\n"
 " __private const int inputStride2,\n"
+" __private const int src_width,\n"
+" __private const int src_height,\n"
+" __private const int src_channel,\n"
+" __private const int src_batch,\n"
 " __global OUTPUT_TYPE *output,\n"
 " __private const int outputOffset,\n"
+" __private const int combineDstOffset,\n"
 " __private const int outputStride0,\n"
 " __private const int outputStride1,\n"
-" __private const int outputStride2\n"
+" __private const int outputStride2,\n"
+" __private const int dst_width,\n"
+" __private const int dst_height,\n"
+" __private const int dst_channel,\n"
+" __private const int dst_batch\n"
 " ) {\n"
-" const int x=get_global_id(0);\n"
+" const int idx=get_global_id(0);\n"
 " const int y=get_global_id(1);\n"
 " const int z=get_global_id(2);\n"
 " \n"
-" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" DEAL_NON_UNIFORM_DIM3(idx,y,z);\n"
+" const int x=idx % size_x;\n"
+" const int id=idx/size_x;\n"
 " \n"
-" int inputIndex=inputOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
-" int outputIndex=outputOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
-" output[outputIndex]=(OUTPUT_TYPE)input[inputIndex];\n"
+" int inputIndex=inputOffset+id*combineSrcOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
+" int outputIndex=outputOffset+id*combineDstOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
+"#if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int inputIndexReal=inputIndex;\n"
+"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int inputIndexReal=inputIndex;\n"
+"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int in_w=inputIndex % src_width; inputIndex /= src_width;\n"
+" int in_h=inputIndex % src_height; inputIndex /= src_height;\n"
+" int in_c=inputIndex % src_channel;\n"
+" int in_b=inputIndex/src_channel;\n"
+" int inputIndexReal=(((in_b+(in_c/4)*src_batch)*src_height+in_h)*src_width+in_w)*4+(in_c % 4);\n"
+"#endif\n"
+" \n"
+"#if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int outputIndexReal=outputIndex;\n"
+"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int outputIndexReal=outputIndex;\n"
+"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int out_w=outputIndex % dst_width; outputIndex /= dst_width;\n"
+" int out_h=outputIndex % dst_height; outputIndex /= dst_height;\n"
+" int out_c=outputIndex % dst_channel;\n"
+" int out_b=outputIndex/dst_channel;\n"
+" int outputIndexReal=(((out_b+(out_c/4)*dst_batch)*dst_height+out_h)*dst_width+out_w)*4+(out_c % 4);\n"
+"#endif\n"
+" output[outputIndexReal]=(OUTPUT_TYPE)input[inputIndexReal];\n"
 "}\n"
 "__kernel void raster_nc4hw4_buffer(\n"
 " GLOBAL_SIZE_3_DIMS\n"
@@ -7484,73 +6081,8 @@ const char* raster_buf =
 " int inputIndex=inputOffset+(z*inputStride0+y*inputStride1+x*inputStride2)*4;\n"
 " int outputIndex=outputOffset+(z*outputStride0+y*outputStride1+x*outputStride2)*4;\n"
 " \n"
-" vstore4(CONVERT_OUTPUT4(vload4(0,input+inputIndex)),0,output+outputIndex);\n"
-"}\n"
-"__kernel void raster_direct_buffer(\n"
-" GLOBAL_SIZE_3_DIMS\n"
-" __private const int size_x,\n"
-" __global INPUT_TYPE *input,\n"
-" __private const int inputOffset,\n"
-" __private const int combineSrcOffset,\n"
-" __private const int inputStride0,\n"
-" __private const int inputStride1,\n"
-" __private const int inputStride2,\n"
-" __private const int src_width,\n"
-" __private const int src_height,\n"
-" __private const int src_channel,\n"
-" __global OUTPUT_TYPE *output,\n"
-" __private const int outputOffset,\n"
-" __private const int combineDstOffset,\n"
-" __private const int outputStride0,\n"
-" __private const int outputStride1,\n"
-" __private const int outputStride2,\n"
-" __private const int dst_width,\n"
-" __private const int dst_height,\n"
-" __private const int dst_channel\n"
-" ) {\n"
-" const int idx=get_global_id(0);\n"
-" const int y=get_global_id(1);\n"
-" const int z=get_global_id(2);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM3(idx,y,z);\n"
-" const int x=idx % size_x;\n"
-" const int id=idx/size_x;\n"
-" \n"
-" int inputIndex=inputOffset+id*combineSrcOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
-" int outputIndex=outputOffset+id*combineDstOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
-"#ifdef INPUT_DATA_FORMAT_NHWC\n"
-" int in_c=inputIndex % src_channel; inputIndex /= src_channel;\n"
-" int in_w=inputIndex % src_width; inputIndex /= src_width;\n"
-" int in_h=inputIndex % src_height;\n"
-" int in_b=inputIndex/src_height;\n"
-" int src_channel4=(src_channel+3)/4;\n"
-" int inputIndexC4=(((in_b*src_channel4+(in_c/4))*src_height+in_h)*src_width+in_w)*4+(in_c % 4);\n"
-"#else\n"
-" int in_w=inputIndex % src_width; inputIndex /= src_width;\n"
-" int in_h=inputIndex % src_height; inputIndex /= src_height;\n"
-" int in_c=inputIndex % src_channel;\n"
-" int in_b=inputIndex/src_channel;\n"
-" int src_channel4=(src_channel+3)/4;\n"
-" int inputIndexC4=(((in_b*src_channel4+(in_c/4))*src_height+in_h)*src_width+in_w)*4+(in_c % 4);\n"
-"#endif\n"
-" \n"
-"#ifdef OUTPUT_DATA_FORMAT_NHWC\n"
-" int out_c=outputIndex % dst_channel; outputIndex /= dst_channel;\n"
-" int out_w=outputIndex % dst_width; outputIndex /= dst_width;\n"
-" int out_h=outputIndex % dst_height;\n"
-" int out_b=outputIndex/dst_height;\n"
-" int dst_channel4=(dst_channel+3)/4;\n"
-" int outputIndexC4=(((out_b*dst_channel4+(out_c/4))*dst_height+out_h)*dst_width+out_w)*4+(out_c % 4);\n"
-"#else\n"
-" int out_w=outputIndex % dst_width; outputIndex /= dst_width;\n"
-" int out_h=outputIndex % dst_height; outputIndex /= dst_height;\n"
-" int out_c=outputIndex % dst_channel;\n"
-" int out_b=outputIndex/dst_channel;\n"
-" int dst_channel4=(dst_channel+3)/4;\n"
-" int outputIndexC4=(((out_b*dst_channel4+(out_c/4))*dst_height+out_h)*dst_width+out_w)*4+(out_c % 4);\n"
-"#endif\n"
-" \n"
-" output[outputIndexC4]=(OUTPUT_TYPE)input[inputIndexC4];\n"
+" OUTPUT_TYPE4 values=CONVERT_OUTPUT4(vload4(0,(__global INPUT_TYPE *)(input+inputIndex)));\n"
+" vstore4(values,0,(__global OUTPUT_TYPE *)(output+outputIndex));\n"
 "}\n"
 ;
 #endif
@@ -7576,7 +6108,7 @@ const char* binary_subgroup_buf =
 " const int h_idx=get_global_id(0)/shape.z;\n"
 " const int batch_idx=get_global_id(2);\n"
 " const int channel_idx=get_global_id(1);\n"
-" const int offset=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " \n"
 " float4 in0=convert_float4(vload4(0,input0+offset*isFull.x));\n"
 " float4 in1=convert_float4(vload4(0,input1+offset*isFull.y));\n"
@@ -7612,7 +6144,7 @@ const char* binary_subgroup_buf =
 " const int channel_idx=get_global_id(1);\n"
 " const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
-" const int offset=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
 " \n"
 " float4 in0=convert_float4(vload4(0,input0+offset*isFull.x));\n"
@@ -7657,7 +6189,7 @@ const char* binary_subgroup_buf =
 " const int channel_idx=get_global_id(1);\n"
 " const int src_width=shape.z+input1_pad_left+input1_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
-" const int offset0=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset1=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left)*16+(channel_idx % 4)*4;\n"
 " float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
 " float4 in1=convert_float4(vload4(0,input1+offset1*isFull.y));\n"
@@ -7691,7 +6223,7 @@ const char* binary_subgroup_buf =
 " const int channel_idx=get_global_id(1);\n"
 " const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
-" const int offset1=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset1=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset0=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16+(channel_idx % 4)*4;\n"
 " \n"
 " float4 in0=convert_float4(vload4(0,input0+offset0*isFull.x));\n"
@@ -7728,7 +6260,7 @@ const char* binary_subgroup_buf =
 " const int src_width=shape.z+input1_pad_left+input1_pad_right;\n"
 " const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
-" const int offset0=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset1=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input1_pad_left)*16+(channel_idx % 4)*4;\n"
 " const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
 " \n"
@@ -7776,7 +6308,7 @@ const char* binary_subgroup_buf =
 " const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
 " const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
-" const int offset1=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset1=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset0=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16+(channel_idx % 4)*4;\n"
 " const int dst_offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
 " \n"
@@ -7819,7 +6351,7 @@ const char* binary_subgroup_buf =
 " const int batch_idx=get_global_id(2);\n"
 " const int channel_idx=get_global_id(1);\n"
 " \n"
-" const int offset0=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset1=channel_idx*4;\n"
 " \n"
 " float4 in0=convert_float4(vload4(0,input0+offset0));\n"
@@ -7844,7 +6376,7 @@ const char* binary_subgroup_buf =
 " const int dst_width=shape.z+output_pad_left+output_pad_right;\n"
 " const int channe_out_idx=channel_idx >> 2;\n"
 " \n"
-" const int offset0=(((batch_idx*channel4+channel_idx)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset0=(((batch_idx+channel_idx*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " const int offset1=channel_idx*4;\n"
 " const int offset=(((batch_idx*channel16+channe_out_idx)*shape.y+h_idx)*dst_width+w_idx+output_pad_left)*16+(channel_idx % 4)*4;\n"
 " float4 in0=convert_float4(vload4(0,input0+offset0));\n"
@@ -7920,10 +6452,10 @@ const char* binary_subgroup_buf =
 " const int channel_idx=get_group_id(1);\n"
 " const int sglid=get_sub_group_local_id();\n"
 " const int src_width=shape.z+input0_pad_left+input0_pad_right;\n"
-" const int width_height=shape.z*shape.y*4;\n"
+" const int batch_width_height=shape.x*shape.z*shape.y*4;\n"
 " const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src_width+w_idx+input0_pad_left)*16;\n"
 " const int offset1=channel_idx*16;\n"
-" const int offset=(((batch_idx*channel4+(channel_idx<<2))*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset=(((batch_idx+(channel_idx<<2)*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " float4 in0=convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0))));\n"
 " float4 in1=(float4)(AS_INPUT_DATA(INTEL_SUB_GROUP_READ((__global INTEL_DATA*)(input1+offset1))));\n"
 " \n"
@@ -7932,7 +6464,7 @@ const char* binary_subgroup_buf =
 " const int lid_y=sglid/4;\n"
 " int block_size=w_idx+4>shape.z ? (shape.z % 4) : 4;\n"
 " for (int i=0; i<block_size; i++) {\n"
-" output[offset+i*4+lid_y*width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
+" output[offset+i*4+lid_y*batch_width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
 " }\n"
 "}\n"
 "__attribute__((intel_reqd_sub_group_size(16)))\n"
@@ -8003,10 +6535,10 @@ const char* binary_subgroup_buf =
 " const int sglid=get_sub_group_local_id();\n"
 " const int src0_width=shape.z+input0_pad_left+input0_pad_right;\n"
 " const int src1_width=shape.z+input1_pad_left+input1_pad_right;\n"
-" const int width_height=shape.z*shape.y*4;\n"
+" const int batch_width_height=shape.x*shape.z*shape.y*4;\n"
 " const int offset0=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src0_width+w_idx+input0_pad_left)*16;\n"
 " const int offset1=(((batch_idx*channel16+channel_idx)*shape.y+h_idx)*src1_width+w_idx+input1_pad_left)*16;\n"
-" const int offset=(((batch_idx*channel4+(channel_idx << 2))*shape.y+h_idx)*shape.z+w_idx)*4;\n"
+" const int offset=(((batch_idx+(channel_idx << 2)*shape.x)*shape.y+h_idx)*shape.z+w_idx)*4;\n"
 " float4 in0=isFull.x ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input0+offset0)))) : (float4)(input0[0]);\n"
 " float4 in1=isFull.y ? convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input1+offset1)))) : (float4)(input1[0]);\n"
 " \n"
@@ -8018,7 +6550,7 @@ const char* binary_subgroup_buf =
 " const int lid_y=sglid/4;\n"
 " int block_size=w_idx+4>shape.z ? (shape.z % 4) : 4;\n"
 " for (int i=0; i<block_size; i++) {\n"
-" output[offset+i*4+lid_y*width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
+" output[offset+i*4+lid_y*batch_width_height+lid_x]=(OUTPUT_TYPE)out[i];\n"
 " }\n"
 "}\n"
 ;
@@ -8040,6 +6572,7 @@ const char* depthwise_conv2d_subgroup_buf =
 " __private const int inputHeight,\n"
 " __private const int inputWidth,\n"
 " __private const int Channel,\n"
+" __private const int Batch,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
 " __private const int outputHeight,\n"
@@ -8141,6 +6674,7 @@ const char* depthwise_conv2d_subgroup_buf =
 " __private const int inputHeight,\n"
 " __private const int inputWidth,\n"
 " __private const int Channel,\n"
+" __private const int Batch,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
 " __private const int outputHeight,\n"
@@ -8171,9 +6705,9 @@ const char* depthwise_conv2d_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*outputWidth;\n"
 " const uint output_fs_pitch=output_y_pitch*outputHeight;\n"
-" const uint output_b_pitch=output_fs_pitch*((Channel+3)/4);\n"
-" const uint output_offset=b*output_b_pitch +\n"
-" (c << 2)*output_fs_pitch +\n"
+" const uint output_b_pitch=output_fs_pitch*Batch;\n"
+" const uint output_offset=(c << 2)*output_b_pitch +\n"
+" b*output_fs_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_x_pitch=16;\n"
@@ -8217,7 +6751,7 @@ const char* depthwise_conv2d_subgroup_buf =
 " const uint lid_x=sglid % 4;\n"
 " const uint lid_y=sglid/4;\n"
 " for (int i=0; i<8 && (x+i)<outputWidth; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=dst[i];\n"
 " }\n"
 "}\n"
 ;
@@ -8308,6 +6842,7 @@ const char* pooling_subgroup_buf =
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int in_channel_block,\n"
 " __private const int out_channel_block,\n"
 " __private const int input_pad_left,\n"
@@ -8327,7 +6862,7 @@ const char* pooling_subgroup_buf =
 " \n"
 " #ifdef POOL_AVG\n"
 " COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
-" const int inp_offset=(((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
 "#ifdef COUNT_INCLUDE_PADDING\n"
 " int total_count=(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+KERNEL_X,input_shape.y+pad_shape.y)-iw_start);\n"
 "#else\n"
@@ -8356,7 +6891,7 @@ const char* pooling_subgroup_buf =
 " #if RETURN_REDICE\n"
 " int4 redice=(int4)0;\n"
 " #endif\n"
-" const int inp_offset=(((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
 " for(int kh=0; kh<KERNEL_Y; kh++) {\n"
 " int ih_cur=ih_start+kh;\n"
 " if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
@@ -8376,10 +6911,10 @@ const char* pooling_subgroup_buf =
 " }\n"
 " #endif\n"
 " \n"
-" const int out_offset=(((b_idx*in_channel_block+c_idx)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
 " vstore4(CONVERT_FLOAT4(result),0,output+out_offset);\n"
 " #if RETURN_REDICE\n"
-" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+(((b_idx*in_channel_block+c_idx)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4);\n"
+" vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4);\n"
 " #endif\n"
 "}\n"
 "__kernel void pooling_c4_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,\n"
@@ -8389,6 +6924,7 @@ const char* pooling_subgroup_buf =
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int in_channel_block,\n"
 " __private const int out_channel_block,\n"
 " __private const int input_pad_left,\n"
@@ -8409,7 +6945,7 @@ const char* pooling_subgroup_buf =
 " \n"
 " #ifdef POOL_AVG\n"
 " COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
-" const int inp_offset=(((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
 " #ifdef COUNT_INCLUDE_PADDING\n"
 " int total_count=(min(ih_start+KERNEL_Y,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+KERNEL_X,input_shape.y+pad_shape.y)-iw_start);\n"
 "#else\n"
@@ -8438,7 +6974,7 @@ const char* pooling_subgroup_buf =
 " #if RETURN_REDICE\n"
 " int4 redice=(int4)0;\n"
 " #endif\n"
-" const int inp_offset=(((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;\n"
 " for(int kh=0; kh<KERNEL_Y; kh++) {\n"
 " int ih_cur=ih_start+kh;\n"
 " if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
@@ -8482,6 +7018,7 @@ const char* pooling_subgroup_buf =
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int in_channel_block,\n"
 " __private const int out_channel_block,\n"
 " __private const int input_pad_left,\n"
@@ -8624,6 +7161,7 @@ const char* pooling_subgroup_buf =
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int in_channel_block,\n"
 " __private const int out_channel_block,\n"
 " __private const int input_pad_left,\n"
@@ -8704,18 +7242,18 @@ const char* pooling_subgroup_buf =
 " const uint lid_x=sglid % 4;\n"
 " const uint lid_y=sglid/4;\n"
 " \n"
-" const int out_offset=(((b_idx*out_channel_block+c_idx*4)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
-" const int width_height=output_shape.y*output_shape.x*4;\n"
+" const int out_offset=(((b_idx+c_idx*4*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx+output_pad_left)*4;\n"
+" const int batch_width_height=batch*output_shape.y*output_shape.x*4;\n"
 "#if RETURN_REDICE\n"
-" const int redice_offset=(((b_idx*out_channel_block+c_idx*4)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
+" const int redice_offset=(((b_idx+c_idx*4*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
 "#endif\n"
 "#if OUTPUT_LEFTOVERS\n"
 " if ((c_idx+1)*16 >= channel) {\n"
 " for (int i=0; i<8; i++) {\n"
 " if ((c_idx*16+lid_y*4+lid_x<channel) && (ow_idx+i)<output_shape.y)\n"
-" output[out_offset+lid_y*width_height+i*4+lid_x]=result[i];\n"
+" output[out_offset+lid_y*batch_width_height+i*4+lid_x]=result[i];\n"
 "#if RETURN_REDICE\n"
-" rediceOutput[redice_offset+lid_y*width_height+i*4+lid_x]=redice[i];\n"
+" rediceOutput[redice_offset+lid_y*batch_width_height+i*4+lid_x]=redice[i];\n"
 "#endif\n"
 " }\n"
 " }\n"
@@ -8723,9 +7261,9 @@ const char* pooling_subgroup_buf =
 "#endif \n"
 " {\n"
 " for (int i=0; i<8 && (ow_idx+i)<output_shape.y; i++) {\n"
-" output[out_offset+lid_y*width_height+i*4+lid_x]=result[i];\n"
+" output[out_offset+lid_y*batch_width_height+i*4+lid_x]=result[i];\n"
 "#if RETURN_REDICE\n"
-" rediceOutput[redice_offset+lid_y*width_height+i*4+lid_x]=redice[i];\n"
+" rediceOutput[redice_offset+lid_y*batch_width_height+i*4+lid_x]=redice[i];\n"
 "#endif\n"
 " }\n"
 " }\n"
@@ -8748,7 +7286,7 @@ const char* pooling_buf =
 " __private const int2 kernel_shape,\n"
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
-" __private const int channel_block) {\n"
+" __private const int batch) {\n"
 " \n"
 " const int ow_idx=get_global_id(0);\n"
 " const int b_oh_idx=get_global_id(1);\n"
@@ -8762,7 +7300,7 @@ const char* pooling_buf =
 " \n"
 " #ifdef POOL_AVG\n"
 " COMPUTE_FLOAT4 result=(COMPUTE_FLOAT4)(0);\n"
-" const int inp_offset=(((b_idx*channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
 " #ifdef COUNT_INCLUDE_PADDING\n"
 " int total_count=(min(ih_start+kernel_shape.x,input_shape.x+pad_shape.x)-ih_start)*(min(iw_start+kernel_shape.y,input_shape.y+pad_shape.y)-iw_start);\n"
 " #else\n"
@@ -8791,7 +7329,7 @@ const char* pooling_buf =
 " #if RETURN_REDICE\n"
 " int4 redice=(int4)0;\n"
 " #endif\n"
-" const int inp_offset=(((b_idx*channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;\n"
 " for(int kh=0; kh<kernel_shape.x; kh++) {\n"
 " int ih_cur=ih_start+kh;\n"
 " if(ih_cur<0 || ih_cur >= input_shape.x) {\n"
@@ -8811,7 +7349,7 @@ const char* pooling_buf =
 " }\n"
 " #endif\n"
 " \n"
-" const int out_offset=(((b_idx*channel_block+c_idx)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*output_shape.x+oh_idx)* output_shape.y+ow_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(result),0,output+out_offset);\n"
 " #if RETURN_REDICE\n"
 " vstore4(CONVERT_FLOAT4(redice),0,rediceOutput+out_offset);\n"
@@ -8826,7 +7364,7 @@ const char* pooling_buf =
 " __private const int2 kernel_shape,\n"
 " __global FLOAT *output,\n"
 " __global FLOAT *rediceOutput,\n"
-" __private const int channel_block) {\n"
+" __private const int batch) {\n"
 " const int local_id=get_local_id(0);\n"
 " const int output_channel_idx=get_global_id(1);\n"
 " const int output_batch_idx=get_global_id(2);\n"
@@ -8840,7 +7378,7 @@ const char* pooling_buf =
 "#endif\n"
 "#endif\n"
 " COMPUTE_FLOAT4 local sum[LOCAL_SIZE];\n"
-" const int inp_offset=((output_batch_idx*channel_block+output_channel_idx)*input_shape.x)*input_shape.y*4;\n"
+" const int inp_offset=((output_batch_idx+output_channel_idx*batch)*input_shape.x)*input_shape.y*4;\n"
 " const int size=input_shape.x*input_shape.y;\n"
 " for(int i=local_id; i<size; i+=LOCAL_SIZE){\n"
 " int w=i % input_shape.y;;\n"
@@ -8879,7 +7417,7 @@ const char* pooling_buf =
 "#ifdef POOL_AVG\n"
 " output_result /= (input_shape.x*input_shape.y);\n"
 "#endif\n"
-" const int out_offset=(output_batch_idx*channel_block+output_channel_idx)*4;\n"
+" const int out_offset=(output_batch_idx+output_channel_idx*batch)*4;\n"
 " vstore4(CONVERT_FLOAT4(output_result),0,output+out_offset);\n"
 "#if RETURN_REDICE\n"
 " redice=rediceId[0];\n"
@@ -9314,8 +7852,8 @@ const char* unary_buf =
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
-"#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define GLOBAL_SIZE_2_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
 "inline float4 gelu(float4 in){\n"
 " float4 value=0.79788458f*(0.044715f*in*in*in+in);\n"
 " float4 x2=value*value;\n"
@@ -9323,20 +7861,35 @@ const char* unary_buf =
 " (value*(135135.0f+x2*(17325.0f+x2*(378.0f+x2))))/(135135.0f+x2*(62370.0f+x2*(3150.0f+x2*28.0f))));\n"
 " return (1.0f+dst)*in*0.5f;\n"
 "}\n"
-"__kernel void unary_buf(GLOBAL_SIZE_3_DIMS\n"
+"__kernel void unary_buf(GLOBAL_SIZE_2_DIMS\n"
 " __global const INPUT_TYPE *input,\n"
 " __global OUTPUT_TYPE *output,\n"
-" __private const int height) {\n"
-" const int channel_block_idx=get_global_id(0);\n"
-" const int w=get_global_id(1);\n"
-" const int hb=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(channel_block_idx,w,hb);\n"
-" const int batch_idx=hb/height;\n"
-" const int height_idx=hb % height;\n"
-" const int offset=(((batch_idx*global_size_dim0+channel_block_idx)*height+height_idx)*global_size_dim1+w)*4;\n"
+" __private const int size) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int offset=x << 2;\n"
+"#ifdef PACK_LEAVE\n"
+" if(offset+3 >= size){\n"
+" int remain=size-offset;\n"
+" float4 in;\n"
+" float* in_ptr=(float*)&in;\n"
+" for(int i=0; i<remain; ++i){\n"
+" in_ptr[i]=(float)input[offset+i];\n"
+" }\n"
+" float4 out=OPERATOR;\n"
+" float* out_ptr=(float*)&out;\n"
+" for(int i=0; i<remain; ++i){\n"
+" output[offset+i]=(OUTPUT_TYPE)out_ptr[i];\n"
+" }\n"
+" }else {\n"
+"#endif\n"
 " float4 in=convert_float4(vload4(0,input+offset));\n"
 " float4 out=OPERATOR;\n"
 " vstore4(CONVERT_OUTPUT4(out),0,output+offset);\n"
+"#ifdef PACK_LEAVE\n"
+" }\n"
+"#endif\n"
 "}\n"
 ;
 #endif
@@ -9354,7 +7907,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9384,7 +7937,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh*dilate_hw.x;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " const int kw_dilate=kw*dilate_hw.y;\n"
@@ -9413,7 +7966,7 @@ const char* depthwise_conv2d_buf =
 " outValue2=clamp(outValue2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue3=clamp(outValue3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
 " const int remain=out_hw.y-out_w4_idx;\n"
 " if (remain >= 4) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9438,7 +7991,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9465,7 +8018,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh*dilate_hw.x;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " const int kw_dilate=kw*dilate_hw.y;\n"
@@ -9486,7 +8039,7 @@ const char* depthwise_conv2d_buf =
 " outValue0=clamp(outValue0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue1=clamp(outValue1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
 " const int remain=out_hw.y-out_w2_idx;\n"
 " if (remain >= 2) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9502,7 +8055,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9526,7 +8079,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh*dilate_hw.x;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " const int kw_dilate=kw*dilate_hw.y;\n"
@@ -9543,7 +8096,7 @@ const char* depthwise_conv2d_buf =
 "#ifdef RELU6\n"
 " outValue0=clamp(outValue0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
 "}\n"
 "__kernel\n"
@@ -9552,7 +8105,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9586,8 +8139,8 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset_c0=(((b_idx*c_blocks+c_idx+0)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
-" int inp_offset_c1=(((b_idx*c_blocks+c_idx+1)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset_c0=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset_c1=(((b_idx+(c_idx+1)*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " COMPUTE_FLOAT4 inValue0=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c0));\n"
@@ -9636,7 +8189,7 @@ const char* depthwise_conv2d_buf =
 " outValue6=clamp(outValue6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue7=clamp(outValue7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
+" int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
 " const int remain=out_hw.y-out_w4_idx;\n"
 " if (remain >= 4) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9656,7 +8209,7 @@ const char* depthwise_conv2d_buf =
 " \n"
 " if(c_idx+1 >= c_blocks) return;\n"
 " \n"
-" out_offset += out_hw.x*out_hw.y*4;\n"
+" out_offset += batch*out_hw.x*out_hw.y*4;\n"
 " if (remain >= 4) {\n"
 " vstore4(CONVERT_FLOAT4(outValue4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(outValue5),1,output+out_offset);\n"
@@ -9679,7 +8232,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9707,8 +8260,8 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset_c0=(((b_idx*c_blocks+c_idx+0)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
-" int inp_offset_c1=(((b_idx*c_blocks+c_idx+1)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset_c0=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset_c1=(((b_idx+(c_idx+1)*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " COMPUTE_FLOAT4 inValue0=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset_c0));\n"
@@ -9740,7 +8293,7 @@ const char* depthwise_conv2d_buf =
 " outValue4=clamp(outValue4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue5=clamp(outValue5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
+" int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
 " const int remain=out_hw.y-out_w2_idx;\n"
 " if (remain >= 2) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9751,7 +8304,7 @@ const char* depthwise_conv2d_buf =
 " \n"
 " if(c_idx+1 >= c_blocks) return;\n"
 " \n"
-" out_offset += out_hw.x*out_hw.y*4;\n"
+" out_offset += batch*out_hw.x*out_hw.y*4;\n"
 " if (remain >= 2) {\n"
 " vstore4(CONVERT_FLOAT4(outValue4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(outValue5),1,output+out_offset);\n"
@@ -9765,7 +8318,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9796,7 +8349,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_cur=in_h_start+kh;\n"
 " if(in_h_cur<0 || in_h_cur >= in_hw.x) continue;\n"
 " \n"
-" int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
+" int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_cur)* in_hw.y+in_w_start_0)*4;\n"
 " for (int kw=0; kw<filter_hw.y; kw++) {\n"
 " const int filter_idx=mad24(kh,filter_hw.y,kw);\n"
 " inValue0=(in_w_start_0+kw<0 || in_w_start_0+kw >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(kw+0,input+inp_offset));\n"
@@ -9824,7 +8377,7 @@ const char* depthwise_conv2d_buf =
 " outValue2=clamp(outValue2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue3=clamp(outValue3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w4_idx)*4;\n"
 " const int remain=out_hw.y-out_w4_idx;\n"
 " if (remain >= 4) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9848,7 +8401,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9870,7 +8423,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_start=out_h_idx-pad_hw.x;\n"
 " COMPUTE_FLOAT4 inValue0,inValue1,inValue2,inValue3;\n"
 " //first line\n"
-" const int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_start)* in_hw.y+in_w_start_0)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_start)* in_hw.y+in_w_start_0)*4;\n"
 " inValue0=(in_h_start<0 || in_w_start_0<0 ) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " inValue1=(in_h_start<0 || in_w_start_0+1 >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
 " inValue2=(in_h_start<0 || in_w_start_0+2 >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
@@ -9935,7 +8488,7 @@ const char* depthwise_conv2d_buf =
 " outValue0=clamp(outValue0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue1=clamp(outValue1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w2_idx)*4;\n"
 " const int remain=out_hw.y-out_w2_idx;\n"
 " if (remain >= 2) {\n"
 " vstore4(CONVERT_FLOAT4(outValue0),0,output+out_offset);\n"
@@ -9950,7 +8503,7 @@ const char* depthwise_conv2d_buf =
 " __global const FLOAT *bias,\n"
 " __global FLOAT *output,\n"
 " __private const int2 in_hw,\n"
-" __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 pad_hw,\n"
@@ -9976,7 +8529,7 @@ const char* depthwise_conv2d_buf =
 " const int in_h_start=out_h2_idx-pad_hw.x;\n"
 " COMPUTE_FLOAT4 inValue0,inValue1,inValue2,inValue3;\n"
 " //first line\n"
-" const int inp_offset=(((b_idx*c_blocks+c_idx)*in_hw.x+in_h_start)* in_hw.y+in_w_start)*4;\n"
+" const int inp_offset=(((b_idx+c_idx*batch)*in_hw.x+in_h_start)* in_hw.y+in_w_start)*4;\n"
 " inValue0=(in_h_start<0 || in_w_start<0 ) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " inValue1=(in_h_start<0 || in_w_start+1 >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
 " inValue2=(in_h_start<0 || in_w_start+2 >= in_hw.y) ? (COMPUTE_FLOAT4)0 : CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
@@ -10059,7 +8612,7 @@ const char* depthwise_conv2d_buf =
 " outValue2=clamp(outValue2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " outValue3=clamp(outValue3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((b_idx*c_blocks+c_idx)*out_hw.x+out_h2_idx)*out_hw.y+out_w2_idx)*4;\n"
+" const int out_offset=(((b_idx+c_idx*batch)*out_hw.x+out_h2_idx)*out_hw.y+out_w2_idx)*4;\n"
 " const int remain_w=out_hw.y-out_w2_idx;\n"
 " const int remain_h=out_hw.x-out_h2_idx;\n"
 " if(remain_w >= 2 && remain_h >= 2) {\n"
@@ -10171,6 +8724,7 @@ const char* winogradTransform_buf =
 " __private const int srcWidth,// 6\n"
 " __private const int srcHeight,__private const int srcChannelC4,\n"
 " __private const int dstHeightPad,__private const int srcChannelPad,\n"
+" __private const int batch,\n"
 " __private const int batchOffset) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1)); \n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10206,7 +8760,7 @@ const char* winogradTransform_buf =
 " FLOAT4 S23;\n"
 " FLOAT4 S33;\n"
 " \n"
-" int inp_offset=(((batchIndex*srcChannelC4+srcZ)*srcHeight+syStart)*srcWidth+sxStart)*4;\n"
+" int inp_offset=(((batchIndex+srcZ*batch)*srcHeight+syStart)*srcWidth+sxStart)*4;\n"
 " {\n"
 " int sx=0+sxStart;\n"
 " int sy=0+syStart;\n"
@@ -10451,6 +9005,7 @@ const char* winogradTransform_buf =
 " __private const int dstChannelC4,\n"
 " __private const int srcWidthPad,\n"
 " __private const int dstChannelPad,\n"
+" __private const int batch,\n"
 " __private const int batchOffset) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10498,7 +9053,7 @@ const char* winogradTransform_buf =
 " \n"
 " //NC4HW4 [batch,dstChannelC4,dstHeight,dstWidth]\n"
 " //index: [batchIndex,oz,oyStart,oxStart]\n"
-" int out_offset=(((batchIndex*dstChannelC4+ oz)*dstHeight+oyStart)*dstWidth+oxStart)*4;\n"
+" int out_offset=(((batchIndex+oz*batch)*dstHeight+oyStart)*dstWidth+oxStart)*4;\n"
 " {\n"
 " int ox=oxStart+0;\n"
 " int oy=oyStart+0;\n"
@@ -10578,6 +9133,7 @@ const char* winogradTransform_subgroup_buf =
 " __private const int srcWidth,// 6\n"
 " __private const int srcHeight,__private const int srcChannelC4,__private const int srcChannelC16,__private const int dstHeight,\n"
 " __private const int batchOffset,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1)); \n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10657,6 +9213,7 @@ const char* winogradTransform_subgroup_buf =
 " __private const int dstHeight,\n"
 " __private const int dstChannelC4,__private const int dstChannelC16,__private const int srcWidth,\n"
 " __private const int batchOffset,\n"
+" __private const int batch,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10773,6 +9330,7 @@ const char* winogradTransform_subgroup_buf =
 " __private const int srcWidth,// 6\n"
 " __private const int srcHeight,__private const int srcChannelC4,__private const int srcChannelC16,__private const int dstHeight,\n"
 " __private const int batchOffset,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1)); \n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10800,7 +9358,7 @@ const char* winogradTransform_subgroup_buf =
 " FLOAT4 S23;\n"
 " FLOAT4 S33;\n"
 " \n"
-" int inp_offset=(((batchOffset*srcChannelC4+pos.y)*srcHeight+syStart)*srcWidth+sxStart)*4;\n"
+" int inp_offset=(((batchOffset+pos.y*batch)*srcHeight+syStart)*srcWidth+sxStart)*4;\n"
 " {\n"
 " int sx=0+sxStart;\n"
 " int sy=0+syStart;\n"
@@ -10949,6 +9507,7 @@ const char* winogradTransform_subgroup_buf =
 " __private const int dstHeight,\n"
 " __private const int dstChannelC4,__private const int dstChannelC16,__private const int srcWidth,\n"
 " __private const int batchOffset,\n"
+" __private const int batch,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
 " UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
@@ -10992,7 +9551,7 @@ const char* winogradTransform_subgroup_buf =
 " \n"
 " //NC4HW4 [batch,dstChannelC4,dstHeight,dstWidth]\n"
 " //index: [batchOffset,pos.y,oyStart,oxStart]\n"
-" int out_offset=(((batchOffset*dstChannelC4+ pos.y)*dstHeight+oyStart)*dstWidth+oxStart)*4;\n"
+" int out_offset=(((batchOffset+ pos.y*batch)*dstHeight+oyStart)*dstWidth+oxStart)*4;\n"
 " {\n"
 " int ox=oxStart+0;\n"
 " int oy=oyStart+0;\n"
@@ -11126,7 +9685,7 @@ const char* splitgelu_buf =
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
-"__kernel void splitgelu_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+"__kernel void splitgelu_buf(__private int global_dim0,__private int global_dim1,\n"
 " __global const FLOAT*input,\n"
 " #ifdef DOUBLE_INPUTS\n"
 " __global const FLOAT*input1,\n"
@@ -11134,46 +9693,55 @@ const char* splitgelu_buf =
 " __global FLOAT*output,\n"
 " __private const int4 shape\n"
 "){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int b=pos.x;\n"
-" const int c_4=pos.y;\n"
-"// The product of W and H is a multiple of 4\n"
-"#ifdef WH_4\n"
-" const int hw_4=pos.z;\n"
-" const int channel_4=(shape.y+3) >> 2;\n"
-" const int area_4=(shape.z+3) >> 2;\n"
-" const int in_offset=((b*channel_4+c_4)*area_4*2+hw_4)*16;\n"
-" const int out_offset=((b*channel_4+c_4)*area_4+hw_4)*16;\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" const int h=pos.x;\n"
+" const int bc=pos.y;\n"
+"// The product of W and H is a multiple of 16\n"
+"#ifdef WH_16\n"
+" const int in_offset=bc*shape.z*2+h*16;\n"
+" const int out_offset=bc*shape.z+h*16;\n"
 " float16 valueL=convert_float16(vload16(0,input+in_offset));\n"
-" float16 valueR=convert_float16(vload16(area_4,input+in_offset));\n"
+" float16 valueR=convert_float16(vload16(0,input+in_offset+shape.z));\n"
 " #ifdef DOUBLE_INPUTS\n"
-" float4 valueConstL=convert_float4(vload4(hw,input1));\n"
-" float4 valueConstR=convert_float4(vload4(area_4+hw,input1));\n"
-" valueL += (float16)((float4)valueConstL.x,(float4)valueConstL.y,(float4)valueConstL.z,(float4)valueConstL.w);\n"
-" valueR += (float16)((float4)valueConstR.x,(float4)valueConstR.y,(float4)valueConstR.z,(float4)valueConstR.w);\n"
+" float16 valueConstL=convert_float16(vload16(h,input1));\n"
+" float16 valueConstR=convert_float16(vload16(h,input1+shape.z));\n"
+" valueL += valueConstL;\n"
+" valueR += valueConstR;\n"
 " #endif\n"
 " float16 out=(erf(valueR*(float16)0.7071067932881648)+(float16)1.0)*valueR*(float16)0.5;\n"
 " out *= valueL;\n"
 " vstore16(CONVERT_FLOAT16(out),0,output+out_offset);\n"
-"#else\n"
-" const int hw=pos.z;\n"
-" \n"
-" const int channel_4=(shape.y+3) >> 2;\n"
-" const int in_offset=((b*channel_4+c_4)*shape.z*2+hw)*4;\n"
-" const int out_offset=((b*channel_4+c_4)*shape.z+hw)*4;\n"
-" \n"
+"// The product of W and H is a multiple of 4\n"
+"#elif defined (WH_4)\n"
+" const int in_offset=bc*shape.z*2+h*4;\n"
+" const int out_offset=bc*shape.z+h*4;\n"
 " float4 valueL=convert_float4(vload4(0,input+in_offset));\n"
-" float4 valueR=convert_float4(vload4(shape.z,input+in_offset));\n"
+" float4 valueR=convert_float4(vload4(0,input+in_offset+shape.z));\n"
 " #ifdef DOUBLE_INPUTS\n"
-" float valueConstL=input1[hw];\n"
-" float valueConstR=input1[shape.z+hw];\n"
-" valueL += (float4)valueConstL;\n"
-" valueR += (float4)valueConstR;\n"
+" float4 valueConstL=convert_float4(vload4(h,input1));\n"
+" float4 valueConstR=convert_float4(vload4(h,input1+shape.z));\n"
+" valueL += valueConstL;\n"
+" valueR += valueConstR;\n"
 " #endif\n"
 " float4 out=(erf(valueR*(float4)0.7071067932881648)+(float4)1.0)*valueR*(float4)0.5;\n"
 " out *= valueL;\n"
 " vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
+"#else\n"
+" const int in_offset=bc*shape.z*2+h;\n"
+" const int out_offset=bc*shape.z+h;\n"
+" \n"
+" float valueL=(float)input[in_offset];\n"
+" float valueR=(float)input[in_offset+shape.z];\n"
+" #ifdef DOUBLE_INPUTS\n"
+" float valueConstL=input1[h];\n"
+" float valueConstR=input1[shape.z+h];\n"
+" valueL += valueConstL;\n"
+" valueR += valueConstR;\n"
+" #endif\n"
+" float out=(erf(valueR*0.7071067932881648)+1.0)*valueR*0.5;\n"
+" out *= valueL;\n"
+" output[out_offset]=out;\n"
 "#endif\n"
 " }\n"
 "}\n"
@@ -11492,27 +10060,28 @@ const char* buffer_convert_quant =
 " __write_only image2d_t output,\n"
 " __private const int input_channel,\n"
 " __private const int output_channel) {\n"
-" int x=get_global_id(0); // ic/16\n"
+" int x=get_global_id(0); // ic/32\n"
 " int y=get_global_id(1); // oc\n"
 " DEAL_NON_UNIFORM_DIM2(x,y);\n"
-" const int xin=x << 4;\n"
 "#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
+" const int xin=x << 5;\n"
 "#ifdef CHANNEL_LEAVE\n"
-" uchar8 out=0;\n"
+" uchar16 out=0;\n"
 " uchar *out_ptr=(uchar*)&out;\n"
-" for(int i=0; i<8; ++i){\n"
+" for(int i=0; i<16; ++i){\n"
 " int index0=y*input_channel+xin+i*2;\n"
 " int index1=y*input_channel+xin+i*2+1;\n"
 " uchar s0=input_ptr[index0/2];\n"
 " uchar s1=input_ptr[index1/2];\n"
 " out_ptr[i]=((index0 % 2) == 0 ? (s0 & 0xf0) : (s0 << 4)) | ((index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f));\n"
 " }\n"
-" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(out)));\n"
+" write_imagei(output,(int2)(y,x),as_int4(out));\n"
 "#else\n"
 " const int inputOffset=(y*input_channel+xin)/2;\n"
-" write_imageui(output,(int2)(y,x),convert_uint4(as_ushort4(vload8(0,input_ptr+inputOffset))));\n"
+" write_imagei(output,(int2)(y,x),as_int4(vload16(0,input_ptr+inputOffset)));\n"
 "#endif\n"
 "#else\n"
+" const int xin=x << 4;\n"
 " const int inputOffset=y*input_channel+xin;\n"
 " write_imagei(output,(int2)(y,x),as_int4(vload16(0,input_ptr+inputOffset)));\n"
 "#endif\n"
@@ -11539,7 +10108,6 @@ const char* buffer_convert_quant =
 "#ifdef USE_LOW_BIT_WEIGHT_INT4\n"
 " const int inputOffset=(yin*input_channel+xin)/2;\n"
 " const int outputOffset=((x*outputChannelC4+y)*icPack*ocPack)/2;\n"
-"#ifdef CHANNEL_LEAVE\n"
 " for(int i=0; i<icPack; ++i){\n"
 " for(int j=0; j<ocPack/2; ++j){\n"
 " int index0=(yin+j*2)*input_channel+xin+i;\n"
@@ -11548,21 +10116,9 @@ const char* buffer_convert_quant =
 " uchar s1=input_ptr[index1/2];\n"
 " s0=(index0 % 2) == 0 ? (s0 & 0xf0) : ((s0 & 0x0f) << 4);\n"
 " s1=(index1 % 2) == 0 ? (s1 >> 4) : (s1 & 0x0f);\n"
-" output_ptr[outputOffset+i*(ocPack/2)+j]=s0 | s1;\n"
-" }\n"
-" }\n"
-"#else\n"
-" for(int i=0; i<icPack/2; ++i){\n"
-" for(int j=0; j<ocPack/2; ++j){\n"
-" char s0=input_ptr[inputOffset+(j*2)*(input_channel/2)+i];\n"
-" char s1=input_ptr[inputOffset+(j*2+1)*(input_channel/2)+i];\n"
-" char d0=(s0 & 0xf0) | ((s1 & 0xf0) >> 4);\n"
-" char d1=((s0 & 0x0f) << 4) | (s1 & 0x0f);\n"
-" output_ptr[outputOffset+(i*2)*(ocPack/2)+j]=d0;\n"
-" output_ptr[outputOffset+(i*2+1)*(ocPack/2)+j]=d1;\n"
+" output_ptr[outputOffset+i*(ocPack/2)+j]=s0 | s1;\n"
 " }\n"
 " }\n"
-"#endif\n"
 "#else\n"
 " const int inputOffset=yin*input_channel+xin;\n"
 " const int outputOffset=(x*outputChannelC4+y)*icPack*ocPack;\n"
@@ -11581,103 +10137,7 @@ const char* gemm_buf =
 "#endif\n"
 "#define GLOBAL_SIZE_DIM2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
 "#define UNIFORM_BOUNDRY_CHECK(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
-"__kernel void gemm_buf(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input0,\n"
-" __global const FLOAT* input1,\n"
-" __global FLOAT* output,\n"
-" __private const int width,//UP_DIV(wUnit*hUnit,4)\n"
-" __private const int height,//dstChannelC4\n"
-" __private const int srcChannelC4,\n"
-" __private const int alpha2) {\n"
-" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
-" UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
-" const int pos_x=pos.x % width;\n"
-" const int pos_y=pos.x/width;\n"
-" const int pos_z=pos.y;\n"
-" COMPUTE_FLOAT16 o=(COMPUTE_FLOAT16)0;\n"
-" \n"
-" int kenerlY=mad24(pos_z,height,pos_y);\n"
-" for (int k=0; k<srcChannelC4; ++k) { \n"
-" //NHWC [1,1,alpha2*height,srcChannelC4*4] x 4\n"
-" //index:[0,0,pos_z*width+pos_y,index+0]\n"
-" //int inp1_offset=(((k*(alpha2*height)+kenerlY)*(srcChannelC4*4)+index)*4+0)*4;\n"
-" \n"
-" COMPUTE_FLOAT16 k_v16=CONVERT_COMPUTE_FLOAT16(vload16(kenerlY*(srcChannelC4)+k,input1));\n"
-" \n"
-" //NC4HW4 [alpha*alpha,srcChannelC4,width,4] x 4\n"
-" //index: [pos_z,k,pos_x,0]\n"
-" \n"
-" COMPUTE_FLOAT16 s=CONVERT_COMPUTE_FLOAT16(vload16(((pos_z*srcChannelC4+k)*width+pos_x),input0));\n"
-" o=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0,(COMPUTE_FLOAT4)s.s4,(COMPUTE_FLOAT4)s.s8,(COMPUTE_FLOAT4)s.sc),(COMPUTE_FLOAT16)(k_v16.s0123,k_v16.s0123,k_v16.s0123,k_v16.s0123),o);\n"
-" o=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1,(COMPUTE_FLOAT4)s.s5,(COMPUTE_FLOAT4)s.s9,(COMPUTE_FLOAT4)s.sd),(COMPUTE_FLOAT16)(k_v16.s4567,k_v16.s4567,k_v16.s4567,k_v16.s4567),o);\n"
-" o=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2,(COMPUTE_FLOAT4)s.s6,(COMPUTE_FLOAT4)s.sa,(COMPUTE_FLOAT4)s.se),(COMPUTE_FLOAT16)(k_v16.s89ab,k_v16.s89ab,k_v16.s89ab,k_v16.s89ab),o);\n"
-" o=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3,(COMPUTE_FLOAT4)s.s7,(COMPUTE_FLOAT4)s.sb,(COMPUTE_FLOAT4)s.sf),(COMPUTE_FLOAT16)(k_v16.scdef,k_v16.scdef,k_v16.scdef,k_v16.scdef),o);\n"
-" }\n"
-" \n"
-" //index: [pos_y,pos_z,0,pos_x]\n"
-" int out_offset=(((pos_y*alpha2+pos_z)*4+0)*width+pos_x)*4;\n"
-" vstore4(CONVERT_FLOAT4(o.s0123),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(o.s4567),0,output+out_offset+4*width);\n"
-" vstore4(CONVERT_FLOAT4(o.s89ab),0,output+out_offset+8*width);\n"
-" vstore4(CONVERT_FLOAT4(o.scdef),0,output+out_offset+12*width);\n"
-"}\n"
-"__kernel void gemm_buf2(GLOBAL_SIZE_DIM2\n"
-" __global const FLOAT* input0,\n"
-" __global const FLOAT* input1,\n"
-" __global FLOAT* output,\n"
-" __private const int width,//UP_DIV(wUnit*hUnit,8)\n"
-" __private const int height,//dstChannelC4\n"
-" __private const int srcChannelC4,\n"
-" __private const int alpha2) {\n"
-" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
-" UNIFORM_BOUNDRY_CHECK(pos.x,pos.y);\n"
-" const int width_block=(width+1) >> 1;\n"
-" const int pos_x=(pos.x % width_block) << 1;\n"
-" const int pos_y=pos.x/width_block;\n"
-" const int pos_z=pos.y;\n"
-" COMPUTE_FLOAT16 o0=(COMPUTE_FLOAT16)0;\n"
-" COMPUTE_FLOAT16 o1=(COMPUTE_FLOAT16)0;\n"
-" const int kenerlY=mad24(pos_z,height,pos_y);\n"
-" const int kernel_base=mul24(kenerlY,srcChannelC4);\n"
-" const int inp_base=(pos_z*srcChannelC4+0)*width+pos_x;\n"
-" \n"
-" for (int k=0; k<srcChannelC4; ++k) {\n"
-" //NHWC [1,1,alpha2*height,srcChannelC4*4] x 4\n"
-" //index:[0,0,pos_z*width+pos_y,index+0]\n"
-" //int inp1_offset=(((k*(alpha2*height)+kenerlY)*(srcChannelC4*4)+index)*4+0)*4;\n"
-" \n"
-" COMPUTE_FLOAT16 k_v16=CONVERT_COMPUTE_FLOAT16(vload16(kernel_base+k,input1));\n"
-" \n"
-" //NC4HW4 [alpha*alpha,srcChannelC4,width,4] x 4\n"
-" //index: [pos_z,k,pos_x,0]\n"
-" \n"
-" const int inp_offset=mad24(k,width,inp_base);\n"
-" COMPUTE_FLOAT16 s=CONVERT_COMPUTE_FLOAT16(vload16(inp_offset,input0));\n"
-" o0=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0,(COMPUTE_FLOAT4)s.s4,(COMPUTE_FLOAT4)s.s8,(COMPUTE_FLOAT4)s.sc),(COMPUTE_FLOAT16)(k_v16.s0123,k_v16.s0123,k_v16.s0123,k_v16.s0123),o0);\n"
-" o0=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1,(COMPUTE_FLOAT4)s.s5,(COMPUTE_FLOAT4)s.s9,(COMPUTE_FLOAT4)s.sd),(COMPUTE_FLOAT16)(k_v16.s4567,k_v16.s4567,k_v16.s4567,k_v16.s4567),o0);\n"
-" o0=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2,(COMPUTE_FLOAT4)s.s6,(COMPUTE_FLOAT4)s.sa,(COMPUTE_FLOAT4)s.se),(COMPUTE_FLOAT16)(k_v16.s89ab,k_v16.s89ab,k_v16.s89ab,k_v16.s89ab),o0);\n"
-" o0=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3,(COMPUTE_FLOAT4)s.s7,(COMPUTE_FLOAT4)s.sb,(COMPUTE_FLOAT4)s.sf),(COMPUTE_FLOAT16)(k_v16.scdef,k_v16.scdef,k_v16.scdef,k_v16.scdef),o0);\n"
-" \n"
-" s=CONVERT_COMPUTE_FLOAT16(vload16(inp_offset+1,input0));\n"
-" o1=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s0,(COMPUTE_FLOAT4)s.s4,(COMPUTE_FLOAT4)s.s8,(COMPUTE_FLOAT4)s.sc),(COMPUTE_FLOAT16)(k_v16.s0123,k_v16.s0123,k_v16.s0123,k_v16.s0123),o1);\n"
-" o1=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s1,(COMPUTE_FLOAT4)s.s5,(COMPUTE_FLOAT4)s.s9,(COMPUTE_FLOAT4)s.sd),(COMPUTE_FLOAT16)(k_v16.s4567,k_v16.s4567,k_v16.s4567,k_v16.s4567),o1);\n"
-" o1=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s2,(COMPUTE_FLOAT4)s.s6,(COMPUTE_FLOAT4)s.sa,(COMPUTE_FLOAT4)s.se),(COMPUTE_FLOAT16)(k_v16.s89ab,k_v16.s89ab,k_v16.s89ab,k_v16.s89ab),o1);\n"
-" o1=mad((COMPUTE_FLOAT16)((COMPUTE_FLOAT4)s.s3,(COMPUTE_FLOAT4)s.s7,(COMPUTE_FLOAT4)s.sb,(COMPUTE_FLOAT4)s.sf),(COMPUTE_FLOAT16)(k_v16.scdef,k_v16.scdef,k_v16.scdef,k_v16.scdef),o1);\n"
-" }\n"
-" //index: [pos_y,pos_z,0,pos_x]\n"
-" int out_offset=(((pos_y*alpha2+pos_z)*4+0)*width+pos_x)*4;\n"
-" vstore4(CONVERT_FLOAT4(o0.s0123),0,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(o0.s4567),0,output+out_offset+4*width);\n"
-" vstore4(CONVERT_FLOAT4(o0.s89ab),0,output+out_offset+8*width);\n"
-" vstore4(CONVERT_FLOAT4(o0.scdef),0,output+out_offset+12*width);\n"
-" \n"
-" if(pos_x+1 >= width) return;\n"
-" vstore4(CONVERT_FLOAT4(o1.s0123),1,output+out_offset);\n"
-" vstore4(CONVERT_FLOAT4(o1.s4567),1,output+out_offset+4*width);\n"
-" vstore4(CONVERT_FLOAT4(o1.s89ab),1,output+out_offset+8*width);\n"
-" vstore4(CONVERT_FLOAT4(o1.scdef),1,output+out_offset+12*width);\n"
-"}\n"
-"// [B,K/4,area,4] -> [alignK,alignM] (M=B*area)\n"
+"// [K/4,M,4] -> [alignK,alignM]\n"
 "__kernel void transpose_pad(GLOBAL_SIZE_DIM2\n"
 " const int alignM,\n"
 " const int alignK,\n"
@@ -11687,71 +10147,29 @@ const char* gemm_buf =
 " __global const FLOAT* input,\n"
 " __global FLOAT* output\n"
 " ) {\n"
-"#ifdef AREA_EQUAL_1\n"
 " const int idx_m4=get_global_id(0); // idx M\n"
 " const int idx_k4=get_global_id(1); // idx K\n"
 " UNIFORM_BOUNDRY_CHECK(idx_m4,idx_k4);\n"
 " const int idx_m=idx_m4 << 2;\n"
 " const int idx_k=idx_k4 << 2;\n"
 " const int K_4=(K+3) >> 2;\n"
-" const int in_offset_base=(idx_m*K_4+idx_k4)*4;\n"
+" const int in_offset_base=(idx_k4*M+idx_m)*4;\n"
 " const int out_offset_base=idx_k*alignM+idx_m;\n"
 " \n"
 " FLOAT4 m0k4=(idx_k4 >= K_4 || idx_m+0 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base);\n"
-" FLOAT4 m1k4=(idx_k4 >= K_4 || idx_m+1 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+(K_4 << 2));\n"
-" FLOAT4 m2k4=(idx_k4 >= K_4 || idx_m+2 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+(K_4 << 2)*2);\n"
-" FLOAT4 m3k4=(idx_k4 >= K_4 || idx_m+3 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+(K_4 << 2)*3);\n"
-" \n"
-" vstore4((FLOAT4)(m0k4.x,m1k4.x,m2k4.x,m3k4.x),0,output+out_offset_base);\n"
-" vstore4((FLOAT4)(m0k4.y,m1k4.y,m2k4.y,m3k4.y),0,output+out_offset_base+alignM);\n"
-" vstore4((FLOAT4)(m0k4.z,m1k4.z,m2k4.z,m3k4.z),0,output+out_offset_base+alignM+alignM);\n"
-" vstore4((FLOAT4)(m0k4.w,m1k4.w,m2k4.w,m3k4.w),0,output+out_offset_base+alignM+alignM+alignM);\n"
-"#elif defined BATCH_EQUAL_1\n"
-" const int idx_m4=get_global_id(0); // idx M\n"
-" const int idx_k4=get_global_id(1); // idx K\n"
-" UNIFORM_BOUNDRY_CHECK(idx_m4,idx_k4);\n"
-" const int idx_m=idx_m4 << 2;\n"
-" const int idx_k=idx_k4 << 2;\n"
-" const int K_4=(K+3) >> 2;\n"
-" const int in_offset_base=(idx_k4*area+idx_m)*4;\n"
-" const int out_offset_base=idx_k*alignM+idx_m;\n"
-" FLOAT4 m0k4=(idx_k4 >= K_4 || idx_m+0 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base);\n"
 " FLOAT4 m1k4=(idx_k4 >= K_4 || idx_m+1 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+4);\n"
 " FLOAT4 m2k4=(idx_k4 >= K_4 || idx_m+2 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+8);\n"
 " FLOAT4 m3k4=(idx_k4 >= K_4 || idx_m+3 >= M) ? (FLOAT4)0 : vload4(0,input+in_offset_base+12);\n"
+" \n"
 " vstore4((FLOAT4)(m0k4.x,m1k4.x,m2k4.x,m3k4.x),0,output+out_offset_base);\n"
 " vstore4((FLOAT4)(m0k4.y,m1k4.y,m2k4.y,m3k4.y),0,output+out_offset_base+alignM);\n"
 " vstore4((FLOAT4)(m0k4.z,m1k4.z,m2k4.z,m3k4.z),0,output+out_offset_base+alignM+alignM);\n"
 " vstore4((FLOAT4)(m0k4.w,m1k4.w,m2k4.w,m3k4.w),0,output+out_offset_base+alignM+alignM+alignM);\n"
-"#else\n"
-" const int idx_m=get_global_id(0); // idx M\n"
-" const int idx_k4=get_global_id(1); // idx K\n"
-" UNIFORM_BOUNDRY_CHECK(idx_m,idx_k4);\n"
-" \n"
-" const int K_4=(K+3) >> 2;\n"
-" const int idx_k=idx_k4 << 2;\n"
-" const int out_offset_base=idx_k*alignM+idx_m;\n"
-" \n"
-" if(idx_k4 >= K_4 || idx_m >= M) {\n"
-" output[out_offset_base]=(FLOAT)0;\n"
-" output[out_offset_base+alignM]=(FLOAT)0;\n"
-" output[out_offset_base+alignM+alignM]=(FLOAT)0;\n"
-" output[out_offset_base+alignM+alignM+alignM]=(FLOAT)0;\n"
-" return;\n"
-" }\n"
-" const int idx_b=idx_m/area;\n"
-" const int idx_area=idx_m % area;\n"
-" \n"
-" const int in_offset_base=((idx_b*K_4+idx_k4)*area+idx_area)*4;\n"
-" FLOAT4 data=vload4(0,input+in_offset_base);\n"
-" \n"
-" output[out_offset_base]=data.x;\n"
-" output[out_offset_base+alignM]=data.y;\n"
-" output[out_offset_base+alignM+alignM]=data.z;\n"
-" output[out_offset_base+alignM+alignM+alignM]=data.w;\n"
-"#endif\n"
 "}\n"
-"// [alignM,alignN] -> [B,N/4,area,4] (M=B*area)\n"
+"#ifndef M_VEC\n"
+"#define M_VEC 1\n"
+"#endif\n"
+"// [alignM,alignN] -> [N/4,B,area,N4] (M=B*area)\n"
 "__kernel void transpose_bias(GLOBAL_SIZE_DIM2\n"
 " const int alignM,\n"
 " const int alignN,\n"
@@ -11762,28 +10180,15 @@ const char* gemm_buf =
 " __global const FLOAT* input1,\n"
 " __global FLOAT* output\n"
 " ) {\n"
-"#ifdef AREA_EQUAL_1\n"
-" const int idx_m=get_global_id(0); // idx M\n"
-" const int idx_n_16=get_global_id(1); // idx N\n"
-" UNIFORM_BOUNDRY_CHECK(idx_m,idx_n_16);\n"
-" const int N_4=(N+3) >> 2;\n"
-" const int N_16=(N+15) >> 4;\n"
-" const int N_left=N & 15;\n"
-" bool canVec16=(N_left == 0 || (N_left != 0 && idx_n_16<N_16-1));\n"
-" if(canVec16) {\n"
-" FLOAT16 res0=vload16(0,input0+idx_m*alignN+(idx_n_16 << 4));\n"
-" FLOAT16 res1=vload16(0,input1+(idx_n_16 << 4));\n"
-" FLOAT16 res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT16)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT16)0,(FLOAT16)6);\n"
-" #endif\n"
-" vstore16(res,0,output+((idx_m*N_4+(idx_n_16 << 2)) << 2));\n"
-" } else {\n"
-" FLOAT4 res0=vload4(0,input0+idx_m*alignN+(idx_n_16 << 4));\n"
-" FLOAT4 res1=vload4(0,input1+(idx_n_16 << 4));\n"
+" int idx_m=get_global_id(0); // idx M\n"
+" int idx_n4=get_global_id(1); // idx N\n"
+" UNIFORM_BOUNDRY_CHECK(idx_m,idx_n4);\n"
+" const int idx_n=idx_n4 << 2;\n"
+" idx_m=idx_m*M_VEC;\n"
+" FLOAT4 res1=vload4(0,input1+idx_n);\n"
+" #pragma unroll\n"
+" for(int i=0; i<M_VEC; i++) {\n"
+" FLOAT4 res0=vload4(0,input0+(idx_m+i)*alignN+idx_n);\n"
 " FLOAT4 res=res0+res1;\n"
 " #ifdef RELU\n"
 " res=fmax(res,(FLOAT4)0);\n"
@@ -11791,103 +10196,8 @@ const char* gemm_buf =
 " #ifdef RELU6\n"
 " res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
 " #endif\n"
-" vstore4(res,0,output+((idx_m*N_4+(idx_n_16 << 2)) << 2));\n"
-" \n"
-" if(idx_n_16*4+1 >= N_4) return;\n"
-" res0=vload4(0,input0+idx_m*alignN+(idx_n_16 << 4)+4);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+4);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+((idx_m*N_4+(idx_n_16 << 2)) << 2)+4);\n"
-" \n"
-" if(idx_n_16*4+2 >= N_4) return;\n"
-" res0=vload4(0,input0+idx_m*alignN+(idx_n_16 << 4)+8);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+8);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+((idx_m*N_4+(idx_n_16 << 2)) << 2)+8);\n"
-" \n"
-" if(idx_n_16*4+3 >= N_4) return;\n"
-" res0=vload4(0,input0+idx_m*alignN+(idx_n_16 << 4)+12);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+12);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+((idx_m*N_4+(idx_n_16 << 2)) << 2)+12);\n"
+" vstore4(res,0,output+((idx_n4*M+idx_m+i) << 2));\n"
 " }\n"
-"#else\n"
-" const int idx_m=get_global_id(0); // idx M\n"
-" const int idx_n_16=get_global_id(1); // idx N\n"
-" UNIFORM_BOUNDRY_CHECK(idx_m,idx_n_16);\n"
-" \n"
-" const int N_4=(N+3) >> 2;\n"
-" const int idx_b=idx_m/area;\n"
-" const int idx_area=idx_m % area;\n"
-" \n"
-" const int inp_base_offset=idx_m*alignN+(idx_n_16 << 4);\n"
-" const int out_base_offset=((idx_b*N_4+idx_n_16*4)*area+idx_area)*4;\n"
-" \n"
-" FLOAT4 res0=vload4(0,input0+inp_base_offset);\n"
-" FLOAT4 res1=vload4(0,input1+(idx_n_16 << 4));\n"
-" FLOAT4 res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+out_base_offset);\n"
-" \n"
-" if(idx_n_16*4+1 >= N_4) return;\n"
-" res0=vload4(0,input0+inp_base_offset+4);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+4);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+out_base_offset+area*4);\n"
-" \n"
-" if(idx_n_16*4+2 >= N_4) return;\n"
-" res0=vload4(0,input0+inp_base_offset+8);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+8);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+out_base_offset+area*8);\n"
-" \n"
-" if(idx_n_16*4+3 >= N_4) return;\n"
-" res0=vload4(0,input0+inp_base_offset+12);\n"
-" res1=vload4(0,input1+(idx_n_16 << 4)+12);\n"
-" res=res0+res1;\n"
-" #ifdef RELU\n"
-" res=fmax(res,(FLOAT4)0);\n"
-" #endif\n"
-" #ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)0,(FLOAT4)6);\n"
-" #endif\n"
-" vstore4(res,0,output+out_base_offset+area*12);\n"
-"#endif\n"
 "}\n"
 ;
 #endif
@@ -12287,235 +10597,58 @@ const char* loop =
 " int nc4=c4offset % src1C4_size.w;\n"
 " int cc4_offset=cc4/4;\n"
 " int cc4_remain=cc4 % 4;\n"
-" float4 tmp=convert_float4(RI_DATA(input1,SAMPLER,(int2)(cc4_offset*src1C4_size.x+wc4,nc4*src1C4_size.y+hc4)));\n"
-" float *tmp_ptr=(float*)&tmp;\n"
-" in1_ptr[i]=tmp_ptr[cc4_remain];\n"
-" }\n"
-" }\n"
-" \n"
-" float4 out=LOOP_BINARY_OPERATOR;\n"
-" WI_DATA(output,(int2)(co*dst_width+wo,no*dst_height+ho),CONVERT_OUTPUT_I4(out));\n"
-" }\n"
-"}\n"
-"#endif\n"
-;
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* argmax_buf = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"#define ARGMAX_SELECT(A, B, C, D) "" if(A.x < B.x){ A.x = B.x; C.x = D; } "" if(A.y < B.y){ A.y = B.y; C.y = D; } "" if(A.z < B.z){ A.z = B.z; C.z = D; } "" if(A.w<B.w){ A.w=B.w; C.w=D; } \n"
-"#define ARGMIN_SELECT(A, B, C, D) "" if(A.x > B.x){ A.x = B.x; C.x = D; } "" if(A.y > B.y){ A.y = B.y; C.y = D; } "" if(A.z > B.z){ A.z = B.z; C.z = D; } "" if(A.w>B.w){ A.w=B.w; C.w=D; } \n"
-"__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT* input,\n"
-" __global int* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-" const int x=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
-" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+channel_idx)*inputHeight+height_idx)*inputWidth+0)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+channel_idx)*outputHeight+height_idx)*oututWidth+0)*4;\n"
-" int4 index=0;\n"
-"#ifdef ARGMAX\n"
-" FLOAT4 maxValue=(FLOAT4)-FLT_MAX;\n"
-"#else\n"
-" FLOAT4 maxValue=(FLOAT4)FLT_MAX;\n"
-"#endif\n"
-"#if ARGMAX_LOCAL_SIZE >= 4\n"
-" int lid=get_local_id(0);\n"
-" FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];\n"
-" int4 local index_reduce[ARGMAX_LOCAL_SIZE];\n"
-" \n"
-" for (int i=lid; i<inputWidth; i+=ARGMAX_LOCAL_SIZE) {\n"
-" FLOAT4 value=vload4(i,input+offset);\n"
-"#ifdef ARGMAX\n"
-" ARGMAX_SELECT(maxValue,value,index,i);\n"
-"#else\n"
-" ARGMIN_SELECT(maxValue,value,index,i);\n"
-"#endif\n"
-" }\n"
-" reduce[lid]=maxValue;\n"
-" index_reduce[lid]=index;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=ARGMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i){\n"
-"#ifdef ARGMAX\n"
-" if(reduce[lid].x<reduce[lid+i].x){reduce[lid].x=reduce[lid+i].x; index_reduce[lid].x=index_reduce[lid+i].x;}\n"
-" if(reduce[lid].y<reduce[lid+i].y){reduce[lid].y=reduce[lid+i].y; index_reduce[lid].y=index_reduce[lid+i].y;}\n"
-" if(reduce[lid].z<reduce[lid+i].z){reduce[lid].z=reduce[lid+i].z; index_reduce[lid].z=index_reduce[lid+i].z;}\n"
-" if(reduce[lid].w<reduce[lid+i].w){reduce[lid].w=reduce[lid+i].w; index_reduce[lid].w=index_reduce[lid+i].w;}\n"
-"#else\n"
-" if(reduce[lid].x>reduce[lid+i].x){reduce[lid].x=reduce[lid+i].x; index_reduce[lid].x=index_reduce[lid+i].x;}\n"
-" if(reduce[lid].y>reduce[lid+i].y){reduce[lid].y=reduce[lid+i].y; index_reduce[lid].y=index_reduce[lid+i].y;}\n"
-" if(reduce[lid].z>reduce[lid+i].z){reduce[lid].z=reduce[lid+i].z; index_reduce[lid].z=index_reduce[lid+i].z;}\n"
-" if(reduce[lid].w>reduce[lid+i].w){reduce[lid].w=reduce[lid+i].w; index_reduce[lid].w=index_reduce[lid+i].w;}\n"
-"#endif\n"
-" }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" if(lid == 0){\n"
-" vstore4(index_reduce[0],0,output+outputOffset);\n"
-" }\n"
-"#else\n"
-" for(int i=0; i<inputWidth; ++i){\n"
-" FLOAT4 value=vload4(i,input+offset);\n"
-"#ifdef ARGMAX\n"
-" ARGMAX_SELECT(maxValue,value,index,i);\n"
-"#else\n"
-" ARGMIN_SELECT(maxValue,value,index,i);\n"
-"#endif\n"
-" }\n"
-" vstore4(index,0,output+outputOffset);\n"
-"#endif\n"
-"}\n"
-"__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT* input,\n"
-" __global int* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-" const int x=get_global_id(0);\n"
-" const int width_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,width_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
-" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+channel_idx)*inputHeight+0)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+channel_idx)*outputHeight+0)*oututWidth+width_idx)*4;\n"
-" int4 index=0;\n"
-"#ifdef ARGMAX\n"
-" FLOAT4 maxValue=(FLOAT4)-FLT_MAX;\n"
-"#else\n"
-" FLOAT4 maxValue=(FLOAT4)FLT_MAX;\n"
-"#endif\n"
-"#if ARGMAX_LOCAL_SIZE >= 4\n"
-" int lid=get_local_id(0);\n"
-" FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];\n"
-" int4 local index_reduce[ARGMAX_LOCAL_SIZE];\n"
-" \n"
-" for (int i=lid; i<inputHeight; i+=ARGMAX_LOCAL_SIZE) {\n"
-" FLOAT4 value=vload4(i*inputWidth,input+offset);\n"
-"#ifdef ARGMAX\n"
-" ARGMAX_SELECT(maxValue,value,index,i);\n"
-"#else\n"
-" ARGMIN_SELECT(maxValue,value,index,i);\n"
-"#endif\n"
-" }\n"
-" reduce[lid]=maxValue;\n"
-" index_reduce[lid]=index;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=ARGMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i){\n"
-"#ifdef ARGMAX\n"
-" if(reduce[lid].x<reduce[lid+i].x){reduce[lid].x=reduce[lid+i].x; index_reduce[lid].x=index_reduce[lid+i].x;}\n"
-" if(reduce[lid].y<reduce[lid+i].y){reduce[lid].y=reduce[lid+i].y; index_reduce[lid].y=index_reduce[lid+i].y;}\n"
-" if(reduce[lid].z<reduce[lid+i].z){reduce[lid].z=reduce[lid+i].z; index_reduce[lid].z=index_reduce[lid+i].z;}\n"
-" if(reduce[lid].w<reduce[lid+i].w){reduce[lid].w=reduce[lid+i].w; index_reduce[lid].w=index_reduce[lid+i].w;}\n"
-"#else\n"
-" if(reduce[lid].x>reduce[lid+i].x){reduce[lid].x=reduce[lid+i].x; index_reduce[lid].x=index_reduce[lid+i].x;}\n"
-" if(reduce[lid].y>reduce[lid+i].y){reduce[lid].y=reduce[lid+i].y; index_reduce[lid].y=index_reduce[lid+i].y;}\n"
-" if(reduce[lid].z>reduce[lid+i].z){reduce[lid].z=reduce[lid+i].z; index_reduce[lid].z=index_reduce[lid+i].z;}\n"
-" if(reduce[lid].w>reduce[lid+i].w){reduce[lid].w=reduce[lid+i].w; index_reduce[lid].w=index_reduce[lid+i].w;}\n"
-"#endif\n"
+" float4 tmp=convert_float4(RI_DATA(input1,SAMPLER,(int2)(cc4_offset*src1C4_size.x+wc4,nc4*src1C4_size.y+hc4)));\n"
+" float *tmp_ptr=(float*)&tmp;\n"
+" in1_ptr[i]=tmp_ptr[cc4_remain];\n"
 " }\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" if(lid == 0){\n"
-" vstore4(index_reduce[0],0,output+outputOffset);\n"
+" \n"
+" float4 out=LOOP_BINARY_OPERATOR;\n"
+" WI_DATA(output,(int2)(co*dst_width+wo,no*dst_height+ho),CONVERT_OUTPUT_I4(out));\n"
 " }\n"
-"#else\n"
-" for(int i=0; i<inputHeight; ++i){\n"
-" FLOAT4 value=vload4(i*inputWidth,input+offset);\n"
-"#ifdef ARGMAX\n"
-" ARGMAX_SELECT(maxValue,value,index,i);\n"
-"#else\n"
-" ARGMIN_SELECT(maxValue,value,index,i);\n"
+"}\n"
 "#endif\n"
-" }\n"
-" vstore4(index,0,output+outputOffset);\n"
+;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* argmax_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
-"}\n"
-"__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS\n"
+"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
+"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define ARGMAX_SELECT(A, B, C, D) "" if(A.x < B.x){ A.x = B.x; C.x = D; } "" if(A.y < B.y){ A.y = B.y; C.y = D; } "" if(A.z < B.z){ A.z = B.z; C.z = D; } "" if(A.w<B.w){ A.w=B.w; C.w=D; } \n"
+"#define ARGMIN_SELECT(A, B, C, D) "" if(A.x > B.x){ A.x = B.x; C.x = D; } "" if(A.y > B.y){ A.y = B.y; C.y = D; } "" if(A.z > B.z){ A.z = B.z; C.z = D; } "" if(A.w>B.w){ A.w=B.w; C.w=D; } \n"
+"__kernel void argmax_buf(GLOBAL_SIZE_3_DIMS\n"
 " __global const FLOAT* input,\n"
 " __global int* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim){\n"
 " const int x=get_global_id(0);\n"
-" const int wh=get_global_id(1);\n"
-" const int batch_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,wh,batch_idx);\n"
+" const int y=get_global_id(1); // inside\n"
+" const int z=get_global_id(2); // outside\n"
 " \n"
-" const int width_idx=wh % oututWidth;\n"
-" const int height_idx=wh/oututWidth;\n"
-" const int offset=((((batch_idx*inputChannelBlock)+0)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-"#ifdef ARGMAX_CHANNEL_DIM1\n"
-" const int outputOffset=((batch_idx*outputHeight+height_idx)*oututWidth+width_idx);\n"
-"#else\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+0)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
-"#endif\n"
-" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" int index=0;\n"
 "#ifdef ARGMAX\n"
 " FLOAT maxValue=(FLOAT)-FLT_MAX;\n"
 "#else\n"
-" FLOAT maxValue=(FLOAT)FLT_MAX;\n"
+"FLOAT maxValue=(FLOAT)FLT_MAX;\n"
 "#endif\n"
-" int index=0;\n"
-" FLOAT4 value;\n"
-" FLOAT *valuePtr=(FLOAT*)&value;\n"
+" const int offset=z*dim*inside+y;\n"
 "#if ARGMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
 " FLOAT local reduce[ARGMAX_LOCAL_SIZE];\n"
 " int local index_reduce[ARGMAX_LOCAL_SIZE];\n"
 " \n"
-" for (int i=lid; i<inputChannelBlock-1; i+=ARGMAX_LOCAL_SIZE) {\n"
-" value=vload4(i*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<4; ++j){\n"
+" for (int i=lid; i<dim; i+=ARGMAX_LOCAL_SIZE) {\n"
+" FLOAT value=input[offset+i*inside];\n"
 "#ifdef ARGMAX\n"
-" if(maxValue<valuePtr[j]){\n"
-" index=i*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
+" if(maxValue<value){ maxValue=value; index=i; }\n"
 "#else\n"
-" if(maxValue>valuePtr[j]){\n"
-" index=i*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
+" if(maxValue>value){ maxValue=value; index=i; }\n"
 "#endif\n"
 " }\n"
-" }\n"
 " reduce[lid]=maxValue;\n"
 " index_reduce[lid]=index;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
@@ -12530,94 +10663,45 @@ const char* argmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " if(lid == 0){\n"
-" maxValue=reduce[lid];\n"
-" index=index_reduce[lid];\n"
-" value=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
-"#ifdef ARGMAX\n"
-" if(maxValue<valuePtr[j]){\n"
-" index=(inputChannelBlock-1)*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
-"#else\n"
-" if(maxValue>valuePtr[j]){\n"
-" index=(inputChannelBlock-1)*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
-"#endif\n"
-" }\n"
-" output[outputOffset]=index;\n"
+" output[z*inside+y]=index_reduce[0];\n"
 " }\n"
 "#else\n"
-" for(int i=0; i<inputChannelBlock-1; ++i){\n"
-" value=vload4(i*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<4; ++j){\n"
-"#ifdef ARGMAX\n"
-" if(maxValue<valuePtr[j]){\n"
-" index=i*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
-"#else\n"
-" if(maxValue>valuePtr[j]){\n"
-" index=i*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
-"#endif\n"
-" }\n"
-" }\n"
-" value=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
+" for(int i=0; i<dim; ++i){\n"
+" FLOAT value=input[+offset+i*inside];\n"
 "#ifdef ARGMAX\n"
-" if(maxValue<valuePtr[j]){\n"
-" index=(inputChannelBlock-1)*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
+" if(maxValue<value){ maxValue=value; index=i; }\n"
 "#else\n"
-" if(maxValue>valuePtr[j]){\n"
-" index=(inputChannelBlock-1)*4+j;\n"
-" maxValue=valuePtr[j];\n"
-" }\n"
+" if(maxValue>value){ maxValue=value; index=i; }\n"
 "#endif\n"
 " }\n"
-" output[outputOffset]=index;\n"
+" output[z*inside+y]=index;\n"
 "#endif\n"
 "}\n"
-"__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS\n"
+"__kernel void argmax_v4_buf(GLOBAL_SIZE_3_DIMS\n"
 " __global const FLOAT* input,\n"
 " __global int* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim){\n"
 " const int x=get_global_id(0);\n"
-" const int wh=get_global_id(1);\n"
-" const int channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,wh,channel_idx);\n"
+" const int y=get_global_id(1) << 2; // inside\n"
+" const int z=get_global_id(2); // outside\n"
 " \n"
-" const int width_idx=wh % oututWidth;\n"
-" const int height_idx=wh/oututWidth;\n"
-" const int offset=((((0*inputChannelBlock)+channel_idx)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((0*outputChannelBlock)+channel_idx)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " int4 index=0;\n"
-" int batchOffset=inputChannelBlock*inputHeight*inputWidth;\n"
 "#ifdef ARGMAX\n"
 " FLOAT4 maxValue=(FLOAT4)-FLT_MAX;\n"
 "#else\n"
 " FLOAT4 maxValue=(FLOAT4)FLT_MAX;\n"
 "#endif\n"
+" const int offset=z*dim*inside+y;\n"
 "#if ARGMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
 " FLOAT4 local reduce[ARGMAX_LOCAL_SIZE];\n"
 " int4 local index_reduce[ARGMAX_LOCAL_SIZE];\n"
 " \n"
-" for (int i=lid; i<inputBatch; i+=ARGMAX_LOCAL_SIZE) {\n"
-" FLOAT4 value=vload4(i*batchOffset,input+offset);\n"
+" for (int i=lid; i<dim; i+=ARGMAX_LOCAL_SIZE) {\n"
+" FLOAT4 value=vload4(0,input+offset+i*inside);\n"
 "#ifdef ARGMAX\n"
 " ARGMAX_SELECT(maxValue,value,index,i);\n"
 "#else\n"
@@ -12644,18 +10728,18 @@ const char* argmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " if(lid == 0){\n"
-" vstore4(index_reduce[0],0,output+outputOffset);\n"
+" vstore4(index_reduce[0],0,output+z*inside+y);\n"
 " }\n"
 "#else\n"
-" for(int i=0; i<inputBatch; ++i){\n"
-" FLOAT4 value=vload4(i*batchOffset,input+offset);\n"
+" for(int i=0; i<dim; ++i){\n"
+" FLOAT4 value=vload4(0,input+offset+i*inside);\n"
 "#ifdef ARGMAX\n"
 " ARGMAX_SELECT(maxValue,value,index,i);\n"
 "#else\n"
 " ARGMIN_SELECT(maxValue,value,index,i);\n"
 "#endif\n"
 " }\n"
-" vstore4(index,0,output+outputOffset);\n"
+" vstore4(index,0,output+z*inside+y);\n"
 "#endif\n"
 "}\n"
 ;
@@ -12885,350 +10969,737 @@ const char* attention_buf =
 "#endif\n"
 "#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
 "#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define DEAL_OUTER_SEQLEN_NOT_ALIGN(length) "" if(4 * sl + 3 >= length) {"" temp_3 = (FLOAT4)0;"" }"" if(4 * sl + 2 >= length) {"" temp_2 = (FLOAT4)0;"" }"" if(4 * sl + 1 >= length) {"" temp_1 = (FLOAT4)0;"" }\n"
+"#define DEAL_INNER_HEADDIM_NOT_ALIGN(length) "" if(hd * 4 + 3 >= length) {"" temp_0.w = (FLOAT)0;"" temp_1.w = (FLOAT)0;"" temp_2.w = (FLOAT)0;"" temp_3.w = (FLOAT)0;"" }"" if(hd * 4 + 2 >= length) {"" temp_0.z = (FLOAT)0;"" temp_1.z = (FLOAT)0;"" temp_2.z = (FLOAT)0;"" temp_3.z = (FLOAT)0;"" }"" if(hd * 4 + 1 >= length) {"" temp_0.y = (FLOAT)0;"" temp_1.y = (FLOAT)0;"" temp_2.y = (FLOAT)0;"" temp_3.y = (FLOAT)0;"" }\n"
+"__kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *input_q,//[batch,seqLenQ/4,headNum,headDim,seqLenQ_4]\n"
+" __global const FLOAT *input_k,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
+" __global const FLOAT *input_v,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
+" __global FLOAT *output_q,// [batch*headNum,ROUND_UP(headDim,mTileHDK),ROUND_UP(seqLenQ,mTileQ)]\n"
+" __global FLOAT *output_k,// [batch*headNum/group,ROUND_UP(headDim,mTileHDK),ROUND_UP(seqLenKV,mTileKV)]\n"
+" __global FLOAT *output_v,// [batch*headNum/group,ROUND_UP(seqLenKV,mTileKV),ROUND_UP(headDim,mTileHDN)]\n"
+" __global FLOAT *past_k,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
+" __global FLOAT *past_v,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
+" __private const int4 tile,// [mTileQ,mTileKV,mTileHDK,mTileHDN]\n"
+" __private const int4 shape,// [seqLenQ,seqLenKV,headNum,headDim]\n"
+" __private const int4 param // [group,batch]\n"
+") {\n"
+" const int sl=get_global_id(0); // seqLen/4 : max(seqLenPackQ/4,seqLenPackKV/4)\n"
+" const int hd=get_global_id(1); // headDim/4 : max(headDimPackQK/4,headDimPackV/4)\n"
+" const int z=get_global_id(2); // batch*headNum\n"
+" DEAL_NON_UNIFORM_DIM3(sl,hd,z);\n"
+" \n"
+" const int seqLenQ=shape.x;\n"
+" const int seqLenKV=shape.y;\n"
+" const int headNum=shape.z;\n"
+" const int headDim=shape.w;\n"
+" const int group=param.x;\n"
+" const int batch=param.y;\n"
+" const int b=z % batch;\n"
+" const int hn=z/batch;\n"
+" \n"
+" const int seqLenQ_4=(seqLenQ+3)/4;\n"
+" //const int in_offset_q=(((b*seqLenQ_4+sl)*headNum+hn)*headDim+4*hd)*4;\n"
+" const int in_offset_q=(((b*seqLenQ+sl*4)*headNum+hn)*headDim+4*hd);\n"
+" const int seqLenPackQ=((seqLenQ+tile.x-1)/tile.x)*tile.x;\n"
+" const int headDimPackQK=((headDim+tile.z-1)/tile.z)*tile.z;\n"
+" const int out_offset_q=(((b*headNum+hn)*headDimPackQK+hd*4)*seqLenPackQ+sl*4);\n"
+" \n"
+" if(sl*4<seqLenPackQ && hd*4<headDimPackQK) {\n"
+" if(sl*4 >= seqLenQ || hd*4 >= headDim) {\n"
+" vstore4((FLOAT4)0,0,output_q+out_offset_q);\n"
+" vstore4((FLOAT4)0,0,output_q+out_offset_q+seqLenPackQ);\n"
+" vstore4((FLOAT4)0,0,output_q+out_offset_q+2*seqLenPackQ);\n"
+" vstore4((FLOAT4)0,0,output_q+out_offset_q+3*seqLenPackQ);\n"
+" } else {\n"
+" FLOAT4 temp_0=vload4(0,input_q+in_offset_q);\n"
+" FLOAT4 temp_1=(sl*4+1 >= seqLenQ) ? (FLOAT4)0 : vload4(0,input_q+in_offset_q+headNum*headDim);\n"
+" FLOAT4 temp_2=(sl*4+2 >= seqLenQ) ? (FLOAT4)0 : vload4(0,input_q+in_offset_q+2*headNum*headDim);\n"
+" FLOAT4 temp_3=(sl*4+3 >= seqLenQ) ? (FLOAT4)0 : vload4(0,input_q+in_offset_q+3*headNum*headDim);\n"
+" #ifdef HEADDIM_LEAVE\n"
+" DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)\n"
+" #endif\n"
+" #ifdef SEQLEN_LEAVE\n"
+" DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenQ)\n"
+" #endif\n"
+" vstore4((FLOAT4)(temp_0.s0,temp_1.s0,temp_2.s0,temp_3.s0),0,output_q+out_offset_q);\n"
+" vstore4((FLOAT4)(temp_0.s1,temp_1.s1,temp_2.s1,temp_3.s1),0,output_q+out_offset_q+seqLenPackQ);\n"
+" vstore4((FLOAT4)(temp_0.s2,temp_1.s2,temp_2.s2,temp_3.s2),0,output_q+out_offset_q+2*seqLenPackQ);\n"
+" vstore4((FLOAT4)(temp_0.s3,temp_1.s3,temp_2.s3,temp_3.s3),0,output_q+out_offset_q+3*seqLenPackQ);\n"
+" }\n"
+" }\n"
+" \n"
+" if(hn >= headNum/group) {\n"
+" return;\n"
+" }\n"
+" \n"
+" const int seqLenPackKV=((seqLenKV+tile.y-1)/tile.y)*tile.y;\n"
+" const int headDimPackV=((headDim+tile.w-1)/tile.w)*tile.w;\n"
+" const int seqLenKV_4=(seqLenKV+3)/4;\n"
+" const int in_offset_kv=(((b*seqLenKV+sl*4)*headNum/group+hn)*headDim+4*hd);\n"
+" \n"
+" if(sl*4<seqLenPackKV && hd*4<headDimPackQK) {\n"
+" const int out_offset_k=(((b*headNum/group+hn)*headDimPackQK+hd*4)*seqLenPackKV+sl*4);\n"
+" if(sl*4 >= seqLenKV || hd*4 >= headDim) {\n"
+" vstore4((FLOAT4)0,0,output_k+out_offset_k);\n"
+" vstore4((FLOAT4)0,0,output_k+out_offset_k+seqLenPackKV);\n"
+" vstore4((FLOAT4)0,0,output_k+out_offset_k+2*seqLenPackKV);\n"
+" vstore4((FLOAT4)0,0,output_k+out_offset_k+3*seqLenPackKV);\n"
+" } else {\n"
+" FLOAT4 temp_0=vload4(0,input_k+in_offset_kv);\n"
+" FLOAT4 temp_1=(sl*4+1 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_k+in_offset_kv+headNum*headDim/group);\n"
+" FLOAT4 temp_2=(sl*4+2 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_k+in_offset_kv+2*headNum*headDim/group);\n"
+" FLOAT4 temp_3=(sl*4+3 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_k+in_offset_kv+3*headNum*headDim/group);\n"
+" #ifdef HEADDIM_LEAVE\n"
+" DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)\n"
+" #endif\n"
+" #ifdef SEQLEN_LEAVE\n"
+" DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)\n"
+" #endif\n"
+" vstore4((FLOAT4)(temp_0.s0,temp_1.s0,temp_2.s0,temp_3.s0),0,output_k+out_offset_k);\n"
+" vstore4((FLOAT4)(temp_0.s1,temp_1.s1,temp_2.s1,temp_3.s1),0,output_k+out_offset_k+seqLenPackKV);\n"
+" vstore4((FLOAT4)(temp_0.s2,temp_1.s2,temp_2.s2,temp_3.s2),0,output_k+out_offset_k+2*seqLenPackKV);\n"
+" vstore4((FLOAT4)(temp_0.s3,temp_1.s3,temp_2.s3,temp_3.s3),0,output_k+out_offset_k+3*seqLenPackKV);\n"
+" \n"
+" // pastK\n"
+" vstore4(temp_0,0,past_k+in_offset_kv);\n"
+" if(sl*4+1<seqLenKV) {\n"
+" vstore4(temp_1,0,past_k+in_offset_kv+headNum*headDim/group);\n"
+" }\n"
+" if(sl*4+2<seqLenKV) {\n"
+" vstore4(temp_2,0,past_k+in_offset_kv+2*headNum*headDim/group);\n"
+" }\n"
+" if(sl*4+3<seqLenKV) {\n"
+" vstore4(temp_3,0,past_k+in_offset_kv+3*headNum*headDim/group);\n"
+" }\n"
+" }\n"
+" \n"
+" }\n"
+" \n"
+" if(sl*4<seqLenPackKV && hd*4<headDimPackV) {\n"
+" const int out_offset_v=(((b*headNum/group+hn)*seqLenPackKV+sl*4)*headDimPackV+hd*4);\n"
+" if(sl*4 >= seqLenKV || hd*4 >= headDim) {\n"
+" vstore4((FLOAT4)0,0,output_v+out_offset_v);\n"
+" vstore4((FLOAT4)0,0,output_v+out_offset_v+headDimPackV);\n"
+" vstore4((FLOAT4)0,0,output_v+out_offset_v+2*headDimPackV);\n"
+" vstore4((FLOAT4)0,0,output_v+out_offset_v+3*headDimPackV);\n"
+" } else {\n"
+" FLOAT4 temp_0=vload4(0,input_v+in_offset_kv);\n"
+" FLOAT4 temp_1=(sl*4+1 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_v+in_offset_kv+headNum*headDim/group);\n"
+" FLOAT4 temp_2=(sl*4+2 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_v+in_offset_kv+2*headNum*headDim/group);\n"
+" FLOAT4 temp_3=(sl*4+3 >= seqLenKV) ? (FLOAT4)0 : vload4(0,input_v+in_offset_kv+3*headNum*headDim/group);\n"
+" #ifdef HEADDIM_LEAVE\n"
+" DEAL_INNER_HEADDIM_NOT_ALIGN(headDim)\n"
+" #endif\n"
+" #ifdef SEQLEN_LEAVE\n"
+" DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)\n"
+" #endif\n"
+" vstore4(temp_0,0,output_v+out_offset_v);\n"
+" vstore4(temp_1,0,output_v+out_offset_v+headDimPackV);\n"
+" vstore4(temp_2,0,output_v+out_offset_v+2*headDimPackV);\n"
+" vstore4(temp_3,0,output_v+out_offset_v+3*headDimPackV);\n"
+" \n"
+" // pastV\n"
+" vstore4(temp_0,0,past_v+in_offset_kv);\n"
+" if(sl*4+1<seqLenKV) {\n"
+" vstore4(temp_1,0,past_v+in_offset_kv+headNum*headDim/group);\n"
+" }\n"
+" if(sl*4+2<seqLenKV) {\n"
+" vstore4(temp_2,0,past_v+in_offset_kv+2*headNum*headDim/group);\n"
+" }\n"
+" if(sl*4+3<seqLenKV) {\n"
+" vstore4(temp_3,0,past_v+in_offset_kv+3*headNum*headDim/group);\n"
+" }\n"
+" }\n"
+" \n"
+" }\n"
+"}\n"
+"#ifndef MASK_DTYPE\n"
+"#define MASK_DTYPE FLOAT\n"
+"#define MASK_DTYPE4 FLOAT4\n"
+"#endif\n"
+"__kernel void rearrange_mask(GLOBAL_SIZE_3_DIMS\n"
+" __global const MASK_DTYPE *input_mask,// [batch,1,seqLenQ,seqLenKV,4]\n"
+" __global MASK_DTYPE *output_mask,// [batch,ROUND_UP(seqLenQ,mTileQ),ROUND_UP(seqLenKV,mTileKV)]\n"
+" const int4 shape // [seqLenQ,seqLenKV,mTileQ,mTileKV]\n"
+") {\n"
+" const int sl=get_global_id(0); // seqLen_4\n"
+" const int sl_kv=get_global_id(1); // seqLenKV_4\n"
+" const int b=get_global_id(2); // Batch\n"
+" DEAL_NON_UNIFORM_DIM3(sl,sl_kv,b);\n"
+" \n"
+" const int seq_len_pack=((shape.x+shape.z-1)/shape.z)*shape.z;\n"
+" const int seq_len_kv_pack=((shape.y+shape.w-1)/shape.w)*shape.w;\n"
+" int in_offset=((b*shape.x+sl*4)*shape.y+sl_kv*4);\n"
+" int out_offset=(b*seq_len_pack+sl*4)*seq_len_kv_pack+sl_kv*4;\n"
+" if(sl*4 >= shape.x || sl_kv*4 >= shape.y) {\n"
+" vstore4((MASK_DTYPE4)0,0,output_mask+out_offset);\n"
+" vstore4((MASK_DTYPE4)0,0,output_mask+out_offset+seq_len_kv_pack);\n"
+" vstore4((MASK_DTYPE4)0,0,output_mask+out_offset+seq_len_kv_pack*2);\n"
+" vstore4((MASK_DTYPE4)0,0,output_mask+out_offset+seq_len_kv_pack*3);\n"
+" } else {\n"
+" int y_down_align4=(shape.y/4*4);\n"
+" MASK_DTYPE4 temp_0,temp_1,temp_2,temp_3;\n"
+" \n"
+" if(sl_kv*4<y_down_align4) {\n"
+" temp_0=vload4(0,input_mask+in_offset);\n"
+" temp_1=(sl*4+1 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0,input_mask+in_offset+shape.y);\n"
+" temp_2=(sl*4+2 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0,input_mask+in_offset+shape.y*2);\n"
+" temp_3=(sl*4+3 >= shape.x) ? (MASK_DTYPE4)0 : vload4(0,input_mask+in_offset+shape.y*3);\n"
+" } else if(sl_kv*4+1 == shape.y){\n"
+" temp_0=(MASK_DTYPE4)(input_mask[in_offset],0,0,0);\n"
+" temp_1=(sl*4+1 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y],0,0,0);//vload4(0,input_mask+in_offset+shape.y);\n"
+" temp_2=(sl*4+2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*2],0,0,0);//vload4(0,input_mask+in_offset+shape.y*2);\n"
+" temp_3=(sl*4+3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*3],0,0,0);//vload4(0,input_mask+in_offset+shape.y*3);\n"
+" } else if(sl_kv*4+2 == shape.y){\n"
+" temp_0=(MASK_DTYPE4)(input_mask[in_offset],input_mask[in_offset+1],0,0);\n"
+" temp_1=(sl*4+1 >= shape.x) ? (MASK_DTYPE4)0 : (FLOAT4)(input_mask[in_offset+shape.y],input_mask[in_offset+shape.y+1],0,0);//vload4(0,input_mask+in_offset+shape.y);\n"
+" temp_2=(sl*4+2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*2],input_mask[in_offset+shape.y*2+1],0,0);//vload4(0,input_mask+in_offset+shape.y*2);\n"
+" temp_3=(sl*4+3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*3],input_mask[in_offset+shape.y*3+1],0,0);//vload4(0,input_mask+in_offset+shape.y*3);\n"
+" } else if(sl_kv*4+3 == shape.y){\n"
+" temp_0=(MASK_DTYPE4)(input_mask[in_offset],input_mask[in_offset+1],input_mask[in_offset+2],0);\n"
+" temp_1=(sl*4+1 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y],input_mask[in_offset+shape.y+1],input_mask[in_offset+shape.y+2],0);//vload4(0,input_mask+in_offset+shape.y);\n"
+" temp_2=(sl*4+2 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*2],input_mask[in_offset+shape.y*2+1],input_mask[in_offset+shape.y*2+2],0);//vload4(0,input_mask+in_offset+shape.y*2);\n"
+" temp_3=(sl*4+3 >= shape.x) ? (MASK_DTYPE4)0 : (MASK_DTYPE4)(input_mask[in_offset+shape.y*3],input_mask[in_offset+shape.y*3+1],input_mask[in_offset+shape.y*3+2],0);//vload4(0,input_mask+in_offset+shape.y*3);\n"
+" }\n"
+" vstore4(temp_0,0,output_mask+out_offset);\n"
+" vstore4(temp_1,0,output_mask+out_offset+seq_len_kv_pack);\n"
+" vstore4(temp_2,0,output_mask+out_offset+2*seq_len_kv_pack);\n"
+" vstore4(temp_3,0,output_mask+out_offset+3*seq_len_kv_pack);\n"
+" }\n"
+"}\n"
+"__kernel void qkv_transpose_output(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *input,// [Batch*mNumHead,ROUND_UP(mHeadDim,mTileHDN),ROUND_UP(seqLen,mTileQ)]\n"
+" __global FLOAT *output,// [Batch,seqLen/4,mNumHead， mHeadDim,4]\n"
+" __private const int tile_q,\n"
+" __private const int tile_hdn,\n"
+" __private const int seq_len,\n"
+" __private const int head_num,\n"
+" __private const int head_dim\n"
+") {\n"
+" \n"
+" const int sl=get_global_id(0); // seqLen_4\n"
+" const int hd=get_global_id(1); // mHeadDim_4\n"
+" const int z=get_global_id(2); // Batch*mNumHead\n"
+" DEAL_NON_UNIFORM_DIM3(sl,hd,z);\n"
+" \n"
+" const int b=z/head_num;\n"
+" const int hn=z % head_num;\n"
+" \n"
+" const int seq_len_pack=((seq_len+tile_q-1)/tile_q)*tile_q;\n"
+" const int head_dim_pack=((head_dim+tile_hdn-1)/tile_hdn)*tile_hdn;\n"
+" \n"
+" const int offset_inp=((b*head_num+hn)*head_dim_pack+4*hd)*seq_len_pack+4*sl;\n"
+" \n"
+" const int offset_out=(((b*seq_len+sl*4)*head_num+hn)*head_dim+4*hd);\n"
+" \n"
+" // Q\n"
+" FLOAT4 temp_0=vload4(0,input+offset_inp);\n"
+" FLOAT4 temp_1=vload4(0,input+offset_inp+seq_len_pack);\n"
+" FLOAT4 temp_2=vload4(0,input+offset_inp+2*seq_len_pack);\n"
+" FLOAT4 temp_3=vload4(0,input+offset_inp+3*seq_len_pack);\n"
+" \n"
+" vstore4((FLOAT4)(temp_0.s0,temp_1.s0,temp_2.s0,temp_3.s0),0,output+offset_out);\n"
+" if(4*sl+1 >= seq_len) return;\n"
+" vstore4((FLOAT4)(temp_0.s1,temp_1.s1,temp_2.s1,temp_3.s1),0,output+offset_out+head_num*head_dim);\n"
+" if(4*sl+2 >= seq_len) return;\n"
+" vstore4((FLOAT4)(temp_0.s2,temp_1.s2,temp_2.s2,temp_3.s2),0,output+offset_out+2*head_num*head_dim);\n"
+" if(4*sl+3 >= seq_len) return;\n"
+" vstore4((FLOAT4)(temp_0.s3,temp_1.s3,temp_2.s3,temp_3.s3),0,output+offset_out+3*head_num*head_dim);\n"
+"}\n"
+"#ifndef NUMHEAD_GROUP_SIZE\n"
+"#define NUMHEAD_GROUP_SIZE 1\n"
+"#endif\n"
 "__kernel void matmul_qk_div_mask(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT *input0,// query [1 query_seq_len/4 head_num head_dim 4]\n"
-" __global const FLOAT *input1,// key [1 key_seq_len/4 head_num head_dim 4]\n"
-" __global FLOAT *output,// prefill [1 head_num query_seq_len/4 key_seq_len 4] decode[1 head_num key_seq_len/4 4]\n"
-" __global FLOAT *past_key,// [1 head_num max_length/4 head_dim 4]\n"
-"#ifdef ADD_MASK\n"
+" __global const FLOAT *input0,// query [1 query_seq_len head_num head_dim]\n"
+" __global const FLOAT *input1,// key [1 key_seq_len head_num head_dim]\n"
+" __global FLOAT *output,// prefill [1 head_num query_seq_len key_seq_len] decode[1 head_num key_seq_len/4 4]\n"
+" __global FLOAT *past_key,// [1 max_length head_num head_dim]\n"
+" #ifdef ADD_MASK\n"
 " __global const FLOAT* mask,\n"
-"#else\n"
-" __global const int* mask,// [1 1 query_seq_len key_seq_len 4]\n"
-"#endif\n"
+" #else\n"
+" __global const int* mask,// [1 1 query_seq_len key_seq_len]\n"
+" #endif\n"
 " __private const float scale,\n"
 " __private const int query_seq_len,\n"
 " __private const int key_seq_len,\n"
 " __private const int head_num,\n"
 " __private const int kv_head_num,\n"
 " __private const int head_dim) {\n"
-" const int x=get_global_id(0); // query_seq_len/4 for prefill 1 for decode\n"
-" const int y=get_global_id(1); // head_num\n"
-" const int z=get_global_id(2); // key_seq_len/4\n"
+" \n"
+" const int x=get_global_id(0); // key_seq_len\n"
+" const int y=get_global_id(1); // query_seq_len for prefill 1 for decode\n"
+" const int z=get_global_id(2); // head_num\n"
 " DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
-" int yin=y/NUMHEAD_GROUP_SIZE;\n"
-" const int offset=head_num*head_dim*4;\n"
-" const int offset_head=y*head_dim*4;\n"
-" __global const FLOAT *A_offset=input0+x*offset+offset_head;\n"
-" __global FLOAT *Pastkey_offset=past_key+(z*kv_head_num+yin)*head_dim*4;\n"
-" const int z4=z << 2;\n"
-" float4 Vscale=(float4)scale;\n"
+" int x4=x << 2;\n"
+" int y4=y << 2;\n"
+" int zin=z/NUMHEAD_GROUP_SIZE;\n"
+" __global const FLOAT *A_offset=input0+(y4*head_num+z)*head_dim;\n"
+" __global FLOAT *Pastkey_offset=past_key+(x4*kv_head_num+zin)*head_dim;\n"
+" int strideA=head_num*head_dim;\n"
+" int strideB=kv_head_num*head_dim;\n"
 "#ifdef OPENCL_PREFILL_ATTENTION\n"
-" __global const FLOAT *B_offset=input1+(z*kv_head_num+yin)*head_dim*4;\n"
-" const int x4=x << 2;\n"
-" const int query_seq_len4=(query_seq_len+3)/4;\n"
-" const int output_offset=y*query_seq_len4*key_seq_len*4;\n"
+" __global const FLOAT *B_offset=input1+(x4*kv_head_num+zin)*head_dim;\n"
+" int output_offset=(z*query_seq_len+y4)*key_seq_len+x4;\n"
 " float4 out0=0;\n"
 " float4 out1=0;\n"
 " float4 out2=0;\n"
 " float4 out3=0;\n"
 " \n"
-" const int head_dim4=(head_dim+3)/4;\n"
-"#ifdef HEADDIM_LEAVE\n"
-" for(int i=0; i<head_dim4-1; ++i){\n"
-" float16 A=convert_float16(vload16(i,A_offset));\n"
-" float16 B=convert_float16(vload16(i,B_offset));\n"
-" \n"
-" out0=mad(A.s0123,(float4)B.s0,out0);\n"
-" out1=mad(A.s0123,(float4)B.s1,out1);\n"
-" out2=mad(A.s0123,(float4)B.s2,out2);\n"
-" out3=mad(A.s0123,(float4)B.s3,out3);\n"
-" \n"
-" out0=mad(A.s4567,(float4)B.s4,out0);\n"
-" out1=mad(A.s4567,(float4)B.s5,out1);\n"
-" out2=mad(A.s4567,(float4)B.s6,out2);\n"
-" out3=mad(A.s4567,(float4)B.s7,out3);\n"
+" bool A1_enable=y4+1<query_seq_len;\n"
+" bool A2_enable=y4+2<query_seq_len;\n"
+" bool A3_enable=y4+3<query_seq_len;\n"
 " \n"
-" out0=mad(A.s89ab,(float4)B.s8,out0);\n"
-" out1=mad(A.s89ab,(float4)B.s9,out1);\n"
-" out2=mad(A.s89ab,(float4)B.sa,out2);\n"
-" out3=mad(A.s89ab,(float4)B.sb,out3);\n"
+" bool B1_enable=x4+1<key_seq_len;\n"
+" bool B2_enable=x4+2<key_seq_len;\n"
+" bool B3_enable=x4+3<key_seq_len;\n"
 " \n"
-" out0=mad(A.scdef,(float4)B.sc,out0);\n"
-" out1=mad(A.scdef,(float4)B.sd,out1);\n"
-" out2=mad(A.scdef,(float4)B.se,out2);\n"
-" out3=mad(A.scdef,(float4)B.sf,out3);\n"
-" \n"
-" vstore16(CONVERT_FLOAT16(B),i,Pastkey_offset);\n"
+" const int head_dim4=(head_dim+3)/4;\n"
+" #ifdef HEADDIM_LEAVE\n"
+" for(int i=0; i<head_dim4-1; ++i){\n"
+" float4 A0=convert_float4(vload4(i,A_offset));\n"
+" float4 A1=A1_enable ? convert_float4(vload4(i,A_offset+strideA)) : (float4)0;\n"
+" float4 A2=A2_enable ? convert_float4(vload4(i,A_offset+strideA+strideA)) : (float4)0;\n"
+" float4 A3=A3_enable ? convert_float4(vload4(i,A_offset+strideA+strideA+strideA)) : (float4)0;\n"
+" float4 B0=convert_float4(vload4(i,B_offset));\n"
+" float4 B1=B1_enable ? convert_float4(vload4(i,B_offset+strideB)) : (float4)0;\n"
+" float4 B2=B2_enable ? convert_float4(vload4(i,B_offset+strideB+strideB)) : (float4)0;\n"
+" float4 B3=B3_enable ? convert_float4(vload4(i,B_offset+strideB+strideB+strideB)) : (float4)0;\n"
+" \n"
+" out0.x += dot(A0,B0);\n"
+" out0.y += dot(A0,B1);\n"
+" out0.z += dot(A0,B2);\n"
+" out0.w += dot(A0,B3);\n"
+" \n"
+" out1.x += dot(A1,B0);\n"
+" out1.y += dot(A1,B1);\n"
+" out1.z += dot(A1,B2);\n"
+" out1.w += dot(A1,B3);\n"
+" \n"
+" out2.x += dot(A2,B0);\n"
+" out2.y += dot(A2,B1);\n"
+" out2.z += dot(A2,B2);\n"
+" out2.w += dot(A2,B3);\n"
+" \n"
+" out3.x += dot(A3,B0);\n"
+" out3.y += dot(A3,B1);\n"
+" out3.z += dot(A3,B2);\n"
+" out3.w += dot(A3,B3);\n"
+" \n"
+" vstore4(CONVERT_FLOAT4(B0),i,Pastkey_offset);\n"
+" vstore4(CONVERT_FLOAT4(B1),i,Pastkey_offset+strideB);\n"
+" vstore4(CONVERT_FLOAT4(B2),i,Pastkey_offset+strideB+strideB);\n"
+" vstore4(CONVERT_FLOAT4(B3),i,Pastkey_offset+strideB+strideB+strideB);\n"
 " }\n"
 " for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B=convert_float4(vload4(i,B_offset));\n"
-" \n"
-" out0=mad(A,(float4)B.s0,out0);\n"
-" out1=mad(A,(float4)B.s1,out1);\n"
-" out2=mad(A,(float4)B.s2,out2);\n"
-" out3=mad(A,(float4)B.s3,out3);\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(B),i,Pastkey_offset);\n"
+" float A0=A_offset[i];\n"
+" float A1=A1_enable ? A_offset[i+strideA] : 0;\n"
+" float A2=A2_enable ? A_offset[i+strideA+strideA] : 0;\n"
+" float A3=A3_enable ? A_offset[i+strideA+strideA+strideA] : 0;\n"
+" float B0=B_offset[i];\n"
+" float B1=B1_enable ? B_offset[i+strideB] : 0;\n"
+" float B2=B2_enable ? B_offset[i+strideB+strideB] : 0;\n"
+" float B3=B3_enable ? B_offset[i+strideB+strideB+strideB] : 0;\n"
+" \n"
+" out0.x += A0*B0;\n"
+" out0.y += A0*B1;\n"
+" out0.z += A0*B2;\n"
+" out0.w += A0*B3;\n"
+" \n"
+" out1.x += A1*B0;\n"
+" out1.y += A1*B1;\n"
+" out1.z += A1*B2;\n"
+" out1.w += A1*B3\n"
+" \n"
+" out2.x += A2*B0;\n"
+" out2.y += A2*B1;\n"
+" out2.z += A2*B2;\n"
+" out2.w += A2*B3;\n"
+" \n"
+" out3.x += A3*B0;\n"
+" out3.y += A3*B1;\n"
+" out3.z += A3*B2;\n"
+" out3.w += A3*B3;\n"
+" \n"
+" Pastkey_offset[i]=(FLOAT)B0;\n"
+" Pastkey_offset[i+strideB]=(FLOAT)B1;\n"
+" Pastkey_offset[i+strideB+strideB]=(FLOAT)B2;\n"
+" Pastkey_offset[i+strideB+strideB+strideB]=(FLOAT)B3;\n"
 " }\n"
-"#else\n"
+" #else\n"
 " for(int i=0; i<head_dim4; ++i){\n"
-" float16 A=convert_float16(vload16(i,A_offset));\n"
-" float16 B=convert_float16(vload16(i,B_offset));\n"
-" \n"
-" out0=mad(A.s0123,(float4)B.s0,out0);\n"
-" out1=mad(A.s0123,(float4)B.s1,out1);\n"
-" out2=mad(A.s0123,(float4)B.s2,out2);\n"
-" out3=mad(A.s0123,(float4)B.s3,out3);\n"
-" \n"
-" out0=mad(A.s4567,(float4)B.s4,out0);\n"
-" out1=mad(A.s4567,(float4)B.s5,out1);\n"
-" out2=mad(A.s4567,(float4)B.s6,out2);\n"
-" out3=mad(A.s4567,(float4)B.s7,out3);\n"
-" \n"
-" out0=mad(A.s89ab,(float4)B.s8,out0);\n"
-" out1=mad(A.s89ab,(float4)B.s9,out1);\n"
-" out2=mad(A.s89ab,(float4)B.sa,out2);\n"
-" out3=mad(A.s89ab,(float4)B.sb,out3);\n"
-" \n"
-" out0=mad(A.scdef,(float4)B.sc,out0);\n"
-" out1=mad(A.scdef,(float4)B.sd,out1);\n"
-" out2=mad(A.scdef,(float4)B.se,out2);\n"
-" out3=mad(A.scdef,(float4)B.sf,out3);\n"
-" \n"
-" vstore16(CONVERT_FLOAT16(B),i,Pastkey_offset);\n"
-" }\n"
-"#endif\n"
-" \n"
-" out0 *= Vscale;\n"
-" out1 *= Vscale;\n"
-" out2 *= Vscale;\n"
-" out3 *= Vscale;\n"
-" float4 mask0,mask1,mask2,mask3;\n"
-" mask=mask+(x4*key_seq_len+z4)*4;\n"
-" mask0.s0=mask[0]; mask1.s0=mask[4]; mask2.s0=mask[8]; mask3.s0=mask[12]; mask += key_seq_len*4;\n"
-" mask0.s1=mask[0]; mask1.s1=mask[4]; mask2.s1=mask[8]; mask3.s1=mask[12]; mask += key_seq_len*4;\n"
-" mask0.s2=mask[0]; mask1.s2=mask[4]; mask2.s2=mask[8]; mask3.s2=mask[12]; mask += key_seq_len*4;\n"
-" mask0.s3=mask[0]; mask1.s3=mask[4]; mask2.s3=mask[8]; mask3.s3=mask[12];\n"
-"#ifdef ADD_MASK\n"
+" float4 A0=convert_float4(vload4(i,A_offset));\n"
+" float4 A1=A1_enable ? convert_float4(vload4(i,A_offset+strideA)) : (float4)0;\n"
+" float4 A2=A2_enable ? convert_float4(vload4(i,A_offset+strideA+strideA)) : (float4)0;\n"
+" float4 A3=A3_enable ? convert_float4(vload4(i,A_offset+strideA+strideA+strideA)) : (float4)0;\n"
+" float4 B0=convert_float4(vload4(i,B_offset));\n"
+" float4 B1=B1_enable ? convert_float4(vload4(i,B_offset+strideB)) : (float4)0;\n"
+" float4 B2=B2_enable ? convert_float4(vload4(i,B_offset+strideB+strideB)) : (float4)0;\n"
+" float4 B3=B3_enable ? convert_float4(vload4(i,B_offset+strideB+strideB+strideB)) : (float4)0;\n"
+" \n"
+" out0.x += dot(A0,B0);\n"
+" out0.y += dot(A0,B1);\n"
+" out0.z += dot(A0,B2);\n"
+" out0.w += dot(A0,B3);\n"
+" \n"
+" out1.x += dot(A1,B0);\n"
+" out1.y += dot(A1,B1);\n"
+" out1.z += dot(A1,B2);\n"
+" out1.w += dot(A1,B3);\n"
+" \n"
+" out2.x += dot(A2,B0);\n"
+" out2.y += dot(A2,B1);\n"
+" out2.z += dot(A2,B2);\n"
+" out2.w += dot(A2,B3);\n"
+" \n"
+" out3.x += dot(A3,B0);\n"
+" out3.y += dot(A3,B1);\n"
+" out3.z += dot(A3,B2);\n"
+" out3.w += dot(A3,B3);\n"
+" \n"
+" vstore4(CONVERT_FLOAT4(B0),i,Pastkey_offset);\n"
+" vstore4(CONVERT_FLOAT4(B1),i,Pastkey_offset+strideB);\n"
+" vstore4(CONVERT_FLOAT4(B2),i,Pastkey_offset+strideB+strideB);\n"
+" vstore4(CONVERT_FLOAT4(B3),i,Pastkey_offset+strideB+strideB+strideB);\n"
+" }\n"
+" #endif\n"
+" out0 *= (float4)scale;\n"
+" out1 *= (float4)scale;\n"
+" out2 *= (float4)scale;\n"
+" out3 *= (float4)scale;\n"
+" float4 mask0=convert_float4(vload4(0,mask+y4*key_seq_len+x4));\n"
+" float4 mask1=convert_float4(vload4(0,mask+(y4+1)*key_seq_len+x4));\n"
+" float4 mask2=convert_float4(vload4(0,mask+(y4+2)*key_seq_len+x4));\n"
+" float4 mask3=convert_float4(vload4(0,mask+(y4+3)*key_seq_len+x4));\n"
+" #ifdef ADD_MASK\n"
 " out0 += mask0;\n"
 " out1 += mask1;\n"
 " out2 += mask2;\n"
 " out3 += mask3;\n"
-"#else\n"
+" #else\n"
 " out0=(mask0 == (float4)0) ? (float4)(-FLT_MAX) : out0;\n"
 " out1=(mask1 == (float4)0) ? (float4)(-FLT_MAX) : out1;\n"
 " out2=(mask2 == (float4)0) ? (float4)(-FLT_MAX) : out2;\n"
 " out3=(mask3 == (float4)0) ? (float4)(-FLT_MAX) : out3;\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset+x*key_seq_len*4+z4*4);\n"
-" if(z4+1 >= key_seq_len) return;\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset+x*key_seq_len*4+(z4+1)*4);\n"
-" if(z4+2 >= key_seq_len) return;\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset+x*key_seq_len*4+(z4+2)*4);\n"
-" if(z4+3 >= key_seq_len) return;\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset+x*key_seq_len*4+(z4+3)*4);\n"
+" #endif\n"
+" if(B3_enable){\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
+" if(!A1_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
+" if(!A2_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
+" if(!A3_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
+" } else if(B2_enable){\n"
+" vstore3(CONVERT_FLOAT3((float3)(out0.x,out0.y,out0.z)),0,output+output_offset);\n"
+" if(!A1_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore3(CONVERT_FLOAT3((float3)(out1.x,out1.y,out1.z)),0,output+output_offset);\n"
+" if(!A2_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore3(CONVERT_FLOAT3((float3)(out2.x,out2.y,out2.z)),0,output+output_offset);\n"
+" if(!A3_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore3(CONVERT_FLOAT3((float3)(out3.x,out3.y,out3.z)),0,output+output_offset);\n"
+" } else if(B1_enable){\n"
+" vstore2(CONVERT_FLOAT2((float2)(out0.x,out0.y)),0,output+output_offset);\n"
+" if(!A1_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore2(CONVERT_FLOAT2((float2)(out1.x,out1.y)),0,output+output_offset);\n"
+" if(!A2_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore2(CONVERT_FLOAT2((float2)(out2.x,out2.y)),0,output+output_offset);\n"
+" if(!A3_enable) return;\n"
+" output_offset += key_seq_len;\n"
+" vstore2(CONVERT_FLOAT2((float2)(out3.x,out3.y)),0,output+output_offset);\n"
+" } else {\n"
+" output[output_offset]=out0.x;\n"
+" if(!A1_enable) return;\n"
+" output[output_offset+key_seq_len]=out1.x;\n"
+" if(!A2_enable) return;\n"
+" output[output_offset+key_seq_len+key_seq_len]=out2.x;\n"
+" if(!A3_enable) return;\n"
+" output[output_offset+key_seq_len+key_seq_len+key_seq_len]=out3.x;\n"
+" }\n"
 "#else\n"
-" __global const FLOAT *B_offset=input1+yin*head_dim*4;\n"
-" const int key_seq_len4=(key_seq_len+3)/4;\n"
 " float4 out=0;\n"
 " const int head_dim4=(head_dim+3)/4;\n"
-" \n"
-"#ifdef HEADDIM_LEAVE\n"
+" int key_seq_len4=(key_seq_len+3)/4;\n"
+" #ifdef HEADDIM_LEAVE\n"
 " for(int i=0; i<head_dim4-1; ++i){\n"
-" float16 A=convert_float16(vload16(i,A_offset));\n"
-" float16 B=convert_float16(vload16(i,Pastkey_offset));\n"
+" float4 A=convert_float4(vload4(i,A_offset));\n"
+" float4 B0=convert_float4(vload4(i,Pastkey_offset));\n"
+" float4 B1=convert_float4(vload4(i,Pastkey_offset+strideB));\n"
+" float4 B2=convert_float4(vload4(i,Pastkey_offset+strideB+strideB));\n"
+" float4 B3=convert_float4(vload4(i,Pastkey_offset+strideB+strideB+strideB));\n"
 " \n"
-" out=mad((float4)A.s0,B.s0123,out);\n"
-" out=mad((float4)A.s4,B.s4567,out);\n"
-" out=mad((float4)A.s8,B.s89ab,out);\n"
-" out=mad((float4)A.sc,B.scdef,out);\n"
+" out.x += dot(A,B0);\n"
+" out.y += dot(A,B1);\n"
+" out.z += dot(A,B2);\n"
+" out.w += dot(A,B3);\n"
 " }\n"
 " for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B=convert_float4(vload4(i,Pastkey_offset));\n"
-" \n"
-" out=mad((float4)A.s0,B,out);\n"
+" float A=A_offset[i];\n"
+" float B0=Pastkey_offset[i];\n"
+" float B1=Pastkey_offset[i+strideB];\n"
+" float B2=Pastkey_offset[i+strideB+strideB];\n"
+" float B3=Pastkey_offset[i+strideB+strideB];\n"
+" out.x += A*B0;\n"
+" out.y += A*B1;\n"
+" out.z += A*B2;\n"
+" out.w += A*B3;\n"
 " }\n"
-"#else\n"
+" #else\n"
 " for(int i=0; i<head_dim4; ++i){\n"
-" float16 A=convert_float16(vload16(i,A_offset));\n"
-" float16 B=convert_float16(vload16(i,Pastkey_offset));\n"
+" float4 A=convert_float4(vload4(i,A_offset));\n"
+" float4 B0=convert_float4(vload4(i,Pastkey_offset));\n"
+" float4 B1=convert_float4(vload4(i,Pastkey_offset+strideB));\n"
+" float4 B2=convert_float4(vload4(i,Pastkey_offset+strideB+strideB));\n"
+" float4 B3=convert_float4(vload4(i,Pastkey_offset+strideB+strideB+strideB));\n"
 " \n"
-" out=mad((float4)A.s0,B.s0123,out);\n"
-" out=mad((float4)A.s4,B.s4567,out);\n"
-" out=mad((float4)A.s8,B.s89ab,out);\n"
-" out=mad((float4)A.sc,B.scdef,out);\n"
+" out.x += dot(A,B0);\n"
+" out.y += dot(A,B1);\n"
+" out.z += dot(A,B2);\n"
+" out.w += dot(A,B3);\n"
 " }\n"
-"#endif\n"
-" if(z == key_seq_len4-1){\n"
-" int remain=key_seq_len-z*4-1;\n"
-" Pastkey_offset += remain;\n"
+" #endif\n"
+" int remain=key_seq_len-x4;\n"
+" if(x == key_seq_len4-1){\n"
+" __global const FLOAT *B_offset=input1+zin*head_dim;\n"
+" Pastkey_offset += (remain-1)*strideB;\n"
 " float tmp=0;\n"
-" for(int i=0; i<head_dim; ++i){\n"
-" float A=A_offset[i*4];\n"
-" float B=B_offset[i*4];\n"
-" Pastkey_offset[i*4]=B;\n"
+" #ifdef HEADDIM_LEAVE\n"
+" for(int i=0; i<head_dim4-1; ++i){\n"
+" float4 A=convert_float4(vload4(i,A_offset));\n"
+" float4 B=convert_float4(vload4(i,B_offset));\n"
+" \n"
+" tmp += dot(A,B);\n"
+" vstore4(CONVERT_FLOAT4(B),i,Pastkey_offset);\n"
+" }\n"
+" for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
+" float A=A_offset[i];\n"
+" float B=B_offset[i];\n"
 " tmp += A*B;\n"
+" Pastkey_offset[i]=B;\n"
+" }\n"
+" #else\n"
+" for(int i=0; i<head_dim4; ++i){\n"
+" float4 A=convert_float4(vload4(i,A_offset));\n"
+" float4 B=convert_float4(vload4(i,B_offset));\n"
+" \n"
+" tmp += dot(A,B);\n"
+" vstore4(CONVERT_FLOAT4(B),i,Pastkey_offset);\n"
 " }\n"
+" #endif\n"
 " float *out_ptr=(float*)&out;\n"
-" out_ptr[remain]=tmp;\n"
+" out_ptr[remain-1]=tmp;\n"
+" }\n"
+" out *= (float4)scale;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+z*key_seq_len+x4);\n"
+" } else if (remain >= 3){\n"
+" vstore3(CONVERT_FLOAT3((float3)(out.x,out.y,out.z)),0,output+z*key_seq_len+x4);\n"
+" } else if (remain >= 2){\n"
+" vstore2(CONVERT_FLOAT2((float2)(out.x,out.y)),0,output+z*key_seq_len+x4);\n"
+" } else {\n"
+" output[z*key_seq_len+x4]=out.x;\n"
 " }\n"
-" out *= Vscale;\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+y*key_seq_len4*4+z4);\n"
 "#endif\n"
 "}\n"
 "__kernel void matmul_qkv(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT *input0,// qk prefill [1 head_num qk_seq_len/4 value_seq_len 4] decode[1 head_num value_seq_len/4 4]\n"
-" __global const FLOAT *input1,// [1 value_seq_len/4 head_num head_dim 4]\n"
-" __global FLOAT *output,// [1 qk_seq_len head_num*head_dim 1 4]\n"
-" __global FLOAT *past_value,// [1 value_seq_len/4 head_num head_dim 4]\n"
+" __global const FLOAT *input0,// qk prefill [1 head_num qk_seq_len value_seq_len] decode[1 head_num value_seq_len]\n"
+" __global const FLOAT *input1,// [1 value_seq_len head_num head_dim]\n"
+" __global FLOAT *output,// [1 qk_seq_len head_num head_dim]\n"
+" __global FLOAT *past_value,// [1 value_seq_len head_num head_dim]\n"
 " __private const int qk_seq_len,\n"
 " __private const int value_seq_len,\n"
 " __private const int head_num,\n"
 " __private const int kv_head_num,\n"
 " __private const int head_dim) {\n"
-" const int x=get_global_id(0); // prefill qk_seq_len/4 decode 1\n"
+" \n"
+" const int x=get_global_id(0); // head_dim << 2\n"
 " const int y=get_global_id(1); // head_num\n"
-" const int z=get_global_id(2); // head_dim << 2\n"
-" const int z4=z << 2;\n"
+" const int z=get_global_id(2); // prefill qk_seq_len decode 1\n"
+" \n"
+" const int x4=x << 2;\n"
 " DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
 " const int yin=y/NUMHEAD_GROUP_SIZE;\n"
 "#ifdef OPENCL_PREFILL_ATTENTION\n"
-" const int offset=head_num*head_dim*4;\n"
-" const int stride=kv_head_num*head_dim*4;\n"
-" const int offset_head=y*head_dim*4+z4*4;\n"
-" const int value_seq_len4=(value_seq_len+3)/4;\n"
-" const int qk_seq_len4=(qk_seq_len+3)/4;\n"
-" __global const FLOAT *A_offset=input0+(y*qk_seq_len4+x)*value_seq_len*4;\n"
-" __global const FLOAT *B_offset=input1+yin*head_dim*4+z4*4;\n"
-" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim*4+z4*4;\n"
+" int z4=z << 2;\n"
+" int value_seq_len4=(value_seq_len+3)/4;\n"
+" int loop_end=max(value_seq_len4-1,0);\n"
+" const int stride=kv_head_num*head_dim;\n"
+" __global const FLOAT *A_offset=input0+(y*qk_seq_len+z4)*value_seq_len;\n"
+" __global const FLOAT *B_offset=input1+yin*head_dim+x4;\n"
+" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim+x4;\n"
 " COMPUTE_FLOAT4 out0=0;\n"
 " COMPUTE_FLOAT4 out1=0;\n"
 " COMPUTE_FLOAT4 out2=0;\n"
 " COMPUTE_FLOAT4 out3=0;\n"
 " \n"
-" for(int i=0; i<value_seq_len4-1; ++i){\n"
+" for(int i=0; i<loop_end; ++i){\n"
 " int index=i << 2;\n"
-" COMPUTE_FLOAT4 A0=CONVERT_COMPUTE_FLOAT4(vload4(index,A_offset));\n"
-" COMPUTE_FLOAT4 A1=CONVERT_COMPUTE_FLOAT4(vload4(index+1,A_offset));\n"
-" COMPUTE_FLOAT4 A2=CONVERT_COMPUTE_FLOAT4(vload4(index+2,A_offset));\n"
-" COMPUTE_FLOAT4 A3=CONVERT_COMPUTE_FLOAT4(vload4(index+3,A_offset));\n"
-" COMPUTE_FLOAT16 B=CONVERT_COMPUTE_FLOAT16(vload16(0,B_offset+i*stride));\n"
-" \n"
-" out0=mad(A0,(COMPUTE_FLOAT4)B.s0,out0);\n"
-" out0=mad(A1,(COMPUTE_FLOAT4)B.s1,out0);\n"
-" out0=mad(A2,(COMPUTE_FLOAT4)B.s2,out0);\n"
-" out0=mad(A3,(COMPUTE_FLOAT4)B.s3,out0);\n"
-" \n"
-" out1=mad(A0,(COMPUTE_FLOAT4)B.s4,out1);\n"
-" out1=mad(A1,(COMPUTE_FLOAT4)B.s5,out1);\n"
-" out1=mad(A2,(COMPUTE_FLOAT4)B.s6,out1);\n"
-" out1=mad(A3,(COMPUTE_FLOAT4)B.s7,out1);\n"
-" \n"
-" out2=mad(A0,(COMPUTE_FLOAT4)B.s8,out2);\n"
-" out2=mad(A1,(COMPUTE_FLOAT4)B.s9,out2);\n"
-" out2=mad(A2,(COMPUTE_FLOAT4)B.sa,out2);\n"
-" out2=mad(A3,(COMPUTE_FLOAT4)B.sb,out2);\n"
-" \n"
-" out3=mad(A0,(COMPUTE_FLOAT4)B.sc,out3);\n"
-" out3=mad(A1,(COMPUTE_FLOAT4)B.sd,out3);\n"
-" out3=mad(A2,(COMPUTE_FLOAT4)B.se,out3);\n"
-" out3=mad(A3,(COMPUTE_FLOAT4)B.sf,out3);\n"
-" vstore16(CONVERT_FLOAT16(B),0,Pastvalue_offset+i*stride);\n"
-" }\n"
-"#ifdef HEADDIM_LEAVE\n"
-" COMPUTE_FLOAT16 B=CONVERT_COMPUTE_FLOAT16(vload16(0,B_offset+(value_seq_len4-1)*stride));\n"
-" COMPUTE_FLOAT *B_ptr=(COMPUTE_FLOAT*)&B;\n"
-" for(int i=(value_seq_len4-1)*4,j=0; i<value_seq_len; ++i,++j){\n"
-" COMPUTE_FLOAT4 A0=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset));\n"
-" out0=mad(A0,(COMPUTE_FLOAT4)B_ptr[j],out0);\n"
-" out1=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+4],out1);\n"
-" out2=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+8],out2);\n"
-" out3=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+12],out3);\n"
-" }\n"
-" vstore4(CONVERT_FLOAT4(out0),0,output+x*offset+(y*head_dim+z4)*4);\n"
-" vstore4(CONVERT_FLOAT4(B.s0123),0,Pastvalue_offset+(value_seq_len4-1)*stride);\n"
-" if(z4+1 >= head_dim) return;\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+x*offset+(y*head_dim+z4+1)*4);\n"
-" vstore4(CONVERT_FLOAT4(B.s4567),1,Pastvalue_offset+(value_seq_len4-1)*stride);\n"
-" if(z4+2 >= head_dim) return;\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+x*offset+(y*head_dim+z4+2)*4);\n"
-" vstore4(CONVERT_FLOAT4(B.s89ab),2,Pastvalue_offset+(value_seq_len4-1)*stride);\n"
-" if(z4+3 >= head_dim) return;\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+x*offset+(y*head_dim+z4+3)*4);\n"
-" vstore4(CONVERT_FLOAT4(B.scdef),3,Pastvalue_offset+(value_seq_len4-1)*stride);\n"
-"#else\n"
-" COMPUTE_FLOAT16 B=CONVERT_COMPUTE_FLOAT16(vload16(0,B_offset+(value_seq_len4-1)*stride));\n"
-" vstore16(CONVERT_FLOAT16(B),0,Pastvalue_offset+(value_seq_len4-1)*stride);\n"
-" COMPUTE_FLOAT *B_ptr=(COMPUTE_FLOAT*)&B;\n"
-" for(int i=(value_seq_len4-1)*4,j=0; i<value_seq_len; ++i,++j){\n"
 " COMPUTE_FLOAT4 A0=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset));\n"
-" out0=mad(A0,(COMPUTE_FLOAT4)B_ptr[j],out0);\n"
-" out1=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+4],out1);\n"
-" out2=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+8],out2);\n"
-" out3=mad(A0,(COMPUTE_FLOAT4)B_ptr[j+12],out3);\n"
+" COMPUTE_FLOAT4 A1=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len));\n"
+" COMPUTE_FLOAT4 A2=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len+value_seq_len));\n"
+" COMPUTE_FLOAT4 A3=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len+value_seq_len+value_seq_len));\n"
+" COMPUTE_FLOAT4 B0=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+0)*stride));\n"
+" COMPUTE_FLOAT4 B1=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+1)*stride));\n"
+" COMPUTE_FLOAT4 B2=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+2)*stride));\n"
+" COMPUTE_FLOAT4 B3=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+3)*stride));\n"
+" \n"
+" out0=mad(B0,(COMPUTE_FLOAT4)A0.x,out0);\n"
+" out0=mad(B1,(COMPUTE_FLOAT4)A0.y,out0);\n"
+" out0=mad(B2,(COMPUTE_FLOAT4)A0.z,out0);\n"
+" out0=mad(B3,(COMPUTE_FLOAT4)A0.w,out0);\n"
+" \n"
+" out1=mad(B0,(COMPUTE_FLOAT4)A1.x,out1);\n"
+" out1=mad(B1,(COMPUTE_FLOAT4)A1.y,out1);\n"
+" out1=mad(B2,(COMPUTE_FLOAT4)A1.z,out1);\n"
+" out1=mad(B3,(COMPUTE_FLOAT4)A1.w,out1);\n"
+" \n"
+" out2=mad(B0,(COMPUTE_FLOAT4)A2.x,out2);\n"
+" out2=mad(B1,(COMPUTE_FLOAT4)A2.y,out2);\n"
+" out2=mad(B2,(COMPUTE_FLOAT4)A2.z,out2);\n"
+" out2=mad(B3,(COMPUTE_FLOAT4)A2.w,out2);\n"
+" \n"
+" out3=mad(B0,(COMPUTE_FLOAT4)A3.x,out3);\n"
+" out3=mad(B1,(COMPUTE_FLOAT4)A3.y,out3);\n"
+" out3=mad(B2,(COMPUTE_FLOAT4)A3.z,out3);\n"
+" out3=mad(B3,(COMPUTE_FLOAT4)A3.w,out3);\n"
+" vstore4(CONVERT_FLOAT4(B0),0,Pastvalue_offset+(index+0)*stride);\n"
+" vstore4(CONVERT_FLOAT4(B1),0,Pastvalue_offset+(index+1)*stride);\n"
+" vstore4(CONVERT_FLOAT4(B2),0,Pastvalue_offset+(index+2)*stride);\n"
+" vstore4(CONVERT_FLOAT4(B3),0,Pastvalue_offset+(index+3)*stride);\n"
+" }\n"
+" for(int i=loop_end << 2; i<value_seq_len; ++i){\n"
+" COMPUTE_FLOAT A0=A_offset[i];\n"
+" COMPUTE_FLOAT A1=A_offset[i+value_seq_len];\n"
+" COMPUTE_FLOAT A2=A_offset[i+value_seq_len+value_seq_len];\n"
+" COMPUTE_FLOAT A3=A_offset[i+value_seq_len+value_seq_len+value_seq_len];\n"
+" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+i*stride));\n"
+" \n"
+" out0=mad(B,(COMPUTE_FLOAT4)A0,out0);\n"
+" out1=mad(B,(COMPUTE_FLOAT4)A1,out1);\n"
+" out2=mad(B,(COMPUTE_FLOAT4)A2,out2);\n"
+" out3=mad(B,(COMPUTE_FLOAT4)A3,out3);\n"
+" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+i*stride);\n"
 " }\n"
-" vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+x*offset+(y*head_dim+z4)*4);\n"
-"#endif\n"
+" \n"
+" #ifdef HEADDIM_LEAVE\n"
+" int remain=head_dim-x4;\n"
+" int output_offset=(z4*head_num+y)*head_dim+x4;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
+" } else if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out0.x,out0.y,out0.z)),0,output+output_offset);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out0.x,out0.y)),0,output+output_offset);\n"
+" } else{\n"
+" output[output_offset]=out0.x;\n"
+" }\n"
+" if(z4+1 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
+" } else if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out1.x,out1.y,out1.z)),0,output+output_offset);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out1.x,out1.y)),0,output+output_offset);\n"
+" } else{\n"
+" output[output_offset]=out1.x;\n"
+" }\n"
+" if(z4+2 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
+" } else if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out2.x,out2.y,out2.z)),0,output+output_offset);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out2.x,out2.y)),0,output+output_offset);\n"
+" } else{\n"
+" output[output_offset]=out2.x;\n"
+" }\n"
+" if(z4+3 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
+" } else if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out3.x,out3.y,out3.z)),0,output+output_offset);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out3.x,out3.y)),0,output+output_offset);\n"
+" } else{\n"
+" output[(x*head_num+y)*head_dim+z4]=out3.x;\n"
+" }\n"
+" #else\n"
+" int output_offset=(z4*head_num+y)*head_dim+x4;\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
+" if(z4+1 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
+" if(z4+2 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
+" if(z4+3 >= qk_seq_len) return;\n"
+" output_offset += head_num*head_dim;\n"
+" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
+" #endif\n"
 "#else\n"
-" const int value_seq_len4=(value_seq_len+3)/4;\n"
-" const int stride=kv_head_num*head_dim*4;\n"
-" const int offset=head_num*head_dim*4;\n"
-" const int offset_head=y*head_dim*4+z4*4;\n"
-" const int loop=(value_seq_len+2)/4;\n"
-" __global const FLOAT *A_offset=input0+y*value_seq_len4*4;\n"
-" __global const FLOAT *B_offset=input1+yin*head_dim*4+z4*4;\n"
-" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim*4+z4*4;\n"
+" int value_seq_len4=(value_seq_len-1+3)/4;\n"
+" int loop_end=max(value_seq_len4-1,0);\n"
+" const int stride=kv_head_num*head_dim;\n"
+" __global const FLOAT *A_offset=input0+y*value_seq_len;\n"
+" __global const FLOAT *B_offset=input1+yin*head_dim+x4;\n"
+" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim+x4;\n"
 " COMPUTE_FLOAT4 out=0;\n"
 " \n"
-" for(int i=0; i<loop-1; i++){\n"
+" for(int i=0; i<loop_end; i++){\n"
+" int index=i << 2;\n"
 " COMPUTE_FLOAT4 A=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset));\n"
-" COMPUTE_FLOAT16 B=CONVERT_COMPUTE_FLOAT16(vload16(0,Pastvalue_offset+i*stride));\n"
+" COMPUTE_FLOAT4 B0=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+0)*stride));\n"
+" COMPUTE_FLOAT4 B1=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+1)*stride));\n"
+" COMPUTE_FLOAT4 B2=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+2)*stride));\n"
+" COMPUTE_FLOAT4 B3=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+3)*stride));\n"
 " \n"
-" out.s0 += dot(A,B.s0123);\n"
-" out.s1 += dot(A,B.s4567);\n"
-" out.s2 += dot(A,B.s89ab);\n"
-" out.s3 += dot(A,B.scdef);\n"
+" out=mad(B0,(COMPUTE_FLOAT4)A.x,out);\n"
+" out=mad(B1,(COMPUTE_FLOAT4)A.y,out);\n"
+" out=mad(B2,(COMPUTE_FLOAT4)A.z,out);\n"
+" out=mad(B3,(COMPUTE_FLOAT4)A.w,out);\n"
 " }\n"
-" int start=(loop-1)<0 ? 0 : (loop-1);\n"
-" COMPUTE_FLOAT16 B_Vec=CONVERT_COMPUTE_FLOAT16(vload16(0,Pastvalue_offset+start*stride));\n"
-" COMPUTE_FLOAT *B_ptr=(COMPUTE_FLOAT *)&B_Vec;\n"
-" for(int i=start*4; i<value_seq_len-1; ++i){\n"
+" for(int i=loop_end << 2; i<value_seq_len-1; i++){\n"
 " COMPUTE_FLOAT A=A_offset[i];\n"
+" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+i*stride));\n"
 " \n"
-" int index=i % 4;\n"
-" out.s0 += A*B_ptr[index];\n"
-" out.s1 += A*B_ptr[index+4];\n"
-" out.s2 += A*B_ptr[index+8];\n"
-" out.s3 += A*B_ptr[index+12];\n"
+" out=mad(B,(COMPUTE_FLOAT4)A,out);\n"
 " }\n"
 " COMPUTE_FLOAT A=A_offset[value_seq_len-1];\n"
-" COMPUTE_FLOAT B0=B_offset[0];\n"
-" COMPUTE_FLOAT B1=B_offset[4];\n"
-" COMPUTE_FLOAT B2=B_offset[8];\n"
-" COMPUTE_FLOAT B3=B_offset[12];\n"
-" out.s0 += A*B0;\n"
-" out.s1 += A*B1;\n"
-" out.s2 += A*B2;\n"
-" out.s3 += A*B3;\n"
-" int index=((value_seq_len-1) >> 2)*stride+((value_seq_len-1) % 4);\n"
-" \n"
-"#ifdef HEADDIM_LEAVE\n"
-" Pastvalue_offset[index]=B0;\n"
-" output[(y*head_dim+z4)*4]=out.s0;\n"
-" if(z4+1 >= head_dim) return;\n"
-" Pastvalue_offset[index+4]=B1;\n"
-" output[(y*head_dim+z4+1)*4]=out.s1;\n"
-" if(z4+2 >= head_dim) return;\n"
-" Pastvalue_offset[index+8]=B2;\n"
-" output[(y*head_dim+z4+2)*4]=out.s2;\n"
-" if(z4+3 >= head_dim) return;\n"
-" Pastvalue_offset[index+12]=B3;\n"
-" output[(y*head_dim+z4+3)*4]=out.s3;\n"
-"#else\n"
-" Pastvalue_offset[index]=B0;\n"
-" Pastvalue_offset[index+4]=B1;\n"
-" Pastvalue_offset[index+8]=B2;\n"
-" Pastvalue_offset[index+12]=B3;\n"
+" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset));\n"
+" out=mad(B,(COMPUTE_FLOAT4)A,out);\n"
 " \n"
-" output[(y*head_dim+z4)*4]=out.s0;\n"
-" output[(y*head_dim+z4+1)*4]=out.s1;\n"
-" output[(y*head_dim+z4+2)*4]=out.s2;\n"
-" output[(y*head_dim+z4+3)*4]=out.s3;\n"
-"#endif\n"
+" #ifdef HEADDIM_LEAVE\n"
+" int remain=head_dim-x4;\n"
+" if(remain >= 4){\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+y*head_dim+x4);\n"
+" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
+" } else if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out.x,out.y,out.z)),0,output+y*head_dim+x4);\n"
+" vstore3(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x,B.y,B.z)),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out.x,out.y)),0,output+y*head_dim+x4);\n"
+" vstore2(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x,B.y)),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
+" } else{\n"
+" output[(x*head_num+y)*head_dim+x4]=out.x;\n"
+" Pastvalue_offset[(value_seq_len-1)*stride]=B.x;\n"
+" }\n"
+" #else\n"
+" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+y*head_dim+x4);\n"
+" #endif\n"
 " \n"
 "#endif\n"
 "}\n"
@@ -13482,6 +11953,7 @@ const char* unary_subgroup_buf =
 " __private const int width,\n"
 " __private const int height,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " const int channel_block_idx=get_global_id(0);\n"
@@ -13490,8 +11962,7 @@ const char* unary_subgroup_buf =
 " DEAL_NON_UNIFORM_DIM3(channel_block_idx,w,hb);\n"
 " const int batch_idx=hb/height;\n"
 " const int height_idx=hb % height;\n"
-" const int channel4=(channel+3)/4;\n"
-" const int offset=(((batch_idx*channel4+channel_block_idx)*height+height_idx)*width+w)*4;\n"
+" const int offset=(((batch_idx+channel_block_idx*batch)*height+height_idx)*width+w)*4;\n"
 " float4 in=convert_float4(vload4(0,input+offset));\n"
 " float4 out=OPERATOR;\n"
 " vstore4(CONVERT_OUTPUT4(out),0,output+offset);\n"
@@ -13502,6 +11973,7 @@ const char* unary_subgroup_buf =
 " __private const int width,\n"
 " __private const int height,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " const int channel_block_idx=get_global_id(0);\n"
@@ -13511,10 +11983,9 @@ const char* unary_subgroup_buf =
 " const int batch_idx=hb/height;\n"
 " const int height_idx=hb % height;\n"
 " const int dst_width=output_pad_left+width+output_pad_right;\n"
-" const int channel4=(channel+3)/4;\n"
 " const int channel16=(channel+15)/16;\n"
 " const int channe_out_idx=channel_block_idx >> 2;\n"
-" const int offset=(((batch_idx*channel4+channel_block_idx)*height+height_idx)*width+w)*4;\n"
+" const int offset=(((batch_idx+channel_block_idx*batch)*height+height_idx)*width+w)*4;\n"
 " const int dst_offset=(((batch_idx*channel16+channe_out_idx)*height+height_idx)*dst_width+w+output_pad_left)*16+(channel_block_idx % 4)*4;\n"
 " float4 in=convert_float4(vload4(0,input+offset));\n"
 " float4 out=OPERATOR;\n"
@@ -13537,6 +12008,7 @@ const char* unary_subgroup_buf =
 " __private const int width,\n"
 " __private const int height,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " const int channel_idx=get_group_id(0);\n"
@@ -13578,6 +12050,7 @@ const char* unary_subgroup_buf =
 " __private const int width,\n"
 " __private const int height,\n"
 " __private const int channel,\n"
+" __private const int batch,\n"
 " __private const int input_pad_left,__private const int input_pad_right,\n"
 " __private const int output_pad_left,__private const int output_pad_right) {\n"
 " const int channel_idx=get_group_id(0);\n"
@@ -13587,10 +12060,9 @@ const char* unary_subgroup_buf =
 " const int batch_idx=hb/height;\n"
 " const int height_idx=hb % height;\n"
 " const int src_width=width+input_pad_left+input_pad_right;\n"
-" const int channel4=(channel+3)/4;\n"
 " const int channel16=(channel+15)/16;\n"
 " const int src_offset=(((batch_idx*channel16+channel_idx)*height+height_idx)*src_width+w+input_pad_left)*16;\n"
-" const int dst_offset=(((batch_idx*channel4+(channel_idx<<2))*height+height_idx)*width+w)*4;\n"
+" const int dst_offset=(((batch_idx+(channel_idx<<2)*batch)*height+height_idx)*width+w)*4;\n"
 " const int height_width=height*width*4;\n"
 " \n"
 " float4 in=convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input+src_offset))));\n"
@@ -14155,25 +12627,24 @@ const char* scale_buf =
 " __global const FLOAT* bias,\n"
 "#endif\n"
 " __global FLOAT* output,\n"
-" __private const int4 shape) {//N,H,W,C4\n"
-" const int out_w_c_idx=get_global_id(0);\n"
-" const int out_h_b_idx=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(out_w_c_idx,out_h_b_idx);\n"
-" const int out_b_idx=out_h_b_idx/shape.y;\n"
-" const int out_h_idx=out_h_b_idx % shape.y;\n"
-" const int out_c_idx=out_w_c_idx/shape.z;\n"
-" const int out_w_idx=out_w_c_idx % shape.z;\n"
+" __private const int channelBlock,\n"
+" __private const int batch,\n"
+" __private const int inside) {\n"
+" const int x=get_global_id(0); // inside(width*height)\n"
+" const int y=get_global_id(1); // channelBlock*batch\n"
 " \n"
-" const int offset=(((out_b_idx*shape.w+out_c_idx)*shape.y+out_h_idx)*shape.z+out_w_idx)*4;\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int out_c_idx=y % channelBlock;\n"
+" const int out_b_idx=y/channelBlock;\n"
+" const int offset=((out_b_idx+out_c_idx*batch)*inside+x)*4;\n"
 " COMPUTE_FLOAT4 in_value=CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset));\n"
 " COMPUTE_FLOAT4 scale_value=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,scale));\n"
-"#ifdef BIAS\n"
+" #ifdef BIAS\n"
 " COMPUTE_FLOAT4 bias_value=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias));\n"
 " COMPUTE_FLOAT4 out_value=in_value*scale_value+bias_value;\n"
-"#else\n"
+" #else\n"
 " COMPUTE_FLOAT4 out_value=in_value*scale_value;\n"
-"#endif\n"
+" #endif\n"
 " vstore4(CONVERT_FLOAT4(out_value),0,output+offset);\n"
 "}\n"
 ;
@@ -14191,380 +12662,169 @@ const char* matmul_buf =
 " __global const FLOAT* input_c,\n"
 " #endif\n"
 " __global FLOAT* output_c,\n"
-" __private const int channels,\n"
-" __private const int channel_blocks,\n"
-" __private const int width_blocks,\n"
-" __private const int width) {\n"
-" const int width_blocks_idx=get_global_id(0);// output W\n"
-" const int height_idx=get_global_id(1);// output H\n"
-" DEAL_NON_UNIFORM_DIM2(width_blocks_idx,height_idx);\n"
-" COMPUTE_FLOAT4 a;\n"
-" COMPUTE_FLOAT4 b0=0,b1=0,b2=0,b3=0;\n"
-" COMPUTE_FLOAT4 v_zero=(COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);\n"
+" __private const int M,\n"
+" __private const int N,\n"
+" __private const int K) {\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1)); // N M\n"
+" DEAL_NON_UNIFORM_DIM2(pos.x,pos.y);\n"
+" const int idn=pos.x << 2;\n"
+" const int idm=pos.y << 2;\n"
+" \n"
+" COMPUTE_FLOAT4 out[4];\n"
 " #ifdef BIAS\n"
-" COMPUTE_FLOAT4 temp=CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx,input_c));\n"
-" COMPUTE_FLOAT result0=temp.x;\n"
-" COMPUTE_FLOAT result1=temp.y;\n"
-" COMPUTE_FLOAT result2=temp.z;\n"
-" COMPUTE_FLOAT result3=temp.w;\n"
-" #else\n"
-" COMPUTE_FLOAT result0=0;\n"
-" COMPUTE_FLOAT result1=0;\n"
-" COMPUTE_FLOAT result2=0;\n"
-" COMPUTE_FLOAT result3=0;\n"
-" #endif\n"
-" const int remain=channel_blocks*4-channels;\n"
-" for (short pos=0; pos<channel_blocks-1; pos += 1) {\n"
-" const int inpa_offset=height_idx*channel_blocks+pos;\n"
-" a=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" const int inpb_offset=(pos*4)*width_blocks+width_blocks_idx;\n"
-" b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" b1=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks,input_b));\n"
-" b2=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*2,input_b));\n"
-" b3=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*3,input_b));\n"
-" COMPUTE_FLOAT4 btmp0=(COMPUTE_FLOAT4)(b0.s0,b1.s0,b2.s0,b3.s0);\n"
-" COMPUTE_FLOAT4 btmp1=(COMPUTE_FLOAT4)(b0.s1,b1.s1,b2.s1,b3.s1);\n"
-" COMPUTE_FLOAT4 btmp2=(COMPUTE_FLOAT4)(b0.s2,b1.s2,b2.s2,b3.s2);\n"
-" COMPUTE_FLOAT4 btmp3=(COMPUTE_FLOAT4)(b0.s3,b1.s3,b2.s3,b3.s3);\n"
-" result0 += dot(a,btmp0);\n"
-" result1 += dot(a,btmp1);\n"
-" result2 += dot(a,btmp2);\n"
-" result3 += dot(a,btmp3);\n"
-" }\n"
-" \n"
-" {\n"
-" const int inpa_offset=height_idx*channel_blocks+channel_blocks-1;\n"
-" a=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" const int inpb_offset=((channel_blocks-1)*4)*width_blocks+width_blocks_idx;\n"
-" b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" b1=(remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks,input_b));\n"
-" b2=(remain >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*2,input_b));\n"
-" b3=(remain >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*3,input_b));\n"
-" if (remain == 3) {\n"
-" a.y=0;\n"
-" a.z=0;\n"
-" a.w=0;\n"
-" } else if (remain == 2) {\n"
-" a.z=0;\n"
-" a.w=0;\n"
-" } else if (remain == 1) {\n"
-" a.w=0;;\n"
+" COMPUTE_FLOAT4 bias=CONVERT_COMPUTE_FLOAT4(vload4(0,input_c+idn));\n"
+" #pragma unroll\n"
+" for(int i=0; i<4; ++i){\n"
+" out[i]=bias;\n"
 " }\n"
-" COMPUTE_FLOAT4 btmp0=(COMPUTE_FLOAT4)(b0.s0,b1.s0,b2.s0,b3.s0);\n"
-" COMPUTE_FLOAT4 btmp1=(COMPUTE_FLOAT4)(b0.s1,b1.s1,b2.s1,b3.s1);\n"
-" COMPUTE_FLOAT4 btmp2=(COMPUTE_FLOAT4)(b0.s2,b1.s2,b2.s2,b3.s2);\n"
-" COMPUTE_FLOAT4 btmp3=(COMPUTE_FLOAT4)(b0.s3,b1.s3,b2.s3,b3.s3);\n"
-" result0 += dot(a,btmp0);\n"
-" result1 += dot(a,btmp1);\n"
-" result2 += dot(a,btmp2);\n"
-" result3 += dot(a,btmp3);\n"
+" #else\n"
+" #pragma unroll\n"
+" for(int i=0; i<4; ++i){\n"
+" out[i]=(COMPUTE_FLOAT4)0;\n"
 " }\n"
-" const int out_offset=height_idx*width_blocks+width_blocks_idx;\n"
-" vstore4(CONVERT_FLOAT4((COMPUTE_FLOAT4)(result0,result1,result2,result3)),out_offset,output_c);\n"
-"}\n"
-"__kernel void matmul_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,\n"
-" __global const FLOAT* input_b,\n"
-" #ifdef BIAS\n"
-" __global const FLOAT* input_c,\n"
 " #endif\n"
-" __global FLOAT* output_c,\n"
-" __private const int channels,\n"
-" __private const int channel_blocks,\n"
-" __private const int width_blocks,\n"
-" __private const int width) {\n"
-" const int width_blocks_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(width_blocks_idx,height_idx);\n"
-" COMPUTE_FLOAT4 a;\n"
-" COMPUTE_FLOAT4 b0=0,b1=0,b2=0,b3=0;\n"
-" COMPUTE_FLOAT4 v_zero=(COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);\n"
-" #ifdef BIAS\n"
-" COMPUTE_FLOAT4 temp=CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx,input_c));\n"
-" COMPUTE_FLOAT result0=temp.x;\n"
-" COMPUTE_FLOAT result1=temp.y;\n"
-" COMPUTE_FLOAT result2=temp.z;\n"
-" COMPUTE_FLOAT result3=temp.w;\n"
+" const int K4=(K+3)/4;\n"
+" #ifdef K_LEAVE\n"
+" const int loop_end=max(K4-1,0);\n"
+" const int remain=K-loop_end*4;\n"
 " #else\n"
-" COMPUTE_FLOAT result0=0;\n"
-" COMPUTE_FLOAT result1=0;\n"
-" COMPUTE_FLOAT result2=0;\n"
-" COMPUTE_FLOAT result3=0;\n"
-" #endif\n"
-" const int remaina=channel_blocks*4-channels;\n"
-" const int remainb=(width_blocks_idx+1)*4-width;\n"
-" for (short pos=0; pos<channel_blocks-1; pos += 1) {\n"
-" const int inpa_offset=height_idx*channel_blocks+pos;\n"
-" a=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" const int inpb_offset=(width_blocks_idx*4)*channel_blocks+pos;\n"
-" b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" b1=(remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks,input_b));\n"
-" b2=(remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*2,input_b));\n"
-" b3=(remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*3,input_b));\n"
-" result0 += dot(a,b0);\n"
-" result1 += dot(a,b1);\n"
-" result2 += dot(a,b2);\n"
-" result3 += dot(a,b3);\n"
-" }\n"
-" \n"
-" {\n"
-" const int inpa_offset=height_idx*channel_blocks+channel_blocks-1;\n"
-" a=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" const int inpb_offset=(width_blocks_idx*4)*channel_blocks+channel_blocks-1;\n"
-" b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" b1=(remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks,input_b));\n"
-" b2=(remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*2,input_b));\n"
-" b3=(remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*3,input_b));\n"
-" if (remaina == 3) {\n"
-" a.y=0;\n"
-" a.z=0;\n"
-" a.w=0;\n"
-" } else if (remaina == 2) {\n"
-" a.z=0;\n"
-" a.w=0;\n"
-" } else if (remaina == 1) {\n"
-" a.w=0;\n"
-" }\n"
-" result0 += dot(a,b0);\n"
-" result1 += dot(a,b1);\n"
-" result2 += dot(a,b2);\n"
-" result3 += dot(a,b3);\n"
-" }\n"
-" const int out_offset=height_idx*width_blocks+width_blocks_idx;\n"
-" vstore4(CONVERT_FLOAT4((COMPUTE_FLOAT4)(result0,result1,result2,result3)),out_offset,output_c);\n"
-"}\n"
-"__kernel void matmul_transA_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,\n"
-" __global const FLOAT* input_b,\n"
-" #ifdef BIAS\n"
-" __global const FLOAT* input_c,\n"
+" const int loop_end=K4;\n"
 " #endif\n"
-" __global FLOAT* output_c,\n"
-" __private const int channels,\n"
-" __private const int channel_blocks,\n"
-" __private const int height,\n"
-" __private const int height_blocks,\n"
-" __private const int width_blocks,\n"
-" __private const int width) {\n"
-" const int width_blocks_idx=get_global_id(0);\n"
-" const int height_blocks_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(width_blocks_idx,height_blocks_idx);\n"
-" COMPUTE_FLOAT4 v_zero=(COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);\n"
-" #ifdef BIAS\n"
-" COMPUTE_FLOAT4 result0=CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx,input_c));\n"
-" COMPUTE_FLOAT4 result1=result0;\n"
-" COMPUTE_FLOAT4 result2=result0;\n"
-" COMPUTE_FLOAT4 result3=result0;\n"
+" \n"
+" #ifdef TRANSPOSE_A\n"
+" __global const FLOAT* input_a_offset=input_a+idm; // K x M\n"
 " #else\n"
-" COMPUTE_FLOAT4 result0=0;\n"
-" COMPUTE_FLOAT4 result1=0;\n"
-" COMPUTE_FLOAT4 result2=0;\n"
-" COMPUTE_FLOAT4 result3=0;\n"
-" #endif\n"
-" \n"
-" const int remain=channel_blocks*4-channels;\n"
-" for (short pos=0; pos<channel_blocks-1; pos += 1) {\n"
-" const int inpa_offset=(4*pos)*height_blocks+height_blocks_idx;\n"
-" COMPUTE_FLOAT4 a0=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" COMPUTE_FLOAT4 a1=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks,input_a));\n"
-" COMPUTE_FLOAT4 a2=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*2,input_a));\n"
-" COMPUTE_FLOAT4 a3=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*3,input_a));\n"
-" const int inpb_offset=(4*pos)*width_blocks+width_blocks_idx;\n"
-" COMPUTE_FLOAT4 b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" COMPUTE_FLOAT4 b1=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks,input_b));\n"
-" COMPUTE_FLOAT4 b2=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*2,input_b));\n"
-" COMPUTE_FLOAT4 b3=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*3,input_b));\n"
-" COMPUTE_FLOAT4 a0_trans=(COMPUTE_FLOAT4)(a0.x,a1.x,a2.x,a3.x);\n"
-" COMPUTE_FLOAT4 a1_trans=(COMPUTE_FLOAT4)(a0.y,a1.y,a2.y,a3.y);\n"
-" COMPUTE_FLOAT4 a2_trans=(COMPUTE_FLOAT4)(a0.z,a1.z,a2.z,a3.z);\n"
-" COMPUTE_FLOAT4 a3_trans=(COMPUTE_FLOAT4)(a0.w,a1.w,a2.w,a3.w);\n"
-" \n"
-" COMPUTE_FLOAT4 b0_trans=(COMPUTE_FLOAT4)(b0.x,b1.x,b2.x,b3.x);\n"
-" COMPUTE_FLOAT4 b1_trans=(COMPUTE_FLOAT4)(b0.y,b1.y,b2.y,b3.y);\n"
-" COMPUTE_FLOAT4 b2_trans=(COMPUTE_FLOAT4)(b0.z,b1.z,b2.z,b3.z);\n"
-" COMPUTE_FLOAT4 b3_trans=(COMPUTE_FLOAT4)(b0.w,b1.w,b2.w,b3.w);\n"
-" //matmul\n"
-" result0.x += dot(a0_trans,b0_trans);\n"
-" result0.y += dot(a0_trans,b1_trans);\n"
-" result0.z += dot(a0_trans,b2_trans);\n"
-" result0.w += dot(a0_trans,b3_trans);\n"
+" __global const FLOAT* input_a_offset=input_a+idm*K; // M x K\n"
+" #endif\n"
 " \n"
-" result1.x += dot(a1_trans,b0_trans);\n"
-" result1.y += dot(a1_trans,b1_trans);\n"
-" result1.z += dot(a1_trans,b2_trans);\n"
-" result1.w += dot(a1_trans,b3_trans);\n"
+" #ifdef TRANSPOSE_B\n"
+" __global const FLOAT* input_b_offset=input_b+idn*K; // N x K\n"
+" #else\n"
+" __global const FLOAT* input_b_offset=input_b+idn; // K x N\n"
+" #endif\n"
 " \n"
-" result2.x += dot(a2_trans,b0_trans);\n"
-" result2.y += dot(a2_trans,b1_trans);\n"
-" result2.z += dot(a2_trans,b2_trans);\n"
-" result2.w += dot(a2_trans,b3_trans);\n"
+" for (int k=0; k<loop_end; ++k) {\n"
+" int kindex=k << 2;\n"
+" COMPUTE_FLOAT4 A[4]; // m4 x k4\n"
+" COMPUTE_FLOAT4 B[4]; // k4 x n4\n"
+" #ifdef TRANSPOSE_A\n"
+" {\n"
+" COMPUTE_FLOAT4 tmp0=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+kindex*M));\n"
+" COMPUTE_FLOAT4 tmp1=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+(kindex+1)*M));\n"
+" COMPUTE_FLOAT4 tmp2=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+(kindex+2)*M));\n"
+" COMPUTE_FLOAT4 tmp3=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+(kindex+3)*M));\n"
 " \n"
-" result3.x += dot(a3_trans,b0_trans);\n"
-" result3.y += dot(a3_trans,b1_trans);\n"
-" result3.z += dot(a3_trans,b2_trans);\n"
-" result3.w += dot(a3_trans,b3_trans);\n"
+" A[0]=(COMPUTE_FLOAT4)(tmp0.x,tmp1.x,tmp2.x,tmp3.x);\n"
+" A[1]=(COMPUTE_FLOAT4)(tmp0.y,tmp1.y,tmp2.y,tmp3.y);\n"
+" A[2]=(COMPUTE_FLOAT4)(tmp0.z,tmp1.z,tmp2.z,tmp3.z);\n"
+" A[3]=(COMPUTE_FLOAT4)(tmp0.w,tmp1.w,tmp2.w,tmp3.w);\n"
 " }\n"
+" #else\n"
+" A[0]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+kindex));\n"
+" A[1]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+kindex+K));\n"
+" A[2]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+kindex+2*K));\n"
+" A[3]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+kindex+3*K));\n"
+" #endif\n"
 " \n"
+" #ifdef TRANSPOSE_B\n"
 " {\n"
-" const int inpa_offset=(4*(channel_blocks-1))*height_blocks+height_blocks_idx;\n"
-" COMPUTE_FLOAT4 a0=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" COMPUTE_FLOAT4 a1=((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks,input_a)));\n"
-" COMPUTE_FLOAT4 a2=((remain >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*2,input_a)));\n"
-" COMPUTE_FLOAT4 a3=((remain >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*3,input_a)));\n"
-" const int inpb_offset=(4*(channel_blocks-1))*width_blocks+width_blocks_idx;\n"
-" COMPUTE_FLOAT4 b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" COMPUTE_FLOAT4 b1=((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks,input_b)));\n"
-" COMPUTE_FLOAT4 b2=((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*2,input_b)));\n"
-" COMPUTE_FLOAT4 b3=((remain >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+width_blocks*3,input_b)));\n"
-" COMPUTE_FLOAT4 a0_trans=(COMPUTE_FLOAT4)(a0.x,a1.x,a2.x,a3.x);\n"
-" COMPUTE_FLOAT4 a1_trans=(COMPUTE_FLOAT4)(a0.y,a1.y,a2.y,a3.y);\n"
-" COMPUTE_FLOAT4 a2_trans=(COMPUTE_FLOAT4)(a0.z,a1.z,a2.z,a3.z);\n"
-" COMPUTE_FLOAT4 a3_trans=(COMPUTE_FLOAT4)(a0.w,a1.w,a2.w,a3.w);\n"
-" \n"
-" COMPUTE_FLOAT4 b0_trans=(COMPUTE_FLOAT4)(b0.x,b1.x,b2.x,b3.x);\n"
-" COMPUTE_FLOAT4 b1_trans=(COMPUTE_FLOAT4)(b0.y,b1.y,b2.y,b3.y);\n"
-" COMPUTE_FLOAT4 b2_trans=(COMPUTE_FLOAT4)(b0.z,b1.z,b2.z,b3.z);\n"
-" COMPUTE_FLOAT4 b3_trans=(COMPUTE_FLOAT4)(b0.w,b1.w,b2.w,b3.w);\n"
-" //matmul\n"
-" result0.x += dot(a0_trans,b0_trans);\n"
-" result0.y += dot(a0_trans,b1_trans);\n"
-" result0.z += dot(a0_trans,b2_trans);\n"
-" result0.w += dot(a0_trans,b3_trans);\n"
-" \n"
-" result1.x += dot(a1_trans,b0_trans);\n"
-" result1.y += dot(a1_trans,b1_trans);\n"
-" result1.z += dot(a1_trans,b2_trans);\n"
-" result1.w += dot(a1_trans,b3_trans);\n"
-" \n"
-" result2.x += dot(a2_trans,b0_trans);\n"
-" result2.y += dot(a2_trans,b1_trans);\n"
-" result2.z += dot(a2_trans,b2_trans);\n"
-" result2.w += dot(a2_trans,b3_trans);\n"
+" COMPUTE_FLOAT4 tmp0=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+kindex));\n"
+" COMPUTE_FLOAT4 tmp1=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+kindex+K));\n"
+" COMPUTE_FLOAT4 tmp2=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+kindex+2*K));\n"
+" COMPUTE_FLOAT4 tmp3=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+kindex+3*K));\n"
 " \n"
-" result3.x += dot(a3_trans,b0_trans);\n"
-" result3.y += dot(a3_trans,b1_trans);\n"
-" result3.z += dot(a3_trans,b2_trans);\n"
-" result3.w += dot(a3_trans,b3_trans);\n"
+" B[0]=(COMPUTE_FLOAT4)(tmp0.x,tmp1.x,tmp2.x,tmp3.x);\n"
+" B[1]=(COMPUTE_FLOAT4)(tmp0.y,tmp1.y,tmp2.y,tmp3.y);\n"
+" B[2]=(COMPUTE_FLOAT4)(tmp0.z,tmp1.z,tmp2.z,tmp3.z);\n"
+" B[3]=(COMPUTE_FLOAT4)(tmp0.w,tmp1.w,tmp2.w,tmp3.w);\n"
 " }\n"
-" \n"
-" const int out_offset=(4*height_blocks_idx)*width_blocks+width_blocks_idx;\n"
-" vstore4(CONVERT_FLOAT4(result0),out_offset,output_c);\n"
-" if(4*height_blocks_idx+1 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result1),out_offset+width_blocks,output_c);\n"
-" if(4*height_blocks_idx+2 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result2),out_offset+width_blocks*2,output_c);\n"
-" if(4*height_blocks_idx+3 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result3),out_offset+width_blocks*3,output_c);\n"
-"}\n"
-"__kernel void matmul_transA_transB_buf(GLOBAL_SIZE_2_DIMS __global const FLOAT* input_a,\n"
-" __global const FLOAT* input_b,\n"
-" #ifdef BIAS\n"
-" __global const FLOAT* input_c,\n"
-" #endif\n"
-" __global FLOAT* output_c,\n"
-" __private const int channels,\n"
-" __private const int channel_blocks,\n"
-" __private const int height,\n"
-" __private const int height_blocks,\n"
-" __private const int width_blocks,\n"
-" __private const int width) {\n"
-" const int width_blocks_idx=get_global_id(0);\n"
-" const int height_blocks_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(width_blocks_idx,height_blocks_idx);\n"
-" COMPUTE_FLOAT4 v_zero=(COMPUTE_FLOAT4)((COMPUTE_FLOAT)0.0);\n"
-" #ifdef BIAS\n"
-" COMPUTE_FLOAT4 result0=CONVERT_COMPUTE_FLOAT4(vload4(width_blocks_idx,input_c));\n"
-" COMPUTE_FLOAT4 result1=result0;\n"
-" COMPUTE_FLOAT4 result2=result0;\n"
-" COMPUTE_FLOAT4 result3=result0;\n"
 " #else\n"
-" COMPUTE_FLOAT4 result0=0;\n"
-" COMPUTE_FLOAT4 result1=0;\n"
-" COMPUTE_FLOAT4 result2=0;\n"
-" COMPUTE_FLOAT4 result3=0;\n"
-" #endif\n"
-" \n"
-" const int remaina=channel_blocks*4-channels;\n"
-" const int remainb=(width_blocks_idx+1)*4-width;\n"
-" for (short pos=0; pos<channel_blocks-1; pos += 1) {\n"
-" const int inpa_offset=(4*pos)*height_blocks+height_blocks_idx;\n"
-" COMPUTE_FLOAT4 a0=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" COMPUTE_FLOAT4 a1=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks,input_a));\n"
-" COMPUTE_FLOAT4 a2=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*2,input_a));\n"
-" COMPUTE_FLOAT4 a3=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*3,input_a));\n"
-" const int inpb_offset=(4*width_blocks_idx)*channel_blocks+pos;\n"
-" COMPUTE_FLOAT4 b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" COMPUTE_FLOAT4 b1=((remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks,input_b)));\n"
-" COMPUTE_FLOAT4 b2=((remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*2,input_b)));\n"
-" COMPUTE_FLOAT4 b3=((remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*3,input_b)));\n"
-" COMPUTE_FLOAT4 a0_trans=(COMPUTE_FLOAT4)(a0.x,a1.x,a2.x,a3.x);\n"
-" COMPUTE_FLOAT4 a1_trans=(COMPUTE_FLOAT4)(a0.y,a1.y,a2.y,a3.y);\n"
-" COMPUTE_FLOAT4 a2_trans=(COMPUTE_FLOAT4)(a0.z,a1.z,a2.z,a3.z);\n"
-" COMPUTE_FLOAT4 a3_trans=(COMPUTE_FLOAT4)(a0.w,a1.w,a2.w,a3.w);\n"
-" //matmul\n"
-" result0.x += dot(a0_trans,b0);\n"
-" result0.y += dot(a0_trans,b1);\n"
-" result0.z += dot(a0_trans,b2);\n"
-" result0.w += dot(a0_trans,b3);\n"
-" \n"
-" result1.x += dot(a1_trans,b0);\n"
-" result1.y += dot(a1_trans,b1);\n"
-" result1.z += dot(a1_trans,b2);\n"
-" result1.w += dot(a1_trans,b3);\n"
-" \n"
-" result2.x += dot(a2_trans,b0);\n"
-" result2.y += dot(a2_trans,b1);\n"
-" result2.z += dot(a2_trans,b2);\n"
-" result2.w += dot(a2_trans,b3);\n"
+" B[0]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+kindex*N));\n"
+" B[1]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+(kindex+1)*N));\n"
+" B[2]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+(kindex+2)*N));\n"
+" B[3]=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+(kindex+3)*N));\n"
+" #endif\n"
 " \n"
-" result3.x += dot(a3_trans,b0);\n"
-" result3.y += dot(a3_trans,b1);\n"
-" result3.z += dot(a3_trans,b2);\n"
-" result3.w += dot(a3_trans,b3);\n"
+" #pragma unroll\n"
+" for (int vec_m=0; vec_m<4; ++vec_m){\n"
+" out[vec_m]=mad((COMPUTE_FLOAT4)A[vec_m].x,B[0],out[vec_m]);\n"
+" out[vec_m]=mad((COMPUTE_FLOAT4)A[vec_m].y,B[1],out[vec_m]);\n"
+" out[vec_m]=mad((COMPUTE_FLOAT4)A[vec_m].z,B[2],out[vec_m]);\n"
+" out[vec_m]=mad((COMPUTE_FLOAT4)A[vec_m].w,B[3],out[vec_m]);\n"
 " }\n"
+" }\n"
+" #ifdef K_LEAVE\n"
+" for (int k=loop_end << 2; k<K; ++k){\n"
+" COMPUTE_FLOAT4 A; // m4\n"
+" COMPUTE_FLOAT4 B; // n4\n"
+" #ifdef TRANSPOSE_A\n"
+" A=CONVERT_COMPUTE_FLOAT4(vload4(0,input_a_offset+k*M));\n"
+" #else\n"
+" A.x=(COMPUTE_FLOAT)input_a_offset[k];\n"
+" A.y=(COMPUTE_FLOAT)input_a_offset[k+K];\n"
+" A.z=(COMPUTE_FLOAT)input_a_offset[k+2*K];\n"
+" A.w=(COMPUTE_FLOAT)input_a_offset[k+3*K];\n"
+" #endif\n"
 " \n"
-" {\n"
-" const int inpa_offset=(4*(channel_blocks-1))*height_blocks+height_blocks_idx;\n"
-" COMPUTE_FLOAT4 a0=CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset,input_a));\n"
-" COMPUTE_FLOAT4 a1=((remaina >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks,input_a)));\n"
-" COMPUTE_FLOAT4 a2=((remaina >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*2,input_a)));\n"
-" COMPUTE_FLOAT4 a3=((remaina >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpa_offset+height_blocks*3,input_a)));\n"
-" const int inpb_offset=(4*width_blocks_idx)*channel_blocks+channel_blocks-1;\n"
-" COMPUTE_FLOAT4 b0=CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset,input_b));\n"
-" COMPUTE_FLOAT4 b1=((remainb >= 3) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks,input_b)));\n"
-" COMPUTE_FLOAT4 b2=((remainb >= 2) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*2,input_b)));\n"
-" COMPUTE_FLOAT4 b3=((remainb >= 1) ? v_zero : CONVERT_COMPUTE_FLOAT4(vload4(inpb_offset+channel_blocks*3,input_b)));\n"
-" COMPUTE_FLOAT4 a0_trans=(COMPUTE_FLOAT4)(a0.x,a1.x,a2.x,a3.x);\n"
-" COMPUTE_FLOAT4 a1_trans=(COMPUTE_FLOAT4)(a0.y,a1.y,a2.y,a3.y);\n"
-" COMPUTE_FLOAT4 a2_trans=(COMPUTE_FLOAT4)(a0.z,a1.z,a2.z,a3.z);\n"
-" COMPUTE_FLOAT4 a3_trans=(COMPUTE_FLOAT4)(a0.w,a1.w,a2.w,a3.w);\n"
-" //matmul\n"
-" result0.x += dot(a0_trans,b0);\n"
-" result0.y += dot(a0_trans,b1);\n"
-" result0.z += dot(a0_trans,b2);\n"
-" result0.w += dot(a0_trans,b3);\n"
+" #ifdef TRANSPOSE_B\n"
+" B.x=(COMPUTE_FLOAT)input_b_offset[k];\n"
+" B.y=(COMPUTE_FLOAT)input_b_offset[k+K];\n"
+" B.z=(COMPUTE_FLOAT)input_b_offset[k+2*K];\n"
+" B.w=(COMPUTE_FLOAT)input_b_offset[k+3*K];\n"
+" #else\n"
+" B=CONVERT_COMPUTE_FLOAT4(vload4(0,input_b_offset+k*N));\n"
+" #endif\n"
+" out[0]=mad((COMPUTE_FLOAT4)A.x,B,out[0]);\n"
+" out[1]=mad((COMPUTE_FLOAT4)A.y,B,out[1]);\n"
+" out[2]=mad((COMPUTE_FLOAT4)A.z,B,out[2]);\n"
+" out[3]=mad((COMPUTE_FLOAT4)A.w,B,out[3]);\n"
+" }\n"
+" #endif\n"
 " \n"
-" result1.x += dot(a1_trans,b0);\n"
-" result1.y += dot(a1_trans,b1);\n"
-" result1.z += dot(a1_trans,b2);\n"
-" result1.w += dot(a1_trans,b3);\n"
 " \n"
-" result2.x += dot(a2_trans,b0);\n"
-" result2.y += dot(a2_trans,b1);\n"
-" result2.z += dot(a2_trans,b2);\n"
-" result2.w += dot(a2_trans,b3);\n"
+" const int out_offset=idm*N+idn;\n"
+" #ifdef M_LEAVE\n"
+" if(idm+3 >= M){\n"
+" #ifdef N_LEAVE\n"
+" if(idn+3 >= N){\n"
+" for (int vec_m=0; vec_m<M-idm; ++vec_m){\n"
+" COMPUTE_FLOAT *out_ptr=(COMPUTE_FLOAT*)&out[vec_m];\n"
+" for(int vec_n=0; vec_n<N-idn; ++vec_n){\n"
+" output_c[out_offset+vec_m*N+vec_n]=out_ptr[vec_n];\n"
+" }\n"
+" }\n"
+" } else {\n"
+" #endif\n"
+" for (int vec_m=0; vec_m<M-idm; ++vec_m){\n"
+" vstore4(CONVERT_FLOAT4(out[vec_m]),0,output_c+out_offset+vec_m*N);\n"
+" }\n"
 " \n"
-" result3.x += dot(a3_trans,b0);\n"
-" result3.y += dot(a3_trans,b1);\n"
-" result3.z += dot(a3_trans,b2);\n"
-" result3.w += dot(a3_trans,b3);\n"
+" #ifdef N_LEAVE\n"
 " }\n"
-" const int out_offset=(4*height_blocks_idx)*width_blocks+width_blocks_idx;\n"
-" vstore4(CONVERT_FLOAT4(result0),out_offset,output_c);\n"
-" if(4*height_blocks_idx+1 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result1),out_offset+width_blocks,output_c);\n"
-" if(4*height_blocks_idx+2 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result2),out_offset+width_blocks*2,output_c);\n"
-" if(4*height_blocks_idx+3 >= height) return;\n"
-" vstore4(CONVERT_FLOAT4(result3),out_offset+width_blocks*3,output_c);\n"
+" #endif\n"
+" } else{\n"
+" #endif\n"
+" #ifdef N_LEAVE\n"
+" if(idn+3 >= N){\n"
+" #pragma unroll\n"
+" for (int vec_m=0; vec_m<4; ++vec_m){\n"
+" COMPUTE_FLOAT *out_ptr=(COMPUTE_FLOAT*)&out[vec_m];\n"
+" for(int vec_n=0; vec_n<N-idn; ++vec_n){\n"
+" output_c[out_offset+vec_m*N+vec_n]=out_ptr[vec_n];\n"
+" }\n"
+" }\n"
+" } else {\n"
+" #endif\n"
+" #pragma unroll\n"
+" for (int vec_m=0; vec_m<4; ++vec_m){\n"
+" vstore4(CONVERT_FLOAT4(out[vec_m]),0,output_c+out_offset+vec_m*N);\n"
+" }\n"
+" #ifdef N_LEAVE\n"
+" }\n"
+" #endif\n"
+" #ifdef M_LEAVE\n"
+" }\n"
+" #endif\n"
 "}\n"
 ;
 #endif
@@ -14722,6 +12982,69 @@ const char* conv_2d_buf =
 "#endif\n"
 "#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
 "#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
+"#ifdef CONV_LOCAL_SIZE\n"
+"__kernel\n"
+"void conv_2d_1x1_local(__private const int out_w_blocks,\n"
+" __global const FLOAT *input,\n"
+" __global const FLOAT *kernel_ptr,\n"
+" __global const FLOAT *bias_ptr,\n"
+" __global FLOAT *output,\n"
+" __private const int in_c_block,\n"
+" __private const int batch,\n"
+" __private const int out_h,\n"
+" __private const int out_w,\n"
+" __private const int out_c_block,\n"
+" __private const int out_c_pack) {\n"
+" const int lid=get_local_id(0);\n"
+" const int out_c_w_idx=get_global_id(1); //c/4 w\n"
+" const int out_b_h_idx=get_global_id(2); //b h\n"
+" \n"
+" COMPUTE_FLOAT4 local sum[CONV_LOCAL_SIZE];\n"
+" \n"
+" const int out_c_idx=out_c_w_idx/out_w_blocks;\n"
+" const int out_w_idx=out_c_w_idx % out_w_blocks;\n"
+" const int out_b_idx=out_b_h_idx/out_h; // equal to in_b_idx\n"
+" const int out_h_idx=out_b_h_idx % out_h; // equal to in_h_idx\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
+" COMPUTE_FLOAT4 out0=(COMPUTE_FLOAT4)0;\n"
+" int offset=out_c_idx*4;\n"
+" int inp_offset=(((out_b_idx+in_c_block*batch)*out_h+out_h_idx)* out_w+out_w_idx) << 2;\n"
+" \n"
+" const int inp_add=batch*out_h*out_w*4;\n"
+" for (ushort in_channel_block_idx=lid; in_channel_block_idx<in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {\n"
+" \n"
+" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n"
+" COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset+in_channel_block_idx*inp_add));\n"
+" COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
+" COMPUTE_FLOAT4 weights1=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack));\n"
+" COMPUTE_FLOAT4 weights2=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack));\n"
+" COMPUTE_FLOAT4 weights3=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset+out_c_pack+out_c_pack+out_c_pack));\n"
+" out0=mad(in0.x,weights0,out0);\n"
+" out0=mad(in0.y,weights1,out0);\n"
+" out0=mad(in0.z,weights2,out0);\n"
+" out0=mad(in0.w,weights3,out0);\n"
+" }\n"
+" \n"
+" sum[lid]=out0;\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" for(int i=CONV_LOCAL_SIZE/2; i>0; i /= 2){\n"
+" if (lid<i)\n"
+" sum[lid]=sum[lid]+sum[lid+i];\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" }\n"
+" out0=sum[0]+bias0;\n"
+" if(lid == 0){\n"
+"#ifdef RELU\n"
+" out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+"#endif\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
+" }\n"
+"}\n"
+"#endif\n"
 "__kernel\n"
 "void conv_2d_1x1_c4h1w4(GLOBAL_SIZE_2_DIMS __private const int out_w_blocks,\n"
 " __global const FLOAT *input,\n"
@@ -14731,6 +13054,7 @@ const char* conv_2d_buf =
 " __private const int in_c_block,\n"
 " __private const int out_h,\n"
 " __private const int out_w,\n"
+" __private const int out_b,\n"
 " __private const int out_c_block,\n"
 " __private const int out_c_pack) {\n"
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
@@ -14746,14 +13070,11 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 out2=out0;\n"
 " COMPUTE_FLOAT4 out3=out0;\n"
 " const int intput_width_idx0=out_w4_idx;\n"
-" \n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0) << 2;\n"
 " int offset=out_c_idx*4;\n"
-" int inp_offset=(((out_b_idx*in_c_block)*out_h+out_h_idx)* out_w+intput_width_idx0) << 2;\n"
-" \n"
-" const int inp_add=out_h*out_w*4;\n"
+" const int inp_add=out_b*out_h*out_w*4;\n"
 " for (ushort in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
 " \n"
-" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(2,input+inp_offset));\n"
@@ -14797,7 +13118,7 @@ const char* conv_2d_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_block+out_c_idx)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_w-out_w4_idx;\n"
 " if (remain >= 4) {\n"
@@ -14823,6 +13144,7 @@ const char* conv_2d_buf =
 " __private const int in_c_block,\n"
 " __private const int out_h,\n"
 " __private const int out_w,\n"
+" __private const int out_b,\n"
 " __private const int out_c_block,\n"
 " __private const int out_c_pack) {\n"
 " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
@@ -14843,10 +13165,10 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 out6=out4;\n"
 " COMPUTE_FLOAT4 out7=out4;\n"
 " const int intput_width_idx0=out_w4_idx;\n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
+" int offset=out_c_idx*8;\n"
+" const int inp_add=out_b*out_h*out_w*4;\n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
-" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*8);\n"
-" const int inp_offset =\n"
-" (((out_b_idx*in_c_block+in_channel_block_idx)*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
 " \n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
@@ -14900,6 +13222,9 @@ const char* conv_2d_buf =
 " out7=mad(in3.y,weights3,out7);\n"
 " out7=mad(in3.z,weights5,out7);\n"
 " out7=mad(in3.w,weights7,out7);\n"
+" \n"
+" offset += 4*out_c_pack;\n"
+" inp_offset += inp_add;\n"
 " }\n"
 "#ifdef RELU\n"
 " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
@@ -14923,9 +13248,9 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_block+out_c_idx*2)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*2*out_b)*out_h+out_h_idx)* out_w+out_w4_idx)*4;\n"
 " __global FLOAT*_tempoutput=output+out_offset;\n"
-" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w;\n"
+" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_w-out_w4_idx;\n"
 " if (remain >= 4) {\n"
@@ -14972,6 +13297,7 @@ const char* conv_2d_buf =
 " __private const int in_c_block,\n"
 " __private const int out_h,\n"
 " __private const int out_w,\n"
+" __private const int out_b,\n"
 " __private const int out_c_block,\n"
 " __private const int out_c_pack) {\n"
 " const int out_c_w_idx=get_global_id(0); //c/8 w/4\n"
@@ -14989,10 +13315,10 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 out4=CONVERT_COMPUTE_FLOAT4(vload4((out_c_idx<<1)+1,bias_ptr));\n"
 " COMPUTE_FLOAT4 out5=out4;\n"
 " const int intput_width_idx0=out_w2_idx;\n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)<<2;\n"
+" int offset=out_c_idx*8;\n"
+" const int inp_add=out_b*out_h*out_w*4;\n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
-" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*8);\n"
-" const int inp_offset =\n"
-" (((out_b_idx*in_c_block+in_channel_block_idx)*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
 " \n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
@@ -15023,6 +13349,9 @@ const char* conv_2d_buf =
 " out5=mad(in1.y,weights3,out5);\n"
 " out5=mad(in1.z,weights5,out5);\n"
 " out5=mad(in1.w,weights7,out5);\n"
+" \n"
+" offset += 4*out_c_pack;\n"
+" inp_offset += inp_add;\n"
 " }\n"
 "#ifdef RELU\n"
 " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
@@ -15036,9 +13365,9 @@ const char* conv_2d_buf =
 " out4=clamp(out4,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out5=clamp(out5,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_block+out_c_idx*2)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*2*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
 " __global FLOAT*_tempoutput=output+out_offset;\n"
-" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w;\n"
+" __global FLOAT*_tempoutput1=_tempoutput+4*out_h*out_w*out_b;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_w-out_w2_idx;\n"
 " if (remain >= 2) {\n"
@@ -15075,6 +13404,7 @@ const char* conv_2d_buf =
 " __private const int in_c_block,\n"
 " __private const int out_h,\n"
 " __private const int out_w,\n"
+" __private const int out_b,\n"
 " __private const int out_c_block,\n"
 " __private const int out_c_pack) {\n"
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
@@ -15086,12 +13416,12 @@ const char* conv_2d_buf =
 " const int out_h_idx=out_b_h_idx % out_h;//equal to in_h_idx\n"
 " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
 " const int intput_width_idx0=out_w_idx;\n"
+" int offset=out_c_idx*4;\n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)*out_w+intput_width_idx0)*4;\n"
+" const int inp_add=out_b*out_h*out_w*4;\n"
 " \n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
 " \n"
-" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n"
-" const int inp_offset =\n"
-" (((out_b_idx*in_c_block+in_channel_block_idx)*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
 " \n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " COMPUTE_FLOAT4 weights0=CONVERT_COMPUTE_FLOAT4(vload4(0,kernel_ptr+offset));\n"
@@ -15102,6 +13432,9 @@ const char* conv_2d_buf =
 " out0=mad(in0.y,weights1,out0);\n"
 " out0=mad(in0.z,weights2,out0);\n"
 " out0=mad(in0.w,weights3,out0);\n"
+" \n"
+" offset += 4*out_c_pack;\n"
+" inp_offset += inp_add;\n"
 " }\n"
 "#ifdef RELU\n"
 " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
@@ -15109,7 +13442,7 @@ const char* conv_2d_buf =
 "#ifdef RELU6\n"
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_block+out_c_idx)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 "}\n"
 "__kernel\n"
@@ -15121,6 +13454,7 @@ const char* conv_2d_buf =
 " __private const int in_c_block,\n"
 " __private const int out_h,\n"
 " __private const int out_w,\n"
+" __private const int out_b,\n"
 " __private const int out_c_block,\n"
 " __private const int out_c_pack) {\n"
 " const int out_c_w_idx=get_global_id(0); //c/4 w\n"
@@ -15134,11 +13468,11 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 out0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
 " COMPUTE_FLOAT4 out1=out0;\n"
 " const int intput_width_idx0=out_w2_idx;\n"
+" int offset=out_c_idx*4;\n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
+" const int inp_add=out_b*out_h*out_w*4;\n"
 " \n"
 " for (int in_channel_block_idx=0; in_channel_block_idx<in_c_block; ++in_channel_block_idx) {\n"
-" int offset=mad24(in_channel_block_idx*4,out_c_pack,out_c_idx*4);\n"
-" const int inp_offset =\n"
-" (((out_b_idx*in_c_block+in_channel_block_idx)*out_h+out_h_idx)* out_w+intput_width_idx0)*4;\n"
 " \n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(1,input+inp_offset));\n"
@@ -15155,6 +13489,9 @@ const char* conv_2d_buf =
 " out1=mad(in1.y,weights1,out1);\n"
 " out1=mad(in1.z,weights2,out1);\n"
 " out1=mad(in1.w,weights3,out1);\n"
+" \n"
+" offset += 4*out_c_pack;\n"
+" inp_offset += inp_add;\n"
 " }\n"
 "#ifdef RELU\n"
 " out0=fmax(out0,(COMPUTE_FLOAT4)0);\n"
@@ -15164,7 +13501,7 @@ const char* conv_2d_buf =
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_block+out_c_idx)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*out_b)*out_h+out_h_idx)* out_w+out_w2_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_w-out_w2_idx;\n"
 " if (remain >= 2) {\n"
@@ -15185,6 +13522,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15221,7 +13559,7 @@ const char* conv_2d_buf =
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+kw_start)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
 " for(int ix=in_w_idx_start; ix<in_w_idx_end; ix += dilate_hw.y) {\n"
-" int inp_offset=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
+" int inp_offset=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+ix)*4;\n"
 " COMPUTE_FLOAT4 in0=CONVERT_COMPUTE_FLOAT4(vload4(0,input+inp_offset));\n"
 " \n"
 " const int filter_w_inc=(ix-in_w_idx_start)/dilate_hw.y;\n"
@@ -15243,7 +13581,7 @@ const char* conv_2d_buf =
 "#ifdef RELU6\n"
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " \n"
 "}\n"
@@ -15256,6 +13594,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15289,7 +13628,7 @@ const char* conv_2d_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -15322,7 +13661,7 @@ const char* conv_2d_buf =
 " out0=clamp(out0,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " if(out_w_idx+1 >= out_hw.y) return;\n"
@@ -15340,6 +13679,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15375,7 +13715,7 @@ const char* conv_2d_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -15425,7 +13765,7 @@ const char* conv_2d_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if (remain >= 4) {\n"
@@ -15451,6 +13791,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15486,7 +13827,7 @@ const char* conv_2d_buf =
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -15539,7 +13880,7 @@ const char* conv_2d_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" const int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" const int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -15573,6 +13914,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15613,7 +13955,7 @@ const char* conv_2d_buf =
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -15697,7 +14039,7 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 4){\n"
@@ -15715,12 +14057,12 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
@@ -15741,12 +14083,12 @@ const char* conv_2d_buf =
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out2),2*out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),3*out_hw.y,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out4),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out5),out_hw.y,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out6),2*out_hw.y,output+out_offset);\n"
@@ -15762,6 +14104,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15797,7 +14140,7 @@ const char* conv_2d_buf =
 " for(ushort in_c_idx=0; in_c_idx<in_c_blocks; in_c_idx++) {\n"
 " //weights NC4HW4 [1,4*icC4,ocC4*kh*kw,1] xic4\n"
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
-" const int inp_offset_base=(out_b_idx*in_c_blocks+in_c_idx)*in_hw.x*in_hw.y*4;\n"
+" const int inp_offset_base=(out_b_idx+in_c_idx*batch)*in_hw.x*in_hw.y*4;\n"
 " for(int iy=0; iy<filter_hw.x; iy++) {\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+iy)*filter_hw.y+kw_start)*4;\n"
 " const int in_h0_idx=(iy*dilate_hw.x+in_h0_idx_base)*in_hw.y;\n"
@@ -15851,7 +14194,7 @@ const char* conv_2d_buf =
 " out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.x-out_h_idx;\n"
 " if(remain >= 2){\n"
@@ -15860,12 +14203,12 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 2){\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
@@ -15875,12 +14218,12 @@ const char* conv_2d_buf =
 "#else\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out1),out_hw.y,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks){\n"
 " return;\n"
 " }\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore4(CONVERT_FLOAT4(out2),0,output+out_offset);\n"
 " vstore4(CONVERT_FLOAT4(out3),out_hw.y,output+out_offset);\n"
 "#endif\n"
@@ -15894,6 +14237,7 @@ const char* conv_2d_buf =
 " __private const int2 in_hw,\n"
 " __private const int inChannel,\n"
 " __private const int in_c_blocks,\n"
+" __private const int batch,\n"
 " __private const int2 out_hw,\n"
 " __private const int2 filter_hw,\n"
 " __private const int2 stride_hw,\n"
@@ -15936,7 +14280,7 @@ const char* conv_2d_buf =
 " //index: [0,4*in_c_idx,out_c_idx*kh*kw+kh_start*kw+kw_start,0]\n"
 " int weight_offset=((((4*in_c_idx+0)* out_c_blocks+out_c_idx) *filter_hw.x+kh_start)*filter_hw.y+0)*4;\n"
 " for(int iy=in_h_idx_start; iy<in_h_idx_end; iy += dilate_hw.x) {\n"
-" const int inp_offset_base=(((out_b_idx*in_c_blocks+in_c_idx)*in_hw.x+iy)*in_hw.y+0)*4;\n"
+" const int inp_offset_base=(((out_b_idx+in_c_idx*batch)*in_hw.x+iy)*in_hw.y+0)*4;\n"
 " for(int fw=0; fw<filter_hw.y; fw++) {\n"
 " const int in_w0_idx=fw*dilate_hw.y+in_w0_idx_base;\n"
 " const int in_w1_idx=fw*dilate_hw.y+in_w1_idx_base;\n"
@@ -16019,7 +14363,7 @@ const char* conv_2d_buf =
 " out6=clamp(out6,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 " out7=clamp(out7,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
 "#endif\n"
-" int out_offset=(((out_b_idx*out_c_blocks+out_c_idx)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" int out_offset=(((out_b_idx+out_c_idx*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 "#ifdef BLOCK_LEAVE\n"
 " const int remain=out_hw.y-out_w_idx;\n"
 " if(remain >= 4){\n"
@@ -16032,10 +14376,10 @@ const char* conv_2d_buf =
 " }else if(remain == 1){\n"
 " vstore4(CONVERT_FLOAT4(out0),0,output+out_offset);\n"
 " }\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks)return;\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " if(remain >= 4){\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 " }else if(remain == 3){\n"
@@ -16048,10 +14392,10 @@ const char* conv_2d_buf =
 " }\n"
 "#else\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out0,out1,out2,out3)),0,output+out_offset);\n"
-"#ifdef CHANNEL_LEAVE\n"
+" #ifdef CHANNEL_LEAVE\n"
 " if(out_c_idx+1 >= out_c_blocks)return;\n"
-"#endif\n"
-" out_offset=(((out_b_idx*out_c_blocks+out_c_idx+1)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
+" #endif\n"
+" out_offset=(((out_b_idx+(out_c_idx+1)*batch)*out_hw.x+out_h_idx)*out_hw.y+out_w_idx)*4;\n"
 " vstore16(CONVERT_FLOAT16((COMPUTE_FLOAT16)(out4,out5,out6,out7)),0,output+out_offset);\n"
 "#endif\n"
 "}\n"
@@ -16573,312 +14917,94 @@ const char* winogradTransformDest2_3_1 =
 " res=max(res,(FLOAT4)(0));\n"
 "#endif\n"
 "#ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
-"#endif\n"
-" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
-" }\n"
-" }\n"
-" {\n"
-" int ox=oxStart+0;\n"
-" int oy=oyStart+1;\n"
-" if (ox<dstWidth && oy<dstHeight) {\n"
-" int imageOx=ox+pos.y*dstWidth;\n"
-" int imageOy=oy+batchOffset*dstHeight;\n"
-" FLOAT4 res=bias+m01+m11+m21;\n"
-"#ifdef RELU\n"
-" res=max(res,(FLOAT4)(0));\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
-"#endif\n"
-" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
-" }\n"
-" }\n"
-" {\n"
-" int ox=oxStart+1;\n"
-" int oy=oyStart+1;\n"
-" if (ox<dstWidth && oy<dstHeight) {\n"
-" int imageOx=ox+pos.y*dstWidth;\n"
-" int imageOy=oy+batchOffset*dstHeight;\n"
-" FLOAT4 res=bias+m11-m21+m31;\n"
-"#ifdef RELU\n"
-" res=max(res,(FLOAT4)(0));\n"
-"#endif\n"
-"#ifdef RELU6\n"
-" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
-"#endif\n"
-" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
-" }\n"
-" }\n"
-" }\n"
-" }\n"
-"}\n"
-;
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* layernorm_buf = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"__kernel void layernorm_w_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __global const FLOAT*input,\n"
-" __global FLOAT*output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channel,\n"
-"#ifdef GAMMA_BETA\n"
-" __global const FLOAT *gamma,\n"
-" __global const FLOAT *beta,\n"
-"#endif\n"
-" __private float epsilon){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" float4 local sum[LOCAL_SIZE];\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int h=pos.y % height;\n"
-" const int c=pos.y/height;\n"
-" const int b=pos.z;\n"
-" const int lid=get_local_id(0);\n"
-" const int channel4=(channel+3)/4;\n"
-" const int offset=((b*channel4+c)*height+h)*width*4;\n"
-" float4 in_sum=0;\n"
-"#ifdef RMSNORM\n"
-" float4 mean=0;\n"
-"#else\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-" in_sum += in;\n"
-" }\n"
-" sum[lid]=in_sum;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" \n"
-" float4 mean=sum[0]/(float4)width;\n"
-"#endif\n"
-" in_sum=0;\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-" in_sum += (in-mean)*(in-mean);\n"
-" }\n"
-" sum[lid]=in_sum;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" float4 square_sum=sum[0]/(float4)width;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-"#ifdef GAMMA_BETA\n"
-" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
-"#else\n"
-" float4 out=(in-mean)*value;\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),i,output+offset);\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void layernorm_hw_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __global const FLOAT*input,\n"
-" __global FLOAT*output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channel,\n"
-"#ifdef GAMMA_BETA\n"
-" __global const FLOAT *gamma,\n"
-" __global const FLOAT *beta,\n"
-"#endif\n"
-" __private float epsilon){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" float4 local sum[LOCAL_SIZE];\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int c=pos.y;\n"
-" const int b=pos.z;\n"
-" const int height_width=height*width;\n"
-" const int channel4=(channel+3)/4;\n"
-" const int lid=get_local_id(0);\n"
-" const int offset=((b*channel4+c)*height)*width*4;\n"
-" float4 in_sum=0;\n"
-"#ifdef RMSNORM\n"
-" float4 mean=0;\n"
-"#else\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-" in_sum += in;\n"
-" }\n"
-" sum[lid]=in_sum;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" \n"
-" float4 mean=sum[0]/(float4)height_width;\n"
-"#endif\n"
-" in_sum=0;\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-" in_sum += (in-mean)*(in-mean);\n"
-" }\n"
-" sum[lid]=in_sum;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" float4 square_sum=sum[0]/(float4)height_width;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset));\n"
-"#ifdef GAMMA_BETA\n"
-" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
-"#else\n"
-" float4 out=(in-mean)*value;\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),i,output+offset);\n"
-" }\n"
-" }\n"
-"}\n"
-"__kernel void layernorm_chw_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __global const FLOAT*input,\n"
-" __global FLOAT*output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channel,\n"
-"#ifdef GAMMA_BETA\n"
-" __global const FLOAT *gamma,\n"
-" __global const FLOAT *beta,\n"
-"#endif\n"
-" __private float epsilon){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" float local sum[LOCAL_SIZE];\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int b=pos.z;\n"
-" const int sum_size=width*height*channel;\n"
-" const int reduce_size=width*height;\n"
-" const int lid=get_local_id(0);\n"
-" const int channel4=(channel+3)/4;\n"
-" const int channel_remain=channel-(channel4-1)*4;\n"
-" const int offset=((b*channel4)*height)*width*4;\n"
-" const int wh_offset=height*width*4;\n"
-" \n"
-" float4 in_sum=0;\n"
-" float4 in_sum_left=0;\n"
-" float *in_sum_left_ptr=(float*)(&in_sum_left);\n"
-"#ifdef RMSNORM\n"
-" float4 mean=0;\n"
-"#else\n"
-" for(int c=0; c<channel4-1; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset+c*wh_offset));\n"
-" in_sum += in;\n"
-" }\n"
-" }\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset+(channel4-1)*wh_offset));\n"
-" in_sum_left += in;\n"
-" }\n"
-" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
-" for(int i=1; i<channel_remain; ++i){\n"
-" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
-" }\n"
-" sum[lid]=in_sum.x+in_sum_left.x;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" \n"
-" float4 mean=sum[0]/(float4)sum_size;\n"
+" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
 "#endif\n"
-" in_sum=0;\n"
-" in_sum_left=0;\n"
-" for(int c=0; c<channel4-1; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset+c*wh_offset));\n"
-" in_sum += (in-mean)*(in-mean);\n"
-" }\n"
+" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
 " }\n"
-" \n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset+(channel4-1)*wh_offset));\n"
-" in_sum_left += (in-mean)*(in-mean);\n"
 " }\n"
-" \n"
-" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
-" for(int i=1; i<channel_remain; ++i){\n"
-" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
+" {\n"
+" int ox=oxStart+0;\n"
+" int oy=oyStart+1;\n"
+" if (ox<dstWidth && oy<dstHeight) {\n"
+" int imageOx=ox+pos.y*dstWidth;\n"
+" int imageOy=oy+batchOffset*dstHeight;\n"
+" FLOAT4 res=bias+m01+m11+m21;\n"
+"#ifdef RELU\n"
+" res=max(res,(FLOAT4)(0));\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
+"#endif\n"
+" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
 " }\n"
-" \n"
-" sum[lid]=in_sum.x+in_sum_left.x;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)sum_size;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int c=0; c<channel4; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(vload4(i,input+offset+c*wh_offset));\n"
-"#ifdef GAMMA_BETA\n"
-" float4 out=(in-mean)*value*(float4)gamma[c*reduce_size+i]+(float4)beta[c*reduce_size+i];\n"
-"#else\n"
-" float4 out=(in-mean)*value;\n"
+" {\n"
+" int ox=oxStart+1;\n"
+" int oy=oyStart+1;\n"
+" if (ox<dstWidth && oy<dstHeight) {\n"
+" int imageOx=ox+pos.y*dstWidth;\n"
+" int imageOy=oy+batchOffset*dstHeight;\n"
+" FLOAT4 res=bias+m11-m21+m31;\n"
+"#ifdef RELU\n"
+" res=max(res,(FLOAT4)(0));\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" res=clamp(res,(FLOAT4)(0),(FLOAT4)(6));\n"
 "#endif\n"
-" vstore4(CONVERT_FLOAT4(out),i,output+offset+c*wh_offset);\n"
+" WI_F(uOutput,(int2)(imageOx,imageOy),res);\n"
+" }\n"
 " }\n"
 " }\n"
 " }\n"
 "}\n"
-"__kernel void layernorm_plain_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* layernorm_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"__kernel void layernorm_buf(__private int global_dim0,__private int global_dim1,\n"
 " __global const FLOAT*input,\n"
 " __global FLOAT*output,\n"
 " __private const int inside,\n"
-" __private const int outside,\n"
 "#ifdef GAMMA_BETA\n"
 " __global const FLOAT *gamma,\n"
 " __global const FLOAT *beta,\n"
 "#endif\n"
 " __private float epsilon){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" COMPUTE_FLOAT local sum[LOCAL_SIZE];\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int idx_out=pos.z;\n"
+" int2 pos=(int2)(get_global_id(0),get_global_id(1));\n"
+"#if LOCAL_SIZE>1\n"
+" float local sum[LOCAL_SIZE];\n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
 " const int lid=get_local_id(0);\n"
-" const int offset=idx_out*inside;\n"
+" const int offset=pos.y*inside;\n"
 " const int inside_v4=(inside+3) >> 2;\n"
+" #ifdef PACK_LEAVE\n"
+" const int loop=inside_v4-1;\n"
 " const int inside_remain=inside-((inside_v4-1) << 2);\n"
-" COMPUTE_FLOAT4 in_sum=0;\n"
+" #else\n"
+" const int loop=inside_v4;\n"
+" #endif\n"
+" \n"
+" float4 in_sum=0;\n"
 " int index=lid;\n"
-" for(; index<inside_v4-1; index+=LOCAL_SIZE){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(index,input+offset));\n"
+" #ifdef RMSNORM\n"
+" float4 mean=(float4)0;\n"
+" #else\n"
+" for(; index<loop; index+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(vload4(index,input+offset));\n"
 " in_sum += in;\n"
 " }\n"
 " sum[lid]=in_sum.x+in_sum.y+in_sum.z+ in_sum.w;\n"
 " \n"
-" COMPUTE_FLOAT4 in_left=0;\n"
+" #ifdef PACK_LEAVE\n"
 " if(index == inside_v4-1) {\n"
-" in_left=CONVERT_COMPUTE_FLOAT4(vload4(inside_v4-1,input+offset));\n"
-" sum[lid]=sum[lid]+in_left.x;\n"
-" if(inside_remain>1) {\n"
-" sum[lid]=sum[lid]+in_left.y;\n"
-" }\n"
-" if(inside_remain>2) {\n"
-" sum[lid]=sum[lid]+in_left.z;\n"
-" }\n"
-" if(inside_remain>3) {\n"
-" sum[lid]=sum[lid]+in_left.w;\n"
+" for(int i=0; i<inside_remain; ++i)\n"
+" float in=input[offset+index*4+i];\n"
+" sum[lid]=sum[lid]+in;\n"
 " }\n"
 " }\n"
+" #endif\n"
 " \n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
@@ -16887,47 +15013,86 @@ const char* layernorm_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " \n"
-" COMPUTE_FLOAT4 mean=sum[0]/(COMPUTE_FLOAT4)inside;\n"
+" float4 mean=sum[0]/(float4)inside;\n"
+" #endif\n"
 " in_sum=0;\n"
 " index=lid;\n"
-" for(; index<inside_v4-1; index+=LOCAL_SIZE){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(index,input+offset));\n"
+" for(; index<loop; index+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(vload4(index,input+offset));\n"
 " in_sum += (in-mean)*(in-mean);\n"
 " }\n"
 " sum[lid]=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
-" \n"
+" #ifdef PACK_LEAVE\n"
 " if(index == inside_v4-1) {\n"
-" COMPUTE_FLOAT4 in_left=CONVERT_COMPUTE_FLOAT4(vload4(inside_v4-1,input+offset));\n"
-" in_sum=(in_left-mean)*(in_left-mean);\n"
-" sum[lid]=sum[lid]+in_sum.x;\n"
-" if(inside_remain>1) {\n"
-" sum[lid]=sum[lid]+in_sum.y;\n"
-" }\n"
-" if(inside_remain>2) {\n"
-" sum[lid]=sum[lid]+in_sum.z;\n"
-" }\n"
-" if(inside_remain>3) {\n"
-" sum[lid]=sum[lid]+in_sum.w;\n"
+" for(int i=0; i<inside_remain; ++i)\n"
+" float in=input[offset+index*4+i];\n"
+" in=(in-mean)*(in-mean);\n"
+" sum[lid]=sum[lid]+in;\n"
 " }\n"
 " }\n"
+" #endif\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" COMPUTE_FLOAT4 square_sum=sum[0]/(COMPUTE_FLOAT4)inside;\n"
-" COMPUTE_FLOAT4 value=(COMPUTE_FLOAT4)1.0f/(COMPUTE_FLOAT4)sqrt(square_sum+(COMPUTE_FLOAT4)epsilon);\n"
-" for(int i=lid; i<inside_v4; i+=LOCAL_SIZE){\n"
-" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset));\n"
-"#ifdef GAMMA_BETA\n"
-" COMPUTE_FLOAT4 out=(in-mean)*value*CONVERT_COMPUTE_FLOAT4(vload4(i,gamma))+CONVERT_COMPUTE_FLOAT4(vload4(i,beta));\n"
+" float4 square_sum=sum[0]/(float4)inside;\n"
+" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
+" index=lid;\n"
+" for(; index<loop; index+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(vload4(index,input+offset));\n"
+" #ifdef GAMMA_BETA\n"
+" float4 out=(in-mean)*value*convert_float4(vload4(index,gamma))+convert_float4(vload4(index,beta));\n"
+" #else\n"
+" float4 out=(in-mean)*value;\n"
+" #endif\n"
+" vstore4(CONVERT_FLOAT4(out),index,output+offset);\n"
+" }\n"
+" #ifdef PACK_LEAVE\n"
+" if(index == inside_v4-1) {\n"
+" for(int i=0; i<inside_remain; ++i){\n"
+" float in=input[offset+index*4+i];\n"
+" #ifdef GAMMA_BETA\n"
+" float out=(in-mean.x)*value.x*(float)gamma[index*4+i]+(float)beta[index*4+i];\n"
+" #else\n"
+" float out=(in-mean.x)*value.x;\n"
+" #endif\n"
+" output[offset+index*4+i]=out;\n"
+" }\n"
+" }\n"
+" #endif\n"
+" }\n"
 "#else\n"
-" COMPUTE_FLOAT4 out=(in-mean)*value;\n"
-"#endif\n"
-" vstore4(CONVERT_FLOAT4(out),i,output+offset);\n"
+" if (pos.x<global_dim0 && pos.y<global_dim1) {\n"
+" const int offset=pos.y*inside;\n"
+" #ifdef RMSNORM\n"
+" float mean=0;\n"
+" #else\n"
+" float in_sum=0;\n"
+" for(int index=0; index<inside; index++){\n"
+" in_sum += (float)input[offset+index];\n"
+" }\n"
+" float mean=in_sum/inside;\n"
+" #endif\n"
+" in_sum=0;\n"
+" for(int index=0; index<inside; index++){\n"
+" float in=(float)input[offset+index];\n"
+" in_sum += (in-mean)*(in-mean);\n"
+" }\n"
+" float square_sum=in_sum/inside;\n"
+" float value=1.0f/sqrt(square_sum+epsilon);\n"
+" for(int i=0; i<inside; ++i){\n"
+" float in=input[offset+i];\n"
+" #ifdef GAMMA_BETA\n"
+" float out=(in-mean)*value*(float)gamma[i]+(float)beta[i];\n"
+" #else\n"
+" float out=(in-mean)*value;\n"
+" #endif\n"
+" output[offset+i]=out;\n"
 " }\n"
 " }\n"
+"#endif\n"
 "}\n"
 ;
 #endif
@@ -16939,162 +15104,111 @@ const char* softmax_buf =
 "#define EXP exp\n"
 "#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
 "#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS\n"
+"__kernel void softmax_in1_buf(GLOBAL_SIZE_3_DIMS\n"
 " __global const FLOAT *input,\n"
 " __global FLOAT *output,\n"
-" __private const int remain_channels,\n"
-" __private const int4 shape) {//NCHW\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim) {\n"
 " const int x=get_global_id(0);\n"
-" const int w=get_global_id(1);\n"
-" const int bh=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,w,bh);\n"
+" const int y=get_global_id(1); // inside=1\n"
+" const int z=get_global_id(2); // outside\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
-" const int batch_idx=bh/shape.z;\n"
-" const int height_idx=bh % shape.z;\n"
-" const int offset=(((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+w)*4;\n"
+" const int offset=z*dim+y;\n"
+" const int dim4=(dim+3)/4;\n"
+" const int loop_end=max(0,dim4-1);\n"
 "#if SOFTMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
-" COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];\n"
+" COMPUTE_FLOAT local sum[SOFTMAX_LOCAL_SIZE];\n"
+" // compute maxvalue\n"
 " COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
-" for (int i=lid; i<shape.y-1; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset)));\n"
+" for (int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
 " }\n"
-" sum[lid]=maxValue;\n"
+" sum[lid]=fmax(fmax(fmax(maxValue.x,maxValue.y),maxValue.z),maxValue.w);\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=fmax(sum[lid],sum[lid+i]);\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" maxValue=sum[0];\n"
-" maxValue.x=fmax(maxValue.x,maxValue.y);\n"
-" maxValue.x=fmax(maxValue.x,maxValue.z);\n"
-" maxValue.x=fmax(maxValue.x,maxValue.w);\n"
-" COMPUTE_FLOAT4 input_data=CONVERT_COMPUTE_FLOAT4(vload4((shape.y-1) *shape.z*shape.w,input+offset));\n"
-" if (remain_channels == 0) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.z);\n"
-" maxValue.x=fmax(maxValue.x,input_data.w);\n"
-" } else if (remain_channels == 1) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.z);\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" } else if (remain_channels == 2) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" } else if (remain_channels == 3) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
+" maxValue.x=sum[0];\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" maxValue.x=fmax(maxValue.x,(COMPUTE_FLOAT)(input[offset+i]));\n"
 " }\n"
+" // compute sumvalue\n"
 " COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=lid; i<shape.y-1; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
+" for (int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
 " }\n"
-" sum[lid]=sumValue;\n"
+" sum[lid]=sumValue.x+sumValue.y+sumValue.z+sumValue.w;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=SOFTMAX_LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" sumValue=sum[0];\n"
-" sumValue.x=sumValue.x+sumValue.y+sumValue.z+sumValue.w;\n"
-" \n"
+" sumValue.x=sum[0];\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x);\n"
+" }\n"
 " \n"
-" input_data -= maxValue.x;\n"
-" if (remain_channels == 0) {\n"
-" sumValue.x += exp(input_data.w);\n"
-" sumValue.x += exp(input_data.z);\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 1) {\n"
-" sumValue.x += exp(input_data.z);\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 2) {\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 3) {\n"
-" sumValue.x += exp(input_data.x);\n"
+" // store result\n"
+" for(int i=lid; i<loop_end; i+=SOFTMAX_LOCAL_SIZE){\n"
+" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x)/(COMPUTE_FLOAT4)sumValue.x),0,output+offset+i*4);\n"
 " }\n"
-" for(int i=lid; i<shape.y; i+=SOFTMAX_LOCAL_SIZE){\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset))-maxValue.x)/sumValue.x;\n"
-" vstore4(CONVERT_FLOAT4(value),i*shape.z*shape.w,output+offset);\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" output[offset+i]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x)/sumValue.x;\n"
 " }\n"
 "#else\n"
+" // compute maxvalue\n"
 " COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
-" for (int i=0; i<shape.y-1; i++) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset)));\n"
+" for (int i=0; i<loop_end; i++) {\n"
+" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
 " }\n"
-" \n"
-" maxValue.x=fmax(maxValue.x,maxValue.y);\n"
-" maxValue.x=fmax(maxValue.x,maxValue.z);\n"
-" maxValue.x=fmax(maxValue.x,maxValue.w);\n"
-" COMPUTE_FLOAT4 input_data=CONVERT_COMPUTE_FLOAT4(vload4((shape.y-1) *shape.z*shape.w,input+offset));\n"
-" if (remain_channels == 0) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.z);\n"
-" maxValue.x=fmax(maxValue.x,input_data.w);\n"
-" } else if (remain_channels == 1) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.z);\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" } else if (remain_channels == 2) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.y);\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
-" } else if (remain_channels == 3) {\n"
-" maxValue.x=fmax(maxValue.x,input_data.x);\n"
+" maxValue.x=fmax(fmax(fmax(maxValue.x,maxValue.y),maxValue.z),maxValue.w);\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" maxValue.x=fmax(maxValue.x,(COMPUTE_FLOAT)(input[offset+i]));\n"
 " }\n"
+" \n"
+" // compute sumvalue\n"
 " COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=0; i<shape.y-1; i++) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
+" for (int i=0; i<loop_end; i++) {\n"
+" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x);\n"
 " }\n"
 " sumValue.x=sumValue.x+sumValue.y+sumValue.z+sumValue.w;\n"
-" input_data -= maxValue.x;\n"
-" if (remain_channels == 0) {\n"
-" sumValue.x += exp(input_data.w);\n"
-" sumValue.x += exp(input_data.z);\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 1) {\n"
-" sumValue.x += exp(input_data.z);\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 2) {\n"
-" sumValue.x += exp(input_data.y);\n"
-" sumValue.x += exp(input_data.x);\n"
-" } else if (remain_channels == 3) {\n"
-" sumValue.x += exp(input_data.x);\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x);\n"
 " }\n"
-" for(int i=0; i<shape.y; i++){\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w,input+offset))-maxValue.x)/sumValue.x;\n"
-" vstore4(CONVERT_FLOAT4(value),i*shape.z*shape.w,output+offset);\n"
+" \n"
+" // store result\n"
+" for(int i=0; i<loop_end; i++){\n"
+" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-(COMPUTE_FLOAT4)maxValue.x)/(COMPUTE_FLOAT4)sumValue.x),0,output+offset+i*4);\n"
+" }\n"
+" for(int i=loop_end << 2; i<dim; ++i){\n"
+" output[offset+i]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i])-maxValue.x)/sumValue.x;\n"
 " }\n"
 "#endif\n"
 "}\n"
-"__kernel void softmax_height(GLOBAL_SIZE_3_DIMS\n"
+"__kernel void softmax_buf(GLOBAL_SIZE_3_DIMS\n"
 " __global const FLOAT *input,\n"
 " __global FLOAT *output,\n"
-" __private const int remain_channels,\n"
-" __private const int4 shape // NCHW\n"
-" ) {\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim) {\n"
 " const int x=get_global_id(0);\n"
-" const int wc=get_global_id(1);\n"
-" const int b=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,wc,b);\n"
+" const int y=get_global_id(1); // inside\n"
+" const int z=get_global_id(2); // outside\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
-" const int c=wc/shape.w;\n"
-" const int w=wc % shape.w;\n"
-" const int offset=(((b*shape.y+c)*shape.z+0)*shape.w+w)*4;\n"
+" const int offset=z*dim*inside+y;\n"
 "#if SOFTMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
-" COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];\n"
-" \n"
-" /*Compute Max */\n"
-" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
-" for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset)));\n"
+" COMPUTE_FLOAT local sum[SOFTMAX_LOCAL_SIZE];\n"
+" COMPUTE_FLOAT maxValue=(COMPUTE_FLOAT)-FLT_MAX;\n"
+" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" maxValue=fmax(maxValue,(COMPUTE_FLOAT)(input[offset+i*inside]));\n"
 " }\n"
 " sum[lid]=maxValue;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
@@ -17104,11 +15218,9 @@ const char* softmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " maxValue=sum[0];\n"
-" \n"
-" /*Compute Exp Sum*/\n"
-" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset))-maxValue);\n"
+" COMPUTE_FLOAT sumValue=(COMPUTE_FLOAT)0;\n"
+" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue);\n"
 " }\n"
 " sum[lid]=sumValue;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
@@ -17118,51 +15230,41 @@ const char* softmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " sumValue=sum[0];\n"
-" /*Compute Result */\n"
-" for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset))-maxValue)/sumValue;\n"
-" vstore4(CONVERT_FLOAT4(value),i*shape.w,output+offset);\n"
+" for(int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE){\n"
+" output[offset+i*inside]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue)/sumValue;\n"
 " }\n"
 "#else\n"
-" /*Compute Max */\n"
-" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
-" for (int i=0; i<shape.z; i++) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset)));\n"
+" COMPUTE_FLOAT maxValue=(COMPUTE_FLOAT)-FLT_MAX;\n"
+" for (int i=0; i<dim; i++) {\n"
+" maxValue=fmax(maxValue,(COMPUTE_FLOAT)(input[offset+i*inside]));\n"
 " }\n"
-" \n"
-" /*Compute Exp Sum*/\n"
-" COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=0; i<shape.z; i++) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset))-maxValue);\n"
+" COMPUTE_FLOAT sumValue=(COMPUTE_FLOAT)0;\n"
+" for (int i=0; i<dim; i++) {\n"
+" sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue);\n"
 " }\n"
-" /*Compute Result */\n"
-" for (int i=0; i<shape.z; i++) {\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w,input+offset))-maxValue)/sumValue;\n"
-" vstore4(CONVERT_FLOAT4(value),i*shape.w,output+offset);\n"
+" for(int i=0; i<dim; i++){\n"
+" output[offset+i*inside]=(FLOAT)exp((COMPUTE_FLOAT)(input[offset+i*inside])-maxValue)/sumValue;\n"
 " }\n"
 "#endif\n"
 "}\n"
-"__kernel void softmax_width(GLOBAL_SIZE_3_DIMS\n"
+"__kernel void softmax_v4_buf(GLOBAL_SIZE_3_DIMS\n"
 " __global const FLOAT *input,\n"
 " __global FLOAT *output,\n"
-" __private const int remain_channels,\n"
-" __private const int4 shape // NCHW\n"
-" ) {\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim) {\n"
 " const int x=get_global_id(0);\n"
-" const int c=get_global_id(1);\n"
-" const int bh=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(x,c,bh);\n"
-" const int b=bh/shape.z;\n"
-" const int h=bh % shape.z;\n"
-" const int offset=(((b*shape.y+c)*shape.z+h)*shape.w+0)*4;\n"
+" const int y=get_global_id(1); // inside\n"
+" const int z=get_global_id(2); // outside\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" \n"
+" const int offset=z*dim*inside+(y << 2);\n"
 "#if SOFTMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
 " COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];\n"
-" \n"
-" /*Compute Max */\n"
-" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
-" for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
+" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
+" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside)));\n"
 " }\n"
 " sum[lid]=maxValue;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
@@ -17172,11 +15274,9 @@ const char* softmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " maxValue=sum[0];\n"
-" \n"
-" /*Compute Exp Sum*/\n"
 " COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-maxValue);\n"
+" for (int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE) {\n"
+" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue);\n"
 " }\n"
 " sum[lid]=sumValue;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
@@ -17186,28 +15286,20 @@ const char* softmax_buf =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " sumValue=sum[0];\n"
-" \n"
-" /*Compute Result */\n"
-" for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-maxValue)/sumValue;\n"
-" vstore4(CONVERT_FLOAT4(value),i,output+offset);\n"
+" for(int i=lid; i<dim; i+=SOFTMAX_LOCAL_SIZE){\n"
+" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue)/sumValue),0,output+offset+i*inside);\n"
 " }\n"
 "#else\n"
-" /*Compute Max */\n"
-" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)(-FLT_MAX);\n"
-" for (int i=0; i<shape.w; i++) {\n"
-" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset)));\n"
+" COMPUTE_FLOAT4 maxValue=(COMPUTE_FLOAT4)-FLT_MAX;\n"
+" for (int i=0; i<dim; i++) {\n"
+" maxValue=fmax(maxValue,CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside)));\n"
 " }\n"
-" /*Compute Exp Sum*/\n"
 " COMPUTE_FLOAT4 sumValue=(COMPUTE_FLOAT4)0;\n"
-" for (int i=0; i<shape.w; i++) {\n"
-" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-maxValue);\n"
+" for (int i=0; i<dim; i++) {\n"
+" sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue);\n"
 " }\n"
-" \n"
-" /*Compute Result */\n"
-" for (int i=0; i<shape.w; i++) {\n"
-" COMPUTE_FLOAT4 value=exp(CONVERT_COMPUTE_FLOAT4(vload4(i,input+offset))-maxValue)/sumValue;\n"
-" vstore4(CONVERT_FLOAT4(value),i,output+offset);\n"
+" for(int i=0; i<dim; i++){\n"
+" vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0,input+offset+i*inside))-maxValue)/sumValue),0,output+offset+i*inside);\n"
 " }\n"
 "#endif\n"
 "}\n"
@@ -17233,8 +15325,6 @@ const char* gather_buf =
 " __private const int4 stride_dst,\n"
 " __private const int2 steps,\n"
 " __private const int2 iters,\n"
-" __private const int4 dst_c4size,// w,h,c,n\n"
-" __private const int4 src_c4size,// w,h,c,n\n"
 " __private const int inputSize) {\n"
 " int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
 " \n"
@@ -17243,90 +15333,21 @@ const char* gather_buf =
 " int x=pos.x % x_size;\n"
 " int y=pos.x/x_size;\n"
 " int2 index=(int2)(pos.z,pos.z);\n"
+"#ifdef OFFSET_DST\n"
+" index.x=offset_dst_ptr[pos.z];\n"
+"#endif\n"
 " \n"
-" #ifdef OFFSET_DST\n"
-" {\n"
-" int offset_value=pos.z;\n"
-" int off_c4_size=(offset_dst_shape.z+3) >> 2;\n"
-" #ifdef GATHER_INPUT_NHWC\n"
-" int off_c=offset_value % offset_dst_shape.z; offset_value /= offset_dst_shape.z;\n"
-" int off_w=offset_value % offset_dst_shape.x; offset_value /= offset_dst_shape.x;\n"
-" int off_h=offset_value % offset_dst_shape.y;\n"
-" int off_b=offset_value/offset_dst_shape.y;\n"
-" #else\n"
-" int off_w=offset_value % offset_dst_shape.x; offset_value /= offset_dst_shape.x;\n"
-" int off_h=offset_value % offset_dst_shape.y; offset_value /= offset_dst_shape.y;\n"
-" int off_c=offset_value % offset_dst_shape.z;\n"
-" int off_b=offset_value/offset_dst_shape.z;\n"
-" #endif\n"
-" int real_dst_offset=(((off_b*off_c4_size+off_c/4)*offset_dst_shape.y+off_h)*offset_dst_shape.x+off_w)*4+off_c % 4;\n"
-" index.x=offset_dst_ptr[real_dst_offset];\n"
-" }\n"
-" #endif\n"
-" \n"
-" #ifdef OFFSET_SRC\n"
-" {\n"
-" int offset_value=pos.z;\n"
-" int off_c4_size=(offset_src_shape.z+3) >> 2;\n"
-" #ifdef GATHER_INPUT_NHWC\n"
-" int off_c=offset_value % offset_src_shape.z; offset_value /= offset_src_shape.z;\n"
-" int off_w=offset_value % offset_src_shape.x; offset_value /= offset_src_shape.x;\n"
-" int off_h=offset_value % offset_src_shape.y;\n"
-" int off_b=offset_value/offset_src_shape.y;\n"
-" #else\n"
-" int off_w=offset_value % offset_src_shape.x; offset_value /= offset_src_shape.x;\n"
-" int off_h=offset_value % offset_src_shape.y; offset_value /= offset_src_shape.y;\n"
-" int off_c=offset_value % offset_src_shape.z;\n"
-" int off_b=offset_value/offset_src_shape.z;\n"
-" #endif\n"
-" int real_src_offset=(((off_b*off_c4_size+off_c/4)*offset_src_shape.y+off_h)*offset_src_shape.x+off_w)*4+off_c % 4;\n"
-" index.y=offset_src_ptr[real_src_offset];\n"
-" }\n"
-" #endif\n"
-" \n"
+"#ifdef OFFSET_SRC\n"
+" index.y=offset_src_ptr[pos.z];\n"
+"#endif\n"
 " int2 offset=index*steps;\n"
 " int src_offset=offset.y+stride_src.w+x*stride_src.x+y*stride_src.y+pos.y*stride_src.z;\n"
 " int dst_offset=offset.x+stride_dst.w+x*stride_dst.x+y*stride_dst.y+pos.y*stride_dst.z;\n"
-" int src_offsetC4,dst_offsetC4;\n"
-" {\n"
-"#ifdef GATHER_INPUT_NHWC\n"
-" int c=src_offset % src_c4size.z; src_offset /= src_c4size.z;\n"
-" int w=src_offset % src_c4size.x; src_offset /= src_c4size.x;\n"
-" int h=src_offset % src_c4size.y;\n"
-" int b=src_offset/src_c4size.y;\n"
-" int c4_size=(src_c4size.z+3)/4;\n"
-" src_offsetC4=(((b*c4_size+(c/4))*src_c4size.y+h)*src_c4size.x+w)*4+(c % 4);\n"
-"#else\n"
-" int w=src_offset % src_c4size.x; src_offset /= src_c4size.x;\n"
-" int h=src_offset % src_c4size.y; src_offset /= src_c4size.y;\n"
-" int c=src_offset % src_c4size.z;\n"
-" int b=src_offset/src_c4size.z;\n"
-" int c4_size=(src_c4size.z+3)/4;\n"
-" src_offsetC4=(((b*c4_size+(c/4))*src_c4size.y+h)*src_c4size.x+w)*4+(c % 4);\n"
-"#endif\n"
-" }\n"
-" {\n"
-"#ifdef GATHER_OUTPUT_NHWC\n"
-" int c=dst_offset % dst_c4size.z; dst_offset /= dst_c4size.z;\n"
-" int w=dst_offset % dst_c4size.x; dst_offset /= dst_c4size.x;\n"
-" int h=dst_offset % dst_c4size.y;\n"
-" int b=dst_offset/dst_c4size.y;\n"
-" int c4_size=(dst_c4size.z+3)/4;\n"
-" dst_offsetC4=(((b*c4_size+(c/4))*dst_c4size.y+h)*dst_c4size.x+w)*4+(c % 4);\n"
-"#else\n"
-" int w=dst_offset % dst_c4size.x; dst_offset /= dst_c4size.x;\n"
-" int h=dst_offset % dst_c4size.y; dst_offset /= dst_c4size.y;\n"
-" int c=dst_offset % dst_c4size.z;\n"
-" int b=dst_offset/dst_c4size.z;\n"
-" int c4_size=(dst_c4size.z+3)/4;\n"
-" dst_offsetC4=(((b*c4_size+(c/4))*dst_c4size.y+h)*dst_c4size.x+w)*4+(c % 4);\n"
-"#endif\n"
-" }\n"
 " if(offset.x >= 0){\n"
 " if(offset.y >= 0 && offset.y<inputSize){\n"
-" output[dst_offsetC4]=(OUTPUT_TYPE)input[src_offsetC4];\n"
+" output[dst_offset]=(OUTPUT_TYPE)input[src_offset];\n"
 " }else{\n"
-" output[dst_offsetC4]=(OUTPUT_TYPE)(0);\n"
+" output[dst_offset]=(OUTPUT_TYPE)(0);\n"
 " }\n"
 " }\n"
 " }\n"
@@ -17380,6 +15401,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -17407,9 +15429,9 @@ const char* conv_2d_c16_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*((output_channel+3)/4);\n"
-" const uint output_offset=b*output_b_pitch +\n"
-" (feature_block << 2)*output_fs_pitch +\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" (feature_block << 2)*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -17550,13 +15572,13 @@ const char* conv_2d_c16_subgroup_buf =
 " if ((feature_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
 " if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<2 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "#if SLM_DIV_FACTOR>1\n"
@@ -17576,6 +15598,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -17603,9 +15626,9 @@ const char* conv_2d_c16_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*((output_channel+3)/4);\n"
-" const uint output_offset=b*output_b_pitch +\n"
-" (feature_block << 2)*output_fs_pitch +\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" (feature_block << 2)*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -17746,13 +15769,13 @@ const char* conv_2d_c16_subgroup_buf =
 " if ((feature_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
 " if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<4 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "#if SLM_DIV_FACTOR>1\n"
@@ -17772,6 +15795,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -17799,9 +15823,9 @@ const char* conv_2d_c16_subgroup_buf =
 " const uint output_x_pitch=4;\n"
 " const uint output_y_pitch=output_x_pitch*output_width;\n"
 " const uint output_fs_pitch=output_y_pitch*output_height;\n"
-" const uint output_b_pitch=output_fs_pitch*((output_channel+3)/4);\n"
-" const uint output_offset=b*output_b_pitch +\n"
-" (feature_block << 2)*output_fs_pitch +\n"
+" const uint output_b_pitch=output_fs_pitch*batch;\n"
+" const uint output_offset=b*output_fs_pitch +\n"
+" (feature_block << 2)*output_b_pitch +\n"
 " y*output_y_pitch +\n"
 " x*output_x_pitch;\n"
 " const uint filter_isv_pitch=16;\n"
@@ -17942,13 +15966,13 @@ const char* conv_2d_c16_subgroup_buf =
 " if ((feature_block+1)*16 >= output_channel) {\n"
 " for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
 " if ((feature_block*16+lid_y*4+lid_x<output_channel))\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 " else\n"
 " {\n"
 " for (int i=0; i<8 && (x+i)<output_width; i++) {\n"
-" output[output_offset+lid_y*output_fs_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
+" output[output_offset+lid_y*output_b_pitch+i*output_x_pitch+lid_x]=(FLOAT)dst[i];\n"
 " }\n"
 " }\n"
 "#if SLM_DIV_FACTOR>1\n"
@@ -17968,6 +15992,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -18176,6 +16201,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -18383,6 +16409,7 @@ const char* conv_2d_c16_subgroup_buf =
 " __private const int output_width,\n"
 " __private const int output_height,\n"
 " __private const int output_channel,\n"
+" __private const int batch,\n"
 " __private const int x_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right,\n"
@@ -18597,6 +16624,7 @@ const char* input_transe_buf =
 " __private const int input_width,\n"
 " __private const int input_height,\n"
 " __private const int input_channel,\n"
+" __private const int batch,\n"
 " __private const int channel_blocks,\n"
 " __private const int input_pad_left,\n"
 " __private const int input_pad_right)\n"
@@ -18613,9 +16641,9 @@ const char* input_transe_buf =
 " const uint input_x_pitch=4;\n"
 " const uint input_y_pitch=input_x_pitch*input_width;\n"
 " const uint input_f_pitch=input_y_pitch*input_height;\n"
-" const uint input_b_pitch=input_f_pitch*channel_blocks;\n"
-" const uint input_offset=b*input_b_pitch +\n"
-" c*input_f_pitch +\n"
+" const uint input_b_pitch=input_f_pitch*batch;\n"
+" const uint input_offset=b*input_f_pitch +\n"
+" c*input_b_pitch +\n"
 " h*input_y_pitch +\n"
 " w*input_x_pitch;\n"
 " // Output offset calculations:\n"
@@ -18643,6 +16671,7 @@ const char* input_transe_buf =
 " int input_width,\n"
 " int input_height,\n"
 " int input_channel,\n"
+" int batch,\n"
 " int channel_blocks,\n"
 " int input_pad_left,\n"
 " int input_pad_right)\n"
@@ -18660,10 +16689,10 @@ const char* input_transe_buf =
 " const uint input_x_pitch=4;\n"
 " const uint input_y_pitch=input_x_pitch*input_width;\n"
 " const uint input_f_pitch=input_y_pitch*input_height;\n"
-" const uint input_b_pitch=input_f_pitch*channel_blocks;\n"
+" const uint input_b_pitch=input_f_pitch*batch;\n"
 " \n"
-" const uint input_offset=b*input_b_pitch +\n"
-" c*input_f_pitch +\n"
+" const uint input_offset=b*input_f_pitch +\n"
+" c*input_b_pitch +\n"
 " h*input_y_pitch +\n"
 " w*input_x_pitch;\n"
 " \n"
@@ -18687,360 +16716,103 @@ const char* input_transe_buf =
 " }\n"
 " pad_offset += (input_pad_left+input_width)*output_x_pitch;\n"
 " for(int i=0; i<input_pad_right; ++i){\n"
-" vstore4((FLOAT4)0,0,output+pad_offset+i*output_x_pitch);\n"
-" }\n"
-" }\n"
-"}\n"
-;
-#endif
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* reduction_buf = 
-"// TODO: use INIT_SCALAR_VALUE,OPERATOR,FINAL_OPERATOR_ON_CHANNEL macro abstract and simplify code\n"
-"// TODO: support reduce dims include batch\n"
-"// TODO: support keep_dim=False\n"
-"// TODO: fix channel reduce result re-pack problem\n"
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
-"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const INPUT_TYPE* input,\n"
-" __global OUTPUT_TYPE* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
-" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
-" const int offset=((((batch_idx*inputChannelBlock)+channel_idx)*inputHeight+height_idx)*inputWidth+0)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+channel_idx)*outputHeight+height_idx)*oututWidth+0)*4;\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" \n"
-"#if LOCAL_SIZE>0\n"
-" const int lid=get_local_id(0);\n"
-" INPUT_TYPE4 local sum[LOCAL_SIZE];\n"
-" for(int i=lid; i<inputWidth; i+=LOCAL_SIZE){\n"
-" INPUT_TYPE4 in=vload4(i,input+offset);\n"
-" out=OPERATE(out,in);\n"
-" }\n"
-" sum[lid]=out;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=OPERATE(sum[lid],sum[lid+i]);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" out=sum[0];\n"
-"#else\n"
-" for(int i=0; i<inputWidth; ++i){\n"
-" INPUT_TYPE4 in=vload4(i,input+offset);\n"
-" out=OPERATE(out,in);\n"
-" }\n"
-"#endif\n"
-"#ifdef GET_AVG\n"
-" out=out/inputWidth;\n"
-"#endif\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+outputOffset);\n"
-"}\n"
-"__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const INPUT_TYPE* input,\n"
-" __global OUTPUT_TYPE* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-"#if LOCAL_SIZE>0\n"
-" const int width_local_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int width_idx=get_group_id(0);\n"
-" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
-" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+channel_idx)*inputHeight+0)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+channel_idx)*outputHeight+0)*oututWidth+width_idx)*4;\n"
-" const int lid=get_local_id(0);\n"
-" INPUT_TYPE4 local sum[LOCAL_SIZE];\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" for(int i=lid; i<inputHeight; i+=LOCAL_SIZE){\n"
-" INPUT_TYPE4 in=vload4(i*inputWidth,input+offset);\n"
-" out=OPERATE(out,in);\n"
-" }\n"
-" sum[lid]=out;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=OPERATE(sum[lid],sum[lid+i]);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" out=sum[0];\n"
-"#else\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/outputChannelBlock;\n"
-" const int channel_idx=batch_channel_idx % outputChannelBlock;\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+channel_idx)*inputHeight+0)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+channel_idx)*outputHeight+0)*oututWidth+width_idx)*4;\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" for(int i=0; i<inputHeight; ++i){\n"
-" INPUT_TYPE4 in=vload4(i*inputWidth,input+offset);\n"
-" out=OPERATE(out,in);\n"
-" }\n"
-"#endif\n"
-" \n"
-"#ifdef GET_AVG\n"
-" out=out/inputHeight;\n"
-"#endif\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+outputOffset);\n"
-"}\n"
-"__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const INPUT_TYPE* input,\n"
-" __global OUTPUT_TYPE* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-"#if LOCAL_SIZE>0\n"
-" const int width_local_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_idx=get_global_id(2);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,batch_idx);\n"
-" const int width_idx=get_group_id(0);\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+0)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+0)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
-" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
-" const int lid=get_local_id(0);\n"
-" INPUT_TYPE local sum[LOCAL_SIZE];\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" INPUT_TYPE4 in;\n"
-" INPUT_TYPE *inPtr=(INPUT_TYPE*)&in;\n"
-" for(int i=lid; i<inputChannelBlock-1; i += LOCAL_SIZE){\n"
-" in=vload4(i*inputWidth*inputHeight,input+offset);\n"
-" out=OPERATE(out,in);\n"
-" }\n"
-" out.x=OPERATE(out.x,out.y);\n"
-" out.x=OPERATE(out.x,out.z);\n"
-" out.x=OPERATE(out.x,out.w);\n"
-" sum[lid]=out.x;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=OPERATE(sum[lid],sum[lid+i]);\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" }\n"
-" out.x=sum[0];\n"
-" in=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
-" out.x=OPERATE(out.x,inPtr[j]);\n"
-" }\n"
-"#ifdef GET_AVG\n"
-" out.x=out.x/inputChannel;\n"
-"#endif\n"
-" output[outputOffset]=(OUTPUT_TYPE)out.x;\n"
-" \n"
-"#else\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_idx);\n"
-" \n"
-" const int offset=((((batch_idx*inputChannelBlock)+0)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((batch_idx*outputChannelBlock)+0)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
-" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
-" \n"
-" INPUT_TYPE out=(INPUT_TYPE)VALUE;\n"
-" INPUT_TYPE4 in;\n"
-" INPUT_TYPE *inPtr=(INPUT_TYPE*)&in;\n"
-" for(int i=0; i<inputChannelBlock-1; ++i){\n"
-" in=vload4(i*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<4; ++j){\n"
-" out=OPERATE(out,inPtr[j]);\n"
-" }\n"
-" }\n"
-" in=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
-" out=OPERATE(out,inPtr[j]);\n"
-" }\n"
-"#ifdef GET_AVG\n"
-" out=out/inputChannel;\n"
-"#endif\n"
-" output[outputOffset]=(OUTPUT_TYPE)out;\n"
-"#endif\n"
-"}\n"
-"__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const INPUT_TYPE* input,\n"
-" __global OUTPUT_TYPE* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-"#if LOCAL_SIZE>0\n"
-" const int width_local_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_idx=get_global_id(2);\n"
+" vstore4((FLOAT4)0,0,output+pad_offset+i*output_x_pitch);\n"
+" }\n"
+" }\n"
+"}\n"
+;
+#endif
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* reduction_buf = 
+"// TODO: use INIT_SCALAR_VALUE,OPERATOR,FINAL_OPERATOR_ON_CHANNEL macro abstract and simplify code\n"
+"// TODO: support reduce dims include batch\n"
+"// TODO: support keep_dim=False\n"
+"// TODO: fix channel reduce result re-pack problem\n"
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
+"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"__kernel void reduct_buf(GLOBAL_SIZE_3_DIMS\n"
+" __global const INPUT_TYPE *input,\n"
+" __global OUTPUT_TYPE *output,\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1); // inside\n"
+" const int z=get_global_id(2); // outside\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
-" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,batch_idx);\n"
-" const int width_idx=get_group_id(0);\n"
+" INPUT_TYPE out=(INPUT_TYPE)VALUE;\n"
+" const int offset=z*dim*inside+y;\n"
 " \n"
-" const int offset=((((batch_idx*inputChannelBlock)+0)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((batch_idx*outputHeight+height_idx)*oututWidth+width_idx);\n"
-" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
+"#if REDUCT_LOCAL_SIZE>4\n"
 " const int lid=get_local_id(0);\n"
-" INPUT_TYPE local sum[LOCAL_SIZE];\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" INPUT_TYPE4 in;\n"
-" INPUT_TYPE *inPtr=(INPUT_TYPE*)&in;\n"
-" for(int i=lid; i<inputChannelBlock-1; i += LOCAL_SIZE){\n"
-" in=vload4(i*inputWidth*inputHeight,input+offset);\n"
+" INPUT_TYPE local sum[REDUCT_LOCAL_SIZE];\n"
+" for(int i=lid; i<dim; i+=REDUCT_LOCAL_SIZE){\n"
+" INPUT_TYPE in=(INPUT_TYPE)input[offset+i*inside];\n"
 " out=OPERATE(out,in);\n"
 " }\n"
-" out.x=OPERATE(out.x,out.y);\n"
-" out.x=OPERATE(out.x,out.z);\n"
-" out.x=OPERATE(out.x,out.w);\n"
-" sum[lid]=out.x;\n"
+" sum[lid]=out;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
+" for(int i=REDUCT_LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=OPERATE(sum[lid],sum[lid+i]);\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" out.x=sum[0];\n"
-" in=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
-" out.x=OPERATE(out.x,inPtr[j]);\n"
-" }\n"
-"#ifdef GET_AVG\n"
-" out.x=out.x/inputChannel;\n"
-"#endif\n"
-" output[outputOffset]=(OUTPUT_TYPE)out.x;\n"
-" \n"
+" out=sum[0];\n"
 "#else\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_idx);\n"
-" const int offset=((((batch_idx*inputChannelBlock)+0)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((batch_idx*outputHeight+height_idx)*oututWidth+width_idx);\n"
-" int remain=inputChannel-(inputChannelBlock-1)*4;\n"
-" INPUT_TYPE out=(INPUT_TYPE)VALUE;\n"
-" INPUT_TYPE4 in;\n"
-" INPUT_TYPE *inPtr=(INPUT_TYPE*)&in;\n"
-" for(int i=0; i<inputChannelBlock-1; ++i){\n"
-" in=vload4(i*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<4; ++j){\n"
-" out=OPERATE(out,inPtr[j]);\n"
-" }\n"
-" }\n"
-" in=vload4((inputChannelBlock-1)*inputWidth*inputHeight,input+offset);\n"
-" for(int j=0; j<remain; ++j){\n"
-" out=OPERATE(out,inPtr[j]);\n"
+" for(int i=0; i<dim; ++i){\n"
+" INPUT_TYPE in=(INPUT_TYPE)input[offset+i*inside];\n"
+" out=OPERATE(out,in);\n"
 " }\n"
-"#ifdef GET_AVG\n"
-" out=out/inputChannel;\n"
 "#endif\n"
-" output[outputOffset]=(OUTPUT_TYPE)out;\n"
+"#ifdef GET_AVG\n"
+" out=out/dim;\n"
 "#endif\n"
+" output[z*inside+y]=(OUTPUT_TYPE)out;\n"
 "}\n"
-"__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS\n"
-" __global const INPUT_TYPE* input,\n"
-" __global OUTPUT_TYPE* output,\n"
-" __private const int inputWidth,\n"
-" __private const int inputHeight,\n"
-" __private const int inputChannel,\n"
-" __private const int inputBatch,\n"
-" __private const int inputChannelBlock,\n"
-" __private const int oututWidth,\n"
-" __private const int outputHeight,\n"
-" __private const int outputChannel,\n"
-" __private const int outputChannelBlock\n"
-" ) {\n"
-"#if LOCAL_SIZE>0\n"
-" const int width_local_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_local_idx,height_idx,channel_idx);\n"
-" const int width_idx=get_group_id(0);\n"
+"__kernel void reduct_v4_buf(GLOBAL_SIZE_3_DIMS\n"
+" __global const INPUT_TYPE *input,\n"
+" __global OUTPUT_TYPE *output,\n"
+" __private const int inside,\n"
+" __private const int outside,\n"
+" __private const int dim) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1); // inside\n"
+" const int z=get_global_id(2); // outside\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
 " \n"
-" const int offset=((((0*inputChannelBlock)+channel_idx)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((0*outputChannelBlock)+channel_idx)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
-" int batchOffset=inputChannelBlock*inputHeight*inputWidth;\n"
-" const int lid=get_local_id(0);\n"
-" INPUT_TYPE4 local sum[LOCAL_SIZE];\n"
 " INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" for(int i=lid; i<inputBatch; i+=LOCAL_SIZE){\n"
-" INPUT_TYPE4 in=vload4(i*batchOffset,input+offset);\n"
+" const int offset=z*dim*inside+(y << 2);\n"
+" \n"
+"#if REDUCT_LOCAL_SIZE>4\n"
+" const int lid=get_local_id(0);\n"
+" INPUT_TYPE4 local sum[REDUCT_LOCAL_SIZE];\n"
+" for(int i=lid; i<dim; i+=REDUCT_LOCAL_SIZE){\n"
+" INPUT_TYPE4 in=vload4(0,input+offset+i*inside);\n"
 " out=OPERATE(out,in);\n"
 " }\n"
 " sum[lid]=out;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
+" for(int i=REDUCT_LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=OPERATE(sum[lid],sum[lid+i]);\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " out=sum[0];\n"
-"#ifdef GET_AVG\n"
-" out=out/inputBatch;\n"
-"#endif\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+outputOffset);\n"
 "#else\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,channel_idx);\n"
-" \n"
-" const int offset=((((0*inputChannelBlock)+channel_idx)*inputHeight+height_idx)*inputWidth+width_idx)*4;\n"
-" const int outputOffset=((((0*outputChannelBlock)+channel_idx)*outputHeight+height_idx)*oututWidth+width_idx)*4;\n"
-" int batchOffset=inputChannelBlock*inputHeight*inputWidth;\n"
-" INPUT_TYPE4 out=(INPUT_TYPE4)VALUE;\n"
-" for(int i=0; i<inputBatch; ++i){\n"
-" INPUT_TYPE4 in=vload4(i*batchOffset,input+offset);\n"
+" for(int i=0; i<dim; ++i){\n"
+" INPUT_TYPE4 in=vload4(0,input+offset+i*inside);\n"
 " out=OPERATE(out,in);\n"
 " }\n"
-"#ifdef GET_AVG\n"
-" out=out/inputBatch;\n"
 "#endif\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+outputOffset);\n"
+"#ifdef GET_AVG\n"
+" out=out/(INPUT_TYPE4)dim;\n"
 "#endif\n"
+" vstore4(CONVERT_OUTPUT4(out),0,output+z*inside+(y << 2));\n"
 "}\n"
 ;
 #endif
@@ -19222,6 +16994,7 @@ const char* matmul_params_buf =
 "// 2 -> with bias (eltwise_add) [M,N]\n"
 "// 3 -> with bias (eltwise_sub) [M,N]\n"
 "// 4 -> with bias (eltwise_sub and get negative) [M,N]\n"
+"// 5 -> with bias (mask 0 for invalid) [M,N]\n"
 "#ifndef BIAS_TYPE\n"
 " #define BIAS_TYPE 0\n"
 "#endif\n"
@@ -19233,13 +17006,38 @@ const char* matmul_params_buf =
 "#define DEAL_BIAS(x,a) x=x-a\n"
 "#elif BIAS_TYPE == 4\n"
 "#define DEAL_BIAS(x,a) x=a-x\n"
+"#elif BIAS_TYPE == 5\n"
+"#define DEAL_BIAS(x,a) x=(a == 0 ? (FLOAT)(-FLT_MAX) : x)\n"
 "#endif\n"
 "// By default the workgroup size requirement is enabled. For Qualcomm devices the workgroup size\n"
 "// requirement results in worse performance and is disabled (src/utilities/compile.cpp)\n"
 "#ifndef RELAX_WORKGROUP_SIZE\n"
 " #define RELAX_WORKGROUP_SIZE 0\n"
 "#endif\n"
-"#define ZERO (FLOAT)0.0f\n"
+"typedef float real_arg;\n"
+"#define GetRealArg(x) (FLOAT)x\n"
+"typedef FLOAT real;\n"
+"#ifndef PRECISION_COMPUTE\n"
+"#define PRECISION_COMPUTE COMPUTE_FLOAT\n"
+"#define CONVERT_PRECISION_COMPUTE(x) CONVERT_COMPUTE_FLOAT(x)\n"
+"#endif\n"
+"#ifndef PRECISION_COMPUTE2\n"
+"#define PRECISION_COMPUTE2 COMPUTE_FLOAT2\n"
+"#define CONVERT_PRECISION_COMPUTE2(x) CONVERT_COMPUTE_FLOAT2(x)\n"
+"#endif\n"
+"#ifndef PRECISION_COMPUTE4\n"
+"#define PRECISION_COMPUTE4 COMPUTE_FLOAT4\n"
+"#define CONVERT_PRECISION_COMPUTE4(x) CONVERT_COMPUTE_FLOAT4(x)\n"
+"#endif\n"
+"#ifndef PRECISION_COMPUTE8\n"
+"#define PRECISION_COMPUTE8 COMPUTE_FLOAT8\n"
+"#define CONVERT_PRECISION_COMPUTE8(x) CONVERT_COMPUTE_FLOAT8(x)\n"
+"#endif\n"
+"#ifndef PRECISION_COMPUTE16\n"
+"#define PRECISION_COMPUTE16 COMPUTE_FLOAT16\n"
+"#define CONVERT_PRECISION_COMPUTE16(x) CONVERT_COMPUTE_FLOAT16(x)\n"
+"#endif\n"
+"#define ZERO (PRECISION_COMPUTE)0.0f\n"
 "// Sets a variable to zero\n"
 "#define SetToZero(a) a=ZERO\n"
 "#define IsZero(a) (a == ZERO)\n"
@@ -19259,38 +17057,69 @@ const char* matmul_params_buf =
 "INLINE_FUNC int GetGroupID1() { return get_group_id(1); }\n"
 "INLINE_FUNC int GetGroupID0() { return get_group_id(0); }\n"
 "// =================================================================================================\n"
-"// End of the C++11 raw string literal\n"
-"typedef float real_arg;\n"
-"#define GetRealArg(x) (FLOAT)x\n"
-"typedef FLOAT real;\n"
 "// Data-widths in dimension M\n"
 "#if VWM == 1\n"
 " typedef FLOAT realM;\n"
+" #define COMPUTE_FLOATM PRECISION_COMPUTE\n"
+" #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE(x)\n"
+" #define CONVERT_FLOATM(x) CONVERT_FLOAT(x)\n"
 "#elif VWM == 2\n"
 " typedef FLOAT2 realM;\n"
+" #define COMPUTE_FLOATM PRECISION_COMPUTE2\n"
+" #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE2(x)\n"
+" #define CONVERT_FLOATM(x) CONVERT_FLOAT2(x)\n"
 "#elif VWM == 4\n"
 " typedef FLOAT4 realM;\n"
+" #define COMPUTE_FLOATM PRECISION_COMPUTE4\n"
+" #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE4(x)\n"
+" #define CONVERT_FLOATM(x) CONVERT_FLOAT4(x)\n"
 "#elif VWM == 8\n"
 " typedef FLOAT8 realM;\n"
+" #define COMPUTE_FLOATM PRECISION_COMPUTE8\n"
+" #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE8(x)\n"
+" #define CONVERT_FLOATM(x) CONVERT_FLOAT8(x)\n"
 "#elif VWM == 16\n"
 " typedef FLOAT16 realM;\n"
+" #define COMPUTE_FLOATM PRECISION_COMPUTE16\n"
+" #define CONVERT_COMPUTE_FLOATM(x) CONVERT_PRECISION_COMPUTE16(x)\n"
+" #define CONVERT_FLOATM(x) CONVERT_FLOAT16(x)\n"
 "#endif\n"
 "// Data-widths in dimension N\n"
 "#if VWN == 1\n"
 " typedef FLOAT realN;\n"
+" typedef int intN;\n"
+" #define COMPUTE_FLOATN PRECISION_COMPUTE\n"
+" #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE(x)\n"
+" #define CONVERT_FLOATN(x) CONVERT_FLOAT(x)\n"
 "#elif VWN == 2\n"
 " typedef FLOAT2 realN;\n"
+" typedef int2 intN;\n"
+" #define COMPUTE_FLOATN PRECISION_COMPUTE2\n"
+" #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE2(x)\n"
+" #define CONVERT_FLOATN(x) CONVERT_FLOAT2(x)\n"
 "#elif VWN == 4\n"
 " typedef FLOAT4 realN;\n"
+" typedef int4 intN;\n"
+" #define COMPUTE_FLOATN PRECISION_COMPUTE4\n"
+" #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE4(x)\n"
+" #define CONVERT_FLOATN(x) CONVERT_FLOAT4(x)\n"
 "#elif VWN == 8\n"
 " typedef FLOAT8 realN;\n"
+" typedef int8 intN;\n"
+" #define COMPUTE_FLOATN PRECISION_COMPUTE8\n"
+" #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE8(x)\n"
+" #define CONVERT_FLOATN(x) CONVERT_FLOAT8(x)\n"
 "#elif VWN == 16\n"
 " typedef FLOAT16 realN;\n"
+" typedef int16 intN;\n"
+" #define COMPUTE_FLOATN PRECISION_COMPUTE16\n"
+" #define CONVERT_COMPUTE_FLOATN(x) CONVERT_PRECISION_COMPUTE16(x)\n"
+" #define CONVERT_FLOATN(x) CONVERT_FLOAT16(x)\n"
 "#endif\n"
 "// =================================================================================================\n"
 "// Initializes the accumulation registers to zero\n"
-"INLINE_FUNC realM InitAccRegisters() {\n"
-" realM result;\n"
+"INLINE_FUNC COMPUTE_FLOATM InitAccRegisters() {\n"
+" COMPUTE_FLOATM result;\n"
 " #if VWM == 1\n"
 " SetToZero(result);\n"
 " #elif VWM == 2\n"
@@ -19330,8 +17159,8 @@ const char* matmul_params_buf =
 " #endif\n"
 " return result;\n"
 "}\n"
-"INLINE_FUNC realN InitAccRegistersN() {\n"
-" realN result;\n"
+"INLINE_FUNC COMPUTE_FLOATN InitAccRegistersN() {\n"
+" COMPUTE_FLOATN result;\n"
 " #if VWN == 1\n"
 " SetToZero(result);\n"
 " #elif VWN == 2\n"
@@ -19538,10 +17367,10 @@ const char* matmul_params_buf =
 "}\n"
 "#endif\n"
 "// The vectorised multiply-add function\n"
-"INLINE_FUNC realM MultiplyAddVector(realM cvec,const realM avec,const real bval) {\n"
+"INLINE_FUNC COMPUTE_FLOATM MultiplyAddVector(COMPUTE_FLOATM cvec,COMPUTE_FLOATM avec,PRECISION_COMPUTE bval) {\n"
 " #if USE_VECTOR_MAD == 1\n"
 " #if USE_CL_MAD == 1\n"
-" cvec=mad(avec,(realM)bval,cvec);\n"
+" cvec=mad(avec,(COMPUTE_FLOATM)bval,cvec);\n"
 " #else\n"
 " cvec += avec*bval;\n"
 " #endif\n"
@@ -19587,10 +17416,10 @@ const char* matmul_params_buf =
 " return cvec;\n"
 "}\n"
 "// The vectorised multiply-add function\n"
-"INLINE_FUNC realN MultiplyAddVectorN(realN cvec,const real avec,const realN bval) {\n"
+"INLINE_FUNC COMPUTE_FLOATN MultiplyAddVectorN(COMPUTE_FLOATN cvec,PRECISION_COMPUTE avec,COMPUTE_FLOATN bval) {\n"
 " #if USE_VECTOR_MAD == 1\n"
 " #if USE_CL_MAD == 1\n"
-" cvec=mad((realN)avec,bval,cvec);\n"
+" cvec=mad((COMPUTE_FLOATN)avec,bval,cvec);\n"
 " #else\n"
 " cvec += avec*bval;\n"
 " #endif\n"
@@ -19660,8 +17489,8 @@ const char* matmul_params_buf =
 " return res;\n"
 "}\n"
 "// layout : [N,M]\n"
-"INLINE_FUNC void StoreResultsM(__global realM* cgm,realM c_value,const INT2 baseOffset,const int _mi,const int _ni,\n"
-" const int kSizeM,const real alpha,const real beta) {\n"
+"INLINE_FUNC void StoreResultsM(__global realM* cgm,COMPUTE_FLOATM c_value,const INT2 baseOffset,const int _mi,const int _ni,\n"
+" const int kSizeM,const PRECISION_COMPUTE alpha,const PRECISION_COMPUTE beta) {\n"
 " #if STRM == 0\n"
 " int idm=_mi+baseOffset.index[0];\n"
 " #elif STRM == 1\n"
@@ -19674,10 +17503,10 @@ const char* matmul_params_buf =
 " #endif\n"
 " \n"
 " int index=idn*(kSizeM/VWM)+idm;\n"
-" realM result=c_value;\n"
+" COMPUTE_FLOATM result=c_value;\n"
 " // The final multiplication with alpha (in case beta == 0)\n"
 " #ifdef ONLY_HAVE_ALPHA\n"
-" realM xval=c_value;\n"
+" COMPUTE_FLOATM xval=c_value;\n"
 " #if VWM == 1\n"
 " Multiply(result,alpha,xval);\n"
 " #elif VWM == 2\n"
@@ -19718,8 +17547,8 @@ const char* matmul_params_buf =
 " #endif\n"
 " // The final multiplication with alpha and the addition with beta*C\n"
 " #ifdef HAVE_ALPHA_BETA\n"
-" realM xval=c_value;\n"
-" realM yval=cgm[index];\n"
+" COMPUTE_FLOATM xval=c_value;\n"
+" COMPUTE_FLOATM yval=CONVERT_COMPUTE_FLOATM(cgm[index]);\n"
 " #if VWM == 1\n"
 " AXPBY(result,alpha,xval,beta,yval);\n"
 " #elif VWM == 2\n"
@@ -19758,7 +17587,7 @@ const char* matmul_params_buf =
 " AXPBY(result.sF,alpha,xval.sF,beta,yval.sF);\n"
 " #endif\n"
 " #endif\n"
-" cgm[index]=result;\n"
+" cgm[index]=CONVERT_FLOATM(result);\n"
 "}\n"
 "INLINE_FUNC INT2 StoreIndexN() {\n"
 " INT2 res;\n"
@@ -19780,7 +17609,7 @@ const char* matmul_params_buf =
 " return res;\n"
 "}\n"
 "// layout : [M,N]\n"
-"INLINE_FUNC void StoreResultsN(__global realN* cgn,realN c_value,\n"
+"INLINE_FUNC void StoreResultsN(__global realN* cgn,COMPUTE_FLOATN c_value,\n"
 " const INT2 baseOffset,\n"
 " #if BIAS_TYPE>0\n"
 " #if BIAS_TYPE>1\n"
@@ -19790,7 +17619,7 @@ const char* matmul_params_buf =
 " #endif\n"
 " #endif\n"
 " const int _mi,const int _ni,\n"
-" const int cstride/*kSizeN*/,const int dstride/*kSizeN*/,const real alpha,const real beta) {\n"
+" const int cstride/*kSizeN*/,const int dstride/*kSizeN*/,const PRECISION_COMPUTE alpha,const PRECISION_COMPUTE beta) {\n"
 " #if STRM == 0\n"
 " int idm=_mi+baseOffset.index[0];\n"
 " #elif STRM == 1\n"
@@ -19803,11 +17632,11 @@ const char* matmul_params_buf =
 " #endif\n"
 " int index=idm*(cstride/VWN)+idn;\n"
 " \n"
-" realN result=c_value;\n"
+" COMPUTE_FLOATN result=c_value;\n"
 " \n"
 " // The final multiplication with alpha (in case beta == 0)\n"
 " #ifdef ONLY_HAVE_ALPHA\n"
-" realN xval=c_value;\n"
+" COMPUTE_FLOATN xval=c_value;\n"
 " #if VWN == 1\n"
 " Multiply(result,alpha,xval);\n"
 " #elif VWN == 2\n"
@@ -19848,8 +17677,8 @@ const char* matmul_params_buf =
 " #endif\n"
 " // The final multiplication with alpha and the addition with beta*C\n"
 " #ifdef HAVE_ALPHA_BETA\n"
-" realN xval=c_value;\n"
-" realN yval=cgn[index];\n"
+" COMPUTE_FLOATN xval=c_value;\n"
+" COMPUTE_FLOATN yval=CONVERT_COMPUTE_FLOATN(cgn[index]);\n"
 " #if VWN == 1\n"
 " AXPBY(result,alpha,xval,beta,yval);\n"
 " #elif VWN == 2\n"
@@ -19892,29 +17721,31 @@ const char* matmul_params_buf =
 " \n"
 "#if BIAS_TYPE>0\n"
 " #if BIAS_TYPE == 1\n"
-" realN eval=epm[_ni];\n"
+" COMPUTE_FLOATN eval=CONVERT_COMPUTE_FLOATN(epm[_ni]);\n"
+" #elif BIAS_TYPE == 5\n"
+" int index_bias=idm*(dstride/VWN)+idn;\n"
+" intN eval=((__global intN*)egm)[index_bias];\n"
 " #else\n"
-" \n"
 " int index_bias=idm*(dstride/VWN)+idn;\n"
-" realN eval=egm[index_bias];\n"
+" COMPUTE_FLOATN eval=CONVERT_COMPUTE_FLOATN(egm[index_bias]);\n"
 " #endif\n"
 " \n"
 " #if VWN == 1\n"
 " DEAL_BIAS(result,eval);\n"
 " #ifdef RELU\n"
-" result=fmax(result,(FLOAT)0);\n"
+" result=fmax(result,(COMPUTE_FLOATN)0);\n"
 " #endif\n"
 " #ifdef RELU6\n"
-" result=clamp(result,(FLOAT)0,(FLOAT)6);\n"
+" result=clamp(result,(COMPUTE_FLOATN)0,(COMPUTE_FLOATN)6);\n"
 " #endif\n"
 " #elif VWN == 2\n"
 " DEAL_BIAS(result.x,eval.x);\n"
 " DEAL_BIAS(result.y,eval.y);\n"
 " #ifdef RELU\n"
-" result=fmax(result,(FLOAT2)0);\n"
+" result=fmax(result,(COMPUTE_FLOATN)0);\n"
 " #endif\n"
 " #ifdef RELU6\n"
-" result=clamp(result,(FLOAT2)0,(FLOAT2)6);\n"
+" result=clamp(result,(COMPUTE_FLOATN)0,(COMPUTE_FLOATN)6);\n"
 " #endif\n"
 " #elif VWN == 4\n"
 " DEAL_BIAS(result.x,eval.x);\n"
@@ -19922,10 +17753,10 @@ const char* matmul_params_buf =
 " DEAL_BIAS(result.z,eval.z);\n"
 " DEAL_BIAS(result.w,eval.w);\n"
 " #ifdef RELU\n"
-" result=fmax(result,(FLOAT4)0);\n"
+" result=fmax(result,(COMPUTE_FLOATN)0);\n"
 " #endif\n"
 " #ifdef RELU6\n"
-" result=clamp(result,(FLOAT4)0,(FLOAT4)6);\n"
+" result=clamp(result,(COMPUTE_FLOATN)0,(COMPUTE_FLOATN)6);\n"
 " #endif\n"
 " #elif VWN == 8\n"
 " DEAL_BIAS(result.s0,eval.s0);\n"
@@ -19937,10 +17768,10 @@ const char* matmul_params_buf =
 " DEAL_BIAS(result.s6,eval.s6);\n"
 " DEAL_BIAS(result.s7,eval.s7);\n"
 " #ifdef RELU\n"
-" result=fmax(result,(FLOAT8)0);\n"
+" result=fmax(result,(COMPUTE_FLOATN)0);\n"
 " #endif\n"
 " #ifdef RELU6\n"
-" result=clamp(result,(FLOAT8)0,(FLOAT8)6);\n"
+" result=clamp(result,(COMPUTE_FLOATN)0,(COMPUTE_FLOATN)6);\n"
 " #endif\n"
 " #elif VWN == 16\n"
 " DEAL_BIAS(result.s0,eval.s0);\n"
@@ -19960,14 +17791,14 @@ const char* matmul_params_buf =
 " DEAL_BIAS(result.sE,eval.sE);\n"
 " DEAL_BIAS(result.sF,eval.sF);\n"
 " #ifdef RELU\n"
-" result=fmax(result,(FLOAT16)0);\n"
+" result=fmax(result,(COMPUTE_FLOATN)0);\n"
 " #endif\n"
 " #ifdef RELU6\n"
-" result=clamp(result,(FLOAT16)0,(FLOAT16)6);\n"
+" result=clamp(result,(COMPUTE_FLOATN)0,(COMPUTE_FLOATN)6);\n"
 " #endif\n"
 " #endif\n"
 "#endif\n"
-" cgn[index]=result;\n"
+" cgn[index]=CONVERT_FLOATN(result);\n"
 "}\n"
 "// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.\n"
 "INLINE_FUNC void XgemmBody(const int kSizeM,const int kSizeN,const int kSizeK,const int4 stride,\n"
@@ -19975,7 +17806,7 @@ const char* matmul_params_buf =
 " #if BIAS_TYPE>0\n"
 " __global realN* restrict egm,\n"
 " #endif\n"
-" __global realM* cgm,const real alpha,const real beta\n"
+" __global realM* cgm,const real_arg alpha,const real_arg beta\n"
 " #if SA == 1 && SB == 1\n"
 " ,LOCAL_PTR realM* alm,LOCAL_PTR realN* blm\n"
 " #elif SA == 1\n"
@@ -19986,10 +17817,10 @@ const char* matmul_params_buf =
 " ) {\n"
 " #ifdef OUTPUTMN\n"
 " #pragma promote_to_registers\n"
-" realN cpn[MWI*(NWI/VWN)]; // MWI*NWI\n"
+" COMPUTE_FLOATN cpn[MWI*(NWI/VWN)]; // MWI*NWI\n"
 " #else\n"
 " #pragma promote_to_registers\n"
-" realM cpm[NWI*(MWI/VWM)]; // NWI*MWI\n"
+" COMPUTE_FLOATM cpm[NWI*(MWI/VWM)]; // NWI*MWI\n"
 " #endif\n"
 " // Combined thread identifier (volatile to disable caching)\n"
 " #if SA == 1 || SB == 1\n"
@@ -20017,9 +17848,9 @@ const char* matmul_params_buf =
 " #if SA == 1 || SB == 1\n"
 " // Allocates workitem-private memory (registers)\n"
 " #pragma promote_to_registers\n"
-" realM apm[MWI/VWM]; // MWI*1\n"
+" COMPUTE_FLOATM apm[MWI/VWM]; // MWI*1\n"
 " #pragma promote_to_registers\n"
-" realN bpm[NWI/VWN]; // 1*NWI\n"
+" COMPUTE_FLOATN bpm[NWI/VWN]; // 1*NWI\n"
 " \n"
 " for (int kwg=0; kwg<kSizeK; kwg += KWG) {\n"
 " // Loads data: off-chip --> local (matrix A)\n"
@@ -20044,10 +17875,10 @@ const char* matmul_params_buf =
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
 " // Loads data: local --> private (matrix A)\n"
 " #if SA == 1\n"
-" apm[_mi]=LocalToPrivateA(alm,_mi,kg);\n"
+" apm[_mi]=CONVERT_COMPUTE_FLOATM(LocalToPrivateA(alm,_mi,kg));\n"
 " // Loads data: off-chip --> private (matrix A)\n"
 " #elif SA == 0\n"
-" apm[_mi]=GlobalToPrivateA(agm,_mi,kSizeM,idk);\n"
+" apm[_mi]=CONVERT_COMPUTE_FLOATM(GlobalToPrivateA(agm,_mi,kSizeM,idk));\n"
 " #endif\n"
 " }\n"
 " // Loads matrix B (kernel 0) or matrix A (kernel 1)\n"
@@ -20055,10 +17886,10 @@ const char* matmul_params_buf =
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " // Loads data: local --> private (matrix B)\n"
 " #if SB == 1\n"
-" bpm[_ni]=LocalToPrivateB(blm,_ni,kg);\n"
+" bpm[_ni]=CONVERT_COMPUTE_FLOATN(LocalToPrivateB(blm,_ni,kg));\n"
 " // Loads data: off-chip --> private (matrix B)\n"
 " #else\n"
-" bpm[_ni]=GlobalToPrivateB(bgm,_ni,kSizeN,idk);\n"
+" bpm[_ni]=CONVERT_COMPUTE_FLOATN(GlobalToPrivateB(bgm,_ni,kSizeN,idk));\n"
 " #endif\n"
 " }\n"
 " // Performs the accumulation (Cpm += Apm*Bpm)\n"
@@ -20067,7 +17898,7 @@ const char* matmul_params_buf =
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
-" const realM aval=apm[_mi];\n"
+" const COMPUTE_FLOATM aval=apm[_mi];\n"
 " #if VWM == 1\n"
 " // [MWI/VWM,VWM,NWI/VWN,VWN]\n"
 " cpn[(_mi*VWM+0)*(NWI/VWN)+_ni]=MultiplyAddVectorN(cpn[(_mi*VWM+0)*(NWI/VWN)+_ni],aval,bpm[_ni]);\n"
@@ -20113,7 +17944,7 @@ const char* matmul_params_buf =
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
-" const realM aval=apm[_mi];\n"
+" const COMPUTE_FLOATM aval=apm[_mi];\n"
 " #if VWN == 1\n"
 " cpm[(_ni*VWN+0)*(MWI/VWM)+_mi]=MultiplyAddVector(cpm[(_ni*VWN+0)*(MWI/VWM)+_mi],aval,bpm[_ni]);\n"
 " #elif VWN == 2\n"
@@ -20166,7 +17997,7 @@ const char* matmul_params_buf =
 " for (int _kj=0; _kj<kSizeK; _kj += 4) {\n"
 " #ifdef OUTPUTMN\n"
 " #pragma promote_to_registers\n"
-" realN bpm[NWI/VWN]; // 1*NWI\n"
+" COMPUTE_FLOATN bpm[NWI/VWN]; // 1*NWI\n"
 " \n"
 " #pragma unroll\n"
 " for(int _ki=0; _ki<4; _ki += 1) {\n"
@@ -20174,11 +18005,11 @@ const char* matmul_params_buf =
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " // Loads data: off-chip --> private (matrix B)\n"
-" bpm[_ni]=GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk);\n"
+" bpm[_ni]=CONVERT_COMPUTE_FLOATN(GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk));\n"
 " }\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
-" const realM aval=GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk);\n"
+" const COMPUTE_FLOATM aval=CONVERT_COMPUTE_FLOATM(GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk));\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
 " #if VWM == 1\n"
@@ -20225,21 +18056,21 @@ const char* matmul_params_buf =
 " #else\n"
 " \n"
 " #pragma promote_to_registers\n"
-" realM apm[MWI/VWM]; // MWI*1\n"
+" COMPUTE_FLOATM apm[MWI/VWM]; // MWI*1\n"
 " #pragma unroll\n"
 " for(int _ki=0; _ki<4; _ki += 1) {\n"
 " int idk=_kj+_ki;\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
 " // Loads data: off-chip --> private (matrix B)\n"
-" apm[_mi]=GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk);\n"
+" apm[_mi]=CONVERT_COMPUTE_FLOATM(GlobalToPrivateOptA(agm,baseIndexA,_mi,stride.s0/*kSizeM*/,idk));\n"
 " }\n"
 " #pragma unroll\n"
 " for (int _ni=0; _ni<NWI/VWN; _ni += 1) {\n"
-" const realN bval=GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk);\n"
+" const COMPUTE_FLOATN bval=CONVERT_COMPUTE_FLOATN(GlobalToPrivateOptB(bgm,baseIndexB,_ni,stride.s1/*kSizeN*/,idk));\n"
 " #pragma unroll\n"
 " for (int _mi=0; _mi<MWI/VWM; _mi += 1) {\n"
-" const realM aval=apm[_mi];\n"
+" const COMPUTE_FLOATM aval=apm[_mi];\n"
 " #if VWN == 1\n"
 " cpm[(_ni*VWN+0)*(MWI/VWM)+_mi]=MultiplyAddVector(cpm[(_ni*VWN+0)*(MWI/VWM)+_mi],aval,bval);\n"
 " #elif VWN == 2\n"
@@ -20351,8 +18182,6 @@ const char* matmul_params_buf =
 " __private const int4 offset,\n"
 " __private const int4 stride\n"
 ") {\n"
-" const real alpha=GetRealArg(arg_alpha);\n"
-" const real beta=GetRealArg(arg_beta);\n"
 " \n"
 " // Adds the offsets (in case of use of a single temporary buffer for A,B,and C)\n"
 " agm=(const __global realM*)((const __global real*)agm+offset.s0);\n"
@@ -20376,25 +18205,25 @@ const char* matmul_params_buf =
 " #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
-" cgm,alpha,beta,alm,blm);\n"
+" cgm,arg_alpha,arg_beta,alm,blm);\n"
 " #elif SA == 1\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
 " #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
-" cgm,alpha,beta,alm);\n"
+" cgm,arg_alpha,arg_beta,alm);\n"
 " #elif SB == 1\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
 " #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
-" cgm,alpha,beta,blm);\n"
+" cgm,arg_alpha,arg_beta,blm);\n"
 " #else\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm,bgm,\n"
 " #if BIAS_TYPE>0\n"
 " egm,\n"
 " #endif\n"
-" cgm,alpha,beta);\n"
+" cgm,arg_alpha,arg_beta);\n"
 " #endif\n"
 "}\n"
 "#if RELAX_WORKGROUP_SIZE == 1\n"
@@ -20408,29 +18237,32 @@ const char* matmul_params_buf =
 " const real_arg arg_alpha,\n"
 " const real_arg arg_beta,\n"
 " const __global realM* restrict agm,\n"
-" const int batch_offset_a,\n"
 " const __global realN* restrict bgm,\n"
-" const int batch_offset_b,\n"
 " #if BIAS_TYPE>0\n"
 " __global realN* restrict egm,\n"
-" const int batch_offset_e,\n"
 " #endif\n"
 " __global realM* cgm,\n"
-" const int batch_offset_c) {\n"
+" const int4 batch_offset,// [batch_offset_a,batch_offset_b,batch_offset_c,batch_offset_e]\n"
+" const int4 stride,// [stride_a,stride_b,stride_c,stride_e]\n"
+" /*\n"
+" total_batch -> [loop_y,loop_x]\n"
+" with group batch -> [loop_y,loop_x/group_num]\n"
+" group_size == loop_x/group_num\n"
+" */\n"
+" const int4 group // [group_num_a,group_num_b,group_num_e,loop_x]\n"
+") {\n"
 " const int batch=get_group_id(2);\n"
-" const real alpha=GetRealArg(arg_alpha);\n"
-" const real beta=GetRealArg(arg_beta);\n"
 " \n"
 " // Sets the offsets\n"
-" const int a_offset=batch*batch_offset_a;\n"
-" const int b_offset=batch*batch_offset_b;\n"
-" const int c_offset=batch*batch_offset_c;\n"
+" const int a_offset=((batch/group.w)*group.x+(batch % group.w)/group.x)*batch_offset.x;\n"
+" const int b_offset=((batch/group.w)*group.y+(batch % group.w)/group.y)*batch_offset.y;\n"
+" const int c_offset=batch*batch_offset.z;\n"
 " const __global realM* restrict agm_=&agm[a_offset/VWM];\n"
 " const __global realN* restrict bgm_=&bgm[b_offset/VWN];\n"
 " __global realM* restrict cgm_=&cgm[c_offset/VWM];\n"
 " \n"
 " #if BIAS_TYPE>0\n"
-" const int e_offset=batch*batch_offset_e;\n"
+" const int e_offset=((batch/group.w)*group.z+(batch % group.w)/group.z)*batch_offset.w;\n"
 " __global realN* restrict egm_=&egm[e_offset/VWN];\n"
 " #endif\n"
 " \n"
@@ -20441,40 +18273,31 @@ const char* matmul_params_buf =
 " #if SB == 1\n"
 " __local realN blm[KWG*NWG/VWN];\n"
 " #endif\n"
-" int4 stride;\n"
-" stride.s0=kSizeM;\n"
-" stride.s1=kSizeN;\n"
-" #ifdef OUTPUTMN\n"
-" stride.s2=kSizeN;\n"
-" #else\n"
-" stride.s2=kSizeM;\n"
-" #endif\n"
-" stride.s3=kSizeN;\n"
 " // Computes the matrix-multiplication and stores the result in global memory\n"
 " #if SA == 1 && SB == 1\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
 " #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
-" cgm_,alpha,beta,alm,blm);\n"
+" cgm_,arg_alpha,arg_beta,alm,blm);\n"
 " #elif SA == 1\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
 " #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
-" cgm_,alpha,beta,alm);\n"
+" cgm_,arg_alpha,arg_beta,alm);\n"
 " #elif SB == 1\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
 " #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
-" cgm_,alpha,beta,blm);\n"
+" cgm_,arg_alpha,arg_beta,blm);\n"
 " #else\n"
 " XgemmBody(kSizeM,kSizeN,kSizeK,stride,agm_,bgm_,\n"
 " #if BIAS_TYPE>0\n"
 " egm_,\n"
 " #endif\n"
-" cgm_,alpha,beta);\n"
+" cgm_,arg_alpha,arg_beta);\n"
 " #endif\n"
 "}\n"
 ;
@@ -20495,228 +18318,83 @@ const char* cast =
 " ) {\n"
 " const int width_idx=get_global_id(0);\n"
 " const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/channelBlock;\n"
-" const int channel_idx=batch_channel_idx % channelBlock;\n"
-" \n"
-"#ifdef TO_BOOL\n"
-" int4 value=convert_int4(RI_DATA(input,SAMPLER,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx)));\n"
-" value=value == (int4)0 ? (int4)0 : (int4)1;\n"
-" WI_DATA(output,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx),CONVERT_OUTPUT_I4(value));\n"
-"#else\n"
-" INPUT_TYPE_I4 value=RI_DATA(input,SAMPLER,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx));\n"
-" WI_DATA(output,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx),CONVERT_OUTPUT_I4(value));\n"
-"#endif\n"
-"}\n"
-;
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-const char* buffer_convert_buf = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
-"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
-"// convert data from buffer(nhwc) to buffer(nc4hw4)\n"
-"__kernel void nhwc_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global const INPUT_TYPE *input_ptr,\n"
-" __private const int height,\n"
-" __private const int width,__private const int channels,\n"
-" __global OUTPUT_TYPE *output) {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int batch_idx=image_height_idx/height;\n"
-" const int height_idx=image_height_idx % height;\n"
-" const int width_idx=image_width_idx % width;\n"
-" const int channel_4_idx=(image_width_idx/width) << 2;\n"
-" const int buffer_offset=((batch_idx*height+height_idx)*width+width_idx)*channels+channel_4_idx;\n"
-" const int remain_channel=channels-channel_4_idx;\n"
-" float4 values=convert_float4(vload4(0,input_ptr+buffer_offset));\n"
-" if (remain_channel == 3) {\n"
-" values.w=0;\n"
-" } else if (remain_channel == 2) {\n"
-" values.z=0;\n"
-" values.w=0;\n"
-" } else if (remain_channel == 1) {\n"
-" values.y=0;\n"
-" values.z=0;\n"
-" values.w=0;\n"
-" }\n"
-" const int out_offset=(((batch_idx*((channels+3)/4)+channel_4_idx/4)*height+height_idx)*width+width_idx)*4;\n"
-" vstore4(CONVERT_OUTPUT4(values),0,output+out_offset);\n"
-"}\n"
-"// convert data from buffer(nchw) to buffer(nc4hw4)\n"
-"__kernel void nchw_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global const INPUT_TYPE *input_ptr,\n"
-" __private const int height,__private const int width,__private const int channels,\n"
-" __global OUTPUT_TYPE *output) {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int batch_idx=image_height_idx/height;\n"
-" const int height_idx=image_height_idx % height;\n"
-" const int width_idx=image_width_idx % width;\n"
-" const int channel_4_idx=image_width_idx/width << 2;\n"
-" const int buffer_offset=((batch_idx*channels+channel_4_idx)*height+height_idx)*width+width_idx;\n"
-" const int remain_channel=channels-channel_4_idx;\n"
-" const int height_width_size=height*width;\n"
-" float4 output_values=0;\n"
-" if (remain_channel >= 4) {\n"
-" int offset=buffer_offset;\n"
-" output_values.x=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.y=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.z=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.w=(float)*(input_ptr+offset);\n"
-" } else if (remain_channel == 3) {\n"
-" int offset=buffer_offset;\n"
-" output_values.x=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.y=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.z=(float)*(input_ptr+offset);\n"
-" } else if (remain_channel == 2) {\n"
-" int offset=buffer_offset;\n"
-" output_values.x=(float)*(input_ptr+offset);\n"
-" offset += height_width_size;\n"
-" output_values.y=(float)*(input_ptr+offset);\n"
-" } else if (remain_channel == 1) {\n"
-" int offset=buffer_offset;\n"
-" output_values.x=(float)*(input_ptr+offset);\n"
-" }\n"
-" const int out_offset=(((batch_idx*((channels+3)/4)+channel_4_idx/4)*height+height_idx)*width+width_idx)*4;\n"
-" vstore4(CONVERT_OUTPUT4(output_values),0,output+out_offset);\n"
-"}\n"
-"__kernel void nchw_buffer_to_nchw_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global INPUT_TYPE *input_ptr,\n"
-" __private const int height,__private const int width,__private const int channels,\n"
-" __private const int input_pad_left,__private const int input_pad_right,\n"
-" __private const int output_pad_left,__private const int output_pad_right,\n"
-" __global OUTPUT_TYPE *output) {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int src_width=width+input_pad_left+input_pad_right;\n"
-" const int dst_width=width+output_pad_left+output_pad_right;\n"
-" const int batch_idx=image_height_idx/height;\n"
-" const int height_idx=image_height_idx % height;\n"
-" const int width_idx=image_width_idx % width;\n"
-" const int channel_idx=image_width_idx/width;\n"
-" const int in_offset=((batch_idx*channels+channel_idx)*height+height_idx)*src_width+width_idx+input_pad_left;\n"
-" const int out_offset=((batch_idx*channels+channel_idx)*height+height_idx)*dst_width+width_idx+output_pad_left;\n"
-" output[out_offset]=(OUTPUT_TYPE)input_ptr[in_offset];\n"
-"}\n"
-"// convert data from image(b h,ic/4 w ic4) to buffer(nhwc)\n"
-"__kernel void nc4hw4_buffer_to_nhwc_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global OUTPUT_TYPE *output,\n"
-" __private const int height,__private const int width,\n"
-" __private const int channels,\n"
-" __global INPUT_TYPE *input_ptr) {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int batch_idx=image_height_idx/height;\n"
-" const int height_idx=image_height_idx % height;\n"
-" const int width_idx=image_width_idx % width;\n"
-" const int channel_4_idx=(image_width_idx/width) << 2;\n"
-" const int buffer_offset=((batch_idx*height+height_idx)*width+width_idx)*channels+channel_4_idx;\n"
-" const int in_offset=(((batch_idx*((channels+3)/4)+channel_4_idx/4)*height+height_idx)*width+width_idx)*4;\n"
-" \n"
-" float4 values=convert_float4(vload4(0,input_ptr+in_offset));\n"
-" const int remain_channel=channels-channel_4_idx;\n"
-" if (remain_channel >= 4) {\n"
-" vstore4(CONVERT_OUTPUT4(values),0,output+buffer_offset);\n"
-" } else if (remain_channel == 3) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" offset++;\n"
-" output[offset]=(OUTPUT_TYPE)values.y;\n"
-" offset++;\n"
-" output[offset]=(OUTPUT_TYPE)values.z;\n"
-" } else if (remain_channel == 2) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" offset++;\n"
-" output[offset]=(OUTPUT_TYPE)values.y;\n"
-" } else if (remain_channel == 1) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" }\n"
-"}\n"
-"// convert data from buffer(nc4hw4) to buffer(nchw)\n"
-"__kernel void nc4hw4_buffer_to_nchw_buffer(GLOBAL_SIZE_2_DIMS\n"
-" __global OUTPUT_TYPE *output,\n"
-" __private const int height,__private const int width,\n"
-" __private const int channels,\n"
-" __global INPUT_TYPE *input_ptr) {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" \n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int batch_idx=image_height_idx/height;\n"
-" const int height_idx=image_height_idx % height;\n"
-" const int width_idx=image_width_idx % width;\n"
-" int channel_4_idx=(image_width_idx/width)*4;\n"
-" int buffer_offset=((batch_idx*channels+channel_4_idx)*height+height_idx)*width+width_idx;\n"
+" const int batch_channel_idx=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
 " \n"
-" const int in_offset=(((batch_idx*((channels+3)/4)+channel_4_idx/4)*height+height_idx)*width+width_idx)*4;\n"
-" float4 values=convert_float4(vload4(0,input_ptr+in_offset));\n"
-" const int height_width_size=height*width;\n"
-" const int remain_channel=channels-channel_4_idx;\n"
-" if (remain_channel >= 4) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.y;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.z;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.w;\n"
-" } else if (remain_channel == 3) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.y;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.z;\n"
-" } else if (remain_channel == 2) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" offset += height_width_size;\n"
-" output[offset]=(OUTPUT_TYPE)values.y;\n"
-" } else if (remain_channel == 1) {\n"
-" int offset=buffer_offset;\n"
-" output[offset]=(OUTPUT_TYPE)values.x;\n"
-" }\n"
+" const int batch_idx=batch_channel_idx/channelBlock;\n"
+" const int channel_idx=batch_channel_idx % channelBlock;\n"
+" \n"
+"#ifdef TO_BOOL\n"
+" int4 value=convert_int4(RI_DATA(input,SAMPLER,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx)));\n"
+" value=value == (int4)0 ? (int4)0 : (int4)1;\n"
+" WI_DATA(output,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx),CONVERT_OUTPUT_I4(value));\n"
+"#else\n"
+" INPUT_TYPE_I4 value=RI_DATA(input,SAMPLER,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx));\n"
+" WI_DATA(output,(int2)(channel_idx*width+width_idx,batch_idx*height+height_idx),CONVERT_OUTPUT_I4(value));\n"
+"#endif\n"
 "}\n"
-"__kernel void nc4hw4_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
+;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* buffer_convert_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"#define GLOBAL_SIZE_2_DIMS __private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
+"#define GLOBAL_SIZE_3_DIMS __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
+"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define MNN_DATA_FORMAT_NCHW 0\n"
+"#define MNN_DATA_FORMAT_NHWC 1\n"
+"#define MNN_DATA_FORMAT_NC4HW4 2\n"
+"#define MNN_DATA_FORMAT_C4NHW4 3\n"
+"__kernel void buffer_convert_to_buffer(GLOBAL_SIZE_3_DIMS\n"
 " __global const INPUT_TYPE *input_ptr,\n"
-" __private const int2 output_shape,\n"
-" __private const int2 src_stride,\n"
-" __private const int2 dst_stride,\n"
-" __global OUTPUT_TYPE *output\n"
+" __private const int4 shape,// N C H W\n"
+" __global OUTPUT_TYPE *output_ptr\n"
 ") {\n"
-" int image_width_idx=get_global_id(0);\n"
-" int image_height_idx=get_global_id(1);\n"
-" DEAL_NON_UNIFORM_DIM2(image_width_idx,image_height_idx);\n"
-" const int batch_idx=image_height_idx/output_shape.x;\n"
-" const int height_idx=image_height_idx % output_shape.x;\n"
-" const int width_idx=image_width_idx % output_shape.y;\n"
-" const int channel_block_idx=image_width_idx/output_shape.y;\n"
-" int2 src_bc_offset=src_stride*(int2)(batch_idx,channel_block_idx);\n"
-" int2 dst_bc_offset=dst_stride*(int2)(batch_idx,channel_block_idx);\n"
-" int src_buffer_offset =\n"
-" (((src_bc_offset.x+src_bc_offset.y)*output_shape.x+height_idx)*output_shape.y+width_idx)*4;\n"
-" int dst_buffer_offset =\n"
-" (((dst_bc_offset.x+dst_bc_offset.y)*output_shape.x+height_idx)*output_shape.y+width_idx)*4;\n"
-" \n"
-" vstore4(CONVERT_OUTPUT4(vload4(0,input_ptr+src_buffer_offset)),0,output+dst_buffer_offset);\n"
+" int wh=get_global_id(0);\n"
+" int c=get_global_id(1);\n"
+" int n=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(wh,c,n);\n"
+" int w=wh % shape.w;\n"
+" int h=wh/shape.w;\n"
+" \n"
+"#if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int input_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
+"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int input_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
+"#elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int input_offset=((((c/4)*shape.x+n)*shape.z+h)*shape.w+w)*4+(c % 4);\n"
+"#endif\n"
+"#if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW\n"
+" int output_offset=((n*shape.y+c)*shape.z+h)*shape.w+w;\n"
+"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC\n"
+" int output_offset=((n*shape.z+h)*shape.w+w)*shape.y+c;\n"
+"#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4\n"
+" int output_offset=((((c/4)*shape.x+n)*shape.z+h)*shape.w+w)*4+(c % 4);\n"
+"#endif\n"
+" output_ptr[output_offset]=input_ptr[input_offset];\n"
+"}\n"
+"__kernel void buffer_copy_to_buffer(GLOBAL_SIZE_2_DIMS\n"
+" __global const INPUT_TYPE *input_ptr,\n"
+" __global OUTPUT_TYPE *output_ptr,\n"
+" __private const int size // N C H W\n"
+") {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int offset=x << 2;\n"
+"#ifdef PACK_LEAVE\n"
+" if(offset+3 >= size){\n"
+" for(int i=0; i<size-offset; ++i){\n"
+" output_ptr[offset+i]=(OUTPUT_TYPE)input_ptr[offset+i];\n"
+" }\n"
+" } else {\n"
+"#endif\n"
+" vstore4(CONVERT_OUTPUT4(vload4(0,input_ptr+offset)),0,output_ptr+offset);\n"
+"#ifdef PACK_LEAVE\n"
+" }\n"
+"#endif\n"
 "}\n"
 "// convert kernel : from buffer(oihw) to image(oc/4 h w ,ic oc4)\n"
 "__kernel void conv2d_filter_buffer_to_nc4hw4_buffer(GLOBAL_SIZE_2_DIMS\n"
@@ -21263,7 +18941,7 @@ const char* loop_buf =
 "#ifndef TSH\n"
 " #define TSH 8 // thread handle size H dimension\n"
 "#endif\n"
-"// [N C4 H 1 4] -> [N H C 1]\n"
+"// [C4 N H 1 4] -> [N H C 1]\n"
 "__kernel void tile_trans_3d_buf(__global INPUT_TYPE* input,\n"
 " __global OUTPUT_TYPE* output,\n"
 " __private const int widthPad,\n"
@@ -21281,7 +18959,6 @@ const char* loop_buf =
 " // group id\n"
 " const int c=get_group_id(0)*WGSC;\n"
 " const int h=get_group_id(1)*WGSH;\n"
-" const int channel_4=(channel+3) >> 2;\n"
 " int jc=lidc;\n"
 " int ih=lidh;\n"
 " \n"
@@ -21294,7 +18971,7 @@ const char* loop_buf =
 " int offset_h=i*WGSH/TSH+ih;\n"
 " int offset_c=j*WGSC/TSC+jc ;\n"
 " // [TSH,WGSH/TSH] [TSC/4,WGSC/TSC,4]\n"
-" localData[offset_h][offset_c]=(h+offset_h >= height || c+4*offset_c >= channel) ? (INPUT_TYPE4)0 : vload4(0,input+((b*channel_4+(c/4+offset_c))*height+(h+offset_h))*4);\n"
+" localData[offset_h][offset_c]=(h+offset_h >= height || c+4*offset_c >= channel) ? (INPUT_TYPE4)0 : vload4(0,input+((b+(c/4+offset_c)*batch)*height+(h+offset_h))*4);\n"
 " }\n"
 " }\n"
 " \n"
@@ -21316,7 +18993,7 @@ const char* loop_buf =
 " }\n"
 " }\n"
 "}\n"
-"// [N C4 H W 4] -> [N C W H]\n"
+"// [C4 N H W 4] -> [N C W H]\n"
 "__kernel void tile_trans_4d_buf(__global INPUT_TYPE* input,\n"
 " __global OUTPUT_TYPE* output,\n"
 " __private const int widthPad,\n"
@@ -21337,7 +19014,6 @@ const char* loop_buf =
 " // group id\n"
 " const int w=get_group_id(0)*WGSW;\n"
 " const int h=get_group_id(1)*WGSH;\n"
-" const int channel_4=(channel+3) >> 2;\n"
 " int jw=lidw;\n"
 " int ih=lidh;\n"
 " \n"
@@ -21349,7 +19025,7 @@ const char* loop_buf =
 " for(int j=0; j<TSW; j++) {\n"
 " int offset_h=h+ih+i*WGSH/TSH;\n"
 " int offset_w=w+jw+j*WGSW/TSW;\n"
-" localData[ih+i*WGSH/TSH][jw+j*WGSW/TSW]=(offset_h >= height || offset_w >= width) ? (INPUT_TYPE4)0 : vload4(0,input+(((b*channel_4+c4)*height+offset_h)*width+offset_w)*4);\n"
+" localData[ih+i*WGSH/TSH][jw+j*WGSW/TSW]=(offset_h >= height || offset_w >= width) ? (INPUT_TYPE4)0 : vload4(0,input+(((b+c4*batch)*height+offset_h)*width+offset_w)*4);\n"
 " }\n"
 " }\n"
 " \n"
@@ -21469,8 +19145,8 @@ const char* loop_buf =
 " const int c=c_4 << 2;\n"
 " const int x_src_pitch=4;\n"
 " const int y_src_pitch=x_src_pitch*width;\n"
-" const int c_src_pitch=y_src_pitch*height;\n"
-" const int b_src_pitch=c_src_pitch*((channel+3)/4);\n"
+" const int b_src_pitch=y_src_pitch*height;\n"
+" const int c_src_pitch=b_src_pitch*batch;\n"
 " \n"
 " bool outBound=(w >= width || h >= height || c >= channel);\n"
 "#ifdef MNN_NHWC\n"
@@ -21621,154 +19297,32 @@ const char* loop_buf =
 " }\n"
 "}\n"
 "#ifdef LOOP_BINARY_OPERATOR\n"
-"__kernel void broadcast_binary_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
-" __private const int8 src0_size,//(batch,channel,height,width)\n"
-" __private const int4 src0C4_size,// nc4hw4\n"
-" __private const int8 src1_size,\n"
-" __private const int4 src1C4_size,\n"
-" __private const int8 dst_size,\n"
-" __private const int dst_width,\n"
-" __private const int dst_height,\n"
-" __private const int dst_channel,\n"
-" __private const int channel_block) {\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" \n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" \n"
-" const int wo=pos.x;\n"
-" const int ho=pos.y;\n"
-" const int co=pos.z % channel_block;\n"
-" const int no=pos.z/channel_block;\n"
-" const int output_offset=((((no*channel_block)+co)*dst_height+ho)*dst_width+wo)*4;\n"
-" int co4=co << 2;\n"
-" int4 covec=(int4)(co4 % dst_channel,(co4+1) % dst_channel,(co4+2) % dst_channel,(co4+3) % dst_channel);\n"
-" int4 out_offset=((no*dst_channel+covec)*dst_height+ho)*dst_width+wo;\n"
-" int4 w=out_offset % (dst_size.s3*dst_size.s4); out_offset /= (dst_size.s3*dst_size.s4);\n"
-" int4 h=out_offset % dst_size.s2; out_offset /= dst_size.s2;\n"
-" int4 c=out_offset % dst_size.s1; out_offset /= dst_size.s1;\n"
-" int4 n=out_offset % dst_size.s0;\n"
-" float4 in0,in1;\n"
-" \n"
-"#ifdef BROADCAST_INPUT1\n"
-" in0=convert_float4(vload4(0,input0+output_offset));\n"
-" const int src1_channel_block=(src1C4_size.y+3)/4;\n"
-" float* in1_ptr=(float*)&in1;\n"
-" {\n"
-" int4 w0=w % (src1_size.s3*src1_size.s4);\n"
-" int4 h0=h % src1_size.s2;\n"
-" int4 c0=c % src1_size.s1;\n"
-" int4 n0=n % src1_size.s0;\n"
-" int* w0_ptr=(int*)&w0;\n"
-" int* h0_ptr=(int*)&h0;\n"
-" int* c0_ptr=(int*)&c0;\n"
-" int* n0_ptr=(int*)&n0;\n"
-" for(int i=0; i<4; ++i){\n"
-" int c4offset=((n0_ptr[i]*src1_size.s1+c0_ptr[i])*src1_size.s2+h0_ptr[i])*src1_size.s3*src1_size.s4+w0_ptr[i];\n"
-" int wc4=c4offset % src1C4_size.w; c4offset /= src1C4_size.w;\n"
-" int hc4=c4offset % src1C4_size.z; c4offset /= src1C4_size.z;\n"
-" int cc4=c4offset % src1C4_size.y; c4offset /= src1C4_size.y;\n"
-" int nc4=c4offset % src1C4_size.x;\n"
-" int cc4_offset=cc4/4;\n"
-" int cc4_remain=cc4 % 4;\n"
-" in1_ptr[i]=(float)input1[((((nc4*src1_channel_block)+cc4_offset)*src1C4_size.z+hc4)*src1C4_size.w+wc4)*4+cc4_remain];\n"
-" }\n"
-" }\n"
-"#else\n"
-" const int src0_channel_block=(src0C4_size.y+3)/4;\n"
-" float* in0_ptr=(float*)&in0;\n"
-" {\n"
-" int4 w0=w % (src0_size.s3*src0_size.s4);\n"
-" int4 h0=h % src0_size.s2;\n"
-" int4 c0=c % src0_size.s1;\n"
-" int4 n0=n % src0_size.s0;\n"
-" int* w0_ptr=(int*)&w0;\n"
-" int* h0_ptr=(int*)&h0;\n"
-" int* c0_ptr=(int*)&c0;\n"
-" int* n0_ptr=(int*)&n0;\n"
-" for(int i=0; i<4; ++i){\n"
-" int c4offset=((n0_ptr[i]*src0_size.s1+c0_ptr[i])*src0_size.s2+h0_ptr[i])*src0_size.s3*src0_size.s4+w0_ptr[i];\n"
-" int wc4=c4offset % src0C4_size.w; c4offset /= src0C4_size.w;\n"
-" int hc4=c4offset % src0C4_size.z; c4offset /= src0C4_size.z;\n"
-" int cc4=c4offset % src0C4_size.y; c4offset /= src0C4_size.y;\n"
-" int nc4=c4offset % src0C4_size.x;\n"
-" int cc4_offset=cc4/4;\n"
-" int cc4_remain=cc4 % 4;\n"
-" in0_ptr[i]=(float)input0[((((nc4*src0_channel_block)+cc4_offset)*src0C4_size.z+hc4)*src0C4_size.w+wc4)*4+cc4_remain];\n"
-" }\n"
-" }\n"
-" in1=convert_float4(vload4(0,input1+output_offset));\n"
-"#endif\n"
-" float4 out=LOOP_BINARY_OPERATOR;\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+output_offset);\n"
-" }\n"
-"}\n"
-"__kernel void broadcast_binary_channel_equall_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+"__kernel void loop_binary_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
 " __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
-" __private const int8 src0_size,//(batch,channel,height,width)\n"
-" __private const int4 src0C4_size,// nc4hw4\n"
-" __private const int8 src1_size,\n"
-" __private const int4 src1C4_size,\n"
-" __private const int8 dst_size,\n"
-" __private const int dst_width,\n"
-" __private const int dst_height,\n"
-" __private const int dst_channel,\n"
-" __private const int channel_block) {\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
+" __private const int input0Stride0,\n"
+" __private const int input0Stride1,\n"
+" __private const int input0Stride2,\n"
+" __private const int input1Stride0,\n"
+" __private const int input1Stride1,\n"
+" __private const int input1Stride2,\n"
+" __private const int outputStride0,\n"
+" __private const int outputStride1,\n"
+" __private const int outputStride2\n"
+" ) {\n"
 " \n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int wo=pos.x;\n"
-" const int ho=pos.y;\n"
-" const int co=pos.z % channel_block;\n"
-" const int no=pos.z/channel_block;\n"
-" const int output_offset=((((no*channel_block)+co)*dst_height+ho)*dst_width+wo)*4;\n"
-"#ifdef BROADCAST_INPUT1\n"
-" const int src1_channel_block=(src1C4_size.y+3)/4;\n"
-" const int input_offset=(((((no % src1_size.s0)*src1_channel_block)+co)*src1C4_size.z+(ho % src1_size.s2))*src1C4_size.w+(wo % (src1_size.s3*src1_size.s4)))*4;\n"
-" float4 in0=convert_float4(vload4(0,input0+output_offset));\n"
-" float4 in1=convert_float4(vload4(0,input1+input_offset));\n"
-"#else\n"
-" const int src0_channel_block=(src0C4_size.y+3)/4;\n"
-" const int input_offset=(((((no % src0_size.s0)*src0_channel_block)+co)*src0C4_size.z+(ho % src0_size.s2))*src0C4_size.w+(wo % (src0_size.s3*src0_size.s4)))*4;\n"
-" float4 in0=convert_float4(vload4(0,input0+input_offset));\n"
-" float4 in1=convert_float4(vload4(0,input1+output_offset));\n"
-"#endif\n"
-" float4 out=LOOP_BINARY_OPERATOR;\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+output_offset);\n"
-" }\n"
-"}\n"
-"//channel=1 and dimmision=1\n"
-"__kernel void broadcast_binary_dimmision1_channel1_buf(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __global OUTPUT_TYPE* output,__global INPUT_TYPE* input0,__global INPUT_TYPE* input1,\n"
-" __private const int8 src0_size,//(batch,channel,height,width)\n"
-" __private const int4 src0C4_size,// nc4hw4\n"
-" __private const int8 src1_size,\n"
-" __private const int4 src1C4_size,\n"
-" __private const int8 dst_size,\n"
-" __private const int dst_width,\n"
-" __private const int dst_height,\n"
-" __private const int dst_channel,\n"
-" __private const int channel_block) {\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" const int z=get_global_id(2);\n"
 " \n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int wo=pos.x;\n"
-" const int ho=pos.y;\n"
-" const int co=pos.z % channel_block;\n"
-" const int no=pos.z/channel_block;\n"
+" if (x<global_dim0 && y<global_dim1 && z<global_dim2) {\n"
 " \n"
-" const int output_offset=((((no*channel_block)+co)*dst_height+ho)*dst_width+wo)*4;\n"
-"#ifdef BROADCAST_INPUT1\n"
-" const int input_offset=((no % src1_size.s0)*src1_size.s2+(ho % src1_size.s2))*src1_size.s3*src1_size.s4+(wo % (src1_size.s3*src1_size.s4));\n"
-" float4 in0=convert_float4(vload4(0,input0+output_offset));\n"
-" float4 in1=(float4)(input1[input_offset]);\n"
-"#else\n"
-" const int input_offset=((no % src0_size.s0)*src0_size.s2+(ho % src0_size.s2))*src0_size.s3*src0_size.s4+(wo % (src0_size.s3*src0_size.s4));\n"
-" float4 in0=(float4)(input0[input_offset]);\n"
-" float4 in1=convert_float4(vload4(0,input1+output_offset));\n"
-"#endif\n"
-" float4 out=LOOP_BINARY_OPERATOR;\n"
-" vstore4(CONVERT_OUTPUT4(out),0,output+output_offset);\n"
+" int inputIndex0=z*input0Stride0+y*input0Stride1+x*input0Stride2;\n"
+" int inputIndex1=z*input1Stride0+y*input1Stride1+x*input1Stride2;\n"
+" int outputIndex=z*outputStride0+y*outputStride1+x*outputStride2;\n"
+" float in0=(float)input0[inputIndex0];\n"
+" float in1=(float)input1[inputIndex1];\n"
+" float out=LOOP_BINARY_OPERATOR;\n"
+" output[outputIndex]=(OUTPUT_TYPE)out;\n"
 " }\n"
 "}\n"
 "#endif\n"
@@ -22054,13 +19608,75 @@ const char* depthwise_conv2d =
 " WI_F(output,(int2)(outWidthIdx,outHeightIdx),outValue0);\n"
 " }\n"
 "}\n"
-;
-const char* layernorm = 
-"#ifdef MNN_SUPPORT_FP16\n"
-"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
-"#endif\n"
-"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
-"__kernel void layernorm_w(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+;
+const char* layernorm = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+"__kernel void layernorm_w(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+" __read_only image2d_t input,\n"
+" __write_only image2d_t output,\n"
+" __private const int width,\n"
+" __private const int height,\n"
+" __private const int channel,\n"
+"#ifdef GAMMA_BETA\n"
+" __global const FLOAT *gamma,\n"
+" __global const FLOAT *beta,\n"
+"#endif\n"
+" __private float epsilon){\n"
+" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
+" float4 local sum[LOCAL_SIZE];\n"
+" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
+" const int h=pos.y % height;\n"
+" const int c=pos.y/height;\n"
+" const int b=pos.z;\n"
+" const int lid=get_local_id(0);\n"
+" const int bh_offset=mad24(b,height,h);\n"
+" float4 in_sum=0;\n"
+"#ifdef RMSNORM\n"
+" float4 mean=0;\n"
+"#else\n"
+" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+" in_sum += in;\n"
+" }\n"
+" sum[lid]=in_sum;\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
+" if (lid<i)\n"
+" sum[lid]=sum[lid]+sum[lid+i];\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" }\n"
+" \n"
+" float4 mean=sum[0]/(float4)width;\n"
+"#endif\n"
+" in_sum=0;\n"
+" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+" in_sum += (in-mean)*(in-mean);\n"
+" }\n"
+" sum[lid]=in_sum;\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
+" if (lid<i)\n"
+" sum[lid]=sum[lid]+sum[lid+i];\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" }\n"
+" float4 square_sum=sum[0]/(float4)width;\n"
+" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
+" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+"#ifdef GAMMA_BETA\n"
+" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
+"#else\n"
+" float4 out=(in-mean)*value;\n"
+"#endif\n"
+" WI_F(output,(int2)(c*width+i,bh_offset),CONVERT_FLOAT4(out));\n"
+" }\n"
+" }\n"
+"}\n"
+"__kernel void layernorm_hw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
 " __read_only image2d_t input,\n"
 " __write_only image2d_t output,\n"
 " __private const int width,\n"
@@ -22074,17 +19690,18 @@ const char* layernorm =
 " int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
 " float4 local sum[LOCAL_SIZE];\n"
 " if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int h=pos.y % height;\n"
-" const int c=pos.y/height;\n"
+" const int c=pos.y;\n"
 " const int b=pos.z;\n"
+" const int height_width=height*width;\n"
 " const int lid=get_local_id(0);\n"
-" const int bh_offset=mad24(b,height,h);\n"
 " float4 in_sum=0;\n"
 "#ifdef RMSNORM\n"
 " float4 mean=0;\n"
 "#else\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" int w=i % width;\n"
+" int h=i/width;\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 " in_sum += in;\n"
 " }\n"
 " sum[lid]=in_sum;\n"
@@ -22095,11 +19712,13 @@ const char* layernorm =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " \n"
-" float4 mean=sum[0]/(float4)width;\n"
+" float4 mean=sum[0]/(float4)height_width;\n"
 "#endif\n"
 " in_sum=0;\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" int w=i % width;\n"
+" int h=i/width;\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 " in_sum += (in-mean)*(in-mean);\n"
 " }\n"
 " sum[lid]=in_sum;\n"
@@ -22109,20 +19728,22 @@ const char* layernorm =
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)width;\n"
+" float4 square_sum=sum[0]/(float4)height_width;\n"
 " float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int i=lid; i<width; i+=LOCAL_SIZE){\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+i,bh_offset)));\n"
+" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" int w=i % width;\n"
+" int h=i/width;\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 "#ifdef GAMMA_BETA\n"
 " float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
 "#else\n"
 " float4 out=(in-mean)*value;\n"
 "#endif\n"
-" WI_F(output,(int2)(c*width+i,bh_offset),CONVERT_FLOAT4(out));\n"
+" WI_F(output,(int2)(c*width+w,b*height+h),CONVERT_FLOAT4(out));\n"
 " }\n"
 " }\n"
 "}\n"
-"__kernel void layernorm_hw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
+"__kernel void layernorm_chw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
 " __read_only image2d_t input,\n"
 " __write_only image2d_t output,\n"
 " __private const int width,\n"
@@ -22134,23 +19755,40 @@ const char* layernorm =
 "#endif\n"
 " __private float epsilon){\n"
 " int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" float4 local sum[LOCAL_SIZE];\n"
+" float local sum[LOCAL_SIZE];\n"
 " if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int c=pos.y;\n"
 " const int b=pos.z;\n"
-" const int height_width=height*width;\n"
+" const int sum_size=width*height*channel;\n"
+" const int reduce_size=width*height;\n"
 " const int lid=get_local_id(0);\n"
+" const int channel4=(channel+3)/4;\n"
+" const int channel_remain=channel-(channel4-1)*4;\n"
+" \n"
 " float4 in_sum=0;\n"
+" float4 in_sum_left=0;\n"
+" float *in_sum_left_ptr=(float*)(&in_sum_left);\n"
 "#ifdef RMSNORM\n"
 " float4 mean=0;\n"
 "#else\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" for(int c=0; c<channel4-1; ++c){\n"
+" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
 " int w=i % width;\n"
 " int h=i/width;\n"
 " float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 " in_sum += in;\n"
 " }\n"
-" sum[lid]=in_sum;\n"
+" }\n"
+" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
+" int w=i % width;\n"
+" int h=i/width;\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
+" in_sum_left += in;\n"
+" }\n"
+" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
+" for(int i=1; i<channel_remain; ++i){\n"
+" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
+" }\n"
+" sum[lid]=in_sum.x+in_sum_left.x;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
@@ -22158,140 +19796,757 @@ const char* layernorm =
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " \n"
-" float4 mean=sum[0]/(float4)height_width;\n"
+" float4 mean=sum[0]/(float4)sum_size;\n"
 "#endif\n"
 " in_sum=0;\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" in_sum_left=0;\n"
+" for(int c=0; c<channel4-1; ++c){\n"
+" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
 " int w=i % width;\n"
 " int h=i/width;\n"
 " float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 " in_sum += (in-mean)*(in-mean);\n"
 " }\n"
-" sum[lid]=in_sum;\n"
+" }\n"
+" \n"
+" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
+" int w=i % width;\n"
+" int h=i/width;\n"
+" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
+" in_sum_left += (in-mean)*(in-mean);\n"
+" }\n"
+" \n"
+" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
+" for(int i=1; i<channel_remain; ++i){\n"
+" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
+" }\n"
+" \n"
+" sum[lid]=in_sum.x+in_sum_left.x;\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
 " if (lid<i)\n"
 " sum[lid]=sum[lid]+sum[lid+i];\n"
 " barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)height_width;\n"
+" float4 square_sum=sum[0]/(float4)sum_size;\n"
 " float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int i=lid; i<height_width; i+=LOCAL_SIZE){\n"
+" for(int c=0; c<channel4; ++c){\n"
+" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
 " int w=i % width;\n"
 " int h=i/width;\n"
 " float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
 "#ifdef GAMMA_BETA\n"
-" float4 out=(in-mean)*value*(float4)gamma[i]+(float4)beta[i];\n"
+" float4 out=(in-mean)*value*(float4)gamma[c*reduce_size+i]+(float4)beta[c*reduce_size+i];\n"
 "#else\n"
 " float4 out=(in-mean)*value;\n"
 "#endif\n"
 " WI_F(output,(int2)(c*width+w,b*height+h),CONVERT_FLOAT4(out));\n"
 " }\n"
 " }\n"
+" }\n"
+"}\n"
+;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+const char* gemm_conv1x1_buf = 
+"#ifdef MNN_SUPPORT_FP16\n"
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
+"#endif\n"
+"#define GLOBAL_SIZE_DIM2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
+"#define UNIFORM_BOUNDRY_CHECK(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
+"#define GLOBAL_SIZE_DIM3 "" __private int global_size_dim0,__private int global_size_dim1,__private int global_size_dim2,\n"
+"#define UNIFORM_BOUNDRY_CHECK3(index0, index1, index2) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1 || index2 >= global_size_dim2) { "" return; "" }\n"
+"#define UCHAR16_TO_2CHAR16(a, b, c) "" a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8; "" a.s8 = (c.s4 >> 4) - 8; a.s9 = (c.s4 & 15) - 8; a.sa = (c.s5 >> 4) - 8; a.sb = (c.s5 & 15) - 8; a.sc = (c.s6 >> 4) - 8; a.sd = (c.s6 & 15) - 8; a.se = (c.s7 >> 4) - 8; a.sf = (c.s7 & 15) - 8; "" b.s0 = (c.s8 >> 4) - 8; b.s1 = (c.s8 & 15) - 8; b.s2 = (c.s9 >> 4) - 8; b.s3 = (c.s9 & 15) - 8; b.s4 = (c.sa >> 4) - 8; b.s5 = (c.sa & 15) - 8; b.s6 = (c.sb >> 4) - 8; b.s7 = (c.sb & 15) - 8; "" b.s8=(c.sc >> 4)-8; b.s9=(c.sc & 15)-8; b.sa=(c.sd >> 4)-8; b.sb=(c.sd & 15)-8; b.sc=(c.se >> 4)-8; b.sd=(c.se & 15)-8; b.se=(c.sf >> 4)-8; b.sf=(c.sf & 15)-8;\n"
+"#define UCHAR8_TO_CHAR16(a, c) "" a.s0 = (c.s0 >> 4) - 8; a.s1 = (c.s0 & 15) - 8; a.s2 = (c.s1 >> 4) - 8; a.s3 = (c.s1 & 15) - 8; a.s4 = (c.s2 >> 4) - 8; a.s5 = (c.s2 & 15) - 8; a.s6 = (c.s3 >> 4) - 8; a.s7 = (c.s3 & 15) - 8; "" a.s8=(c.s4 >> 4)-8; a.s9=(c.s4 & 15)-8; a.sa=(c.s5 >> 4)-8; a.sb=(c.s5 & 15)-8; a.sc=(c.s6 >> 4)-8; a.sd=(c.s6 & 15)-8; a.se=(c.s7 >> 4)-8; a.sf=(c.s7 & 15)-8;\n"
+"#define DOT16X16(a, b, c) "" c += dot(a.s0123, b.s0123); "" c += dot(a.s4567, b.s4567); "" c += dot(a.s89ab, b.s89ab); "" c += dot(a.scdef,b.scdef);\n"
+"#if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+"#define CHANNEL_PACK 32\n"
+"#else\n"
+"#define CHANNEL_PACK 16\n"
+"#endif\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+"#define WEIGHT_STRIDE 16\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+"#define WEIGHT_STRIDE 8\n"
+"#endif\n"
+"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
+"#ifdef USE_IMAGE\n"
+"inline COMPUTE_FLOAT16 readWeight(__read_only image2d_t weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" return CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(ix,iy))))*scale+offset;\n"
+"}\n"
+"#else\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+"inline COMPUTE_FLOAT16 readWeight(__global const char *weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" return CONVERT_COMPUTE_FLOAT16(vload16(0,weight))*scale+offset;\n"
+"}\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+"inline COMPUTE_FLOAT16 readWeight(__global const uchar *weight,int ix,int iy,COMPUTE_FLOAT scale,COMPUTE_FLOAT offset){\n"
+" uchar16 charWeightsInt40=vload16(0,weight);\n"
+" uchar8 charWeightsInt4=vload8(0,weight);\n"
+" char16 charWeights=0;\n"
+" UCHAR8_TO_CHAR16(charWeights,charWeightsInt4);\n"
+" return CONVERT_COMPUTE_FLOAT16(charWeights)*scale+offset;\n"
+"}\n"
+"#endif\n"
+"#endif\n"
+"__kernel void inverse_quant_weight(GLOBAL_SIZE_DIM2\n"
+" #ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+" #else\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" __global const char *weight,\n"
+" #elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" __global const uchar *weight,\n"
+" #endif\n"
+" #endif\n"
+" __global const float *dequantScaleOffset,\n"
+" __global FLOAT* output,\n"
+" __private const int outputChannelAlign,\n"
+" __private const int outputChannel4Align,\n"
+" __private const int blockDim){\n"
+" const int x=get_global_id(0); //ic\n"
+" const int y=get_global_id(1); //oc\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" \n"
+" const int ic=x << 5;\n"
+" const int oc=y << 2;\n"
+" const int output_offset=ic*outputChannelAlign+oc;\n"
+" int kindex=(ic/blockDim)*outputChannel4Align*2;\n"
+" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(0,dequantScaleOffset+kindex+oc*2));\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11,weights20,weights21,weights30,weights31;\n"
+" {\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(oc,x)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(oc+1,x)));\n"
+" uchar16 charWeightsInt42=as_uchar16(read_imagei(weight,SAMPLER,(int2)(oc+2,x)));\n"
+" uchar16 charWeightsInt43=as_uchar16(read_imagei(weight,SAMPLER,(int2)(oc+3,x)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt42);\n"
+" weights20=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights21=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt43);\n"
+" weights30=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights31=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" }\n"
+" COMPUTE_FLOAT *weights00_ptr=(COMPUTE_FLOAT *)&weights00;\n"
+" COMPUTE_FLOAT *weights10_ptr=(COMPUTE_FLOAT *)&weights10;\n"
+" COMPUTE_FLOAT *weights20_ptr=(COMPUTE_FLOAT *)&weights20;\n"
+" COMPUTE_FLOAT *weights30_ptr=(COMPUTE_FLOAT *)&weights30;\n"
+" COMPUTE_FLOAT *weights01_ptr=(COMPUTE_FLOAT *)&weights01;\n"
+" COMPUTE_FLOAT *weights11_ptr=(COMPUTE_FLOAT *)&weights11;\n"
+" COMPUTE_FLOAT *weights21_ptr=(COMPUTE_FLOAT *)&weights21;\n"
+" COMPUTE_FLOAT *weights31_ptr=(COMPUTE_FLOAT *)&weights31;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" FLOAT4 out=CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights00_ptr[i],weights10_ptr[i],weights20_ptr[i],weights30_ptr[i]));\n"
+" vstore4(out,0,output+output_offset+i*outputChannelAlign);\n"
+" }\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" FLOAT4 out=CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights01_ptr[i],weights11_ptr[i],weights21_ptr[i],weights31_ptr[i]));\n"
+" vstore4(out,0,output+output_offset+(i+16)*outputChannelAlign);\n"
+" }\n"
+" #else\n"
+" const int ic=x << 4;\n"
+" const int oc=y << 2;\n"
+"#ifndef USE_IMAGE\n"
+" #if (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" int weight_offset=oc*8;\n"
+" int weight_oc_offset=outputChannel4Align*8;\n"
+" int weight_stride=8;\n"
+" #else\n"
+" int weight_offset=oc*16;\n"
+" int weight_oc_offset=outputChannel4Align*16;\n"
+" int weight_stride=16;\n"
+" #endif\n"
+"#endif\n"
+" const int output_offset=ic*outputChannelAlign+oc;\n"
+" int kindex=(ic/blockDim)*outputChannel4Align*2;\n"
+" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(0,dequantScaleOffset+kindex+oc*2));\n"
+" #ifdef USE_IMAGE\n"
+" COMPUTE_FLOAT16 weights0=readWeight(weight,oc,x,ScaleOffset.s0,ScaleOffset.s1);\n"
+" COMPUTE_FLOAT16 weights1=readWeight(weight,oc+1,x,ScaleOffset.s2,ScaleOffset.s3);\n"
+" COMPUTE_FLOAT16 weights2=readWeight(weight,oc+2,x,ScaleOffset.s4,ScaleOffset.s5);\n"
+" COMPUTE_FLOAT16 weights3=readWeight(weight,oc+3,x,ScaleOffset.s6,ScaleOffset.s7);\n"
+" #else\n"
+" COMPUTE_FLOAT16 weights0=readWeight(weight+weight_offset+x*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" COMPUTE_FLOAT16 weights1=readWeight(weight+weight_offset+x*weight_oc_offset+weight_stride,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
+" COMPUTE_FLOAT16 weights2=readWeight(weight+weight_offset+x*weight_oc_offset+2*weight_stride,0,0,ScaleOffset.s4,ScaleOffset.s5);\n"
+" COMPUTE_FLOAT16 weights3=readWeight(weight+weight_offset+x*weight_oc_offset+3*weight_stride,0,0,ScaleOffset.s6,ScaleOffset.s7);\n"
+" #endif\n"
+" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT*)&weights0;\n"
+" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT*)&weights1;\n"
+" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT*)&weights2;\n"
+" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT*)&weights3;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" FLOAT4 out=CONVERT_FLOAT4((COMPUTE_FLOAT4)(weights0_ptr[i],weights1_ptr[i],weights2_ptr[i],weights3_ptr[i]));\n"
+" vstore4(out,0,output+output_offset+i*outputChannelAlign);\n"
+" }\n"
+" #endif\n"
+"}\n"
+"__kernel void reshape_nchw4_nhwc4(GLOBAL_SIZE_DIM2\n"
+"__global const FLOAT* input,\n"
+"__global FLOAT* output,\n"
+"__private const int bhw,\n"
+"__private const int channel,\n"
+"__private const int channelAlign){\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //bhw\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" \n"
+" const int x4=x << 2;\n"
+" const int y4=y << 2;\n"
+" const int input_offset=(x*bhw+y4)*4;\n"
+" FLOAT4 in0=vload4(0,input+input_offset);\n"
+" FLOAT4 in1=(y4+1<bhw) ? vload4(0,input+input_offset+4) : (FLOAT4)0;\n"
+" FLOAT4 in2=(y4+2<bhw) ? vload4(0,input+input_offset+8) : (FLOAT4)0;\n"
+" FLOAT4 in3=(y4+3<bhw) ? vload4(0,input+input_offset+12) : (FLOAT4)0;\n"
+" \n"
+"#ifdef INPUT_CHANNEL_LEAVE\n"
+" if(x4+3 >= channel){\n"
+" FLOAT *in0_ptr=(FLOAT*)&in0;\n"
+" FLOAT *in1_ptr=(FLOAT*)&in1;\n"
+" FLOAT *in2_ptr=(FLOAT*)&in2;\n"
+" FLOAT *in3_ptr=(FLOAT*)&in3;\n"
+" int remain=x4+3-channel;\n"
+" for(int i=remain; i >= 0; i--){\n"
+" in0_ptr[3-i]=0;\n"
+" in1_ptr[3-i]=0;\n"
+" in2_ptr[3-i]=0;\n"
+" in3_ptr[3-i]=0;\n"
+" }\n"
+" }\n"
+"#endif\n"
+" \n"
+"#ifdef FORMAT_CNHW\n"
+" int idx=x/4;\n"
+" int idy=x % 4;\n"
+" const int bhw4=(bhw+3)/4*4;\n"
+" int output_offset=((idx*bhw4+y4)*4+idy)*4; // [c/16 b 4 4]\n"
+" vstore4(in0,0,output+output_offset);\n"
+" vstore4(in1,0,output+output_offset+16);\n"
+" vstore4(in2,0,output+output_offset+32);\n"
+" vstore4(in3,0,output+output_offset+48);\n"
+"#else\n"
+" FLOAT16 out=(FLOAT16)(in0.s0,in1.s0,in2.s0,in3.s0,in0.s1,in1.s1,in2.s1,in3.s1,in0.s2,in1.s2,in2.s2,in3.s2,in0.s3,in1.s3,in2.s3,in3.s3);\n"
+" const int output_offset=(y*channelAlign+x4)*4;\n"
+" vstore16(out,0,output+output_offset);\n"
+"#endif\n"
+"}\n"
+"__kernel void reshape_nhwc4_nchw4(GLOBAL_SIZE_DIM2\n"
+"__global const FLOAT* input,\n"
+"__global FLOAT* output,\n"
+"__private const int bhw,\n"
+"__private const int channelAlign){\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //bhw\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" \n"
+" const int x4=x << 2;\n"
+" const int y4=y << 2;\n"
+" const int output_offset=(x*bhw+y4)*4;\n"
+" \n"
+" const int input_offset=(y*channelAlign+x4)*4;\n"
+" FLOAT16 in=vload16(0,input+input_offset);\n"
+" \n"
+" FLOAT4 out0=(FLOAT4)(in.s0,in.s4,in.s8,in.sc);\n"
+" FLOAT4 out1=(FLOAT4)(in.s1,in.s5,in.s9,in.sd);\n"
+" FLOAT4 out2=(FLOAT4)(in.s2,in.s6,in.sa,in.se);\n"
+" FLOAT4 out3=(FLOAT4)(in.s3,in.s7,in.sb,in.sf);\n"
+" vstore4(out0,0,output+output_offset);\n"
+" if(y4+1 >= bhw) return;\n"
+" vstore4(out1,0,output+output_offset+4);\n"
+" if(y4+2 >= bhw) return;\n"
+" vstore4(out2,0,output+output_offset+8);\n"
+" if(y4+3 >= bhw) return;\n"
+" vstore4(out3,0,output+output_offset+12);\n"
+"}\n"
+"__kernel void gemm_b4_c4_buf(GLOBAL_SIZE_DIM2\n"
+" __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" __global const char *weight,\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" __global const uchar *weight,\n"
+"#endif\n"
+"#endif\n"
+" __global const float *dequantScaleOffset,\n"
+" __global const FLOAT *bias,\n"
+" __global FLOAT* output,\n"
+" __private const int bhw4,\n"
+" __private const int dstChannelAlign,\n"
+" __private const int srcChannelAlign,\n"
+" __private const int blockNum,\n"
+" __private const int blockDim) {\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //b\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" const int out_c_idx=x << 2;\n"
+" const int out_b_idx=y << 2;\n"
+" COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(0,bias+out_c_idx));\n"
+" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
+" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1,out2=(COMPUTE_FLOAT4)bias0.s2,out3=(COMPUTE_FLOAT4)bias0.s3;\n"
+"#ifdef FORMAT_CNHW\n"
+" int input_offset=out_b_idx*16;\n"
+"#else\n"
+" int input_offset=out_b_idx*srcChannelAlign;\n"
+"#endif\n"
+" int out_offset=out_b_idx*dstChannelAlign+out_c_idx*4;\n"
+" \n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=out_c_idx*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelAlign*WEIGHT_STRIDE;\n"
+"#endif\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
+" \n"
+" for (int i=0; i<blockNum; i++){\n"
+" int kindex=i*dstChannelAlign*2;\n"
+" COMPUTE_FLOAT8 ScaleOffset=CONVERT_COMPUTE_FLOAT8(vload8(0,dequantScaleOffset+kindex+out_c_idx*2));\n"
+" for (int j=0; j<loop; j++) {\n"
+" int k=i*loop+j;\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11,weights20,weights21,weights30,weights31;\n"
+" {\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
+" uchar16 charWeightsInt42=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+2,k)));\n"
+" uchar16 charWeightsInt43=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+3,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt42);\n"
+" weights20=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" weights21=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s4+ScaleOffset.s5;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt43);\n"
+" weights30=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" weights31=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s6+ScaleOffset.s7;\n"
+" }\n"
+" #ifdef FORMAT_CNHW\n"
+" int k2=k << 1;\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16));\n"
+" DOT16X16(in,weights00,out.s0);\n"
+" DOT16X16(in,weights10,out1.s0);\n"
+" DOT16X16(in,weights20,out2.s0);\n"
+" DOT16X16(in,weights30,out3.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+16));\n"
+" DOT16X16(in,weights00,out.s1);\n"
+" DOT16X16(in,weights10,out1.s1);\n"
+" DOT16X16(in,weights20,out2.s1);\n"
+" DOT16X16(in,weights30,out3.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+32));\n"
+" DOT16X16(in,weights00,out.s2);\n"
+" DOT16X16(in,weights10,out1.s2);\n"
+" DOT16X16(in,weights20,out2.s2);\n"
+" DOT16X16(in,weights30,out3.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+48));\n"
+" DOT16X16(in,weights00,out.s3);\n"
+" DOT16X16(in,weights10,out1.s3);\n"
+" DOT16X16(in,weights20,out2.s3);\n"
+" DOT16X16(in,weights30,out3.s3);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16));\n"
+" DOT16X16(in,weights01,out.s0);\n"
+" DOT16X16(in,weights11,out1.s0);\n"
+" DOT16X16(in,weights21,out2.s0);\n"
+" DOT16X16(in,weights31,out3.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+16));\n"
+" DOT16X16(in,weights01,out.s1);\n"
+" DOT16X16(in,weights11,out1.s1);\n"
+" DOT16X16(in,weights21,out2.s1);\n"
+" DOT16X16(in,weights31,out3.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+32));\n"
+" DOT16X16(in,weights01,out.s2);\n"
+" DOT16X16(in,weights11,out1.s2);\n"
+" DOT16X16(in,weights21,out2.s2);\n"
+" DOT16X16(in,weights31,out3.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+48));\n"
+" DOT16X16(in,weights01,out.s3);\n"
+" DOT16X16(in,weights11,out1.s3);\n"
+" DOT16X16(in,weights21,out2.s3);\n"
+" DOT16X16(in,weights31,out3.s3);\n"
+" #else\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT *weights00_ptr=(COMPUTE_FLOAT *)&weights00;\n"
+" COMPUTE_FLOAT *weights10_ptr=(COMPUTE_FLOAT *)&weights10;\n"
+" COMPUTE_FLOAT *weights20_ptr=(COMPUTE_FLOAT *)&weights20;\n"
+" COMPUTE_FLOAT *weights30_ptr=(COMPUTE_FLOAT *)&weights30;\n"
+" COMPUTE_FLOAT *weights01_ptr=(COMPUTE_FLOAT *)&weights01;\n"
+" COMPUTE_FLOAT *weights11_ptr=(COMPUTE_FLOAT *)&weights11;\n"
+" COMPUTE_FLOAT *weights21_ptr=(COMPUTE_FLOAT *)&weights21;\n"
+" COMPUTE_FLOAT *weights31_ptr=(COMPUTE_FLOAT *)&weights31;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" out=mad(in,weights00_ptr[i],out);\n"
+" out1=mad(in,weights10_ptr[i],out1);\n"
+" out2=mad(in,weights20_ptr[i],out2);\n"
+" out3=mad(in,weights30_ptr[i],out3);\n"
+" }\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i+16)*4));\n"
+" out=mad(in,weights01_ptr[i],out);\n"
+" out1=mad(in,weights11_ptr[i],out1);\n"
+" out2=mad(in,weights21_ptr[i],out2);\n"
+" out3=mad(in,weights31_ptr[i],out3);\n"
+" }\n"
+" #endif\n"
+" #else\n"
+" COMPUTE_FLOAT16 weights0,weights1,weights2,weights3;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,out_c_idx,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,out_c_idx+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight,out_c_idx+2,k,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight,out_c_idx+3,k,ScaleOffset.s6,ScaleOffset.s7);\n"
+" #else\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
+" weights2=readWeight(weight+weight_offset+k*weight_oc_offset+2*WEIGHT_STRIDE,0,0,ScaleOffset.s4,ScaleOffset.s5);\n"
+" weights3=readWeight(weight+weight_offset+k*weight_oc_offset+3*WEIGHT_STRIDE,0,0,ScaleOffset.s6,ScaleOffset.s7);\n"
+" #endif\n"
+" #ifdef FORMAT_CNHW\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16));\n"
+" DOT16X16(in,weights0,out.s0);\n"
+" DOT16X16(in,weights1,out1.s0);\n"
+" DOT16X16(in,weights2,out2.s0);\n"
+" DOT16X16(in,weights3,out3.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+16));\n"
+" DOT16X16(in,weights0,out.s1);\n"
+" DOT16X16(in,weights1,out1.s1);\n"
+" DOT16X16(in,weights2,out2.s1);\n"
+" DOT16X16(in,weights3,out3.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+32));\n"
+" DOT16X16(in,weights0,out.s2);\n"
+" DOT16X16(in,weights1,out1.s2);\n"
+" DOT16X16(in,weights2,out2.s2);\n"
+" DOT16X16(in,weights3,out3.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+48));\n"
+" DOT16X16(in,weights0,out.s3);\n"
+" DOT16X16(in,weights1,out1.s3);\n"
+" DOT16X16(in,weights2,out2.s3);\n"
+" DOT16X16(in,weights3,out3.s3);\n"
+" #else\n"
+" int k16=k << 4;\n"
+" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
+" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
+" COMPUTE_FLOAT *weights2_ptr=(COMPUTE_FLOAT *)&weights2;\n"
+" COMPUTE_FLOAT *weights3_ptr=(COMPUTE_FLOAT *)&weights3;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
+" out=mad(in,weights0_ptr[i],out);\n"
+" out1=mad(in,weights1_ptr[i],out1);\n"
+" out2=mad(in,weights2_ptr[i],out2);\n"
+" out3=mad(in,weights3_ptr[i],out3);\n"
+" }\n"
+" #endif\n"
+" #endif\n"
+" }\n"
+" }\n"
+"#ifdef RELU\n"
+" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
+" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
+" out2=fmax(out2,(COMPUTE_FLOAT4)0);\n"
+" out3=fmax(out3,(COMPUTE_FLOAT4)0);\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out2=clamp(out2,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out3=clamp(out3,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+"#endif\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
+" vstore4(CONVERT_FLOAT4(out2),0,output+out_offset+8);\n"
+" vstore4(CONVERT_FLOAT4(out3),0,output+out_offset+12);\n"
 "}\n"
-"__kernel void layernorm_chw(__private int global_dim0,__private int global_dim1,__private int global_dim2,\n"
-" __read_only image2d_t input,\n"
-" __write_only image2d_t output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channel,\n"
-"#ifdef GAMMA_BETA\n"
-" __global const FLOAT *gamma,\n"
-" __global const FLOAT *beta,\n"
+"__kernel void gemm_b4_c2_buf(GLOBAL_SIZE_DIM2\n"
+" __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" __global const char *weight,\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" __global const uchar *weight,\n"
 "#endif\n"
-" __private float epsilon){\n"
-" int3 pos=(int3)(get_global_id(0),get_global_id(1),get_global_id(2));\n"
-" float local sum[LOCAL_SIZE];\n"
-" if (pos.x<global_dim0 && pos.y<global_dim1 && pos.z<global_dim2) {\n"
-" const int b=pos.z;\n"
-" const int sum_size=width*height*channel;\n"
-" const int reduce_size=width*height;\n"
-" const int lid=get_local_id(0);\n"
-" const int channel4=(channel+3)/4;\n"
-" const int channel_remain=channel-(channel4-1)*4;\n"
+"#endif\n"
+" __global const float *dequantScaleOffset,\n"
+" __global const FLOAT *bias,\n"
+" __global FLOAT* output,\n"
+" __private const int bhw4,\n"
+" __private const int dstChannelAlign,\n"
+" __private const int srcChannelAlign,\n"
+" __private const int blockNum,\n"
+" __private const int blockDim) {\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //b\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" const int out_c_idx=x << 1;\n"
+" const int out_b_idx=y << 2;\n"
+" COMPUTE_FLOAT2 bias0=CONVERT_COMPUTE_FLOAT2(vload2(0,bias+out_c_idx));\n"
+" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0.s0;\n"
+" COMPUTE_FLOAT4 out1=(COMPUTE_FLOAT4)bias0.s1;\n"
 " \n"
-" float4 in_sum=0;\n"
-" float4 in_sum_left=0;\n"
-" float *in_sum_left_ptr=(float*)(&in_sum_left);\n"
-"#ifdef RMSNORM\n"
-" float4 mean=0;\n"
+"#ifdef FORMAT_CNHW\n"
+" int input_offset=out_b_idx*16;\n"
 "#else\n"
-" for(int c=0; c<channel4-1; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" int w=i % width;\n"
-" int h=i/width;\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
-" in_sum += in;\n"
+" int input_offset=out_b_idx*srcChannelAlign;\n"
+"#endif\n"
+" int out_offset=out_b_idx*dstChannelAlign+out_c_idx*4;\n"
+" \n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=out_c_idx*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelAlign*WEIGHT_STRIDE;\n"
+"#endif\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
+" for (int i=0; i<blockNum; i++){\n"
+" int kindex=i*dstChannelAlign*2;\n"
+" COMPUTE_FLOAT4 ScaleOffset=CONVERT_COMPUTE_FLOAT4(vload4(0,dequantScaleOffset+kindex+out_c_idx*2));\n"
+" for (int j=0; j<loop; j++) {\n"
+" int k=i*loop+j;\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11;\n"
+" {\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
+" uchar16 charWeightsInt41=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx+1,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt41);\n"
+" weights10=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" weights11=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s2+ScaleOffset.s3;\n"
+" }\n"
+" #ifdef FORMAT_CNHW\n"
+" int k2=k << 1;\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16));\n"
+" DOT16X16(in,weights00,out.s0);\n"
+" DOT16X16(in,weights10,out1.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+16));\n"
+" DOT16X16(in,weights00,out.s1);\n"
+" DOT16X16(in,weights10,out1.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+32));\n"
+" DOT16X16(in,weights00,out.s2);\n"
+" DOT16X16(in,weights10,out1.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+48));\n"
+" DOT16X16(in,weights00,out.s3);\n"
+" DOT16X16(in,weights10,out1.s3);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16));\n"
+" DOT16X16(in,weights01,out.s0);\n"
+" DOT16X16(in,weights11,out1.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+16));\n"
+" DOT16X16(in,weights01,out.s1);\n"
+" DOT16X16(in,weights11,out1.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+32));\n"
+" DOT16X16(in,weights01,out.s2);\n"
+" DOT16X16(in,weights11,out1.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+48));\n"
+" DOT16X16(in,weights01,out.s3);\n"
+" DOT16X16(in,weights11,out1.s3);\n"
+" #else\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT *weights00_ptr=(COMPUTE_FLOAT *)&weights00;\n"
+" COMPUTE_FLOAT *weights10_ptr=(COMPUTE_FLOAT *)&weights10;\n"
+" COMPUTE_FLOAT *weights01_ptr=(COMPUTE_FLOAT *)&weights01;\n"
+" COMPUTE_FLOAT *weights11_ptr=(COMPUTE_FLOAT *)&weights11;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" out=mad(in,weights00_ptr[i],out);\n"
+" out1=mad(in,weights10_ptr[i],out1);\n"
 " }\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i+16)*4));\n"
+" out=mad(in,weights01_ptr[i],out);\n"
+" out1=mad(in,weights11_ptr[i],out1);\n"
 " }\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" int w=i % width;\n"
-" int h=i/width;\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
-" in_sum_left += in;\n"
+" #endif\n"
+" #else\n"
+" COMPUTE_FLOAT16 weights0,weights1;\n"
+" #ifdef USE_IMAGE\n"
+" weights0=readWeight(weight,out_c_idx,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight,out_c_idx+1,k,ScaleOffset.s2,ScaleOffset.s3);\n"
+" #else\n"
+" weights0=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" weights1=readWeight(weight+weight_offset+k*weight_oc_offset+WEIGHT_STRIDE,0,0,ScaleOffset.s2,ScaleOffset.s3);\n"
+" #endif\n"
+" #ifdef FORMAT_CNHW\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16));\n"
+" DOT16X16(in,weights0,out.s0);\n"
+" DOT16X16(in,weights1,out1.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+16));\n"
+" DOT16X16(in,weights0,out.s1);\n"
+" DOT16X16(in,weights1,out1.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+32));\n"
+" DOT16X16(in,weights0,out.s2);\n"
+" DOT16X16(in,weights1,out1.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+48));\n"
+" DOT16X16(in,weights0,out.s3);\n"
+" DOT16X16(in,weights1,out1.s3);\n"
+" #else\n"
+" int k16=k << 4;\n"
+" COMPUTE_FLOAT *weights0_ptr=(COMPUTE_FLOAT *)&weights0;\n"
+" COMPUTE_FLOAT *weights1_ptr=(COMPUTE_FLOAT *)&weights1;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
+" out=mad(in,weights0_ptr[i],out);\n"
+" out1=mad(in,weights1_ptr[i],out1);\n"
 " }\n"
-" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
-" for(int i=1; i<channel_remain; ++i){\n"
-" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
+" #endif\n"
+" #endif\n"
 " }\n"
-" sum[lid]=in_sum.x+in_sum_left.x;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
 " }\n"
 " \n"
-" float4 mean=sum[0]/(float4)sum_size;\n"
+"#ifdef RELU\n"
+" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
+" out1=fmax(out1,(COMPUTE_FLOAT4)0);\n"
 "#endif\n"
-" in_sum=0;\n"
-" in_sum_left=0;\n"
-" for(int c=0; c<channel4-1; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" int w=i % width;\n"
-" int h=i/width;\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
-" in_sum += (in-mean)*(in-mean);\n"
-" }\n"
-" }\n"
+"#ifdef RELU6\n"
+" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+" out1=clamp(out1,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+"#endif\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
+" vstore4(CONVERT_FLOAT4(out1),0,output+out_offset+4);\n"
+"}\n"
+"__kernel void gemm_b4_c1_buf(GLOBAL_SIZE_DIM2\n"
+" __global const FLOAT* input,\n"
+"#ifdef USE_IMAGE\n"
+" __read_only image2d_t weight,\n"
+"#else\n"
+"#if (defined USE_LOW_BIT_WEIGHT_INT8)\n"
+" __global const char *weight,\n"
+"#elif (defined USE_LOW_BIT_WEIGHT_INT4)\n"
+" __global const uchar *weight,\n"
+"#endif\n"
+"#endif\n"
+" __global const float *dequantScaleOffset,\n"
+" __global const FLOAT *bias,\n"
+" __global FLOAT* output,\n"
+" __private const int bhw4,\n"
+" __private const int dstChannelAlign,\n"
+" __private const int srcChannelAlign,\n"
+" __private const int blockNum,\n"
+" __private const int blockDim) {\n"
+" const int x=get_global_id(0); //c\n"
+" const int y=get_global_id(1); //b\n"
+" UNIFORM_BOUNDRY_CHECK(x,y);\n"
+" const int out_c_idx=x;\n"
+" const int out_b_idx=y << 2;\n"
+" COMPUTE_FLOAT bias0=bias[out_c_idx];\n"
+" COMPUTE_FLOAT4 out=(COMPUTE_FLOAT4)bias0;\n"
 " \n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" int w=i % width;\n"
-" int h=i/width;\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)((channel4-1)*width+w,b*height+h)));\n"
-" in_sum_left += (in-mean)*(in-mean);\n"
-" }\n"
+"#ifdef FORMAT_CNHW\n"
+" int input_offset=out_b_idx*16;\n"
+"#else\n"
+" int input_offset=out_b_idx*srcChannelAlign;\n"
+"#endif\n"
+" int out_offset=out_b_idx*dstChannelAlign+out_c_idx*4;\n"
+"#ifndef USE_IMAGE\n"
+" int weight_offset=out_c_idx*WEIGHT_STRIDE;\n"
+" int weight_oc_offset=dstChannelAlign*WEIGHT_STRIDE;\n"
+"#endif\n"
+" const int loop=(blockDim+CHANNEL_PACK-1)/CHANNEL_PACK;\n"
 " \n"
-" in_sum.x=in_sum.x+in_sum.y+in_sum.z+in_sum.w;\n"
-" for(int i=1; i<channel_remain; ++i){\n"
-" in_sum_left_ptr[0] += in_sum_left_ptr[i];\n"
+" for (int i=0; i<blockNum; i++){\n"
+" int kindex=i*dstChannelAlign*2;\n"
+" COMPUTE_FLOAT2 ScaleOffset=CONVERT_COMPUTE_FLOAT2(vload2(out_c_idx,dequantScaleOffset+kindex));\n"
+" for (int j=0; j<loop; j++) {\n"
+" int k=i*loop+j;\n"
+" #if defined(USE_LOW_BIT_WEIGHT_INT4) && defined(USE_IMAGE)\n"
+" COMPUTE_FLOAT16 weights00,weights01,weights10,weights11;\n"
+" {\n"
+" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(out_c_idx,k)));\n"
+" char16 charWeights0,charWeights1;\n"
+" UCHAR16_TO_2CHAR16(charWeights0,charWeights1,charWeightsInt40);\n"
+" weights00=CONVERT_COMPUTE_FLOAT16(charWeights0)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" weights01=CONVERT_COMPUTE_FLOAT16(charWeights1)*ScaleOffset.s0+ScaleOffset.s1;\n"
+" }\n"
+" #ifdef FORMAT_CNHW\n"
+" int k2=k << 1;\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16));\n"
+" DOT16X16(in,weights00,out.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+16));\n"
+" DOT16X16(in,weights00,out.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+32));\n"
+" DOT16X16(in,weights00,out.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k2*bhw4*16+48));\n"
+" DOT16X16(in,weights00,out.s3);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16));\n"
+" DOT16X16(in,weights01,out.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+16));\n"
+" DOT16X16(in,weights01,out.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+32));\n"
+" DOT16X16(in,weights01,out.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+(k2+1)*bhw4*16+48));\n"
+" DOT16X16(in,weights01,out.s3);\n"
+" #else\n"
+" int k32=k << 5;\n"
+" COMPUTE_FLOAT *weights00_ptr=(COMPUTE_FLOAT *)&weights00;\n"
+" COMPUTE_FLOAT *weights01_ptr=(COMPUTE_FLOAT *)&weights01;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i)*4));\n"
+" out=mad(in,weights00_ptr[i],out);\n"
 " }\n"
-" \n"
-" sum[lid]=in_sum.x+in_sum_left.x;\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
-" for(int i=LOCAL_SIZE/2; i>0; i /= 2){\n"
-" if (lid<i)\n"
-" sum[lid]=sum[lid]+sum[lid+i];\n"
-" barrier(CLK_LOCAL_MEM_FENCE);\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k32+i+16)*4));\n"
+" out=mad(in,weights01_ptr[i],out);\n"
 " }\n"
-" float4 square_sum=sum[0]/(float4)sum_size;\n"
-" float4 value=(float4)1.0f/(float4)sqrt(square_sum+(float4)epsilon);\n"
-" for(int c=0; c<channel4; ++c){\n"
-" for(int i=lid; i<reduce_size; i+=LOCAL_SIZE){\n"
-" int w=i % width;\n"
-" int h=i/width;\n"
-" float4 in=convert_float4(RI_F(input,SAMPLER,(int2)(c*width+w,b*height+h)));\n"
-"#ifdef GAMMA_BETA\n"
-" float4 out=(in-mean)*value*(float4)gamma[c*reduce_size+i]+(float4)beta[c*reduce_size+i];\n"
-"#else\n"
-" float4 out=(in-mean)*value;\n"
-"#endif\n"
-" WI_F(output,(int2)(c*width+w,b*height+h),CONVERT_FLOAT4(out));\n"
+" #endif\n"
+" #else\n"
+" COMPUTE_FLOAT16 weights;\n"
+" #ifdef USE_IMAGE\n"
+" weights=readWeight(weight,out_c_idx,k,ScaleOffset.s0,ScaleOffset.s1);\n"
+" #else\n"
+" weights=readWeight(weight+weight_offset+k*weight_oc_offset,0,0,ScaleOffset.s0,ScaleOffset.s1);\n"
+" #endif\n"
+" #ifdef FORMAT_CNHW\n"
+" COMPUTE_FLOAT16 in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16));\n"
+" DOT16X16(in,weights,out.s0);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+16));\n"
+" DOT16X16(in,weights,out.s1);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+32));\n"
+" DOT16X16(in,weights,out.s2);\n"
+" in=CONVERT_COMPUTE_FLOAT16(vload16(0,input+input_offset+k*bhw4*16+48));\n"
+" DOT16X16(in,weights,out.s3);\n"
+" #else\n"
+" int k16=k << 4;\n"
+" COMPUTE_FLOAT *weights_ptr=(COMPUTE_FLOAT *)&weights;\n"
+" #pragma unroll\n"
+" for (int i=0; i<16; ++i){\n"
+" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+(k16+i)*4));\n"
+" out=mad(in,weights_ptr[i],out);\n"
 " }\n"
+" #endif\n"
+" #endif\n"
 " }\n"
 " }\n"
+" \n"
+"#ifdef RELU\n"
+" out=fmax(out,(COMPUTE_FLOAT4)0);\n"
+"#endif\n"
+"#ifdef RELU6\n"
+" out=clamp(out,(COMPUTE_FLOAT4)0,(COMPUTE_FLOAT4)6);\n"
+"#endif\n"
+" vstore4(CONVERT_FLOAT4(out),0,output+out_offset);\n"
 "}\n"
 ;
+#endif
 const char* winogradTransformDest2_5_1 = 
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
@@ -22434,30 +20689,40 @@ const char* cast_buf =
 "#ifdef MNN_SUPPORT_FP16\n"
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 "#endif\n"
-"#define GLOBAL_SIZE_3_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
-"#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
-"__kernel void cast_buf(GLOBAL_SIZE_3_DIMS\n"
+"#define GLOBAL_SIZE_2_DIMS ""__private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
+"__kernel void cast_buf(GLOBAL_SIZE_2_DIMS\n"
 " __global INPUT_TYPE* input,\n"
 " __global OUTPUT_TYPE* output,\n"
-" __private const int width,\n"
-" __private const int height,\n"
-" __private const int channelBlock\n"
+" __private const int size\n"
 " ) {\n"
-" const int width_idx=get_global_id(0);\n"
-" const int height_idx=get_global_id(1);\n"
-" const int batch_channel_idx=get_global_id(2);\n"
-" DEAL_NON_UNIFORM_DIM3(width_idx,height_idx,batch_channel_idx);\n"
-" \n"
-" const int batch_idx=batch_channel_idx/channelBlock;\n"
-" const int channel_idx=batch_channel_idx % channelBlock;\n"
-" \n"
-" const int inp_offset=((((batch_idx*channelBlock)+channel_idx)*height+height_idx)*width+width_idx)*4;\n"
-"#ifdef TO_BOOL\n"
+" const int idx=get_global_id(0);\n"
+" const int idy=get_global_id(1);\n"
+" DEAL_NON_UNIFORM_DIM2(idx,idy);\n"
+" const int inp_offset=idx*4;\n"
+"#ifdef PACK_LEAVE\n"
+" if(inp_offset+3 >= size){\n"
+" int remain=size-inp_offset;\n"
+" for(int i=0; i<remain; ++i){\n"
+" #ifdef TO_BOOL\n"
+" int value=(int)input[inp_offset+i];\n"
+" value=value == 0 ? 0 : 1;\n"
+" output[inp_offset+i]=(OUTPUT_TYPE)value;\n"
+" #else\n"
+" output[inp_offset+i]=(OUTPUT_TYPE)input[inp_offset+i];\n"
+" #endif\n"
+" }\n"
+" }else {\n"
+"#endif\n"
+" #ifdef TO_BOOL\n"
 " int4 value=convert_int4(vload4(0,input+inp_offset));\n"
 " value=value == (int4)0 ? (int4)0 : (int4)1;\n"
 " vstore4(CONVERT_OUTPUT4(value),0,output+inp_offset);\n"
-"#else\n"
+" #else\n"
 " vstore4(CONVERT_OUTPUT4(vload4(0,input+inp_offset)),0,output+inp_offset);\n"
+" #endif\n"
+"#ifdef PACK_LEAVE\n"
+" }\n"
 "#endif\n"
 "}\n"
 ;
diff --git a/source/backend/opencl/execution/cl/opencl_source_map.hpp b/source/backend/opencl/execution/cl/opencl_source_map.hpp
index 6ec7a2399..5f6861718 100644
--- a/source/backend/opencl/execution/cl/opencl_source_map.hpp
+++ b/source/backend/opencl/execution/cl/opencl_source_map.hpp
@@ -43,9 +43,6 @@ extern const char* softmax;
 extern const char* binary_buf;
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
-extern const char* gemm_quant_batch_buf;
-#endif
-#ifndef MNN_OPENCL_BUFFER_CLOSED
 extern const char* raster_buf;
 #endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
@@ -167,6 +164,9 @@ extern const char* loop_buf;
 extern const char* roi_pooling;
 extern const char* depthwise_conv2d;
 extern const char* layernorm;
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+extern const char* gemm_conv1x1_buf;
+#endif
 extern const char* winogradTransformDest2_5_1;
 #ifndef MNN_OPENCL_BUFFER_CLOSED
 extern const char* cast_buf;
@@ -213,9 +213,6 @@ const std::map<std::string, const char*> OpenCLProgramMap =
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "binary_buf", binary_buf },
 #endif
-#ifndef MNN_OPENCL_BUFFER_CLOSED
-  { "gemm_quant_batch_buf", gemm_quant_batch_buf },
-#endif
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "raster_buf", raster_buf },
 #endif
@@ -338,6 +335,9 @@ const std::map<std::string, const char*> OpenCLProgramMap =
   { "roi_pooling", roi_pooling },
   { "depthwise_conv2d", depthwise_conv2d },
   { "layernorm", layernorm },
+#ifndef MNN_OPENCL_BUFFER_CLOSED
+  { "gemm_conv1x1_buf", gemm_conv1x1_buf },
+#endif
   { "winogradTransformDest2_5_1", winogradTransformDest2_5_1 },
 #ifndef MNN_OPENCL_BUFFER_CLOSED
   { "cast_buf", cast_buf },
diff --git a/source/backend/opencl/execution/cl/pooling_buf.cl b/source/backend/opencl/execution/cl/pooling_buf.cl
index 1340973d1..300e25185 100644
--- a/source/backend/opencl/execution/cl/pooling_buf.cl
+++ b/source/backend/opencl/execution/cl/pooling_buf.cl
@@ -16,7 +16,7 @@ __kernel void pooling(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                       __private const int2 kernel_shape,
                       __global FLOAT *output,
                       __global FLOAT *rediceOutput,
-                      __private const int channel_block) {
+                      __private const int batch) {
                           
     const int ow_idx   = get_global_id(0);
     const int b_oh_idx = get_global_id(1);
@@ -31,7 +31,7 @@ __kernel void pooling(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     
     #ifdef POOL_AVG
     COMPUTE_FLOAT4 result = (COMPUTE_FLOAT4)(0);
-    const int inp_offset = (((b_idx*channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;
     #ifdef COUNT_INCLUDE_PADDING
     int total_count = (min(ih_start + kernel_shape.x, input_shape.x + pad_shape.x) - ih_start) * (min(iw_start + kernel_shape.y, input_shape.y + pad_shape.y) - iw_start);
     #else
@@ -60,7 +60,7 @@ __kernel void pooling(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     #if RETURN_REDICE
     int4 redice = (int4)0;
     #endif
-    const int inp_offset = (((b_idx*channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start)*4;
     for(int kh=0; kh<kernel_shape.x; kh++) {
         int ih_cur = ih_start + kh;
         if(ih_cur < 0 || ih_cur >= input_shape.x) {
@@ -80,7 +80,7 @@ __kernel void pooling(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     }
     #endif
     
-    const int out_offset = (((b_idx*channel_block + c_idx)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4;
     vstore4(CONVERT_FLOAT4(result), 0, output+out_offset);
     #if RETURN_REDICE
     vstore4(CONVERT_FLOAT4(redice),  0, rediceOutput+out_offset);
@@ -96,7 +96,7 @@ __kernel void global_pooling_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                                 __private const int2 kernel_shape,
                                 __global FLOAT *output,
                                 __global FLOAT *rediceOutput,
-                                __private const int channel_block) {
+                                __private const int batch) {
     const int local_id                = get_local_id(0);
     const int output_channel_idx      = get_global_id(1);
     const int output_batch_idx        = get_global_id(2);
@@ -112,7 +112,7 @@ __kernel void global_pooling_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
 #endif
 
     COMPUTE_FLOAT4 local sum[LOCAL_SIZE];
-    const int inp_offset = ((output_batch_idx*channel_block+output_channel_idx)*input_shape.x)*input_shape.y*4;
+    const int inp_offset = ((output_batch_idx+output_channel_idx*batch)*input_shape.x)*input_shape.y*4;
     const int size = input_shape.x * input_shape.y;
     for(int i = local_id; i < size; i+=LOCAL_SIZE){
         int w = i % input_shape.y;;
@@ -152,7 +152,7 @@ __kernel void global_pooling_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     output_result /= (input_shape.x * input_shape.y);
 #endif
 
-    const int out_offset = (output_batch_idx*channel_block + output_channel_idx)*4;
+    const int out_offset = (output_batch_idx + output_channel_idx*batch)*4;
     vstore4(CONVERT_FLOAT4(output_result), 0, output+out_offset);
 #if RETURN_REDICE
     redice = rediceId[0];
diff --git a/source/backend/opencl/execution/cl/pooling_subgroup_buf.cl b/source/backend/opencl/execution/cl/pooling_subgroup_buf.cl
index 304c3b903..6311116ff 100644
--- a/source/backend/opencl/execution/cl/pooling_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/pooling_subgroup_buf.cl
@@ -15,9 +15,10 @@ __kernel void pooling_c4_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                       __global FLOAT *output,
                       __global FLOAT *rediceOutput,
                       __private const int channel,
+                      __private const int batch,
                       __private const int in_channel_block,
                       __private const int out_channel_block,
-                      __private const int input_pad_left, 
+                      __private const int input_pad_left,
                       __private const int input_pad_right, 
                       __private const int output_pad_left,
                       __private const int output_pad_right) {
@@ -35,7 +36,7 @@ __kernel void pooling_c4_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     
     #ifdef POOL_AVG
     COMPUTE_FLOAT4 result = (COMPUTE_FLOAT4)(0);
-    const int inp_offset = (((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
 #ifdef COUNT_INCLUDE_PADDING
     int total_count = (min(ih_start + KERNEL_Y, input_shape.x + pad_shape.x) - ih_start) * (min(iw_start + KERNEL_X, input_shape.y + pad_shape.y) - iw_start);
 #else
@@ -64,7 +65,7 @@ __kernel void pooling_c4_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     #if RETURN_REDICE
     int4 redice = (int4)0;
     #endif
-    const int inp_offset = (((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
     for(int kh=0; kh<KERNEL_Y; kh++) {
         int ih_cur = ih_start + kh;
         if(ih_cur < 0 || ih_cur >= input_shape.x) {
@@ -84,10 +85,10 @@ __kernel void pooling_c4_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     }
     #endif
     
-    const int out_offset = (((b_idx*in_channel_block + c_idx)*output_shape.x + oh_idx)* output_shape.y + ow_idx + output_pad_left)*4;
+    const int out_offset = (((b_idx + c_idx*batch)*output_shape.x + oh_idx)* output_shape.y + ow_idx + output_pad_left)*4;
     vstore4(CONVERT_FLOAT4(result), 0, output+out_offset);
     #if RETURN_REDICE
-    vstore4(CONVERT_FLOAT4(redice),  0, rediceOutput+(((b_idx*in_channel_block + c_idx)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4);
+    vstore4(CONVERT_FLOAT4(redice),  0, rediceOutput+(((b_idx + c_idx*batch)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4);
     #endif
 }
 
@@ -98,6 +99,7 @@ __kernel void pooling_c4_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                       __global FLOAT *output,
                       __global FLOAT *rediceOutput,
                       __private const int channel,
+                      __private const int batch,
                       __private const int in_channel_block,
                       __private const int out_channel_block,
                       __private const int input_pad_left, 
@@ -119,7 +121,7 @@ __kernel void pooling_c4_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     
     #ifdef POOL_AVG
     COMPUTE_FLOAT4 result = (COMPUTE_FLOAT4)(0);
-    const int inp_offset = (((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
  #ifdef COUNT_INCLUDE_PADDING
     int total_count = (min(ih_start + KERNEL_Y, input_shape.x + pad_shape.x) - ih_start) * (min(iw_start + KERNEL_X, input_shape.y + pad_shape.y) - iw_start);
 #else
@@ -148,7 +150,7 @@ __kernel void pooling_c4_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     #if RETURN_REDICE
     int4 redice = (int4)0;
     #endif
-    const int inp_offset = (((b_idx*in_channel_block+c_idx)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
+    const int inp_offset = (((b_idx+c_idx*batch)*input_shape.x+ih_start)*input_shape.y+iw_start+input_pad_left)*4;
     for(int kh=0; kh<KERNEL_Y; kh++) {
         int ih_cur = ih_start + kh;
         if(ih_cur < 0 || ih_cur >= input_shape.x) {
@@ -194,6 +196,7 @@ __kernel void pooling_c16_c16(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                       __global FLOAT *output,
                       __global FLOAT *rediceOutput,
                       __private const int channel,
+                      __private const int batch,
                       __private const int in_channel_block,
                       __private const int out_channel_block,
                       __private const int input_pad_left, 
@@ -343,6 +346,7 @@ __kernel void pooling_c16_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
                       __global FLOAT *output,
                       __global FLOAT *rediceOutput,
                       __private const int channel,
+                      __private const int batch,
                       __private const int in_channel_block,
                       __private const int out_channel_block,
                       __private const int input_pad_left, 
@@ -429,18 +433,18 @@ __kernel void pooling_c16_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
     const uint lid_x = sglid % 4;
     const uint lid_y = sglid / 4;
     
-    const int out_offset = (((b_idx*out_channel_block + c_idx * 4)*output_shape.x + oh_idx)* output_shape.y + ow_idx + output_pad_left)*4;
-    const int width_height = output_shape.y * output_shape.x * 4;
+    const int out_offset = (((b_idx + c_idx * 4 * batch)*output_shape.x + oh_idx)* output_shape.y + ow_idx + output_pad_left)*4;
+    const int batch_width_height = batch * output_shape.y * output_shape.x * 4;
 #if RETURN_REDICE
-    const int redice_offset = (((b_idx*out_channel_block + c_idx * 4)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4;
+    const int redice_offset = (((b_idx + c_idx * 4 * batch)*output_shape.x + oh_idx)* output_shape.y + ow_idx)*4;
 #endif
 #if OUTPUT_LEFTOVERS
     if ((c_idx+1)*16 >= channel) {
         for (int i = 0; i < 8; i++) {
             if ((c_idx*16 + lid_y * 4 + lid_x < channel) && (ow_idx + i) < output_shape.y)
-                output[out_offset + lid_y * width_height + i * 4 + lid_x] = result[i];
+                output[out_offset + lid_y * batch_width_height + i * 4 + lid_x] = result[i];
 #if RETURN_REDICE
-                rediceOutput[redice_offset + lid_y * width_height + i * 4 + lid_x] = redice[i];
+                rediceOutput[redice_offset + lid_y * batch_width_height + i * 4 + lid_x] = redice[i];
 #endif
         }
     }
@@ -448,9 +452,9 @@ __kernel void pooling_c16_c4(GLOBAL_SIZE_3_DIMS __global const FLOAT *input,
 #endif  
     {
         for (int i = 0; i < 8 && (ow_idx + i) < output_shape.y; i++) {
-            output[out_offset + lid_y * width_height + i * 4 + lid_x] = result[i];
+            output[out_offset + lid_y * batch_width_height + i * 4 + lid_x] = result[i];
 #if RETURN_REDICE
-            rediceOutput[redice_offset + lid_y * width_height + i * 4 + lid_x] = redice[i];
+            rediceOutput[redice_offset + lid_y * batch_width_height + i * 4 + lid_x] = redice[i];
 #endif
         }
     }
diff --git a/source/backend/opencl/execution/cl/range_buf.cl b/source/backend/opencl/execution/cl/range_buf.cl
index 79ea69a69..fbadf98e7 100644
--- a/source/backend/opencl/execution/cl/range_buf.cl
+++ b/source/backend/opencl/execution/cl/range_buf.cl
@@ -2,39 +2,40 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-#define GLOBAL_SIZE_3_DIMS \
-__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+#define GLOBAL_SIZE_2_DIMS \
+__private const int global_size_dim0, __private const int global_size_dim1,
 
-#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
-    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
         return;                                                                                   \
     }
 
-__kernel void range_buf(GLOBAL_SIZE_3_DIMS
+__kernel void range_buf(GLOBAL_SIZE_2_DIMS
                             __global const INPUT_TYPE* input0,
                             __global const INPUT_TYPE* input2,
                             __global OUTPUT_TYPE* output,
-                            __private const int width,
-                            __private const int height,
-                            __private const int channel,
-                            __private const int channelBlock
+                            __private const int size
                             ) {
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
+    DEAL_NON_UNIFORM_DIM2(x, y);
                                 
-    const int batch_idx = batch_channel_idx / channelBlock;
-    const int channel_idx = batch_channel_idx % channelBlock;
-                                
-    const int offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
-    const int channel4 = channel_idx << 2;
-    int index = (((batch_idx * channel) + channel4) * height + height_idx) * width + width_idx;
-    int size = height * width;
-    int4 index4 = (int4)(index, index + size, index + size * 2, index + size * 3);
+    int index = x << 2;
+    int4 index4 = (int4)(index, index + 1, index + 2, index + 3);
     INPUT_TYPE start = input0[0];
     INPUT_TYPE step = input2[0];
     OUTPUT_TYPE4 value = (OUTPUT_TYPE4)start + CONVERT_OUTPUT4(index4) * (OUTPUT_TYPE4)step;
-    vstore4(value, 0, output + offset);
+#ifdef PACK_LEAVE
+    if(index + 3 >= size){
+        OUTPUT_TYPE* value_ptr = (OUTPUT_TYPE*)&value;
+        for(int i = 0; i < size - index; ++i){
+            output[index + i] = value_ptr[i];
+        }
+    }else{
+#endif
+        vstore4(value, 0, output + index);
+#ifdef PACK_LEAVE
+    }
+#endif
 }
diff --git a/source/backend/opencl/execution/cl/raster_buf.cl b/source/backend/opencl/execution/cl/raster_buf.cl
index 7770f09e2..947910084 100644
--- a/source/backend/opencl/execution/cl/raster_buf.cl
+++ b/source/backend/opencl/execution/cl/raster_buf.cl
@@ -32,31 +32,69 @@ __kernel void buffer_set_zero(
     output[y*global_size_dim0 + x] = (OUTPUT_TYPE)(0.0f);
 }
 
-__kernel void raster_buffer(
+#define MNN_DATA_FORMAT_NCHW 0
+#define MNN_DATA_FORMAT_NHWC 1
+#define MNN_DATA_FORMAT_NC4HW4 2
+__kernel void raster_direct_buffer(
                     GLOBAL_SIZE_3_DIMS
+                    __private const int size_x,
                     __global INPUT_TYPE *input,
                     __private const int inputOffset,
+                    __private const int combineSrcOffset,
                     __private const int inputStride0,
                     __private const int inputStride1,
                     __private const int inputStride2,
+                    __private const int src_width,
+                    __private const int src_height,
+                    __private const int src_channel,
+                    __private const int src_batch,
                     __global OUTPUT_TYPE *output,
                     __private const int outputOffset,
+                    __private const int combineDstOffset,
                     __private const int outputStride0,
                     __private const int outputStride1,
-                    __private const int outputStride2
+                    __private const int outputStride2,
+                    __private const int dst_width,
+                    __private const int dst_height,
+                    __private const int dst_channel,
+                    __private const int dst_batch
                     ) {
-    const int x = get_global_id(0);
+    const int idx = get_global_id(0);
     const int y = get_global_id(1);
     const int z = get_global_id(2);
     
-    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    DEAL_NON_UNIFORM_DIM3(idx, y, z);
+    const int x = idx % size_x;
+    const int id = idx / size_x;
     
-    int inputIndex = inputOffset + z * inputStride0 + y * inputStride1 + x * inputStride2;
-    int outputIndex = outputOffset + z * outputStride0 + y * outputStride1 + x * outputStride2;
-    output[outputIndex] = (OUTPUT_TYPE)input[inputIndex];
+    int inputIndex = inputOffset + id * combineSrcOffset + z * inputStride0 + y * inputStride1 + x * inputStride2;
+    int outputIndex = outputOffset + id * combineDstOffset + z * outputStride0 + y * outputStride1 + x * outputStride2;
+#if INPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int inputIndexReal = inputIndex;
+#elif INPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int inputIndexReal = inputIndex;
+#elif INPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int in_w = inputIndex % src_width; inputIndex /= src_width;
+    int in_h = inputIndex % src_height; inputIndex /= src_height;
+    int in_c = inputIndex % src_channel;
+    int in_b = inputIndex / src_channel;
+    int inputIndexReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
+#endif
+    
+#if OUTPUT_FORMAT == MNN_DATA_FORMAT_NCHW
+    int outputIndexReal = outputIndex;
+#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NHWC
+    int outputIndexReal = outputIndex;
+#elif OUTPUT_FORMAT == MNN_DATA_FORMAT_NC4HW4
+    int out_w = outputIndex % dst_width; outputIndex /= dst_width;
+    int out_h = outputIndex % dst_height; outputIndex /= dst_height;
+    int out_c = outputIndex % dst_channel;
+    int out_b = outputIndex / dst_channel;
+    int outputIndexReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
+#endif
+    output[outputIndexReal] = (OUTPUT_TYPE)input[inputIndexReal];
 }
 
-
 __kernel void raster_nc4hw4_buffer(
                     GLOBAL_SIZE_3_DIMS
                     __global INPUT_TYPE *input,
@@ -85,72 +123,6 @@ __kernel void raster_nc4hw4_buffer(
     int inputIndex = inputOffset + (z * inputStride0 + y * inputStride1 + x * inputStride2) * 4;
     int outputIndex = outputOffset + (z * outputStride0 + y * outputStride1 + x * outputStride2) * 4;
     
-    vstore4(CONVERT_OUTPUT4(vload4(0, input+inputIndex)), 0, output+outputIndex);
-}
-
-__kernel void raster_direct_buffer(
-                    GLOBAL_SIZE_3_DIMS
-                    __private const int size_x,
-                    __global INPUT_TYPE *input,
-                    __private const int inputOffset,
-                    __private const int combineSrcOffset,
-                    __private const int inputStride0,
-                    __private const int inputStride1,
-                    __private const int inputStride2,
-                    __private const int src_width,
-                    __private const int src_height,
-                    __private const int src_channel,
-                    __global OUTPUT_TYPE *output,
-                    __private const int outputOffset,
-                    __private const int combineDstOffset,
-                    __private const int outputStride0,
-                    __private const int outputStride1,
-                    __private const int outputStride2,
-                    __private const int dst_width,
-                    __private const int dst_height,
-                    __private const int dst_channel
-                    ) {
-    const int idx = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    
-    DEAL_NON_UNIFORM_DIM3(idx, y, z);
-    const int x = idx % size_x;
-    const int id = idx / size_x;
-    
-    int inputIndex = inputOffset + id * combineSrcOffset + z * inputStride0 + y * inputStride1 + x * inputStride2;
-    int outputIndex = outputOffset + id * combineDstOffset + z * outputStride0 + y * outputStride1 + x * outputStride2;
-#ifdef INPUT_DATA_FORMAT_NHWC
-    int in_c = inputIndex % src_channel; inputIndex /= src_channel;
-    int in_w = inputIndex % src_width; inputIndex /= src_width;
-    int in_h = inputIndex % src_height;
-    int in_b = inputIndex / src_height;
-    int src_channel4 = (src_channel + 3) / 4;
-    int inputIndexC4 = (((in_b * src_channel4 + (in_c / 4)) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
-#else
-    int in_w = inputIndex % src_width; inputIndex /= src_width;
-    int in_h = inputIndex % src_height; inputIndex /= src_height;
-    int in_c = inputIndex % src_channel;
-    int in_b = inputIndex / src_channel;
-    int src_channel4 = (src_channel + 3) / 4;
-    int inputIndexC4 = (((in_b * src_channel4 + (in_c / 4)) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
-#endif
-    
-#ifdef OUTPUT_DATA_FORMAT_NHWC
-    int out_c = outputIndex % dst_channel; outputIndex /= dst_channel;
-    int out_w = outputIndex % dst_width; outputIndex /= dst_width;
-    int out_h = outputIndex % dst_height;
-    int out_b = outputIndex / dst_height;
-    int dst_channel4 = (dst_channel + 3) / 4;
-    int outputIndexC4 = (((out_b * dst_channel4 + (out_c / 4)) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
-#else
-    int out_w = outputIndex % dst_width; outputIndex /= dst_width;
-    int out_h = outputIndex % dst_height; outputIndex /= dst_height;
-    int out_c = outputIndex % dst_channel;
-    int out_b = outputIndex / dst_channel;
-    int dst_channel4 = (dst_channel + 3) / 4;
-    int outputIndexC4 = (((out_b * dst_channel4 + (out_c / 4)) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
-#endif
-    
-    output[outputIndexC4] = (OUTPUT_TYPE)input[inputIndexC4];
+    OUTPUT_TYPE4 values = CONVERT_OUTPUT4(vload4(0, (__global INPUT_TYPE *)(input+inputIndex)));
+    vstore4(values, 0, (__global OUTPUT_TYPE *)(output+outputIndex));
 }
diff --git a/source/backend/opencl/execution/cl/reduction_buf.cl b/source/backend/opencl/execution/cl/reduction_buf.cl
index aa5b00960..daf033545 100644
--- a/source/backend/opencl/execution/cl/reduction_buf.cl
+++ b/source/backend/opencl/execution/cl/reduction_buf.cl
@@ -17,355 +17,88 @@ __private const int global_size_dim0, __private const int global_size_dim1, __pr
         return;                                                                                   \
     }
 
-__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS
-                            __global const INPUT_TYPE* input,
-                            __global OUTPUT_TYPE* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
-                                
-    const int batch_idx = batch_channel_idx / outputChannelBlock;
-    const int channel_idx = batch_channel_idx % outputChannelBlock;
-    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
+__kernel void reduct_buf(GLOBAL_SIZE_3_DIMS
+                              __global const INPUT_TYPE *input,
+                              __global OUTPUT_TYPE *output,
+                              __private const int inside,
+                              __private const int outside,
+                              __private const int dim) {
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1); // inside
+    const int z = get_global_id(2); // outside
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
     
-#if LOCAL_SIZE > 0
-    const int lid = get_local_id(0);
-    INPUT_TYPE4 local sum[LOCAL_SIZE];
-    for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
-        INPUT_TYPE4 in = vload4(i, input + offset);
-        out = OPERATE(out, in);
-    }
-    sum[lid] = out;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-        if (lid < i)
-            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    out = sum[0];
-#else
-    for(int i = 0; i < inputWidth; ++i){
-        INPUT_TYPE4 in = vload4(i, input + offset);
-        out = OPERATE(out, in);
-    }
-#endif
-
-#ifdef GET_AVG
-    out = out / inputWidth;
-#endif
-    vstore4(CONVERT_OUTPUT4(out), 0, output + outputOffset);
-}
-
-
-__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS
-                            __global const INPUT_TYPE* input,
-                            __global OUTPUT_TYPE* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-#if LOCAL_SIZE > 0
-    const int width_local_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
-    
-    const int width_idx = get_group_id(0);
-    const int batch_idx = batch_channel_idx / outputChannelBlock;
-    const int channel_idx = batch_channel_idx % outputChannelBlock;
+    INPUT_TYPE out = (INPUT_TYPE)VALUE;
+    const int offset = z * dim * inside + y;
     
-    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
+#if REDUCT_LOCAL_SIZE > 4
     const int lid = get_local_id(0);
-    INPUT_TYPE4 local sum[LOCAL_SIZE];
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
-        INPUT_TYPE4 in = vload4(i * inputWidth, input + offset);
+    INPUT_TYPE local sum[REDUCT_LOCAL_SIZE];
+    for(int i = lid; i < dim; i+=REDUCT_LOCAL_SIZE){
+        INPUT_TYPE in = (INPUT_TYPE)input[offset + i * inside];
         out = OPERATE(out, in);
     }
     sum[lid] = out;
     barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+    for(int i = REDUCT_LOCAL_SIZE/2; i > 0; i /= 2){
         if (lid < i)
             sum[lid] = OPERATE(sum[lid], sum[lid + i]);
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     out = sum[0];
 #else
-
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
-    
-    const int batch_idx = batch_channel_idx / outputChannelBlock;
-    const int channel_idx = batch_channel_idx % outputChannelBlock;
-    
-    const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    for(int i = 0; i < inputHeight; ++i){
-        INPUT_TYPE4 in = vload4(i * inputWidth, input + offset);
+    for(int i = 0; i < dim; ++i){
+        INPUT_TYPE in = (INPUT_TYPE)input[offset + i * inside];
         out = OPERATE(out, in);
     }
 #endif
-    
-#ifdef GET_AVG
-    out = out / inputHeight;
-#endif
-    vstore4(CONVERT_OUTPUT4(out), 0, output + outputOffset);
-}
 
-__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS
-                            __global const INPUT_TYPE* input,
-                            __global OUTPUT_TYPE* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-#if LOCAL_SIZE > 0
-    const int width_local_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_idx = get_global_id(2);
-    
-    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
-    const int width_idx = get_group_id(0);
-    
-    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
-    int remain = inputChannel - (inputChannelBlock - 1) * 4;
-    const int lid = get_local_id(0);
-    INPUT_TYPE local sum[LOCAL_SIZE];
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    INPUT_TYPE4 in;
-    INPUT_TYPE *inPtr = (INPUT_TYPE*)&in;
-    for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
-        in = vload4(i * inputWidth * inputHeight, input + offset);
-        out = OPERATE(out, in);
-    }
-    out.x = OPERATE(out.x, out.y);
-    out.x = OPERATE(out.x, out.z);
-    out.x = OPERATE(out.x, out.w);
-    sum[lid] = out.x;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-        if (lid < i)
-            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    out.x = sum[0];
-    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-    for(int j = 0; j < remain; ++j){
-        out.x = OPERATE(out.x, inPtr[j]);
-    }
-#ifdef GET_AVG
-    out.x = out.x / inputChannel;
-#endif
-    output[outputOffset] = (OUTPUT_TYPE)out.x;
-    
-#else
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
-                                
-    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
-    int remain = inputChannel - (inputChannelBlock - 1) * 4;
-    
-    INPUT_TYPE out = (INPUT_TYPE)VALUE;
-    INPUT_TYPE4 in;
-    INPUT_TYPE *inPtr = (INPUT_TYPE*)&in;
-    for(int i = 0; i < inputChannelBlock - 1; ++i){
-        in = vload4(i * inputWidth * inputHeight, input + offset);
-        for(int j = 0; j < 4; ++j){
-            out = OPERATE(out, inPtr[j]);
-        }
-    }
-    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-    for(int j = 0; j < remain; ++j){
-        out = OPERATE(out, inPtr[j]);
-    }
 #ifdef GET_AVG
-    out = out / inputChannel;
-#endif
-    output[outputOffset] = (OUTPUT_TYPE)out;
+    out = out / dim;
 #endif
+    output[z * inside + y] = (OUTPUT_TYPE)out;
 }
 
-__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
-                            __global const INPUT_TYPE* input,
-                            __global OUTPUT_TYPE* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-#if LOCAL_SIZE > 0
-    const int width_local_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_idx = get_global_id(2);
-    
-    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
-    const int width_idx = get_group_id(0);
+__kernel void reduct_v4_buf(GLOBAL_SIZE_3_DIMS
+                              __global const INPUT_TYPE *input,
+                              __global OUTPUT_TYPE *output,
+                              __private const int inside,
+                              __private const int outside,
+                              __private const int dim) {
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1); // inside
+    const int z = get_global_id(2); // outside
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
     
-    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
-    int remain = inputChannel - (inputChannelBlock - 1) * 4;
-    const int lid = get_local_id(0);
-    INPUT_TYPE local sum[LOCAL_SIZE];
     INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    INPUT_TYPE4 in;
-    INPUT_TYPE *inPtr = (INPUT_TYPE*)&in;
-    for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
-        in = vload4(i * inputWidth * inputHeight, input + offset);
-        out = OPERATE(out, in);
-    }
-    out.x = OPERATE(out.x, out.y);
-    out.x = OPERATE(out.x, out.z);
-    out.x = OPERATE(out.x, out.w);
-    sum[lid] = out.x;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
-        if (lid < i)
-            sum[lid] = OPERATE(sum[lid], sum[lid + i]);
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    out.x = sum[0];
-    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-    for(int j = 0; j < remain; ++j){
-        out.x = OPERATE(out.x, inPtr[j]);
-    }
-#ifdef GET_AVG
-    out.x = out.x / inputChannel;
-#endif
-    output[outputOffset] = (OUTPUT_TYPE)out.x;
+    const int offset = z * dim * inside + (y << 2);
     
-#else
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int batch_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
-    const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
-    int remain = inputChannel - (inputChannelBlock - 1) * 4;
-    INPUT_TYPE out = (INPUT_TYPE)VALUE;
-    INPUT_TYPE4 in;
-    INPUT_TYPE *inPtr = (INPUT_TYPE*)&in;
-    for(int i = 0; i < inputChannelBlock - 1; ++i){
-        in = vload4(i * inputWidth * inputHeight, input + offset);
-        for(int j = 0; j < 4; ++j){
-            out = OPERATE(out, inPtr[j]);
-        }
-    }
-    in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
-    for(int j = 0; j < remain; ++j){
-        out = OPERATE(out, inPtr[j]);
-    }
-#ifdef GET_AVG
-    out = out / inputChannel;
-#endif
-    output[outputOffset] = (OUTPUT_TYPE)out;
-#endif
-}
-
-
-__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS
-                            __global const INPUT_TYPE* input,
-                            __global OUTPUT_TYPE* output,
-                            __private const int inputWidth,
-                            __private const int inputHeight,
-                            __private const int inputChannel,
-                            __private const int inputBatch,
-                            __private const int inputChannelBlock,
-                            __private const int oututWidth,
-                            __private const int outputHeight,
-                            __private const int outputChannel,
-                            __private const int outputChannelBlock
-                            ) {
-#if LOCAL_SIZE > 0
-    const int width_local_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
-    const int width_idx = get_group_id(0);
-                            
-    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
-    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
+#if REDUCT_LOCAL_SIZE > 4
     const int lid = get_local_id(0);
-    INPUT_TYPE4 local sum[LOCAL_SIZE];
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
-        INPUT_TYPE4 in = vload4(i * batchOffset, input + offset);
+    INPUT_TYPE4 local sum[REDUCT_LOCAL_SIZE];
+    for(int i = lid; i < dim; i+=REDUCT_LOCAL_SIZE){
+        INPUT_TYPE4 in = vload4(0, input + offset + i * inside);
         out = OPERATE(out, in);
     }
     sum[lid] = out;
     barrier(CLK_LOCAL_MEM_FENCE);
-    for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
+    for(int i = REDUCT_LOCAL_SIZE/2; i > 0; i /= 2){
         if (lid < i)
             sum[lid] = OPERATE(sum[lid], sum[lid + i]);
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     out = sum[0];
-#ifdef GET_AVG
-    out = out / inputBatch;
-#endif
-    vstore4(CONVERT_OUTPUT4(out), 0, output + outputOffset);
 #else
-    const int width_idx = get_global_id(0);
-    const int height_idx = get_global_id(1);
-    const int channel_idx = get_global_id(2);
-
-    DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
-                                
-    const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
-    const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
-    int batchOffset = inputChannelBlock * inputHeight * inputWidth;
-    INPUT_TYPE4 out = (INPUT_TYPE4)VALUE;
-    for(int i = 0; i < inputBatch; ++i){
-        INPUT_TYPE4 in = vload4(i * batchOffset, input + offset);
+    for(int i = 0; i < dim; ++i){
+        INPUT_TYPE4 in = vload4(0, input + offset + i * inside);
         out = OPERATE(out, in);
     }
-#ifdef GET_AVG
-    out = out / inputBatch;
 #endif
-    vstore4(CONVERT_OUTPUT4(out), 0, output + outputOffset);
+
+#ifdef GET_AVG
+    out = out / (INPUT_TYPE4)dim;
 #endif
+    vstore4(CONVERT_OUTPUT4(out), 0, output + z * inside + (y << 2));
 }
diff --git a/source/backend/opencl/execution/cl/scale_buf.cl b/source/backend/opencl/execution/cl/scale_buf.cl
index 72d3b90fd..f1d722d36 100644
--- a/source/backend/opencl/execution/cl/scale_buf.cl
+++ b/source/backend/opencl/execution/cl/scale_buf.cl
@@ -17,26 +17,25 @@ __kernel void scale_buf(GLOBAL_SIZE_2_DIMS
                         __global const FLOAT* bias,
 #endif
                         __global FLOAT* output,
-                        __private const int4 shape) {//N, H, W, C4
+                        __private const int channelBlock,
+                        __private const int batch,
+                        __private const int inside) {
 
-    const int out_w_c_idx = get_global_id(0);
-    const int out_h_b_idx = get_global_id(1);
+    const int x = get_global_id(0); // inside(width * height)
+    const int y = get_global_id(1); // channelBlock * batch
     
-    DEAL_NON_UNIFORM_DIM2(out_w_c_idx, out_h_b_idx);
+    DEAL_NON_UNIFORM_DIM2(x, y);
 
-    const int out_b_idx = out_h_b_idx / shape.y;
-    const int out_h_idx = out_h_b_idx % shape.y;
-    const int out_c_idx = out_w_c_idx / shape.z;
-    const int out_w_idx = out_w_c_idx % shape.z;
-    
-    const int offset = (((out_b_idx * shape.w + out_c_idx) * shape.y + out_h_idx) * shape.z + out_w_idx) * 4;
+    const int out_c_idx = y % channelBlock;
+    const int out_b_idx = y / channelBlock;
+    const int offset = ((out_b_idx + out_c_idx * batch) * inside + x) * 4;
     COMPUTE_FLOAT4 in_value    = CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset));
     COMPUTE_FLOAT4 scale_value = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, scale));
-#ifdef BIAS
+    #ifdef BIAS
     COMPUTE_FLOAT4 bias_value = CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx, bias));
     COMPUTE_FLOAT4 out_value  = in_value * scale_value + bias_value;
-#else
+    #else
     COMPUTE_FLOAT4 out_value  = in_value * scale_value;
-#endif
+    #endif
     vstore4(CONVERT_FLOAT4(out_value), 0, output+offset);
 }
diff --git a/source/backend/opencl/execution/cl/self_attention_buf.cl b/source/backend/opencl/execution/cl/self_attention_buf.cl
index 8dc4f8b78..2b6cf9d9d 100644
--- a/source/backend/opencl/execution/cl/self_attention_buf.cl
+++ b/source/backend/opencl/execution/cl/self_attention_buf.cl
@@ -53,6 +53,7 @@ __kernel void split_transpose_qkv(GLOBAL_SIZE_3_DIMS
                               __private const int seq_len,
                               __private const int head_num,
                               __private const int head_dim,
+                              __private const int batch,
                               __private const int seq_index
 ) {
     const int sl = get_global_id(0); // seqLen_4
@@ -80,8 +81,8 @@ __kernel void split_transpose_qkv(GLOBAL_SIZE_3_DIMS
             return;
         }
         
-        const int offset_inp = (((b * seq_len_4 + seq_index * seq_len_piece / 4 + sl) * head_num + hn) * 3 * head_dim + 4 * hd) * 4;
-        
+        const int offset_inp = ((((seq_index * seq_len_piece / 4 + sl) * batch + b) * head_num + hn) * 3 * head_dim + 4 * hd) * 4;
+
         if(sl * 4 < seq_len_piece) {
             FLOAT4 temp_0 = vload4(0, input + offset_inp);
             FLOAT4 temp_1 = vload4(0, input + offset_inp + 4);
@@ -125,7 +126,8 @@ __kernel void split_transpose_qkv(GLOBAL_SIZE_3_DIMS
     }
     
 
-    const int offset_inp = (((b * seq_len_4 + sl) * head_num + hn) * 3 * head_dim + 4 * hd) * 4;
+    const int offset_inp = (((sl * batch + b) * head_num + hn) * 3 * head_dim + 4 * hd) * 4;
+
     
     if(sl * 4 < seq_len_piece) {
         FLOAT4 temp_0 = vload4(0, input + offset_inp);
@@ -238,7 +240,7 @@ __kernel void softmax_inside(GLOBAL_SIZE_3_DIMS
     const int out_offset = (outside * shape.z + 0) * shape.y + axis;
     #endif
     /*Compute Result */
-    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
+    for (int i=lid; i<inside_len; i+=SOFTMAX_LOCAL_SIZE) {
         float value = exp((float)input[offset+ i] - maxValue) / sumValue;
         #ifdef OUTPUT_TRANSPOSE
         output[out_offset+ i*shape.y] = value;
@@ -246,19 +248,32 @@ __kernel void softmax_inside(GLOBAL_SIZE_3_DIMS
         output[offset+ i] = value;
         #endif
     }
+    if(shape.z > inside_len){
+        for(int i = lid + inside_len; i < shape.z; i+=SOFTMAX_LOCAL_SIZE){
+            #ifdef OUTPUT_TRANSPOSE
+            output[out_offset+ i*shape.y] = (FLOAT)0;
+            #else
+            output[offset+ i] = (FLOAT)0;
+            #endif
+        }
+    }
 }
 
 // [N X Y4 4] -> [N Y X]
-__kernel void trans_3d_buf(__global const FLOAT* input,
+__kernel void trans_3d_buf(GLOBAL_SIZE_3_DIMS
+                        __global const FLOAT* input,
                         __global FLOAT* output,
                         __private const int batch,
                         __private const int width,
                         __private const int height
 ) {
     int b = get_global_id(2);
-    
-    const int w = get_global_id(0) << 3;
-    const int h = get_global_id(1) << 3;
+    int w = get_global_id(0);
+    int h = get_global_id(1);
+    DEAL_NON_UNIFORM_DIM3(w, h, b);
+
+    w = w << 3;
+    h = h << 3;
     
     const int inp_offset = (b * width + w) * height + h;
     const int out_offset = (b * height + h) * width + w;
@@ -290,6 +305,7 @@ __kernel void clip_transpose_qkv(GLOBAL_SIZE_3_DIMS
                               __private const int seq_len_piece,
                               __private const int head_num,
                               __private const int head_dim,
+                              __private const int batch,
                               __private const int seq_index
 ) {
     
@@ -311,8 +327,8 @@ __kernel void clip_transpose_qkv(GLOBAL_SIZE_3_DIMS
     
     const int offset_inp = ((b * head_num + hn) * head_dim_pack + 4 * hd) * seq_len_pack + 4 * sl;
     
-    const int offset_out = (((b * seq_len_4 + seq_index * seq_len_piece / 4 + sl) * head_num + hn) * head_dim + 4 * hd) * 4;
-    
+    const int offset_out = ((((seq_index * seq_len_piece / 4 + sl) * batch + b) * head_num + hn) * head_dim + 4 * hd) * 4;
+
     // Q
     FLOAT4 temp_0 = vload4(0, input + offset_inp);
     FLOAT4 temp_1 = vload4(0, input + offset_inp + seq_len_pack);
diff --git a/source/backend/opencl/execution/cl/softmax_buf.cl b/source/backend/opencl/execution/cl/softmax_buf.cl
index 52dd91c61..fa30bf5e2 100644
--- a/source/backend/opencl/execution/cl/softmax_buf.cl
+++ b/source/backend/opencl/execution/cl/softmax_buf.cl
@@ -12,173 +12,120 @@
     }
 
 
-__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
+__kernel void softmax_in1_buf(GLOBAL_SIZE_3_DIMS
                               __global const FLOAT *input,
                               __global FLOAT *output,
-                              __private const int remain_channels,
-                              __private const int4 shape) {//NCHW
+                              __private const int inside,
+                              __private const int outside,
+                              __private const int dim) {
 
     const int x = get_global_id(0);
-    const int w = get_global_id(1);
-    const int bh = get_global_id(2);
-    DEAL_NON_UNIFORM_DIM3(x, w, bh);
+    const int y = get_global_id(1); // inside = 1
+    const int z = get_global_id(2); // outside
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
     
-    const int batch_idx = bh / shape.z;
-    const int height_idx = bh % shape.z;
-    const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+w)*4;
+    const int offset = z * dim + y;
+    const int dim4 = (dim + 3) / 4;
+    const int loop_end = max(0, dim4 - 1);
 #if SOFTMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
-    COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
+    COMPUTE_FLOAT local sum[SOFTMAX_LOCAL_SIZE];
 
+    // compute maxvalue
     COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)-FLT_MAX;
-    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)));
+    for (int i = lid; i < loop_end; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)));
     }
 
-    sum[lid] = maxValue;
+    sum[lid] = fmax(fmax(fmax(maxValue.x, maxValue.y), maxValue.z), maxValue.w);
     barrier(CLK_LOCAL_MEM_FENCE);
     for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
         if (lid < i)
             sum[lid] = fmax(sum[lid], sum[lid + i]);
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    maxValue = sum[0];
-
-    maxValue.x = fmax(maxValue.x, maxValue.y);
-    maxValue.x = fmax(maxValue.x, maxValue.z);
-    maxValue.x = fmax(maxValue.x, maxValue.w);
-
-    COMPUTE_FLOAT4 input_data = CONVERT_COMPUTE_FLOAT4(vload4((shape.y - 1) *shape.z*shape.w, input+offset));
-    if (remain_channels == 0) {
-        maxValue.x = fmax(maxValue.x, input_data.x);
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.z);
-        maxValue.x = fmax(maxValue.x, input_data.w);
-    } else if (remain_channels == 1) {
-        maxValue.x = fmax(maxValue.x, input_data.z);
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.x);
-    } else if (remain_channels == 2) {
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.x);
-    } else if (remain_channels == 3) {
-        maxValue.x = fmax(maxValue.x, input_data.x);
+    maxValue.x = sum[0];
+    for(int i = loop_end << 2; i < dim; ++i){
+        maxValue.x = fmax(maxValue.x, (COMPUTE_FLOAT)(input[offset+i]));
     }
 
+    // compute sumvalue
     COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i = lid; i < shape.y - 1; i+=SOFTMAX_LOCAL_SIZE) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)) - (COMPUTE_FLOAT4)maxValue.x);
+    for (int i = lid; i < loop_end; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - (COMPUTE_FLOAT4)maxValue.x);
     }
-    sum[lid] = sumValue;
+    sum[lid] = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
     barrier(CLK_LOCAL_MEM_FENCE);
     for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
         if (lid < i)
             sum[lid] = sum[lid] + sum[lid + i];
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    sumValue = sum[0];
-    sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
-    
+    sumValue.x = sum[0];
+    for(int i = loop_end << 2; i < dim; ++i){
+        sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i]) - maxValue.x);
+    }
     
-    input_data -= maxValue.x;
-    if (remain_channels == 0) {
-        sumValue.x += exp(input_data.w);
-        sumValue.x += exp(input_data.z);
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 1) {
-        sumValue.x += exp(input_data.z);
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 2) {
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 3) {
-        sumValue.x += exp(input_data.x);
+    // store result
+    for(int i = lid; i < loop_end; i+=SOFTMAX_LOCAL_SIZE){
+        vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - (COMPUTE_FLOAT4)maxValue.x) / (COMPUTE_FLOAT4)sumValue.x), 0, output + offset + i * 4);
     }
-    for(int i = lid; i < shape.y; i+=SOFTMAX_LOCAL_SIZE){
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)) - maxValue.x) / sumValue.x;
-        vstore4(CONVERT_FLOAT4(value), i*shape.z*shape.w, output+offset);
+    for(int i = loop_end << 2; i < dim; ++i){
+        output[offset + i] = (FLOAT)exp((COMPUTE_FLOAT)(input[offset + i]) - maxValue.x) / sumValue.x;
     }
 #else
+    // compute maxvalue
     COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)-FLT_MAX;
-    for (int i = 0; i < shape.y - 1; i++) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)));
+    for (int i = 0; i < loop_end; i++) {
+        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)));
     }
-    
-    maxValue.x = fmax(maxValue.x, maxValue.y);
-    maxValue.x = fmax(maxValue.x, maxValue.z);
-    maxValue.x = fmax(maxValue.x, maxValue.w);
-
-    COMPUTE_FLOAT4 input_data = CONVERT_COMPUTE_FLOAT4(vload4((shape.y - 1) *shape.z*shape.w, input+offset));
-    if (remain_channels == 0) {
-        maxValue.x = fmax(maxValue.x, input_data.x);
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.z);
-        maxValue.x = fmax(maxValue.x, input_data.w);
-    } else if (remain_channels == 1) {
-        maxValue.x = fmax(maxValue.x, input_data.z);
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.x);
-    } else if (remain_channels == 2) {
-        maxValue.x = fmax(maxValue.x, input_data.y);
-        maxValue.x = fmax(maxValue.x, input_data.x);
-    } else if (remain_channels == 3) {
-        maxValue.x = fmax(maxValue.x, input_data.x);
+    maxValue.x = fmax(fmax(fmax(maxValue.x, maxValue.y), maxValue.z), maxValue.w);
+    for(int i = loop_end << 2; i < dim; ++i){
+        maxValue.x = fmax(maxValue.x, (COMPUTE_FLOAT)(input[offset+i]));
     }
-
+    
+    // compute sumvalue
     COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i = 0; i < shape.y - 1; i++) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)) - (COMPUTE_FLOAT4)maxValue.x);
+    for (int i = 0; i < loop_end; i++) {
+        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - (COMPUTE_FLOAT4)maxValue.x);
     }
     sumValue.x = sumValue.x + sumValue.y + sumValue.z + sumValue.w;
-    input_data -= maxValue.x;
-    if (remain_channels == 0) {
-        sumValue.x += exp(input_data.w);
-        sumValue.x += exp(input_data.z);
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 1) {
-        sumValue.x += exp(input_data.z);
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 2) {
-        sumValue.x += exp(input_data.y);
-        sumValue.x += exp(input_data.x);
-    } else if (remain_channels == 3) {
-        sumValue.x += exp(input_data.x);
+    for(int i = loop_end << 2; i < dim; ++i){
+        sumValue.x += exp((COMPUTE_FLOAT)(input[offset+i]) - maxValue.x);
+    }
+    
+    // store result
+    for(int i = 0; i < loop_end; i++){
+        vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - (COMPUTE_FLOAT4)maxValue.x) / (COMPUTE_FLOAT4)sumValue.x), 0, output + offset + i * 4);
     }
-    for(int i = 0; i < shape.y; i++){
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.z*shape.w, input+offset)) - maxValue.x) / sumValue.x;
-        vstore4(CONVERT_FLOAT4(value), i*shape.z*shape.w, output+offset);
+    for(int i = loop_end << 2; i < dim; ++i){
+        output[offset + i] = (FLOAT)exp((COMPUTE_FLOAT)(input[offset + i]) - maxValue.x) / sumValue.x;
     }
 #endif
 }
 
+__kernel void softmax_buf(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *input,
+                              __global FLOAT *output,
+                              __private const int inside,
+                              __private const int outside,
+                              __private const int dim) {
 
-__kernel void softmax_height(GLOBAL_SIZE_3_DIMS
-                             __global const FLOAT *input,
-                             __global FLOAT *output,
-                             __private const int remain_channels,
-                             __private const int4 shape // NCHW
-                             ) {
     const int x = get_global_id(0);
-    const int wc = get_global_id(1);
-    const int b = get_global_id(2);
-    DEAL_NON_UNIFORM_DIM3(x, wc, b);
+    const int y = get_global_id(1); // inside
+    const int z = get_global_id(2); // outside
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
     
-    const int c = wc / shape.w;
-    const int w = wc % shape.w;
-    const int offset = (((b*shape.y+c)*shape.z+0)*shape.w+w)*4;
+    const int offset = z * dim * inside + y;
 #if SOFTMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
-    COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
-    
-    /*Compute Max */
-    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)(-FLT_MAX);
-    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)));
+    COMPUTE_FLOAT local sum[SOFTMAX_LOCAL_SIZE];
+
+    COMPUTE_FLOAT maxValue = (COMPUTE_FLOAT)-FLT_MAX;
+    for (int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, (COMPUTE_FLOAT)(input[offset+i*inside]));
     }
+
     sum[lid] = maxValue;
     barrier(CLK_LOCAL_MEM_FENCE);
     for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
@@ -187,11 +134,10 @@ __kernel void softmax_height(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     maxValue = sum[0];
-    
-    /*Compute Exp Sum*/
-    COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)) - maxValue);
+
+    COMPUTE_FLOAT sumValue = (COMPUTE_FLOAT)0;
+    for (int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside]) - maxValue);
     }
     sum[lid] = sumValue;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -201,56 +147,47 @@ __kernel void softmax_height(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     sumValue = sum[0];
-
-    /*Compute Result */
-    for (int i=lid; i<shape.z; i+=SOFTMAX_LOCAL_SIZE) {
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)) - maxValue) / sumValue;
-        vstore4(CONVERT_FLOAT4(value), i*shape.w, output+offset);
+    for(int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE){
+        output[offset + i * inside] = (FLOAT)exp((COMPUTE_FLOAT)(input[offset + i * inside]) - maxValue) / sumValue;
     }
 #else
-    /*Compute Max */
-    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)(-FLT_MAX);
-    for (int i=0; i<shape.z; i++) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)));
-    }
-    
-    /*Compute Exp Sum*/
-    COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i=0; i<shape.z; i++) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)) - maxValue);
+    COMPUTE_FLOAT maxValue = (COMPUTE_FLOAT)-FLT_MAX;
+    for (int i = 0; i < dim; i++) {
+        maxValue = fmax(maxValue, (COMPUTE_FLOAT)(input[offset+i*inside]));
     }
 
-    /*Compute Result */
-    for (int i=0; i<shape.z; i++) {
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i*shape.w, input+offset)) - maxValue) / sumValue;
-        vstore4(CONVERT_FLOAT4(value), i*shape.w, output+offset);
+    COMPUTE_FLOAT sumValue = (COMPUTE_FLOAT)0;
+    for (int i = 0; i < dim; i++) {
+        sumValue += exp((COMPUTE_FLOAT)(input[offset+i*inside]) - maxValue);
+    }
+    for(int i = 0; i < dim; i++){
+        output[offset + i * inside] = (FLOAT)exp((COMPUTE_FLOAT)(input[offset+i*inside]) - maxValue) / sumValue;
     }
 #endif
 }
 
+__kernel void softmax_v4_buf(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *input,
+                              __global FLOAT *output,
+                              __private const int inside,
+                              __private const int outside,
+                              __private const int dim) {
 
-__kernel void softmax_width(GLOBAL_SIZE_3_DIMS
-                            __global const FLOAT *input,
-                            __global FLOAT *output,
-                            __private const int remain_channels,
-                            __private const int4 shape // NCHW
-                            ) {
     const int x = get_global_id(0);
-    const int c = get_global_id(1);
-    const int bh = get_global_id(2);
-    DEAL_NON_UNIFORM_DIM3(x, c, bh);
-    const int b = bh / shape.z;
-    const int h = bh % shape.z;
-    const int offset = (((b*shape.y+c)*shape.z+h)*shape.w+0)*4;
+    const int y = get_global_id(1); // inside
+    const int z = get_global_id(2); // outside
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    
+    const int offset = z * dim * inside + (y << 2);
 #if SOFTMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
     COMPUTE_FLOAT4 local sum[SOFTMAX_LOCAL_SIZE];
-    
-    /*Compute Max */
-    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)(-FLT_MAX);
-    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)));
+
+    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)-FLT_MAX;
+    for (int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE) {
+        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)));
     }
+
     sum[lid] = maxValue;
     barrier(CLK_LOCAL_MEM_FENCE);
     for(int i = SOFTMAX_LOCAL_SIZE/2; i > 0; i /= 2){
@@ -259,11 +196,10 @@ __kernel void softmax_width(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     maxValue = sum[0];
-    
-    /*Compute Exp Sum*/
+
     COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - maxValue);
+    for (int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE) {
+        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)) - maxValue);
     }
     sum[lid] = sumValue;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -273,28 +209,21 @@ __kernel void softmax_width(GLOBAL_SIZE_3_DIMS
         barrier(CLK_LOCAL_MEM_FENCE);
     }
     sumValue = sum[0];
-    
-    /*Compute Result */
-    for (int i=lid; i<shape.w; i+=SOFTMAX_LOCAL_SIZE) {
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - maxValue) / sumValue;
-        vstore4(CONVERT_FLOAT4(value), i, output+offset);
+    for(int i = lid; i < dim; i+=SOFTMAX_LOCAL_SIZE){
+        vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)) - maxValue) / sumValue), 0, output+offset+i*inside);
     }
 #else
-    /*Compute Max */
-    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)(-FLT_MAX);
-    for (int i=0; i<shape.w; i++) {
-        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)));
+    COMPUTE_FLOAT4 maxValue = (COMPUTE_FLOAT4)-FLT_MAX;
+    for (int i = 0; i < dim; i++) {
+        maxValue = fmax(maxValue, CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)));
     }
-    /*Compute Exp Sum*/
+
     COMPUTE_FLOAT4 sumValue = (COMPUTE_FLOAT4)0;
-    for (int i=0; i<shape.w; i++) {
-        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - maxValue);
+    for (int i = 0; i < dim; i++) {
+        sumValue += exp(CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)) - maxValue);
     }
-    
-    /*Compute Result */
-    for (int i=0; i<shape.w; i++) {
-        COMPUTE_FLOAT4 value = exp(CONVERT_COMPUTE_FLOAT4(vload4(i, input+offset)) - maxValue) / sumValue;
-        vstore4(CONVERT_FLOAT4(value), i, output+offset);
+    for(int i = 0; i < dim; i++){
+        vstore4(CONVERT_FLOAT4(exp(CONVERT_COMPUTE_FLOAT4(vload4(0, input+offset+i*inside)) - maxValue) / sumValue), 0, output+offset+i*inside);
     }
 #endif
 }
diff --git a/source/backend/opencl/execution/cl/splitgelu_buf.cl b/source/backend/opencl/execution/cl/splitgelu_buf.cl
index e65e35736..405fa20ca 100644
--- a/source/backend/opencl/execution/cl/splitgelu_buf.cl
+++ b/source/backend/opencl/execution/cl/splitgelu_buf.cl
@@ -2,7 +2,7 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-__kernel void splitgelu_buf(__private int global_dim0, __private int global_dim1, __private int global_dim2,
+__kernel void splitgelu_buf(__private int global_dim0, __private int global_dim1,
                         __global const FLOAT * input,
                         #ifdef DOUBLE_INPUTS
                         __global const FLOAT * input1,
@@ -10,51 +10,63 @@ __kernel void splitgelu_buf(__private int global_dim0, __private int global_dim1
                         __global FLOAT * output,
                         __private const int4 shape
 ){
-    int3 pos = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-    if (pos.x < global_dim0 && pos.y < global_dim1 && pos.z < global_dim2) {
-        const int b   = pos.x;
-        const int c_4 = pos.y;
+    int2 pos = (int2)(get_global_id(0), get_global_id(1));
+    if (pos.x < global_dim0 && pos.y < global_dim1) {
+        const int h   = pos.x;
+        const int bc  = pos.y;
+
+// The product of W and H is a multiple of 16
+#ifdef WH_16
+    const int in_offset = bc * shape.z * 2 + h * 16;
+    const int out_offset = bc * shape.z + h * 16;
+
+    float16 valueL = convert_float16(vload16(0, input + in_offset));
+    float16 valueR = convert_float16(vload16(0, input + in_offset + shape.z));
+
+    #ifdef DOUBLE_INPUTS
+    float16 valueConstL = convert_float16(vload16(h, input1));
+    float16 valueConstR = convert_float16(vload16(h, input1 + shape.z));
+    valueL += valueConstL;
+    valueR += valueConstR;
+    #endif
+    float16 out = (erf(valueR * (float16)0.7071067932881648) + (float16)1.0) * valueR * (float16)0.5;
+    out *= valueL;
+    vstore16(CONVERT_FLOAT16(out), 0, output + out_offset);
 
 // The product of W and H is a multiple of 4
-#ifdef WH_4
-        const int hw_4  = pos.z;
-
-        const int channel_4 = (shape.y + 3) >> 2;
-        const int area_4 = (shape.z + 3) >> 2;
-        const int in_offset = ((b * channel_4 + c_4) * area_4 * 2 + hw_4) * 16;
-        const int out_offset = ((b * channel_4 + c_4) * area_4 + hw_4) * 16;
-
-        float16 valueL = convert_float16(vload16(0, input + in_offset));
-        float16 valueR = convert_float16(vload16(area_4, input + in_offset));
-
-        #ifdef DOUBLE_INPUTS
-        float4 valueConstL = convert_float4(vload4(hw, input1));
-        float4 valueConstR = convert_float4(vload4(area_4+hw, input1));
-        valueL += (float16)((float4)valueConstL.x, (float4)valueConstL.y, (float4)valueConstL.z, (float4)valueConstL.w);
-        valueR += (float16)((float4)valueConstR.x, (float4)valueConstR.y, (float4)valueConstR.z, (float4)valueConstR.w);
-        #endif
-        float16 out = (erf(valueR * (float16)0.7071067932881648) + (float16)1.0) * valueR * (float16)0.5;
-        out *= valueL;
-        vstore16(CONVERT_FLOAT16(out), 0, output + out_offset);
+#elif defined (WH_4)
+
+    const int in_offset = bc * shape.z * 2 + h * 4;
+    const int out_offset = bc * shape.z + h * 4;
+
+    float4 valueL = convert_float4(vload4(0, input + in_offset));
+    float4 valueR = convert_float4(vload4(0, input + in_offset + shape.z));
+
+    #ifdef DOUBLE_INPUTS
+    float4 valueConstL = convert_float4(vload4(h, input1));
+    float4 valueConstR = convert_float4(vload4(h, input1 + shape.z));
+    valueL += valueConstL;
+    valueR += valueConstR;
+    #endif
+    float4 out = (erf(valueR * (float4)0.7071067932881648) + (float4)1.0) * valueR * (float4)0.5;
+    out *= valueL;
+    vstore4(CONVERT_FLOAT4(out), 0, output + out_offset);
 #else
-        const int hw  = pos.z;
-        
-        const int channel_4 = (shape.y + 3) >> 2;
-        const int in_offset = ((b * channel_4 + c_4) * shape.z * 2 + hw) * 4;
-        const int out_offset = ((b * channel_4 + c_4) * shape.z + hw) * 4;
-        
-        float4 valueL = convert_float4(vload4(0, input + in_offset));
-        float4 valueR = convert_float4(vload4(shape.z, input + in_offset));
-
-        #ifdef DOUBLE_INPUTS
-        float valueConstL = input1[hw];
-        float valueConstR = input1[shape.z+hw];
-        valueL += (float4)valueConstL;
-        valueR += (float4)valueConstR;
-        #endif
-        float4 out = (erf(valueR * (float4)0.7071067932881648) + (float4)1.0) * valueR * (float4)0.5;
-        out *= valueL;
-        vstore4(CONVERT_FLOAT4(out), 0, output + out_offset);
+    const int in_offset = bc * shape.z * 2 + h;
+    const int out_offset = bc * shape.z + h;
+    
+    float valueL = (float)input[in_offset];
+    float valueR = (float)input[in_offset + shape.z];
+
+    #ifdef DOUBLE_INPUTS
+    float valueConstL = input1[h];
+    float valueConstR = input1[shape.z+h];
+    valueL += valueConstL;
+    valueR += valueConstR;
+    #endif
+    float out = (erf(valueR * 0.7071067932881648) + 1.0) * valueR * 0.5;
+    out *= valueL;
+    output[out_offset] = out;
 #endif
     }
 }
diff --git a/source/backend/opencl/execution/cl/unary_buf.cl b/source/backend/opencl/execution/cl/unary_buf.cl
index 67565b1b3..9a93d83ce 100644
--- a/source/backend/opencl/execution/cl/unary_buf.cl
+++ b/source/backend/opencl/execution/cl/unary_buf.cl
@@ -2,11 +2,11 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-#define GLOBAL_SIZE_3_DIMS \
-    __private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
+#define GLOBAL_SIZE_2_DIMS \
+    __private const int global_size_dim0, __private const int global_size_dim1,
 
-#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3)                                             \
-    if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
         return;                                                                                   \
     }
 inline float4 gelu(float4 in){
@@ -17,22 +17,35 @@ inline float4 gelu(float4 in){
     return (1.0f + dst) * in * 0.5f;
 }
 
-__kernel void unary_buf(GLOBAL_SIZE_3_DIMS
+__kernel void unary_buf(GLOBAL_SIZE_2_DIMS
                         __global const INPUT_TYPE *input,
                         __global OUTPUT_TYPE *output,
-                        __private const int height) {
-    const int channel_block_idx = get_global_id(0);
-    const int w                 = get_global_id(1);
-    const int hb                = get_global_id(2);
+                        __private const int size) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
-    DEAL_NON_UNIFORM_DIM3(channel_block_idx, w, hb);
-
-    const int batch_idx = hb / height;
-    const int height_idx = hb % height;
-
-    const int offset = (((batch_idx*global_size_dim0+channel_block_idx)*height+height_idx)*global_size_dim1+w) * 4;
-    float4 in  = convert_float4(vload4(0, input+offset));
-    float4 out = OPERATOR;
-    vstore4(CONVERT_OUTPUT4(out), 0, output+offset);
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int offset = x << 2;
+#ifdef PACK_LEAVE
+    if(offset + 3 >= size){
+        int remain = size - offset;
+        float4 in;
+        float* in_ptr = (float*)&in;
+        for(int i = 0; i < remain; ++i){
+            in_ptr[i] = (float)input[offset + i];
+        }
+        float4 out = OPERATOR;
+        float* out_ptr = (float*)&out;
+        for(int i = 0; i < remain; ++i){
+            output[offset + i] = (OUTPUT_TYPE)out_ptr[i];
+        }
+    }else {
+#endif
+        float4 in = convert_float4(vload4(0, input + offset));
+        float4 out = OPERATOR;
+        vstore4(CONVERT_OUTPUT4(out), 0, output + offset);
+#ifdef PACK_LEAVE
+    }
+#endif
 }
 
diff --git a/source/backend/opencl/execution/cl/unary_subgroup_buf.cl b/source/backend/opencl/execution/cl/unary_subgroup_buf.cl
index d2d8b9528..ffdc8f8f3 100644
--- a/source/backend/opencl/execution/cl/unary_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/unary_subgroup_buf.cl
@@ -23,6 +23,7 @@ __kernel void unary_buf_c4_c4(GLOBAL_SIZE_3_DIMS
                         __private const int width,
                         __private const int height,
                         __private const int channel,
+                        __private const int batch,
                         __private const int input_pad_left, __private const int input_pad_right,
                         __private const int output_pad_left, __private const int output_pad_right) {
     const int channel_block_idx = get_global_id(0);
@@ -33,9 +34,8 @@ __kernel void unary_buf_c4_c4(GLOBAL_SIZE_3_DIMS
 
     const int batch_idx = hb / height;
     const int height_idx = hb % height;
-    const int channel4 = (channel + 3) / 4;
 
-    const int offset = (((batch_idx*channel4+channel_block_idx)*height+height_idx)*width+w) * 4;
+    const int offset = (((batch_idx+channel_block_idx*batch)*height+height_idx)*width+w) * 4;
     float4 in  = convert_float4(vload4(0, input+offset));
     float4 out = OPERATOR;
     vstore4(CONVERT_OUTPUT4(out), 0, output+offset);
@@ -47,6 +47,7 @@ __kernel void unary_buf_c4_c16(GLOBAL_SIZE_3_DIMS
                         __private const int width,
                         __private const int height,
                         __private const int channel,
+                        __private const int batch,
                         __private const int input_pad_left, __private const int input_pad_right,
                         __private const int output_pad_left, __private const int output_pad_right) {
     const int channel_block_idx = get_global_id(0);
@@ -58,11 +59,10 @@ __kernel void unary_buf_c4_c16(GLOBAL_SIZE_3_DIMS
     const int batch_idx = hb / height;
     const int height_idx = hb % height;
     const int dst_width = output_pad_left+width+output_pad_right;
-    const int channel4 = (channel + 3) / 4;
     const int channel16 = (channel + 15) / 16;
     const int channe_out_idx = channel_block_idx >> 2;
 
-    const int offset = (((batch_idx*channel4+channel_block_idx)*height+height_idx)*width+w) * 4;
+    const int offset = (((batch_idx+channel_block_idx*batch)*height+height_idx)*width+w) * 4;
     const int dst_offset = (((batch_idx*channel16+channe_out_idx)*height+height_idx)*dst_width+w+output_pad_left) * 16 + (channel_block_idx % 4) * 4;
     float4 in  = convert_float4(vload4(0, input+offset));
     float4 out = OPERATOR;
@@ -86,6 +86,7 @@ __kernel void unary_buf_c16_c16(GLOBAL_SIZE_3_DIMS
                         __private const int width,
                         __private const int height,
                         __private const int channel,
+                        __private const int batch,
                         __private const int input_pad_left, __private const int input_pad_right,
                         __private const int output_pad_left, __private const int output_pad_right) {
     const int channel_idx = get_group_id(0);
@@ -132,6 +133,7 @@ __kernel void unary_buf_c16_c4(GLOBAL_SIZE_3_DIMS
                         __private const int width,
                         __private const int height,
                         __private const int channel,
+                        __private const int batch,
                         __private const int input_pad_left, __private const int input_pad_right,
                         __private const int output_pad_left, __private const int output_pad_right) {
     const int channel_idx = get_group_id(0);
@@ -142,12 +144,11 @@ __kernel void unary_buf_c16_c4(GLOBAL_SIZE_3_DIMS
     const int batch_idx = hb / height;
     const int height_idx = hb % height;
     const int src_width = width + input_pad_left + input_pad_right;
-    const int channel4 = (channel + 3) / 4;
     const int channel16 = (channel + 15) / 16;
 
 
     const int src_offset = (((batch_idx*channel16+channel_idx)*height+height_idx)*src_width+w+input_pad_left) * 16;
-    const int dst_offset = (((batch_idx*channel4+(channel_idx<<2))*height+height_idx)*width+w) * 4;
+    const int dst_offset = (((batch_idx+(channel_idx<<2)*batch)*height+height_idx)*width+w) * 4;
     const int height_width = height * width * 4;
     
     float4 in = convert_float4(AS_INPUT_DATA4(INTEL_SUB_GROUP_READ4((__global INTEL_DATA*)(input + src_offset))));
diff --git a/source/backend/opencl/execution/cl/winogradTransform_buf.cl b/source/backend/opencl/execution/cl/winogradTransform_buf.cl
index 0caf484b0..87424d940 100644
--- a/source/backend/opencl/execution/cl/winogradTransform_buf.cl
+++ b/source/backend/opencl/execution/cl/winogradTransform_buf.cl
@@ -97,6 +97,7 @@ __kernel void winoTransSrcBuf2_3_1(GLOBAL_SIZE_DIM2
                                       __private const int srcWidth, // 6
                                       __private const int srcHeight, __private const int srcChannelC4,
                                       __private const int dstHeightPad, __private const int srcChannelPad,
+                                      __private const int batch,
                                       __private const int batchOffset) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1)); 
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -133,7 +134,7 @@ __kernel void winoTransSrcBuf2_3_1(GLOBAL_SIZE_DIM2
         FLOAT4 S23;
         FLOAT4 S33;
         
-        int inp_offset = (((batchIndex * srcChannelC4 + srcZ) * srcHeight + syStart) * srcWidth + sxStart) * 4;
+        int inp_offset = (((batchIndex + srcZ * batch) * srcHeight + syStart) * srcWidth + sxStart) * 4;
         {
             int sx      = 0 + sxStart;
             int sy      = 0 + syStart;
@@ -395,6 +396,7 @@ __kernel void winoTransDstBuf2_3_1(GLOBAL_SIZE_DIM2
                                     __private const int dstChannelC4,
                                     __private const int srcWidthPad,
                                     __private const int dstChannelPad,
+                                    __private const int batch,
                                     __private const int batchOffset) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1));
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -447,7 +449,7 @@ __kernel void winoTransDstBuf2_3_1(GLOBAL_SIZE_DIM2
         
         //NC4HW4 [batch, dstChannelC4, dstHeight, dstWidth]
         //index: [batchIndex, oz,      oyStart,   oxStart]
-        int out_offset = (((batchIndex * dstChannelC4+ oz) * dstHeight + oyStart) * dstWidth + oxStart)*4;
+        int out_offset = (((batchIndex + oz * batch) * dstHeight + oyStart) * dstWidth + oxStart)*4;
         {
             int ox = oxStart + 0;
             int oy = oyStart + 0;
diff --git a/source/backend/opencl/execution/cl/winogradTransform_subgroup_buf.cl b/source/backend/opencl/execution/cl/winogradTransform_subgroup_buf.cl
index 4a7d903b3..05833ba41 100644
--- a/source/backend/opencl/execution/cl/winogradTransform_subgroup_buf.cl
+++ b/source/backend/opencl/execution/cl/winogradTransform_subgroup_buf.cl
@@ -20,6 +20,7 @@ __kernel void winoTransSrcBuf2_3_1_c16_c16(GLOBAL_SIZE_DIM2
                                       __private const int srcWidth, // 6
                                       __private const int srcHeight, __private const int srcChannelC4, __private const int srcChannelC16, __private const int dstHeight,
                                       __private const int batchOffset,
+                                      __private const int batch,
                                       __private const int input_pad_left, __private const int input_pad_right) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1)); 
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -101,6 +102,7 @@ __kernel void winoTransDstBuf2_3_1_c16_c16(GLOBAL_SIZE_DIM2
                                     __private const int dstHeight,
                                     __private const int dstChannelC4,__private const int dstChannelC16,__private const int srcWidth,
                                     __private const int batchOffset,
+                                    __private const int batch,
                                     __private const int output_pad_left, __private const int output_pad_right) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1));
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -225,6 +227,7 @@ __kernel void winoTransSrcBuf2_3_1_c4_c16(GLOBAL_SIZE_DIM2
                                       __private const int srcWidth, // 6
                                       __private const int srcHeight, __private const int srcChannelC4, __private const int srcChannelC16, __private const int dstHeight,
                                       __private const int batchOffset,
+                                      __private const int batch,
                                       __private const int input_pad_left, __private const int input_pad_right) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1)); 
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -253,7 +256,7 @@ __kernel void winoTransSrcBuf2_3_1_c4_c16(GLOBAL_SIZE_DIM2
         FLOAT4 S23;
         FLOAT4 S33;
         
-        int inp_offset = (((batchOffset * srcChannelC4 + pos.y) * srcHeight + syStart) * srcWidth + sxStart) * 4;
+        int inp_offset = (((batchOffset + pos.y * batch) * srcHeight + syStart) * srcWidth + sxStart) * 4;
         {
             int sx      = 0 + sxStart;
             int sy      = 0 + syStart;
@@ -417,6 +420,7 @@ __kernel void winoTransDstBuf2_3_1_c16_c4(GLOBAL_SIZE_DIM2
                                     __private const int dstHeight,
                                     __private const int dstChannelC4,__private const int dstChannelC16,__private const int srcWidth,
                                     __private const int batchOffset,
+                                    __private const int batch,
                                     __private const int output_pad_left, __private const int output_pad_right) {
     int2 pos = (int2)(get_global_id(0), get_global_id(1));
     UNIFORM_BOUNDRY_CHECK(pos.x, pos.y);
@@ -463,7 +467,7 @@ __kernel void winoTransDstBuf2_3_1_c16_c4(GLOBAL_SIZE_DIM2
         
         //NC4HW4 [batch, dstChannelC4, dstHeight, dstWidth]
         //index: [batchOffset, pos.y,      oyStart,   oxStart]
-        int out_offset = (((batchOffset * dstChannelC4+ pos.y) * dstHeight + oyStart) * dstWidth + oxStart)*4;
+        int out_offset = (((batchOffset+ pos.y * batch) * dstHeight + oyStart) * dstWidth + oxStart)*4;
         {
             int ox = oxStart + 0;
             int oy = oyStart + 0;
diff --git a/source/backend/opencl/execution/image/ConvExecution.cpp b/source/backend/opencl/execution/image/ConvExecution.cpp
index d5315ffee..d2f6d288a 100644
--- a/source/backend/opencl/execution/image/ConvExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvExecution.cpp
@@ -491,7 +491,7 @@ class ConvolutionCreator : public OpenCLBackend::Creator {
         std::vector<int> inputShape  = tensorShapeFormat(inputs[0]);
         const int inputChannels = inputShape.at(3);
 #if defined(MNN_LOW_MEMORY) && not defined(MNN_OPENCL_BUFFER_CLOSED)
-        {
+        if (static_cast<OpenCLBackend *>(backend)->getMemory() == BackendConfig::Memory_Low){
             auto conv2dParams = op->main_as_Convolution2D();
             if (conv2dParams->quanParameter() != nullptr) {
                 if (((conv2dParams->quanParameter()->type() == 4) ||
diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
index f40f3d644..717bab14a 100644
--- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
@@ -86,9 +86,6 @@ bool ConvLowMemoryExecution::convertToQuantWeight1x1Buffer(cl::Buffer input, int
         // int4 case
         buildOptions.emplace("-DUSE_LOW_BIT_WEIGHT_INT4");
     } else {/* More types to be supported. */}
-    if(mResource->mInputChannel % icPack != 0){
-        buildOptions.emplace("-DCHANNEL_LEAVE");
-    }
 
     mBufferToConv1x1Kernel = runtime->buildKernelWithCache("buffer_convert_quant", kernelName, buildOptions);
     auto kernel = mBufferToConv1x1Kernel->get();
@@ -495,6 +492,9 @@ ConvLowMemoryExecution::ConvLowMemoryExecution(const std::vector<Tensor *> &inpu
         setGeneralWeightLowMemory(mFilterDataPtr, quanCommon);
     }
     // Create Kernel
+    if (mResource->mStrides[0] == 1 && mResource->mStrides[1] == 1 && mResource->mDilations[0] == 1 && mResource->mDilations[1] == 1) {
+        mResource->mBuildOptions.emplace("-DMNN_CONV_S1D1");
+    }
     mResource->mBuildOptions.emplace("-DBIAS");
     if (conv2dCommonParams->relu()) {
         mResource->mBuildOptions.emplace("-DRELU");
diff --git a/source/backend/opengl/GLBackend.cpp b/source/backend/opengl/GLBackend.cpp
index d0d9ba2c7..c8c460407 100644
--- a/source/backend/opengl/GLBackend.cpp
+++ b/source/backend/opengl/GLBackend.cpp
@@ -439,7 +439,7 @@ bool GLBackend::isCreateError() const {
 }
 
 
-Backend* GLRuntime::onCreate(const BackendConfig* config) const {
+Backend* GLRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
     BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
     BackendConfig::PowerMode power         = BackendConfig::Power_Normal;
     if (nullptr != mInfo.user) {
@@ -477,7 +477,7 @@ class GLRuntimeCreator : public RuntimeCreator {
 public:
     virtual Runtime *onCreate(const Backend::Info &info) const override {
         auto rt = new GLRuntime(info);
-        auto bn = (GLBackend*)(rt->onCreate(nullptr));
+        auto bn = (GLBackend*)(rt->onCreate(nullptr, nullptr));
         if (bn->isCreateError()) {
             delete bn;
             delete rt;
diff --git a/source/backend/opengl/GLBackend.hpp b/source/backend/opengl/GLBackend.hpp
index b36140258..2c0307faa 100644
--- a/source/backend/opengl/GLBackend.hpp
+++ b/source/backend/opengl/GLBackend.hpp
@@ -35,7 +35,7 @@ class GLRuntime : public Runtime {
      @brief create backend
      @return created backend
      */
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
 
     /**
      @brief clear unuseful resource
diff --git a/source/backend/tensorrt/backend/TRTBackend.cpp b/source/backend/tensorrt/backend/TRTBackend.cpp
index 49d954b10..66fde8932 100755
--- a/source/backend/tensorrt/backend/TRTBackend.cpp
+++ b/source/backend/tensorrt/backend/TRTBackend.cpp
@@ -54,7 +54,7 @@ TRTRuntime::TRTRuntime(const Backend::Info& info) {
 TRTRuntime::~TRTRuntime() {
 }
 
-Backend* TRTRuntime::onCreate(const BackendConfig* config) const {
+Backend* TRTRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
     return new TRTBackend(this);
 }
 
diff --git a/source/backend/tensorrt/backend/TRTBackend.hpp b/source/backend/tensorrt/backend/TRTBackend.hpp
index c7e14fa6c..adde390c2 100644
--- a/source/backend/tensorrt/backend/TRTBackend.hpp
+++ b/source/backend/tensorrt/backend/TRTBackend.hpp
@@ -34,7 +34,7 @@ class TRTRuntime : public Runtime {
     TRTRuntime(const Backend::Info& info);
     virtual ~TRTRuntime();
 
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
     virtual void onGabageCollect(int level) override;
     // If buffer is not nullptr, try copy cache, else delete cache
     virtual bool onSetCache(const void* buffer, size_t size) override {
diff --git a/source/backend/vulkan/component/VulkanPipeline.cpp b/source/backend/vulkan/component/VulkanPipeline.cpp
index e0da6bcdd..5b26ca094 100644
--- a/source/backend/vulkan/component/VulkanPipeline.cpp
+++ b/source/backend/vulkan/component/VulkanPipeline.cpp
@@ -128,7 +128,7 @@ VulkanLayout::DescriptorSet* VulkanPipeline::createSet() const {
 }
 
 void VulkanPipeline::changePipeline(const std::vector<uint32_t>& localSize) const{
-    VkPipeline pipeline = VK_NULL_HANDLE;
+    mDevice.destroyPipeline(mPipeline);
     /*for localSize_x_id = 0,localSize_y_id = 1,localSize_z_id = 2*/
     std::vector<VkSpecializationMapEntry> specializationMapEntry; /*localSize data description*/
     std::shared_ptr<VkSpecializationInfo> specializationInfo = std::make_shared<VkSpecializationInfo>();
@@ -145,11 +145,10 @@ void VulkanPipeline::changePipeline(const std::vector<uint32_t>& localSize) cons
         specializationInfo->mapEntryCount = specializationMapEntry.size();
     }
     
-    auto res = mDevice.createComputePipeline(pipeline, mShader->get(), mLayout->get(), mCache->get(), specializationInfo.get());
+    auto res = mDevice.createComputePipeline(mPipeline, mShader->get(), mLayout->get(), mCache->get(), specializationInfo.get());
     if (VK_SUCCESS != res) {
         FUNC_PRINT(1);
     }
-    mPipeline = pipeline;
 }
 
 VulkanLayout::DescriptorSet* VulkanLayout::createSet() const {
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.cpp b/source/backend/vulkan/runtime/VulkanRuntime.cpp
index 795c24f99..191158113 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.cpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.cpp
@@ -165,7 +165,7 @@ void VulkanRuntime::onGabageCollect(int level) {
     mPipelineFactory->reset();
 }
 
-Backend* VulkanRuntime::onCreate(const BackendConfig* config) const {
+Backend* VulkanRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
     // FIXME: Use config
     return new VulkanBackend(this, mInfo);
 }
diff --git a/source/backend/vulkan/runtime/VulkanRuntime.hpp b/source/backend/vulkan/runtime/VulkanRuntime.hpp
index c8dfa56ac..3c04c9808 100644
--- a/source/backend/vulkan/runtime/VulkanRuntime.hpp
+++ b/source/backend/vulkan/runtime/VulkanRuntime.hpp
@@ -26,7 +26,7 @@ class VulkanRuntime : public Runtime {
 public:
     virtual ~ VulkanRuntime();
 
-    virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual Backend* onCreate(const BackendConfig* config, Backend* origin) const override;
     enum GPUType { ADRENO = 0, MALI = 1, OTHER = 2 };
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp
index 0d199bd90..2e0b2548b 100644
--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@@ -34,11 +34,12 @@ struct RuntimeHint {
     int cpuDecreaseRate = 50;
     int dynamicQuantOption = 0;
 
-    // 0: Do not quantize kvcache, just store float
-    // 1: Only quantize key cache, use int8 asymmetric quantization 
-    // 2: Only quantize value cache, use fp8 quantization
-    // 3: quantize both key and value cache as described above
-    int kvcacheQuantOption = 0;
+    // 0: Do not quantize
+    // 1: Only quantize key, use int8 asymmetric quantization 
+    // 2: Only quantize value, use fp8 quantization
+    // 3: quantize both key and value
+    // 4: quantize query, key and value, and use gemm int8 kernel to compute K*V
+    int qkvQuantOption = 0;
     
     // the kvcache size limit of each layer
     // if the size of kvcache in memory exceeds the limit
@@ -48,6 +49,9 @@ struct RuntimeHint {
 
     // path of the kvcache directory
     std::string kvcacheDirPath = "/tmp";
+    
+    std::string midMemoryPath;
+    std::string weightMemoryPath;
 };
 /** abstract backend */
 class Backend : public NonCopyable {
@@ -267,7 +271,7 @@ class Runtime : public NonCopyable {
      @brief create backend
      @return created backend
      */
-    virtual Backend* onCreate(const BackendConfig* config = nullptr) const = 0;
+    virtual Backend* onCreate(const BackendConfig* config = nullptr, Backend* origin = nullptr) const = 0;
 
     /**
      @brief reset runtime
diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp
index 43104da80..676cf97fc 100644
--- a/source/core/BufferAllocator.cpp
+++ b/source/core/BufferAllocator.cpp
@@ -6,8 +6,10 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include <string>
 #include "core/BufferAllocator.hpp"
 #include "core/Macro.h"
+#include "MNNFileUtils.h"
 
 // #define DUMP_USAGE
 //#define MNN_DEBUG_MEMORY
@@ -54,6 +56,62 @@ class DefaultAllocator : public BufferAllocator::Allocator {
         MNNMemoryFreeAlign(chunk.first);
     }
 };
+class MmapAllocator : public BufferAllocator::Allocator {
+private:
+    std::map<void*, std::tuple<file_t,size_t, std::string>> mCache;
+    std::string mFileName;
+    std::string mPosfix;
+    int mAllocTimes = 0;
+    bool mRemove;
+public:
+    MmapAllocator(const char* dirName, const char* posfix, bool autoRemove) {
+        if (nullptr != dirName) {
+            mFileName = dirName;
+            if (!MNNDirExist(dirName)) {
+                MNN_ERROR("%s not exist\n", dirName);
+            }
+        }
+        if (nullptr != posfix) {
+            mPosfix = posfix;
+        }
+        mRemove = autoRemove;
+    }
+    virtual ~ MmapAllocator() {
+        for (auto& iter : mCache) {
+            MNNUnmapFile(iter.first, std::get<1>(iter.second));
+            MNNCloseFile(std::get<0>(iter.second));
+            if (mRemove) {
+                MNNRemoveFile(std::get<2>(iter.second).c_str());
+            }
+        }
+    }
+    virtual MemChunk onAlloc(size_t size, size_t align) {
+        MNN_ASSERT(size > 0);
+        std::string fileName = MNNFilePathConcat(mFileName, std::to_string(mAllocTimes) + "." + mPosfix);
+        auto file = MNNCreateFile(fileName.c_str());
+        size = UP_DIV(size, align) * align;
+        MNNSetFileSize(file, size);
+        void* ptr = MNNMmapFile(file, size);
+        mCache.insert(std::make_pair(ptr, std::make_tuple(file, size, fileName)));
+        mAllocTimes++;
+        return MemChunk(ptr, 0);
+    }
+    virtual void onRelease(MemChunk chunk) {
+        MNN_ASSERT(chunk.second == 0);
+        auto iter = mCache.find(chunk.first);
+        if (iter == mCache.end()) {
+            MNN_ASSERT(false);
+            MNN_ERROR("Invalid free for MMAPAllocator\n");
+            return;
+        }
+        MNNUnmapFile(iter->first, std::get<1>(iter->second));
+        MNNCloseFile(std::get<0>(iter->second));
+        if (mRemove) {
+            MNNRemoveFile(std::get<2>(iter->second).c_str());
+        }
+        mCache.erase(iter);
+    }
+};
 class RecurseAllocator : public BufferAllocator::Allocator {
 public:
     RecurseAllocator(BufferAllocator* parent) {
@@ -72,14 +130,17 @@ class RecurseAllocator : public BufferAllocator::Allocator {
     BufferAllocator* mParent;
 };
 
-ErrorCode BufferAllocator::compute() {
-    return NO_ERROR;
-}
 std::shared_ptr<BufferAllocator::Allocator> BufferAllocator::Allocator::createDefault() {
     std::shared_ptr<BufferAllocator::Allocator> _res;
     _res.reset(new DefaultAllocator);
     return _res;
 }
+std::shared_ptr<BufferAllocator::Allocator> BufferAllocator::Allocator::createMmap(const char* dirName, const char* posfix, bool autoRemove) {
+    std::shared_ptr<BufferAllocator::Allocator> _res;
+    _res.reset(new MmapAllocator(dirName, posfix, autoRemove));
+    return _res;
+}
+
 std::shared_ptr<BufferAllocator::Allocator> BufferAllocator::Allocator::createRecurse(BufferAllocator* parent) {
     std::shared_ptr<BufferAllocator::Allocator> _res;
     _res.reset(new RecurseAllocator(parent));
@@ -113,23 +174,48 @@ MemChunk EagerBufferAllocator::alloc(size_t size, bool separate, size_t align) {
             return MemChunk(pointer);
         }
     }
+    auto allocSize = size;
+    if (mMinAllocSize != 0) {
+        allocSize = ALIMAX(mMinAllocSize, size);
+    }
 
     // alloc otherwise
-    auto chunk = mAllocator->onAlloc(size, align);
+    auto chunk = mAllocator->onAlloc(allocSize, align);
     pointer.first = chunk.first;
     pointer.second = chunk.second;
     if (nullptr == pointer.first) {
         return chunk;
     }
-    mTotalSize += size;
+    mTotalSize += allocSize;
 
     // save node
     SharedPtr<Node> node(new Node);
-    node->size         = size;
+    node->size         = allocSize;
     node->pointer      = pointer;
-    mUsedList[pointer] = node;
     node->outside      = mAllocator.get();
     MNN_ASSERT(pointer.second % align == 0);
+    if (allocSize > size) {
+        // Split
+        SharedPtr<Node> first(new Node);
+        first->parent  = node;
+        first->size    = size;
+        first->pointer = pointer;
+        mUsedList.insert(std::make_pair(pointer, first));
+        node->useCount = 1;
+
+        SharedPtr<Node> second(new Node);
+        second->parent  = node;
+        second->size    = allocSize - size;
+        second->pointer.first = pointer.first;
+        second->pointer.second = pointer.second + size;
+        if (nullptr != mCurrentFreeList) {
+            mCurrentFreeList->insert(std::make_pair(second->size, second));
+        } else {
+            mFreeList.insert(std::make_pair(second->size, second));
+        }
+    } else {
+        mUsedList[pointer] = node;
+    }
 #ifdef DUMP_USAGE
     MNN_PRINT("mTotalSize: %f\n", mTotalSize / 1024.0f / 1024.0f);
 #endif
@@ -290,13 +376,40 @@ std::pair<void*, size_t> EagerBufferAllocator::getFromFreeList(FREELIST* list, s
 static void _CPUMemChunkApplyToTensor(uint8_t* ptr, size_t offset, Tensor* t) {
     t->buffer().host = ptr + offset;
 }
+SingleBufferWithAllocator::~ SingleBufferWithAllocator() {
+    release();
+}
+void SingleBufferWithAllocator::release() {
+    if (current.first != nullptr) {
+        root->onRelease(current);
+        current.first = nullptr;
+        current.second = 0;
+        currentSize = 0;
+    }
+}
+
+ErrorCode SingleBufferWithAllocator::realloc(size_t size, size_t align) {
+    if (currentSize < size) {
+        if (nullptr != current.first) {
+            root->onRelease(current);
+        }
+        current = root->onAlloc(size, align);
+        if (current.first == nullptr) {
+            return OUT_OF_MEMORY;
+        }
+        currentSize = size;
+    }
+    return NO_ERROR;
+}
+
 
-DeferBufferAllocator::DeferBufferAllocator(std::shared_ptr<Allocator> parent, size_t align, MemChunkApplyToTensor func) : mAllocator(parent), mAlign(align) {
+DeferBufferAllocator::DeferBufferAllocator(SingleBufferWithAllocator* root, size_t align, MemChunkApplyToTensor func) : mAlign(align) {
     if (nullptr == func) {
         mApplyFunction = _CPUMemChunkApplyToTensor;
     } else {
         mApplyFunction = func;
     }
+    mParent = root;
 }
 
 //------------------------------- DeferBufferAllocator -----------------------------------//
@@ -371,10 +484,6 @@ void DeferBufferAllocator::release(bool allRelease) {
     }
 }
 
-size_t DeferBufferAllocator::totalSize() const {
-    return mTotalSize;
-}
-
 void DeferBufferAllocator::barrierBegin() {
     MNN_ASSERT(!mBarrrier);
     mBarrrier = true;
@@ -398,12 +507,8 @@ void DeferBufferAllocator::reset() {
     mTotalSize = 0;
     mChunks.clear();
     mFreeList.clear();
-    // mPtr.reset(nullptr);
-    if (mPtr.ptr()) {
-        mAllocator->onRelease(mPtr);
-        mPtr.first = nullptr;
-        mPtr.second = 0;
-    }
+    mPtr.first = nullptr;
+    mPtr.second = 0;
     mHead = nullptr;
     mTail = nullptr;
     mBarrrier = false;
@@ -411,7 +516,7 @@ void DeferBufferAllocator::reset() {
 }
 
 ErrorCode DeferBufferAllocator::compute() {
-    if (mPtr.ptr()) {
+    if (mTotalSize > 0) {
         return NO_ERROR;
     }
     mTotalSize = 0;
@@ -431,10 +536,28 @@ ErrorCode DeferBufferAllocator::compute() {
         mTotalSize += chunk->size;
         chunk = chunk->right;
     }
-    mPtr = mAllocator->onAlloc(mTotalSize, mAlign);
-    if (mPtr.ptr() == nullptr) {
-        return OUT_OF_MEMORY;
+    return apply();
+}
+ErrorCode DeferBufferAllocator::apply() {
+    if (mFreeList.empty()) {
+        // Not alloc
+        return NO_ERROR;
+    }
+    auto& chunk = mParent->current;
+    bool needApply = false;
+    if (mParent->currentSize < mTotalSize) {
+        needApply = true;
+        auto code = mParent->realloc(mTotalSize, mAlign);
+        if (NO_ERROR != code) {
+            return code;
+        }
+    } else if (mPtr.first != chunk.first || mPtr.second != chunk.second) {
+        needApply = true;
+    }
+    if (!needApply) {
+        return NO_ERROR;
     }
+    mPtr = chunk;
     for (auto& chunk : mChunks) {
         chunk->base = mPtr.ptr();
         for (auto t : chunk->tensors) {
@@ -474,7 +597,7 @@ void DeferBufferAllocator::erase_node(MemNode* chunk) {
     }
     if (right) {
         right->left = nullptr;
-        mTail = right;
+        mHead = right;
         return;
     }
     mHead = mTail = nullptr;
diff --git a/source/core/BufferAllocator.hpp b/source/core/BufferAllocator.hpp
index 78e0b8ee7..b5c406a4f 100644
--- a/source/core/BufferAllocator.hpp
+++ b/source/core/BufferAllocator.hpp
@@ -85,6 +85,7 @@ class MNN_PUBLIC BufferAllocator : public NonCopyable {
         virtual MemChunk onAlloc(size_t size, size_t align) = 0;
         virtual void onRelease(MemChunk chunk) = 0;
         static std::shared_ptr<Allocator> createDefault();
+        static std::shared_ptr<Allocator> createMmap(const char* dirName, const char* posfix, bool autoRemove = true);
         static std::shared_ptr<Allocator> createRecurse(BufferAllocator* parent);
     };
     BufferAllocator() = default;
@@ -92,13 +93,22 @@ class MNN_PUBLIC BufferAllocator : public NonCopyable {
     virtual MemChunk alloc(size_t size, bool separate = false, size_t align = 0) = 0;
     virtual bool free(MemChunk chunk) = 0;
     virtual void release(bool allRelease = true) = 0;
-    virtual size_t totalSize() const = 0;
+    size_t totalSize() const {
+        return mTotalSize;
+    }
     virtual void barrierBegin() {}
     virtual void barrierEnd() {}
     virtual void beginGroup() {}
     virtual void endGroup() {}
     virtual void reset() {}
-    virtual ErrorCode compute();
+    virtual ErrorCode compute() {
+        return NO_ERROR;
+    }
+    virtual ErrorCode apply() {
+        return NO_ERROR;
+    }
+protected:
+    size_t mTotalSize = 0;
 };
 
 
@@ -108,7 +118,7 @@ class MNN_PUBLIC EagerBufferAllocator : public BufferAllocator {
      * @brief init buffer allocator with pointer alignment.
      * @param align given pointer alignment.
      */
-    EagerBufferAllocator(std::shared_ptr<Allocator> parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT) : mAllocator(parent), mAlign(align) {
+    EagerBufferAllocator(std::shared_ptr<Allocator> parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT, size_t minAllocSize = 0) : mAllocator(parent), mAlign(align), mMinAllocSize(minAllocSize) {
         // nothing to do
     }
     /**
@@ -145,14 +155,6 @@ class MNN_PUBLIC EagerBufferAllocator : public BufferAllocator {
      */
     void release(bool allRelease = true) override;
 
-    /**
-     * @brief query total size allocated indeed.
-     * @return total size allocated indeed.
-     */
-    size_t totalSize() const override {
-        return mTotalSize;
-    }
-
     /*
      For multi thread case,
      we must assume that the memory use by different thread don't conflict
@@ -184,40 +186,47 @@ class MNN_PUBLIC EagerBufferAllocator : public BufferAllocator {
 
     std::map<std::pair<void*, size_t>, SharedPtr<Node>> mUsedList;
     FREELIST mFreeList;
-    size_t mTotalSize   = 0;
 
     FREELIST* mCurrentFreeList = nullptr;
     std::vector<std::shared_ptr<FREELIST>> mGroups;
     std::shared_ptr<Allocator> mAllocator;
     size_t mAlign;
+    size_t mMinAllocSize = 0;
 };
 typedef void(*MemChunkApplyToTensor)(uint8_t* ptr, size_t offset, Tensor* tensor);
 
+class MNN_PUBLIC SingleBufferWithAllocator {
+public:
+    ~ SingleBufferWithAllocator();
+    ErrorCode realloc(size_t size, size_t align);
+    void release();
+    std::shared_ptr<BufferAllocator::Allocator> root;
+    MemChunk current;
+    size_t currentSize = 0;
+};
 class MNN_PUBLIC DeferBufferAllocator : public BufferAllocator {
 public:
-    DeferBufferAllocator(std::shared_ptr<Allocator> parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT, MemChunkApplyToTensor func = nullptr);
-    ~DeferBufferAllocator() {
-        reset();
+    DeferBufferAllocator(SingleBufferWithAllocator* parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT, MemChunkApplyToTensor func = nullptr);
+    virtual ~DeferBufferAllocator() {
+        // Donothing
     }
 public:
     MemChunk alloc(size_t size, bool separate = false, size_t align = 0) override;
     bool free(MemChunk chunk) override;
     void release(bool allRelease = true) override;
-    size_t totalSize() const override;
     void barrierBegin() override;
     void barrierEnd() override;
     void beginGroup() override;
     void endGroup() override;
     void reset() override;
     ErrorCode compute() override;
+    ErrorCode apply() override;
 private:
     std::vector<std::unique_ptr<MemNode>> mChunks;
     MemNode *mHead = nullptr, *mTail = nullptr;
     std::multiset<ChunkBySize> mFreeList;
     // std::unique_ptr<uint8_t[]> mPtr;
     MemChunk mPtr;
-    size_t mTotalSize = 0;
-    std::shared_ptr<Allocator> mAllocator;
     size_t mAlign;
     // barrier
     bool mBarrrier = false;
@@ -231,6 +240,7 @@ class MNN_PUBLIC DeferBufferAllocator : public BufferAllocator {
     void eraseFree(MemNode* chunk);
     void visiChildren(MemNode* chunk);
     MemChunkApplyToTensor mApplyFunction;
+    SingleBufferWithAllocator* mParent;
 };
 } // namespace MNN
 #endif
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index 8c5596312..4a62fb4db 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -647,9 +647,13 @@ static bool _RebuildExternalOp(FileLoader* external, const MNN::Op* origin, flat
                     external->offset(param->external[0] + param->external[1] + param->external[2]);
                 }
                 if (param->bias.empty() && param->external.size() > 3) {
-                    param->bias.resize(param->external[3]/sizeof(float));
-                    external->read((char*)param->bias.data(), param->external[3]);
-                }
+		            if (param->external[3] > 0) {
+                       param->bias.resize(param->external[3]/sizeof(float));
+                       external->read((char*)param->bias.data(), param->external[3]);
+                    } else {
+                       param->bias.resize(param->common->outputCount);
+		            }
+		        }
                 if (param->quanParameter->index.empty() && param->external.size() > 4) {
                     param->quanParameter->index.resize(param->external[4]/sizeof(uint32_t));
                     external->read((char*)param->quanParameter->index.data(), param->external[4]);
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 30266df05..0108aa5e6 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -269,6 +269,8 @@ ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) {
         }
     } else {
 #ifndef MNN_BUILD_MINI
+        mBackend->onClearBuffer();
+        mBackupBackend->onClearBuffer();
         mContext.clear();
         mContext.mNeedRelease = mGeometryNeedRelease;
         FileLoader l(mExternalFile.c_str());
@@ -897,6 +899,10 @@ ErrorCode Pipeline::fixResizeCache() {
             info.cacheBuffer.extras.clear();
         }
     }
+    mInfo.first.cache.first->onResizeBegin();
+    mInfo.first.cache.first->onResizeEnd();
+    mInfo.first.cache.second->onResizeBegin();
+    mInfo.first.cache.second->onResizeEnd();
     auto res = mInfo.first.cache.first->onSelectDynamicAllocator(1, 2);
     res = res && mInfo.first.cache.second->onSelectDynamicAllocator(1, 2);
     if (!res) {
@@ -1094,8 +1100,6 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
     /* Create Execution Begin */
     auto& mBackend = mInfo.first.cache.first;
     auto& mBackupBackend = mInfo.first.cache.second;
-    mBackend->onClearBuffer();
-    mBackupBackend->onClearBuffer();
     // Check If we need a lone time for init
     if (mBackend->type() != MNN_FORWARD_CPU && mBackend->type() != MNN_FORWARD_CPU_EXTENSION && mTuneAttr.autoSetOpType) {
         Runtime::OpInfo dstInfo;
@@ -1144,10 +1148,12 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
         }
     }
     /* Create Execution End */
-
+    mBackend->onClearBuffer();
+    mBackupBackend->onClearBuffer();
     _SetTensorBackend(mInfo, mAllocInput);
     // Insert Wrap If needed
     {
+        // Reset memory allocator for backend
         auto insertCode = _InsertCopy(mInfo, mCacheConstTensors, mWrapTensors, mAllocInput, forbidReplace);
         if (NO_ERROR != insertCode) {
             return insertCode;
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index a424898ba..48148ab28 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -18,10 +18,8 @@
 #include "core/TensorUtils.hpp"
 #include "utils/InitNet.hpp"
 
-using namespace std;
-
 namespace MNN {
-static void _createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& runtime) {
+void Session::createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& runtime) {
     if (iter.first.cache.first != nullptr) {
         return;
     }
@@ -41,7 +39,16 @@ static void _createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& ru
         // We need create a new backend to do size compute / not support op compute
         BackendConfig defaultConfig;
         defaultConfig.flags = 4;
-        iter.first.cache.second.reset(cpuRuntime->onCreate(&defaultConfig));
+        if (iter.first.info.user != nullptr) {
+            // Don't change default Precision
+            defaultConfig.memory = iter.first.info.user->memory;
+            defaultConfig.power = iter.first.info.user->power;
+        }
+        Backend* origin = nullptr;
+        if (cpuRuntime.get() == rt) {
+            origin = iter.first.cache.first.get();
+        }
+        iter.first.cache.second.reset(cpuRuntime->onCreate(&defaultConfig, origin));
     }
 }
 void Session::ModeGroup::setMode(Interpreter::SessionMode mode) {
@@ -84,8 +91,8 @@ void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) {
         case Interpreter::DYNAMIC_QUANT_OPTIONS:
             runtimeHint.dynamicQuantOption = hint;
             break;
-        case Interpreter::KVCACHE_QUANT_OPTIONS:
-            runtimeHint.kvcacheQuantOption = hint;
+        case Interpreter::QKV_QUANT_OPTIONS:
+            runtimeHint.qkvQuantOption = hint;
             break;
         case Interpreter::KVCACHE_SIZE_LIMIT:
             runtimeHint.kvcacheSizeLimit = hint;
@@ -100,6 +107,12 @@ void Session::ModeGroup::setExternalPath(std::string path, int type) {
         case MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR:
             runtimeHint.kvcacheDirPath = path;
             break;
+        case MNN::Interpreter::EXTERNAL_FEATUREMAP_DIR:
+            runtimeHint.midMemoryPath = path;
+            break;
+        case MNN::Interpreter::EXTERNAL_WEIGHT_DIR:
+            runtimeHint.weightMemoryPath = path;
+            break;
         default:
             break;
     }
@@ -114,7 +127,7 @@ Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeIn
     }
     mInfo = std::move(info);
     for (auto& iter : mInfo.pipelineInfo) {
-        _createPipelineBackend(iter, mRuntime);
+        createPipelineBackend(iter, mRuntime);
         Pipeline::TuningAttr attr;
         attr.maxTuningNumber = mode.maxTuningNumber;
         attr.autoSetOpType = mode.backendMode == Interpreter::Session_Backend_Auto;
@@ -473,7 +486,7 @@ Session* Session::clone(RuntimeInfo&& runtime, std::shared_ptr<Schedule::Schedul
     pipelineInfo.first.info.user = &pipelineInfo.first.config;
     auto& oplists = pipelineInfo.second;
     oplists.resize(opCaches.size());
-    _createPipelineBackend(pipelineInfo, runtime);
+    createPipelineBackend(pipelineInfo, runtime);
     auto first = pipelineInfo.first.cache.first;
     auto second = pipelineInfo.first.cache.second;
     for (int i=0; i<opCaches.size(); ++i) {
diff --git a/source/core/Session.hpp b/source/core/Session.hpp
index 8f7415ebd..8461a21f3 100644
--- a/source/core/Session.hpp
+++ b/source/core/Session.hpp
@@ -46,6 +46,8 @@ class MNN_PUBLIC Session {
     ~Session();
 
     Session* clone(RuntimeInfo&& runtime, std::shared_ptr<Schedule::ScheduleInfo> sharedConst);
+    static void createPipelineBackend(Schedule::PipelineInfo& iter, RuntimeInfo& runtime);
+
 public:
     /**
      * @brief infer.
diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp
index 7d57a7200..d6592e1d8 100644
--- a/source/cv/ImageProcess.cpp
+++ b/source/cv/ImageProcess.cpp
@@ -28,7 +28,6 @@
 #include "backend/cpu/x86_x64/cpu_id.h"
 #endif
 
-#define CACHE_SIZE 256
 namespace MNN {
 
 void registerBackend();
diff --git a/source/cv/ImageProcessUtils.cpp b/source/cv/ImageProcessUtils.cpp
index c8662ed36..daf8ffd9d 100644
--- a/source/cv/ImageProcessUtils.cpp
+++ b/source/cv/ImageProcessUtils.cpp
@@ -28,7 +28,6 @@
 #include "backend/cpu/x86_x64/cpu_id.h"
 #endif
 
-#define CACHE_SIZE 256
 namespace MNN {
 using namespace CV;
 #define CHECKFORMAT(src, dst, func) if (source == src && dest == dst) return func
@@ -240,9 +239,14 @@ ErrorCode ImageProcessUtils::selectImageProcer(bool identity, bool hasBackend, b
         return NO_ERROR;
     }
     // Choose sampler.
-    mInside->mSampler = choose(mInside->config.sourceFormat, mInside->config.filterType, identity);
-    if (nullptr == mInside->mSampler) {
-        return INPUT_DATA_ERROR;
+    if (false == identity || mInside->config.sourceFormat == YUV_NV12 || mInside->config.sourceFormat == YUV_NV21 || mInside->config.sourceFormat == YUV_I420) {
+        mInside->mSampler = choose(mInside->config.sourceFormat, mInside->config.filterType, identity);
+        if (nullptr == mInside->mSampler) {
+            MNN_ERROR("Do not support resize convert.\n");
+            return INPUT_DATA_ERROR;
+        }
+    } else {
+        mInside->mSampler = nullptr;
     }
     // Choose blitter.
     if ((ImageFormatType)mInside->config.sourceFormat != (ImageFormatType)mInside->config.destFormat) {
@@ -366,11 +370,17 @@ static std::pair<int, int> _computeClip(CV::Point* points, int iw, int ih, const
     return std::make_pair(sta, end);
 }
 
+static inline float __clamp(float v, float minV, float maxV) {
+    return std::max(std::min(v, maxV), minV);
+}
+
 ErrorCode ImageProcessUtils::transformImage(const uint8_t* source, uint8_t* dst, uint8_t* samplerDest, uint8_t* blitDest, int tileCount, int destBytes, const int32_t* regions) {
     CV::Point points[2];
     if (mInside->mStride == 0) {
         mInside->mStride = mInside->iw * mInside->ic;
     }
+    float xMax     = mInside->iw - 1;
+    float yMax     = mInside->ih - 1;
     for (int i = 0; i < mInside->oh; ++i) {
         int dy = mInside->mDraw ? regions[3 * i] : i;
         auto dstY = (uint8_t*)dst + dy * destBytes * mInside->ow * mInside->oc;
@@ -390,7 +400,9 @@ ErrorCode ImageProcessUtils::transformImage(const uint8_t* source, uint8_t* dst,
                 samplerDest = blitDest;
             }
 
+            const uint8_t* blitSrc = samplerDest; // For draw
             // Sample
+            const uint8_t* sourcePos = nullptr; // for sampler is null.
             if (!mInside->mDraw) {
                 // Compute position
                 points[0].fX = xStart;
@@ -432,16 +444,28 @@ ErrorCode ImageProcessUtils::transformImage(const uint8_t* source, uint8_t* dst,
                 }
                 points[1].fX = (deltaX) / (float)(count);
                 points[1].fY = (deltaY) / (float)(count);
-
-                mInside->mSampler(source, samplerDest, points, sta, end - sta, count, mInside->iw, mInside->ih, mInside->mStride);
+                
+                if (mInside->mSampler) {
+                    mInside->mSampler(source, samplerDest, points, sta, end - sta, count, mInside->iw, mInside->ih, mInside->mStride);
+                    blitSrc = samplerDest;
+                } else {
+                    int y          = (int)roundf(__clamp(points[0].fY, 0, yMax));
+                    int x          = (int)roundf(__clamp(points[0].fX, 0, xMax));
+                    sourcePos = source + (y * mInside->mStride + mInside->ic* x);
+                    blitSrc = sourcePos; // update blitSrc when not draw.
+                }
             }
             // Convert format
             if (mInside->mBlitter) {
-                mInside->mBlitter(samplerDest, blitDest, count);
+                mInside->mBlitter(blitSrc, blitDest, count);
             }
             // Turn float
             if (mInside->mBlitFloat) {
-                mInside->mBlitFloat(blitDest, (float*)dstStart, mInside->config.mean, mInside->config.normal, count);
+                if (mInside->mSampler) {
+                    mInside->mBlitFloat(blitDest, (float*)dstStart, mInside->config.mean, mInside->config.normal, count);
+                } else {
+                    mInside->mBlitFloat(sourcePos, (float*)dstStart, mInside->config.mean, mInside->config.normal, count);
+                }
             }
         }
     }
@@ -493,10 +517,10 @@ static CV::ImageFormat _correctImageFormat(int outputBpp, halide_type_t type, CV
 }
 
 ErrorCode ImageProcessUtils::execFunc(const uint8_t *source, int stride, void *dest) {
-    uint8_t sampleDest[4 * 256];
-    uint8_t blitDest[4 * 256];
+    uint8_t sampleDest[4 * CACHE_SIZE];
+    uint8_t blitDest[4 * CACHE_SIZE];
     int destBytes = mInside->mDtype.bytes();
-    int tileCount = UP_DIV(mInside->ow, 256);
+    int tileCount = UP_DIV(mInside->ow, CACHE_SIZE);
     if (mInside->mDraw) {
         tileCount = 1;
     }
@@ -512,7 +536,7 @@ void ImageProcessUtils::setDraw() {
 }
 
 void ImageProcessUtils::draw(uint8_t* img, int w, int h, int c, const int* regions, int num, uint8_t* color) {
-    uint8_t blitDest[4 * 256];
+    uint8_t blitDest[4 * CACHE_SIZE];
     int destBytes = mInside->mDtype.bytes();
     mInside->oh = num;
     transformImage(img, img, color, blitDest, 1, destBytes, regions);
diff --git a/source/cv/ImageProcessUtils.hpp b/source/cv/ImageProcessUtils.hpp
index e8e901bd3..baced1bc8 100644
--- a/source/cv/ImageProcessUtils.hpp
+++ b/source/cv/ImageProcessUtils.hpp
@@ -15,6 +15,7 @@
 #include "backend/cpu/compute/CommonOptFunction.h"
 
 
+#define CACHE_SIZE 512
 namespace MNN {
 typedef void (*BLITTER)(const unsigned char* source, unsigned char* dest, size_t count);
 typedef void (*BLIT_FLOAT)(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count);
diff --git a/source/geometry/GeometryComputerUtils.cpp b/source/geometry/GeometryComputerUtils.cpp
index 207d29e5d..f2aae2a6d 100644
--- a/source/geometry/GeometryComputerUtils.cpp
+++ b/source/geometry/GeometryComputerUtils.cpp
@@ -265,14 +265,20 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                 auto& c = *cp;
                 std::shared_ptr<BufferStorage> tmpStorge;
                 if (nullptr == c.execution) {
-                    auto exe = OpCommonUtils::createExecutionWithExternal(backupBackend.get(), c.inputs, c.outputs, c.op, external, tmpStorge);
-                    c.execution.reset(exe);
+                    auto opIter = info.executionCache.find(c.op);
+                    if (opIter != info.executionCache.end()) {
+                        c.execution = opIter->second;
+                    } else {
+                        auto exe = OpCommonUtils::createExecutionWithExternal(backupBackend.get(), c.inputs, c.outputs, c.op, external, tmpStorge);
+                        c.execution.reset(exe);
+                    }
                 }
                 auto exe = c.execution;
                 if (nullptr == exe.get()) {
                     MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str());
                     return NO_EXECUTION;
                 }
+                backupBackend->onResizeBegin();
                 for (auto t : c.outputs) {
                     auto des = TensorUtils::getDescribeOrigin(t);
                     TensorUtils::setLinearLayout(t);
@@ -282,7 +288,6 @@ ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
                     }
                     des->setBackend(backupBackend.get());
                 }
-                backupBackend->onResizeBegin();
                 auto code = exe->onResize(c.inputs, c.outputs);
                 if (NO_ERROR != code) {
                     return NOT_SUPPORT;
diff --git a/test.sh b/test.sh
index 81ef7c647..52bd6c6d3 100755
--- a/test.sh
+++ b/test.sh
@@ -175,6 +175,7 @@ android_static_build() {
     -DMNN_OPENCL=true \
     -DMNN_SUPPORT_BF16=true \
     -DMNN_OPENCL=true -DMNN_ARM82=true \
+    -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \
     -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3
     make -j16
     android_build_wrong=$[$? > 0]
@@ -205,7 +206,8 @@ android_static_build() {
     -DMNN_OPENCL=true \
     -DMNN_BUILD_MINI=true \
     -DMNN_SUPPORT_BF16=true \
-    -DMNN_OPENCL=true\
+    -DMNN_OPENCL=true \
+    -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \
     -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=.
     make -j16
     android_build_wrong=$[$? > 0]
@@ -249,6 +251,7 @@ linux_build() {
         -DMNN_BUILD_OPENCV=ON \
         -DMNN_LOW_MEMORY=ON \
         -DMNN_IMGCODECS=ON \
+        -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \
         -DMNN_ENABLE_COVERAGE=$COVERAGE
     make -j16
 
@@ -477,33 +480,34 @@ coverage_report() {
 #                                                                                           #
 #############################################################################################
 android_unit_test() {
-    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out all 0 0 1 $1"
+    memory_mode=$2
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out all 0 0 1 $1 $memory_mode"
     if [ $? -ne 0 ]; then
         echo '### Android单元测试失败，测试终止！'
         failed
     fi
-    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 0 0 4 multi$1"
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 0 0 4 multi$1 $memory_mode"
     if [ $? -ne 0 ]; then
         echo '### Android单元测试多线程失败，测试终止！'
         failed
     fi
-    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/convolution 0 2 4 fp16multi$1"
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/convolution 0 2 4 fp16multi$1 $memory_mode"
     if [ $? -ne 0 ]; then
         echo '### Android单元测试卷积FP16多线程失败，测试终止！'
         failed
     fi
-    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/col2im 0 2 4 fp16col2im$1"
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/col2im 0 2 4 fp16col2im$1 $memory_mode"
     if [ $? -ne 0 ]; then
         echo '### Android单元测试FP16-col2im多线程失败，测试终止！'
         failed
     fi
-    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/R 0 2 4 fp16roipooling$1"
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/R 0 2 4 fp16roipooling$1 $memory_mode"
     if [ $? -ne 0 ]; then
         echo '### Android单元测试FP16-roipooling多线程失败，测试终止！'
         failed
     fi
     if [ "$OPENCL_CHANGE" ]; then
-        adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 3 1 4 $1"
+        adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op 3 1 4 $1 $memory_mode"
         if [ $? -ne 0 ]; then
             echo '### Android单元测试OpenCL失败，测试终止！'
             failed
@@ -592,25 +596,58 @@ android_model_test() {
         fi
     fi
 }
-android_unit_test_low_memory() {
+android_unit_test_low_memory_armv8() {
     adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 1 1 $1 2"
     if [ $? -ne 0 ]; then
-        echo '### Android 64位Low Memory, precision=1 单元测试失败，测试终止！'
+        echo '### Android 64位Low Memory,动态量化, precision=1, thread=1 单元测试失败，测试终止！'
         failed
     fi
     adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 2 1 $1 2"
     if [ $? -ne 0 ]; then
-        echo '### Android 64位Low Memory, precision=2 单元测试失败，测试终止！'
+        echo '### Android 64位Low Memory,动态量化, precision=2, thread=1 单元测试失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 1 4 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 64位Low Memory,动态量化, precision=1, thread=4 单元测试失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 2 4 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 64位Low Memory,动态量化, precision=2, thread=4 单元测试失败，测试终止！'
         failed
     fi
     adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 1 1 $1"
     if [ $? -ne 0 ]; then
-        echo '### Android 64位 权值量化调用1x1Strassen, precision=1 单元测试失败，测试终止！'
+        echo '### Android 64位Low Memory 权重反量化, precision=1 单元测试失败，测试终止！'
         failed
     fi
     adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 2 1 $1"
     if [ $? -ne 0 ]; then
-        echo '### Android 64位 权值量化调用1x1Strassen, precision=2 单元测试失败，测试终止！'
+        echo '### Android 64位Low Memory 权重反量化, precision=2 单元测试失败，测试终止！'
+        failed
+    fi
+}
+
+android_unit_test_low_memory_armv7() {
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 1 1 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 32位Low Memory,动态量化, precision=1, thread=1 单元测试失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 2 1 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 32位Low Memory,动态量化, precision=2, thread=1 单元测试失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 1 4 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 32位Low Memory,动态量化, precision=1, thread=4 单元测试失败，测试终止！'
+        failed
+    fi
+    adb shell "cd /data/local/tmp/MNN&&export LD_LIBRARY_PATH=.&&./run_test.out op/lowMemory 0 2 4 $1 2"
+    if [ $? -ne 0 ]; then
+        echo '### Android 32位Low Memory,动态量化, precision=2, thread=4 单元测试失败，测试终止！'
         failed
     fi
 }
@@ -620,7 +657,7 @@ android_test() {
     # 1. build Android32
     mkdir build_32
     pushd build_32
-    ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true
+    ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true -DMNN_LOW_MEMORY=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
     android32_build_wrong=$[$? > 0]
     mnn32_size=$(ls -lh libMNN.so | awk '{print $5}')
     expr32_size=$(ls -lh libMNN_Express.so | awk '{print $5}')
@@ -631,14 +668,15 @@ android_test() {
         failed
     fi
     ../updateTest.sh
-    android_unit_test 32
+    android_unit_test 32bit 1
+    android_unit_test_low_memory_armv7 32bit
     android_model_test 32
     popd
 
     # 3. build Android64
     mkdir build_64
     pushd build_64
-    ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_ARM82=true -DMNN_OPENCL=true -DMNN_LOW_MEMORY=true
+    ../build_64.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_ARM82=true -DMNN_OPENCL=true -DMNN_LOW_MEMORY=true -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
     android64_build_wrong=$[$? > 0]
     mnn64_size=$(ls -lh libMNN.so | awk '{print $5}')
     expr64_size=$(ls -lh libMNN_Express.so | awk '{print $5}')
@@ -651,8 +689,8 @@ android_test() {
 
     # 4. test Android64
     ../updateTest.sh
-    android_unit_test 64
-    android_unit_test_low_memory 64
+    android_unit_test 64 0
+    android_unit_test_low_memory_armv8 64
     android_model_test 64
     popd
 
diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp
index f37f1c038..804b52543 100644
--- a/test/MNNTestSuite.cpp
+++ b/test/MNNTestSuite.cpp
@@ -8,6 +8,7 @@
 
 #include <stdlib.h>
 #include <map>
+#include <algorithm>
 #include <MNN/AutoTime.hpp>
 #include "MNNTestSuite.h"
 MNNTestSuite* MNNTestSuite::gInstance = NULL;
@@ -39,7 +40,7 @@ static void printTestResult(int wrong, int right, const char* flag) {
 int MNNTestSuite::run(const char* key, int precision, const char* flag) {
     if (key == NULL || strlen(key) == 0)
         return 0;
-    std::map<std::string, float> runTimes;
+    std::vector<std::pair<std::string, float>> runTimes;
     auto suite         = MNNTestSuite::get();
     std::string prefix = key;
     std::vector<std::string> wrongs;
@@ -51,12 +52,15 @@ int MNNTestSuite::run(const char* key, int precision, const char* flag) {
             MNN_PRINT("\trunning %s.\n", test->name.c_str());
             MNN::Timer _t;
             auto res = test->run(precision);
-            runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
+            runTimes.emplace_back(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
             if (!res) {
                 wrongs.emplace_back(test->name);
             }
         }
     }
+    std::sort(runTimes.begin(), runTimes.end(), [](const std::pair<std::string, float>& left, const std::pair<std::string, float>& right) {
+        return left.second < right.second;
+    });
     for (auto& iter : runTimes) {
         MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second);
     }
@@ -73,7 +77,7 @@ int MNNTestSuite::run(const char* key, int precision, const char* flag) {
 int MNNTestSuite::runAll(int precision, const char* flag) {
     auto suite = MNNTestSuite::get();
     std::vector<std::string> wrongs;
-    std::map<std::string, float> runTimes;
+    std::vector<std::pair<std::string, float>> runTimes;
     for (int i = 0; i < suite->mTests.size(); ++i) {
         MNNTestCase* test = suite->mTests[i];
         if (test->name.find("speed") != std::string::npos) {
@@ -87,11 +91,14 @@ int MNNTestSuite::runAll(int precision, const char* flag) {
         MNN_PRINT("\trunning %s.\n", test->name.c_str());
         MNN::Timer _t;
         auto res = test->run(precision);
-        runTimes.insert(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
+        runTimes.emplace_back(std::make_pair(test->name, _t.durationInUs() / 1000.0f));
         if (!res) {
             wrongs.emplace_back(test->name);
         }
     }
+    std::sort(runTimes.begin(), runTimes.end(), [](const std::pair<std::string, float>& left, const std::pair<std::string, float>& right) {
+        return left.second < right.second;
+    });
     for (auto& iter : runTimes) {
         MNN_PRINT("%s cost time: %.3f ms\n", iter.first.c_str(), iter.second);
     }
diff --git a/test/core/BufferAllocatorTest.cpp b/test/core/BufferAllocatorTest.cpp
index 968d83646..423b5e011 100644
--- a/test/core/BufferAllocatorTest.cpp
+++ b/test/core/BufferAllocatorTest.cpp
@@ -30,7 +30,9 @@ class BufferAllocatorTest : public MNNTestCase {
         printf("BufferAllocator total size : %lu B, %f M\n", allocator.totalSize(), allocator.totalSize() / 1024.f / 1024.f);
     }
     static void defer_allocator_test(const std::vector<int>& seqs) {
-        DeferBufferAllocator allocator(BufferAllocator::Allocator::createDefault());
+        SingleBufferWithAllocator root;
+        root.root = BufferAllocator::Allocator::createDefault();
+        DeferBufferAllocator allocator(&root);
         std::vector<MemChunk> allocs;
         int usage_num = 0;
         for (int i = 0; i < seqs.size(); i++) {
diff --git a/test/cv/ImageProcessTest.cpp b/test/cv/ImageProcessTest.cpp
index 7f689fac5..959b7d5a6 100644
--- a/test/cv/ImageProcessTest.cpp
+++ b/test/cv/ImageProcessTest.cpp
@@ -11,9 +11,12 @@
 #include <memory>
 #include <map>
 #include "MNNTestSuite.h"
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/AutoTime.hpp>
 
 using namespace MNN;
 using namespace MNN::CV;
+using namespace MNN::Express;
 
 static std::vector<uint8_t> genSourceData(int h, int w, int bpp) {
     std::vector<uint8_t> source(h * w * bpp);
@@ -148,7 +151,7 @@ class ImageProcessGrayToGrayBilinearTransformTest : public MNNTestCase {
         ImageProcess::Config config;
         config.sourceFormat = GRAY;
         config.destFormat   = GRAY;
-        config.filterType   = BILINEAR;
+        config.filterType   = MNN::CV::Filter::BILINEAR;
         config.wrap         = CLAMP_TO_EDGE;
         std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
 
@@ -189,7 +192,7 @@ class ImageProcessGrayToGrayNearestTransformTest : public MNNTestCase {
         ImageProcess::Config config;
         config.sourceFormat = GRAY;
         config.destFormat   = GRAY;
-        config.filterType   = NEAREST;
+        config.filterType   = MNN::CV::Filter::NEAREST;
         config.wrap         = ZERO;
         std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
 
@@ -444,7 +447,7 @@ class ImageProcessRGBAToGrayBilinearTransformTest : public MNNTestCase {
         ImageProcess::Config config;
         config.sourceFormat = RGBA;
         config.destFormat   = GRAY;
-        config.filterType   = BILINEAR;
+        config.filterType   = MNN::CV::Filter::BILINEAR;
         config.wrap         = CLAMP_TO_EDGE;
         std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
 
@@ -483,7 +486,7 @@ class ImageProcessRGBAToGrayNearestTransformTest : public MNNTestCase {
         ImageProcess::Config config;
         config.sourceFormat = RGBA;
         config.destFormat   = GRAY;
-        config.filterType   = NEAREST;
+        config.filterType   = MNN::CV::Filter::NEAREST;
         config.wrap         = CLAMP_TO_EDGE;
         std::shared_ptr<ImageProcess> process(ImageProcess::create(config));
 
@@ -772,7 +775,7 @@ class ImageProcessColorResizeTest: public MNNTestCase {
     // Test: first color then resize and first resize then color, these two results are same.
     virtual ~ImageProcessColorResizeTest() = default;
     virtual bool run(int precison) {
-        std::vector<Filter> filters(NEAREST, BILINEAR);
+        std::vector<Filter> filters = {MNN::CV::Filter::NEAREST, MNN::CV::Filter::BILINEAR};
         for (int iw = 2; iw < 200; iw += 17) {
             for (int ih = 7; ih < 200; ih += 19) {
                 for (int ow = 2; ow < 200; ow += 17) {
@@ -802,5 +805,472 @@ class ImageProcessColorResizeTest: public MNNTestCase {
         return true;
     }
 };
-MNNTestSuiteRegister(ImageProcessColorResizeTest, "cv/image_process/color_resize_test");
+// MNNTestSuiteRegister(ImageProcessColorResizeTest, "cv/image_process/color_resize_test");
 
+static int format2Channel(CV::ImageFormat format) {
+    switch (format) {
+        case CV::RGB:
+        case CV::BGR:
+        case CV::YCrCb:
+        case CV::YUV:
+        case CV::HSV:
+        case CV::XYZ:
+        case CV::YUV_NV21:
+        case CV::YUV_NV12:
+        case CV::YUV_I420:
+            return 3;
+        case CV::BGR555:
+        case CV::BGR565:
+            return 2;
+        case CV::GRAY:
+            return 1;
+        case CV::RGBA:
+        case CV::BGRA:
+            return 4;
+        default:
+            return 3;
+    }
+}
+
+static VARP cvtImpl(VARP src, ImageFormat srcformat, ImageFormat dstformat,int h, int w) {
+    int oc = format2Channel(dstformat);
+    auto type = halide_type_of<uint8_t>();
+    auto dest = Tensor::create({1, h, w, oc}, type);
+    std::unique_ptr<CV::ImageProcess> process(CV::ImageProcess::create(srcformat, dstformat));
+    process->convert(src->readMap<uint8_t>(), w, h, 0, dest);
+    auto res = Express::Variable::create(Express::Expr::create(dest, true), 0);
+    return _Squeeze(res, {0});
+}
+
+static void getVARPSize(VARP var, int* height, int* width, int* channel) {
+    auto info = var->getInfo();
+    auto dims = info->dim;
+    int num = dims.size();
+    if (num < 2) return;
+    if (num == 2) {
+        *height  = dims[0];
+        *width   = dims[1];
+        *channel = 1;
+    } else if (num == 3) {
+        *height  = dims[0];
+        *width   = dims[1];
+        *channel = dims[2];
+    } else if (info->order == NHWC) {
+        *channel = dims[num - 1];
+        *width   = dims[num - 2];
+        *height  = dims[num - 3];
+    } else { // NCHW
+        *width   = dims[num - 1];
+        *height  = dims[num - 2];
+        *channel = dims[num - 3];
+    }
+}
+
+static VARP cvtColor(VARP src, ImageFormat srcformat, ImageFormat dstformat) {
+    int h, w, c;
+    getVARPSize(src, &h, &w, &c);
+    return cvtImpl(src, srcformat, dstformat, h, w);
+}
+
+class ImageProcessSpeed: public MNNTestCase {
+    virtual ~ImageProcessSpeed() = default;
+    virtual bool run(int precison) {
+        int LOOP = 10000;
+        int warmup = 2;
+        int ih = 240, iw = 240;
+        {
+            int ic = 4;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGBA, BGR);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGBA, BGR);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGBA->BGR: cost time=%.3f ms\n", duration);
+        }
+        {
+            int ic = 4;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGBA, BGRA);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGBA, BGRA);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGBA->BGRA: cost time=%.3f ms\n", duration);
+        }
+
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, BGR);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, BGR);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->BGR: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, RGBA);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, RGBA);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->RGBA: cost time=%.3f ms\n", duration);
+        }
+
+        {
+            int ic = 4;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, BGRA, BGR);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, BGRA, BGR);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("BRGA->BGR: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, GRAY);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, GRAY);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->GRAY: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, BGR, GRAY);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, BGR, GRAY);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("BGR->GRAY: cost time=%.3f ms\n", duration);
+        }
+
+        {
+            int ic = 4;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, BGRA, GRAY);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, BGRA, GRAY);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("BGRA->GRAY: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 4;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGBA, GRAY);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGBA, GRAY);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGBA->GRAY: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 1;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, GRAY, RGBA);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, GRAY, RGBA);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("GRAY->RGBA: cost time=%.3f ms\n", duration);
+        }
+
+        {
+            int ic = 1;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, GRAY, RGB);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, GRAY, RGB);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("GRAY->RGB: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, YUV);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, YUV);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->YUV: cost time=%.3f ms\n", duration);
+        }
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+        
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, XYZ);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, XYZ);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->XYZ: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, HSV);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, HSV);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->HSV: cost time=%.3f ms\n", duration);
+        }
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, BGR555);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, BGR555);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->BGR555: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, BGR, BGR555);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, BGR, BGR555);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("BGR->BGR555: cost time=%.3f ms\n", duration);
+        }
+
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, BGR, BGR565);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, BGR, BGR565);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("BGR->BGR565: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, RGB, BGR565);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, RGB, BGR565);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("RGB->BGR565: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, YUV_NV21, RGB);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, YUV_NV21, RGB);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("YUV_NV21->RGB: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, YUV_NV21, BGR);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, YUV_NV21, BGR);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("YUV_NV21->BGR: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, YUV_NV21, BGRA);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, YUV_NV21, BGRA);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("YUV_NV21->BGRA: cost time=%.3f ms\n", duration);
+        }
+        
+        {
+            int ic = 3;
+            auto srcvec = genSourceData(ih, iw, ic);
+            auto srcVar = _Input({ih, iw, ic}, NHWC, halide_type_of<uint8_t>());
+            auto inputPtr = srcVar->writeMap<uint8_t>();
+            memcpy(inputPtr, srcvec.data(), srcVar->getInfo()->size * sizeof(uint8_t));
+
+            for (int i = 0; i < warmup; ++i) {
+                cvtColor(srcVar, YUV_NV21, RGBA);
+            }
+            Timer l_;
+            for (int i = 0; i < LOOP; ++i) {
+                cvtColor(srcVar, YUV_NV21, RGBA);
+            }
+            auto duration = (float)l_.durationInUs() / 1000.f / LOOP;
+            printf("YUV_NV21->RGBA: cost time=%.3f ms\n", duration);
+        }
+        return true;
+    }
+};
+// MNNTestSuiteRegister(ImageProcessSpeed, "cv/image_process/speed");
diff --git a/test/expr/ModuleTest.cpp b/test/expr/ModuleTest.cpp
index 56664bfe7..233711fda 100644
--- a/test/expr/ModuleTest.cpp
+++ b/test/expr/ModuleTest.cpp
@@ -33,7 +33,7 @@ static VARP convBlock(VARP x, INTS channels, int stride) {
 static VARP convBlocTemp(VARP x, INTS channels, int stride) {
     int inputChannel = channels[0], outputChannel = channels[1];
     int group = inputChannel;
-    x         = _Conv(0.002f, 1.0f, x, {inputChannel, inputChannel}, {3, 3}, SAME, {stride, stride}, {1, 1});
+    x         = _Conv(0.002f, 1.0f, x, {inputChannel, inputChannel}, {3, 3}, SAME, {stride, stride}, {1, 1}, inputChannel);
     x         = _Conv(0.05f, -2.0f, x, {inputChannel, outputChannel}, {1, 1}, SAME, {1, 1}, {1, 1}, 1);
     return x;
 }
@@ -1190,3 +1190,144 @@ class WinogradMemoryTest : public MNNTestCase {
     }
 };
 MNNTestSuiteRegister(WinogradMemoryTest, "expr/WinogradMemoryTest");
+
+
+class SequenceMemoryTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        auto res = _run(precision, false);
+        if (!res) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        return _run(precision, true);
+    }
+    virtual bool _run(int precision, bool shapeMultable) {
+        BackendConfig bnConfig;
+        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
+        ExecutorScope scope(exe);
+        Module::Config config;
+        config.shapeMutable = shapeMultable;
+        config.rearrange = true;
+        std::vector<int8_t> buffer;
+        {
+            // Make Buffer
+            auto x0 = _Input({1, 3, -1, -1}, NCHW, halide_type_of<float>());
+            x0->setName("x0");
+            auto y0 = _mobileNetV1Expr(_Convert(x0, NC4HW4), false);
+            y0->setName("y0");
+            buffer = Variable::save({y0});
+        }
+        auto rtInfo = Express::ExecutorScope::Current()->getRuntime();
+        auto rt = rtInfo.first.begin()->second;
+        MNN::ScheduleConfig sconfig;
+        std::vector<MNN::ScheduleConfig> sconfigs = {sconfig};
+        std::shared_ptr<Executor::RuntimeManager> rtMgr(Executor::RuntimeManager::createRuntimeManager(sconfigs));
+        rtMgr->setMode(Interpreter::Session_Memory_Collect);
+        std::shared_ptr<MNN::Express::Module> m0(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        std::shared_ptr<MNN::Express::Module> m1(Module::load({"x0"}, {"y0"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        float memoryInit = 0.0f;
+        rtMgr->getInfo(Interpreter::MEMORY, &memoryInit);
+        FUNC_PRINT_ALL(memoryInit, f);
+        auto x = _Input({1, 3, 96, 96}, NCHW, halide_type_of<float>());
+        x->writeMap<float>();
+        x->unMap();
+        auto x1 = _Input({1, 3, 97, 97}, NCHW, halide_type_of<float>());
+        x1->writeMap<float>();
+        x1->unMap();
+        auto x2 = _Input({1, 3, 95, 95}, NCHW, halide_type_of<float>());
+        x2->writeMap<float>();
+        x2->unMap();
+        float memoryCurrent = 0.0f;
+        auto compute = [&](){
+            m0->onForward({x});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto dynamic0 = memoryCurrent - memoryInit;
+            FUNC_PRINT_ALL(dynamic0, f);
+            m1->onForward({x1});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto dynamic1 = memoryCurrent - memoryInit;
+
+            FUNC_PRINT_ALL(dynamic1, f);
+            m1->onForward({x2});
+            rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+            auto dynamic2 = memoryCurrent - memoryInit;
+            FUNC_PRINT_ALL(dynamic2, f);
+
+            if (dynamic1 > dynamic0 * 1.1f || dynamic2 > dynamic1) {
+                MNN_ERROR("Dynamic Memory reuse error\n");
+                return false;
+            }
+            return true;
+        };
+        bool res = compute();
+        if (!res) {
+            return false;
+        }
+        exe->gc(MNN::Express::Executor::FULL);
+        rtMgr->getInfo(Interpreter::MEMORY, &memoryCurrent);
+        auto dynamic3 = memoryCurrent - memoryInit;
+        FUNC_PRINT_ALL(dynamic3, f);
+        if (dynamic3 > 0.2) {
+            MNN_ERROR("Dynamic Memory GC error\n");
+            return false;
+        }
+        res = compute();
+        if (!res) {
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(SequenceMemoryTest, "expr/SequenceMemoryTest");
+
+class PrearrangeTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        // Make Model include convolution in shape compute and content compute
+        auto x = _Input({1, 3, 24, 24}, NCHW, halide_type_of<float>());
+        x->setName("x");
+        auto xs = _Convert(_Reshape(_Cast<float>(_Shape(x, NCHW)), {1, 1, 2, 2}), NC4HW4);
+        xs = _Convert(_Conv(1.0f, 0.0f, xs, {1, 1}, {2, 2}), NCHW);
+        auto y = _Conv(0.1f, 0.0f, _Convert(x, NC4HW4), {3, 1}, {3, 3});
+        y = _Convert(y, NCHW);
+        y = _ReduceMean(y);
+        y = y * _Reciprocal(xs);
+        auto info = y->getInfo();
+        y->setName("y");
+        auto buffer = Variable::save({y});
+        MNN::ScheduleConfig sconfig;
+        BackendConfig bnConfig;
+        bnConfig.precision = MNN::BackendConfig::Precision_Low;
+        sconfig.backendConfig = &bnConfig;
+        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 4);
+        ExecutorScope scope(exe);
+        std::vector<MNN::ScheduleConfig> sconfigs = {sconfig};
+        std::shared_ptr<Executor::RuntimeManager> rtMgr(Executor::RuntimeManager::createRuntimeManager(sconfigs));
+        rtMgr->setMode(Interpreter::Session_Memory_Collect);
+        Module::Config config;
+        config.rearrange = false;
+        std::shared_ptr<MNN::Express::Module> m0(Module::load({"x"}, {"y"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        config.rearrange = true;
+        std::shared_ptr<MNN::Express::Module> m1(Module::load({"x"}, {"y"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        auto size = x->getInfo()->size;
+        auto xPtr = x->writeMap<float>();
+        for (int v=0; v<size; ++v) {
+            xPtr[v] = 0.01f;
+        }
+        auto y0 = m0->onForward({x})[0]->readMap<float>()[0];
+        auto y1 = m1->onForward({x})[0]->readMap<float>()[0];
+        if (fabsf(y0 - y1) > 0.000001f) {
+            return false;
+        }
+        rtMgr->setExternalPath(".", Interpreter::EXTERNAL_FEATUREMAP_DIR);
+        std::shared_ptr<MNN::Express::Module> m2(Module::load({"x"}, {"y"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        auto y2 = m2->onForward({x})[0]->readMap<float>()[0];
+        if (fabsf(y0 - y2) > 0.000001f) {
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(PrearrangeTest, "expr/PrearrangeTest");
+
diff --git a/test/op/AttentionTest.cpp b/test/op/AttentionTest.cpp
new file mode 100644
index 000000000..402391952
--- /dev/null
+++ b/test/op/AttentionTest.cpp
@@ -0,0 +1,241 @@
+//
+//  AttentionTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2024/07/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifdef MNN_SUPPORT_TRANSFORMER_FUSE
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+#include <stdlib.h>
+#include <vector>
+
+using namespace MNN::Express;
+
+int NumHead   = 16;
+int KvNumHead = 2;
+int HeadDim   = 128;
+const float diff_threshold = 0.001;
+const float diff_percent_threshold = 0.1;
+
+static std::vector< std::vector< std::vector<float> > > generateRandTensor(int C, int H, int W) {
+    std::vector< std::vector< std::vector<float> > > a;
+    a.resize(C);
+    for (int i = 0; i < C; i++) {
+        a[i].resize(H);
+        for (int j = 0; j < H; j++) {
+            a[i][j].resize(W);
+            for (int k = 0; k < W; k++) {
+                a[i][j][k] = (float)rand() / (float)RAND_MAX * 10.0 * (rand() % 2 ? 1 : -1);
+            }
+        }
+    }
+    return a;
+}
+
+VARP vector_to_var(std::vector< std::vector< std::vector<float> > > & a) {
+    int C = a.size();
+    int H = a[0].size();
+    int W = a[0][0].size();
+    VARP var = _Input({1, C, H, W}, NCHW, halide_type_of<float>());
+    float * ptr = var->writeMap<float>();
+    for (int i = 0; i < C; i++) {
+        for (int j = 0; j < H; j++) {
+            for (int k = 0; k < W; k++) {
+                ptr[i * H * W + j * W + k] = a[i][j][k];
+            }
+        }
+    }
+    var->unMap();
+    return var;
+}
+
+VARP vector_to_var(std::vector< std::vector<int> > & a) {
+    int H = a.size();
+    int W = a[0].size();
+    VARP var = _Input({1, 1, H, W}, NCHW, halide_type_of<int>());
+    int * ptr = var->writeMap<int>();
+    for (int i = 0; i < H; i++) {
+        for (int j = 0; j < W; j++) {
+            ptr[i * W + j] = a[i][j];
+        }
+    }
+    var->unMap();
+    return var;
+}
+
+static std::vector< std::vector< std::vector<float> > > 
+computeAttention (
+    std::vector< std::vector< std::vector<float> > > & query,
+    std::vector< std::vector< std::vector<float> > > & key,
+    std::vector< std::vector< std::vector<float> > > & value,
+    std::vector< std::vector<int> > & mask,
+    int seq_len, int kv_seq_len )
+{
+    int group_size = NumHead / KvNumHead;
+    std::vector< std::vector< std::vector<float> > > output(seq_len);
+    for (int i = 0; i < seq_len; i++) {
+        output[i].resize(NumHead);
+        for (int j = 0; j < NumHead; j++) {
+            output[i][j].resize(HeadDim);
+        }
+    }
+    for (int h = 0; h < NumHead; h++) {
+        int kv_h = h / group_size;
+        /*---- Q * K ----*/
+        std::vector< std::vector<float> > qk(seq_len, std::vector<float>(kv_seq_len, 0.0f));
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < kv_seq_len; j++) {
+                qk[i][j] = 0.0f;
+                for (int k = 0; k < HeadDim; k++) {
+                    qk[i][j] += query[i][h][k] * key[j][kv_h][k];
+                }
+            }
+        }
+        /*---- Mask QK ----*/
+        float scale = 1.0 / sqrt(HeadDim);
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < kv_seq_len; j++) {
+                if (mask[i][j] == 1) {
+                    qk[i][j] *= scale;
+                } else {
+                    qk[i][j] = std::numeric_limits<float>::lowest();
+                }
+            }
+        }
+        /*---- Softmax QK ----*/
+        for (int i = 0; i < seq_len; i++) {
+            float maxValue = qk[i][0];
+            for (int j = 1; j < kv_seq_len; j++) {
+                maxValue = ALIMAX(maxValue, qk[i][j]);
+            }
+            for (int j = 0; j < kv_seq_len; j++) {
+                qk[i][j] -= maxValue;
+            }
+            float sum = 0.0f;
+            for (int j = 0; j < kv_seq_len; j++) {
+                sum += exp(qk[i][j]);
+            }
+            for (int j = 0; j < kv_seq_len; j++) {
+                qk[i][j] = exp(qk[i][j]) / sum;
+            }
+        }
+        /*---- QK * V ----*/
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < HeadDim; j++) {
+                output[i][h][j] = 0.0f;
+                for (int k = 0; k < kv_seq_len; k++) {
+                    output[i][h][j] += qk[i][k] * value[k][kv_h][j];
+                }
+            }
+        }
+    }
+    return output;
+}
+
+class NaiveAttention {
+    private:
+        std::vector< std::vector< std::vector<float> > >  mPastKey, mPastValue;
+        int mPastLen;
+    public:
+        NaiveAttention() : mPastLen(0) {}
+        ~NaiveAttention() = default;
+        std::vector< std::vector< std::vector<float> > > onExecute (
+            std::vector< std::vector< std::vector<float> > > & query,
+            std::vector< std::vector< std::vector<float> > > & key,
+            std::vector< std::vector< std::vector<float> > > & value, 
+            std::vector< std::vector<int> > & mask,
+            int seq_len )
+        {
+            for (int i = 0; i < seq_len; i++) {
+                mPastKey.push_back(key[i]);
+                mPastValue.push_back(value[i]);
+            }
+            mPastLen += seq_len;
+            return computeAttention(query, mPastKey, mPastValue, mask, seq_len, mPastLen);
+        }
+};
+
+class AttentionTest : public MNNTestCase {
+    protected:
+        std::vector< std::vector< std::vector<float> > > query;
+        std::vector< std::vector< std::vector<float> > > key;
+        std::vector< std::vector< std::vector<float> > > value;
+        std::vector< std::vector<int> > mask;
+        std::vector< std::vector< std::vector<float> > > expected_result;
+        VARP Query, Key, Value, Mask, Output;
+public:
+    AttentionTest() = default;
+    virtual ~AttentionTest() = default;
+
+    void generateInput(int seq_len) {
+        query = generateRandTensor(seq_len, NumHead, HeadDim);
+        key   = generateRandTensor(seq_len, KvNumHead, HeadDim);
+        value = generateRandTensor(seq_len, KvNumHead, HeadDim);
+        Query = vector_to_var(query);
+        Key   = vector_to_var(key);
+        Value = vector_to_var(value);
+    }
+
+    void generateMask(int seq_len, int kv_seq_len) {
+        mask.resize(seq_len);
+        for (int i = 0; i < seq_len; i++) {
+            mask[i].resize(kv_seq_len);
+            for (int j = 0; j < kv_seq_len; j++) {
+                if (j - i <= kv_seq_len - seq_len) {
+                    mask[i][j] = 1;
+                } else {
+                    mask[i][j] = 0;
+                }
+            }
+        }
+        Mask  = vector_to_var(mask);
+    }
+
+    bool compareResult(int seq_len) {
+        const float * resultPtr = Output->readMap<float>();
+        for (int i = 0; i < seq_len; i++) {
+            for (int j = 0; j < NumHead; j++) {
+                for (int k = 0; k < HeadDim; k++) {
+                    float diff = fabs(resultPtr[i * NumHead * HeadDim + j * HeadDim + k] - expected_result[i][j][k]);
+                    float diff_percent = fabs(diff / expected_result[i][j][k]);
+                    if (diff > diff_threshold && diff_percent > diff_percent_threshold) {
+                        printf("Result Mismatch: expected %lf but got %lf in CPU Attention Test\n", expected_result[i][j][k], resultPtr[i * NumHead * HeadDim + j * HeadDim + k]);
+                        printf("Error Position: Output[%d][%d][%d]\n", i, j, k);
+                        return false;
+                    }
+                }
+            }
+        }
+        Output->unMap();
+        return true;
+    }
+    
+    virtual bool run(int precision) {
+        srand(2024);
+        std::shared_ptr<NaiveAttention> naiveAttention(new NaiveAttention);
+        std::shared_ptr<MNN::OpT> attention(new MNN::OpT);
+        attention->type = MNN::OpType_Attention;
+        attention->main.type = MNN::OpParameter_AttentionParam;
+        attention->main.value = new MNN::AttentionParamT;
+        attention->main.AsAttentionParam()->kv_cache = true;
+        int seq_len = 10;
+        generateInput(seq_len);
+        generateMask(seq_len, seq_len);
+        expected_result = naiveAttention->onExecute(query, key, value, mask, seq_len);
+        Output = Variable::create(Expr::create(attention.get(), {Query, Key, Value, Mask}));
+        bool pass = compareResult(seq_len);
+        if (pass) {
+            printf("CPU attention unit test passed!\n");
+        } else {
+            printf("Error: CPU attention unit test failed!\n");
+        }
+        return pass;
+    }
+};
+
+MNNTestSuiteRegister(AttentionTest, "op/cpu_attention");
+#endif
diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp
index 517da5374..2dd10eb73 100644
--- a/test/op/RasterTest.cpp
+++ b/test/op/RasterTest.cpp
@@ -8,6 +8,7 @@
 
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
+#include "RuntimeAttr.hpp"
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
 
@@ -211,6 +212,12 @@ class ReduceBlitTest : public MNNTestCase {
         return true;
     }
     virtual bool run(int precision) {
+        // TODO: Other Backend Support Reduce Blit
+        auto attr = ExecutorScope::Current()->getAttr();
+        if (attr->firstType != MNN_FORWARD_CPU) {
+            MNN_ERROR("Currently only cpu backend support reduce blit\n");
+            return true;
+        }
         ExecutorScope::Current()->lazyEval = false;
         auto res = _run(precision, false);
         if (!res) {
diff --git a/test/speed/HybridConvSpeedTest.cpp b/test/speed/HybridConvSpeedTest.cpp
index 2354c4c58..42968330d 100644
--- a/test/speed/HybridConvSpeedTest.cpp
+++ b/test/speed/HybridConvSpeedTest.cpp
@@ -132,20 +132,32 @@ class HybridConvSpeedInt8Test : public HybridConvSpeedTestCommon {
 class HybridConvInt8Test : public HybridConvSpeedTestCommon {
 public:
     virtual bool run(int precision) {
-        std::vector< std::vector<int>> channels = {{7, 9}, {2048, 6144}, {1, 10}, {20, 153}, {9, 18}};
+        
         INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {1, 1}; // {w, h}
         int testBatchCount = 5;
         // std::vector<int> batch(testBatchCount);
         std::vector<int> batch = {1, 23, 1479, 38, 29};
         std::vector<int> kernels = {1, 1};
-        std::vector<int> weightBits = {8};
         bool lowmemory = true;
-        for (auto& bits : weightBits) {
+        {
+           std::vector< std::vector<int>> channels = {{7, 9}, {2048, 6144}, {1, 10}, {20, 153}, {9, 18}};
+           for (int i = 0; i < channels.size(); ++i) {
+               for (int n = 0; n < 5; ++n) {
+                   auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], 8, precision);
+                   if (!res) {
+                       MNN_ERROR("Error: low memory hybridConv when bits=8, n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
+                       return false;
+                   }
+               }
+           }
+        }
+        {
+            std::vector< std::vector<int>> channels = {{2048, 6144}, {8, 8}, {8, 9}, {8, 16}};
             for (int i = 0; i < channels.size(); ++i) {
-                for (int n = 0; n < batch.size(); ++n) {
-                    auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], bits, precision);
+                for (int n = 0; n < 5; ++n) {
+                    auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], 4, precision);
                     if (!res) {
-                        MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
+                        MNN_ERROR("Error: low memory hybridConv when bits=4, n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
                         return false;
                     }
                 }
diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp
index befc9d51f..63e72052d 100644
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@@ -53,6 +53,7 @@ class MNN_PUBLIC modelConfig {
     bool detectSparseSpeedUp = true;
     bool convertMatmulToConv = true;
     bool transformerFuse = false;
+    bool allowCustomOp = false;
     std::string customOpLibs = "";
     std::string authCode = "";
     std::string testDir = "";
diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp
index 89e83ab26..bc2399b36 100644
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@@ -295,6 +295,11 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
      "transformerFuse",
      "fuse attention op, like fmhaV2/fmhca/splitGelu/groupNorm. default: false",
      cxxopts::value<bool>()
+     )
+     (
+     "allowCustomOp",
+     "allow custom op when convert. default: false",
+     cxxopts::value<bool>()
      );
 
     auto result = options.parse(argc, argv);
@@ -489,6 +494,9 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
     if (result.count("transformerFuse")) {
         modelPath.transformerFuse = true;
     }
+    if (result.count("allowCustomOp")) {
+        modelPath.allowCustomOp = true;
+    }
     return true;
 }
 
@@ -595,7 +603,7 @@ static void computeUnaryBuffer(MNN::NetT* net) {
             auto inputId = op->inputIndexes[0];
             if (describes.find(inputId) == describes.end()) {
                 auto iter = describes.find(outputId);
-                
+
             }
             unaryDes = describes.find(inputId)->second;
             float inpScale = unaryDes->quantInfo->scale;
@@ -704,7 +712,7 @@ bool Cli::convertModel(modelConfig& modelPath) {
             MNN_PRINT("MNN net has tensor quant info\n");
             computeUnaryBuffer(newNet.get());
         }
-        
+
         error = writeFb(newNet, modelPath.MNNModel, modelPath);
     } else {
         error = writeFb(netT, modelPath.MNNModel, modelPath);
diff --git a/tools/converter/source/common/writeFb.cpp b/tools/converter/source/common/writeFb.cpp
index e46b50a7e..4132a2770 100644
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@@ -60,7 +60,7 @@ static float _computeOpExternalSizeInMB(const MNN::OpT* op) {
             }
             return blob->external[1] / 1024.0f / 1024.0f;
         }
-            
+
         default:
             break;
     }
@@ -166,7 +166,7 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c
     }
 
     std::ostringstream notSupportInfo;
-    if (!notSupportOps.empty()) {
+    if (!notSupportOps.empty() && !config.allowCustomOp) {
         for (auto name : notSupportOps) {
             notSupportInfo << name << " | ";
         }
diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
index f52931fd8..4e714a1e8 100644
--- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
+++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
@@ -28,8 +28,8 @@ static FormatSetType _getFormatType(const OpT* op, MNN_DATA_FORMAT originFormat)
     switch (op->type) {
         // NC4HW4 Ops with multi-input
         case MNN::OpType_SeqLen2Spatial:
-	case MNN::OpType_GroupNorm:
-	case MNN::OpType_Convolution:
+        case MNN::OpType_FmhaV2:
+        case MNN::OpType_Convolution:
         case MNN::OpType_Convolution3D:
         case MNN::OpType_ConvolutionDepthwise:
         case MNN::OpType_Deconvolution:
diff --git a/tools/cpp/CMakeLists.txt b/tools/cpp/CMakeLists.txt
index 134340106..c560fb401 100644
--- a/tools/cpp/CMakeLists.txt
+++ b/tools/cpp/CMakeLists.txt
@@ -1,4 +1,9 @@
 set(MNN_CPP_TOOLS "")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
+    add_definitions(-DMNN_USE_NEON)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+    add_definitions(-DMNN_USE_NEON)
+endif()
 add_executable(GetMNNInfo ${CMAKE_CURRENT_LIST_DIR}/GetMNNInfo.cpp)
 list(APPEND MNN_CPP_TOOLS GetMNNInfo)
 add_executable(ModuleBasic.out ${CMAKE_CURRENT_LIST_DIR}/ModuleBasic.cpp)
diff --git a/tools/cpp/ModuleBasic.cpp b/tools/cpp/ModuleBasic.cpp
index 04954e7a4..6f0eb2538 100644
--- a/tools/cpp/ModuleBasic.cpp
+++ b/tools/cpp/ModuleBasic.cpp
@@ -8,6 +8,7 @@
 
 #include "MNN_generated.h"
 #include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/expr/Module.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #define MNN_OPEN_TIME_TRACE
@@ -93,6 +94,10 @@ int main(int argc, char *argv[]) {
         MNN_ERROR("Usage: ./ModuleBasic.out ${test.mnn} ${Dir} [runMask] [forwardType] [runLoops] [numberThread] [precision | memory] [cacheFile]\n");
         return 0;
     }
+    BackendConfig backendConfigTmp;
+    auto _executor = Executor::newExecutor(MNN_FORWARD_CPU, backendConfigTmp, 1);
+    ExecutorScope _s(_executor);
+
     std::string modelName = argv[1];
     std::string directName = argv[2];
     MNN_PRINT("Test %s from input info: %s\n", modelName.c_str(), directName.c_str());
@@ -277,6 +282,9 @@ int main(int argc, char *argv[]) {
     if (runMask & 1024) {
         rtmgr->setHint(Interpreter::DYNAMIC_QUANT_OPTIONS, 1);
     }
+    if (runMask & 2048) {
+        rtmgr->setExternalPath("tmp", Interpreter::EXTERNAL_FEATUREMAP_DIR);
+    }
     std::shared_ptr<Module> net;
     {
         AUTOTIME;
@@ -419,6 +427,7 @@ int main(int argc, char *argv[]) {
         for (int i = 0; i < t; ++i) {
             Timer _l;
             auto out = net->onForward(inputs);
+            Variable::compute(out);
             for (auto o : out) {
                 ((MNN::Tensor*)o->getTensor())->wait(MNN::Tensor::MAP_TENSOR_READ, true);
             }
diff --git a/tools/cpp/getPerformance.cpp b/tools/cpp/getPerformance.cpp
index ff3b5dfc4..c0b5d6f01 100644
--- a/tools/cpp/getPerformance.cpp
+++ b/tools/cpp/getPerformance.cpp
@@ -207,18 +207,17 @@ static void _testMemcpy() {
     int size = 1024 * 1024;
     int loop = 10000;
     std::vector<std::thread> threads;
+    int threadNumber = 2;
+    std::vector<std::vector<int8_t>> tmp(threadNumber);
+    for (int i=0; i<threadNumber; ++i) {
+        tmp[i].resize(size);
+    }
     MNN::Timer _t;
-    for (int i=0; i<2; ++i) {
-        threads.emplace_back(std::thread([size, loop]() {
-            std::vector<int8_t> tmp0(size);
-            std::vector<int8_t> tmp1(size);
-            auto t0 = tmp0.data();
-            auto t1 = tmp1.data();
+    for (int i=0; i<threadNumber; ++i) {
+        threads.emplace_back(std::thread([size, loop, i, &tmp]() {
+            auto t0 = tmp[i].data();
             for (int i=0; i<loop; ++i) {
-                ::memcpy(t0, t1, size);
-                auto s = t0;
-                t0 = t1;
-                t1 = s;
+                ::memset(t0, 0, size);
             }
         }));
     }
diff --git a/tools/cv/benchmark/opencv_benchmark.cpp b/tools/cv/benchmark/opencv_benchmark.cpp
index 39c901bab..56c27fcc4 100644
--- a/tools/cv/benchmark/opencv_benchmark.cpp
+++ b/tools/cv/benchmark/opencv_benchmark.cpp
@@ -17,6 +17,7 @@
 #ifdef MNN_IMGCODECS
 #include "cv/imgcodecs.hpp"
 #endif
+#include <MNN/AutoTime.hpp>
 
 using namespace MNN;
 using namespace Express;
@@ -30,7 +31,7 @@ constexpr const char* path = "./imgs/cat.jpg";
 
 template <typename T>
 VARP cv2mnn(const cv::Mat& src) {
-    VARP dst = _Input({ src.rows, src.cols, src.channels() }, NHWC, halide_type_of<T>());
+    VARP dst = _Input({ 1, src.rows, src.cols, src.channels() }, NHWC, halide_type_of<T>());
     auto inputPtr = dst->writeMap<T>();
     memcpy(inputPtr, src.ptr(0), dst->getInfo()->size * sizeof(T));
     return dst;
@@ -46,12 +47,12 @@ VARP cv2mnn(const cv::Mat& src) {
 #define arg_switch(COND, CASE0, CASE1, CASE2, CASE3) arg_concat(arg_switch_, COND)(CASE0, CASE1, CASE2, CASE3)
 
 #define BENCH_IMPL(mode, func, ...)\
-    auto t1 =  std::chrono::high_resolution_clock::now();\
+arg_switch(mode, cv::func(__VA_ARGS__);, auto dst = func(__VA_ARGS__);dst->readMap<void>();, auto dst = func(__VA_ARGS__);dst[0]->readMap<void>();, func(__VA_ARGS__);)\
+    Timer l_;\
     for (int i = 0; i < LOOP; i++) {\
 arg_switch(mode, cv::func(__VA_ARGS__);, auto dst = func(__VA_ARGS__);dst->readMap<void>();, auto dst = func(__VA_ARGS__);dst[0]->readMap<void>();, func(__VA_ARGS__);)\
     }\
-    auto t2 =  std::chrono::high_resolution_clock::now();\
-    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() / (1000. * LOOP);\
+    auto duration = (float)l_.durationInUs() / 1000.f / LOOP;\
     times.push_back(duration); \
 
 #define BENCHMARK_NAME(mode, name, func, ...) \
@@ -73,17 +74,18 @@ void color(cv::Mat cvimg, VARP mnnimg) {
     cv::Mat dst;
 #define CVTCOLOR(code)\
     BENCHMARK_NAME(0, code, cvtColor, cvimg, dst, cv::COLOR_##code)\
-    BENCHMARK_NAME(1, code, cvtColor, mnnimg, COLOR_##code)
+    BENCHMARK_NAME(3, code, cvtColor, mnnimg, COLOR_##code)
+
     CVTCOLOR(RGB2BGR)
     CVTCOLOR(RGB2GRAY)
     CVTCOLOR(RGB2RGBA)
-    CVTCOLOR(RGB2BGRA)
     CVTCOLOR(RGB2YUV)
     CVTCOLOR(RGB2XYZ)
     CVTCOLOR(RGB2HSV)
     CVTCOLOR(RGB2HSV_FULL)
     CVTCOLOR(RGB2BGR555)
     CVTCOLOR(RGB2BGR565)
+
 }
 
 void filter(cv::Mat cvimg, VARP mnnimg) {
diff --git a/tools/cv/source/imgproc/filter.cpp b/tools/cv/source/imgproc/filter.cpp
index 5436b0170..b9e6c9204 100644
--- a/tools/cv/source/imgproc/filter.cpp
+++ b/tools/cv/source/imgproc/filter.cpp
@@ -32,7 +32,7 @@ static halide_type_t formatInput(VARP& src, bool fp = true) {
             src = _Convert(_Reshape(src, {1, channel, height, width}), NHWC);
         }
     }
-    if (fp) {
+    if (fp && src->getInfo() && src->getInfo()->type.code != halide_type_float) {
         src = _Cast(src, halide_type_of<float>());
     }
     return info->type;
@@ -46,14 +46,17 @@ static VARP formatOutput(VARP src, halide_type_t type) {
     if (channel == 1) {
         squeeze_dims.push_back(-1);
     }
-    if (!squeeze_dims.empty()) {
-        src = _Squeeze(src, squeeze_dims);
-    }
     if (type == halide_type_of<uint8_t>()) {
         src = _Minimum(src, _Scalar<float>(255));
         src = _Maximum(src, _Scalar<float>(0));
     }
-    return _Cast(src, type);
+    if (src->getInfo()) {
+        auto srctype = src->getInfo()->type;
+        if (srctype.code == type.code && srctype.bits == type.bits) {
+            return src;
+        }
+    }
+    return _Cast(src, type); // if same type, do not need.
 }
 
 template <typename T>
diff --git a/tools/train/source/nn/NN.cpp b/tools/train/source/nn/NN.cpp
index a49c6afaf..8d49f6ada 100644
--- a/tools/train/source/nn/NN.cpp
+++ b/tools/train/source/nn/NN.cpp
@@ -710,7 +710,7 @@ class ConvBNReluFusedModule : public Module {
         int threadNumber = 1, ePack = 12;
         int unit2   = UP_DIV(outH * outW, ePack * threadNumber);
         int maxUnit = (int)::sqrtf((float)unit2);
-        const int MAX_UNIT = 4, MIN_UNIT = 2;
+        const int MAX_UNIT = 6, MIN_UNIT = 2;
         maxUnit = std::max(std::min(maxUnit, MAX_UNIT), MIN_UNIT);
 
         auto units = std::pair<int, int>({0, 0});
diff --git a/transformers/diffusion/export/convert_mnn.py b/transformers/diffusion/export/convert_mnn.py
new file mode 100644
index 000000000..b29b9ca38
--- /dev/null
+++ b/transformers/diffusion/export/convert_mnn.py
@@ -0,0 +1,21 @@
+import os
+def convert(onnx_path, mnn_path, extra):
+    print('Onnx path: ', onnx_path)
+    print('MNN path: ', mnn_path)
+    print('Extra: ', extra)
+    convert_path = '../../../build/MNNConvert'
+    if not os.path.exists(convert_path):
+        print(convert_path + " not exist, use pymnn instead")
+        convert_path = 'mnnconvert'
+    models = ['text_encoder', 'unet', 'vae_decoder']
+    for model in models:
+        cmd = convert_path + ' -f ONNX --modelFile ' + os.path.join(onnx_path, model, 'model.onnx') + ' --MNNModel ' + os.path.join(mnn_path, model + '.mnn') + ' --saveExternalData=1 ' + extra
+        print(cmd)
+        print(os.popen(cmd).read())
+
+if __name__ == '__main__':
+    import sys
+    extra = ""
+    if len(sys.argv) > 3:
+        extra = sys.argv[3]
+    convert(sys.argv[1], sys.argv[2], extra)
diff --git a/transformers/diffusion/pipeline.cpp b/transformers/diffusion/pipeline.cpp
index ed35ba705..7ac441f04 100644
--- a/transformers/diffusion/pipeline.cpp
+++ b/transformers/diffusion/pipeline.cpp
@@ -113,6 +113,7 @@ bool Pipeline::load_modules() {
     // load text_encoder model
     {
         std::string model_path = mModelPath + "/text_encoder.mnn";
+        MNN_PRINT("Load %s\n", model_path.c_str());
         mModules[0].reset(Module::load(
             {"input_ids"}, {"last_hidden_state", "pooler_output"}, model_path.c_str(), runtime_manager_, &module_config));
         
@@ -125,6 +126,7 @@ bool Pipeline::load_modules() {
     // load unet model
     {
         std::string model_path = mModelPath + "/unet.mnn";
+        MNN_PRINT("Load %s\n", model_path.c_str());
         mModules[1].reset(Module::load(
             {"sample", "timestep", "encoder_hidden_states"}, {"out_sample"}, model_path.c_str(), runtime_manager_, &module_config));
         
@@ -137,6 +139,7 @@ bool Pipeline::load_modules() {
     // load vae_decoder model
     {
         std::string model_path = mModelPath + "/vae_decoder.mnn";
+        MNN_PRINT("Load %s\n", model_path.c_str());
         mModules[2].reset(Module::load(
             {"latent_sample"}, {"sample"}, model_path.c_str(), runtime_manager_, &module_config));
         
diff --git a/transformers/llm/config.json b/transformers/llm/config.json
index f34f70063..d508b467d 100755
--- a/transformers/llm/config.json
+++ b/transformers/llm/config.json
@@ -6,10 +6,12 @@
     "thread_num": 4,
     "precision": "low",
     "memory": "low",
+    "power":"normal",
+    "use_mmap":"false",
 
     "is_batch_quant": 1,
     
     "reuse_kv": false,
     "quant_kv": 0,
     "kvcache_limit": -1
-}
\ No newline at end of file
+}
diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
index 72fcaa742..4ebba43d6 100644
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@@ -112,10 +112,11 @@ class MNN_PUBLIC Llm {
 class Embedding : public Llm {
 public:
     Embedding(std::shared_ptr<LlmConfig> config);
-    static Embedding* createEmbedding(const std::string& config_path);
+    static Embedding* createEmbedding(const std::string& config_path, bool load = true);
     static float dist(MNN::Express::VARP var0, MNN::Express::VARP var1);
     virtual void load() override;
-    MNN::Express::VARP embedding(const std::string& txt);
+    MNN::Express::VARP ids_embedding(const std::vector<int>& ids);
+    MNN::Express::VARP txt_embedding(const std::string& txt);
     int dim() const;
 private:
     virtual std::vector<int> tokenizer(const std::string& query) override;
diff --git a/transformers/llm/engine/ios/README.md b/transformers/llm/engine/ios/README.md
new file mode 100644
index 000000000..4a682eb73
--- /dev/null
+++ b/transformers/llm/engine/ios/README.md
@@ -0,0 +1,44 @@
+# mnn-llm ios demo
+
+🚀 本示例代码全部由`ChatGPT-4`生成。
+
+## 速度
+
+[旧版测试prompt](../resource/prompt.txt)
+- Qwen-1.8b-chat 4bit
+  - iPhone 11    : pefill  52.00 tok/s, decode 16.23 tok/s
+  - iPhone 14 Pro: pefill 102.63 tok/s, decode 33.53 tok/s
+- Qwen-1.8b-chat 8bit
+  - iPhone 11    : pefill  61.90 tok/s, decode 14.75 tok/s
+  - iPhone 14 Pro: pefill 105.41 tok/s, decode 25.45 tok/s
+
+---
+
+[新版测试prompt](../resource/bench.txt)
+- Qwen1.5-0.5b-chat 4bit
+  - iPhone 15 Pro: pefill 282.73 tok/s, decode 51.68 tok/s
+- Qwen2-0.5b-instruct 4bit
+  - iPhone 15 Pro: pefill 234.51 tok/s, decode 51.36 tok/s
+- Qwen2-1.5b-instruct 4bit
+  - iPhone 15 Pro: pefill 107.64 tok/s, decode 25.57 tok/s
+
+## 编译
+1. 编译 MNN iOS Framework: 在 MNN 根目录下执行
+```
+sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN_SUPPORT_TRANSFORMER_FUSE=true -DMNN_BUILD_LLM=true -DMNN_CPU_WEIGHT_DEQUANT_GEMM=true"
+mv MNN-iOS-CPU-GPU/Static/MNN.framework transformers/llm/engine/ios/MNN.framework
+```
+2. 下载模型文件: [Qwen1.5-0.5B-Chat-MNN](https://modelscope.cn/models/zhaode/Qwen1.5-0.5B-Chat-MNN/files) ，或者使用 export 下面的脚本导出模型
+3. 将模型文件拷贝到`${MNN根目录}/transformers/llm/engine/model/`目录下
+4. 在xcode项目属性中`Signing & Capabilities` > `Team`输入自己的账号；`Bundle Identifier`可以重新命名；
+5. 连接iPhone并编译执行，需要在手机端打开开发者模式，并在安装完成后在：`设置` > `通用` > `VPN与设备管理`中选择信任该账号；
+
+备注：如测试其他模型，可以将`ios/mnn-llm/model/`替换为其他模型的文件夹；同时修改`LLMInferenceEngineWrapper.m +38`的模型路径；
+
+## 性能
+等待模型加载完成后，发送：`benchmark`，即可进行benchmark测试；
+
+## 测试
+等待模型加载完成后即可发送信息，如下图所示：
+
+![ios-app](./ios_app.jpg)
diff --git a/transformers/llm/engine/ios/ios_app.jpg b/transformers/llm/engine/ios/ios_app.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..72a9d54d9bb0fe9f640a713fb521f46605c70370
GIT binary patch
literal 28478
zcmeFZ2V7I%moFMXq>J>XM4F&9r79q?AtE9o3J5|}qze%h2@(<%=}k~TL5OszkzRy^
zj)+JH=}AzK5=kiGNDA-#{&U~VZ|2Ut&)j?G-OqcU_YR+ZAcwQhS!=Jg_gd?_*3Rzt
zT_R-vr3)4pARHVJ2nYBB*`0!%gK%>Eb^LYa{Oj1m_1Bqu&mOKld%5@S{YT^B<K4@{
z$Fp}YuK+I}|6d3Am!JT@;9np9ddNR@<>cn#;^ybs%kz(0{)bMxEf5i2u27CFE)GQq
zrw9j^2*+**1Pap0{SWd$V*lyj;N;rFy_W|hLjb&?W<N+i7Z*r0H%KvfcL?}@$Q}`H
zQHA4Y_KMlu;ZgJ#*Ljwd&8u{_qE+JBAXQoS?!)JN{E|}AGO{X%Rn?9h)zddHJaN+K
z+<CJL<`*q4*<Qb4XYX*+(e<92yN9Qj_oK%FPXduaPs3h>M?^+N$0WacosydN=52aT
zZeD&t;fJE)%Bt#`+PeCN#<up3&aUpyJzs`~M@GlSzfVk($n!sbE-e09TBfaUY;MuF
zfj>Kc@x=k*`X{!)^S{fN2*?-bUoznNi!Tn&K(KL%?BP~8zE|{&4bL5aF-4tcyy9n*
zvMXBolyt9ACGI{P<d;;|BdO5-V(lNC{pT2a{y)XpKQs0(d`&_Gxi~=Kafv`+5Ee5h
zMh)`6tCd^M6FO#GiQ{cT7&0OfE#<>D7}1O6(u{n;)a10l6o0$uK#Vdz`uuCVM#oXD
z1(PG?wJ{LpFIE<YSb?pkXEXHJV;H$zh`05G?S<hc3x>gpt`ja4B~oe`sN8x#b@6i+
ztuD7ZxFs2x%PXNAa2CQ%s0w6C7}CdH(NkGiyeNCvMFxP9;|1t12efOng)FJoj)aV>
z)Q@!)PBYs}57$*j8|>EOXhGh%PhlT<$nLC{Fk-GGj2}bHsfAaW#ugzByXVBwYFfF}
zG^Z?@rreFHb<I>#nTN(69Si>Ha|iMYdJZ~0^qn>tr=MlLY;q>;LZ+1H!8CiO*dm)R
zjOI%BMDSWF!>Zx2=tFRGL?IA+@t9hgb@!Bq(bvi=`ST6yBe~))pY^A5(9<XZyAW<q
zSQTcqie^0@Qn!kmpMvj4>XPp=VN~l#CL%xnsJ5*4hLyNMU8?9B!thE`zLx4AOJyAh
zN9P@OKcS95DWR6ozYoLiBmzfp&(Ts-EAj?Fa1eP8g8W1h^H#reyh4ex<J=+jd;RTv
z+*`Zr3NqrT><oc$@Xb1`_+1Dw8(U+(>ObFilyQx!8jW%z;t~wxO#q4aNcWSa`0+fi
zI+0%Ag1O#@Q>N2{884mvMKIb6{N3*Mt<6w6_<-jTm{vQyEP7i0B<m&n5K0cfP<2UN
zHHAy`IWdYRB|y`s!>F-TN~9GbSSeW@!QDS8{ox7Z==uYW1v-@K1OwDm3Rfe2L4q(e
z>lW=7FWJ3PE~*I=%;Ubr%R=+9&XQJehDv~BhJR(U&4U~9#a(<ZhGD7D;>HVQg+7xv
zY8!HA@;mc?Hd_1m%Y5mT``~tG^$ZUU*QT9_rCPybV6rAs<hAdSq)4V1bY8oCP7>MO
zeeKUf!;S`Rwjkg*<}|P^uk<l=I!7+Gc7xwB!<X9B-OEqQNt}W4AmLV2^B0c^dp#NK
zo;<ICn~r}XHGg)>b~S9b#XyGsm*!SeAe*13lszIZ&x%DWufQwZ!fr8d)KY8O0a@{y
z=Yc-7_)kjpdsQD8O5EhQ*1{=#i5U&VGE_0Go)~Y!q&AyxkYR-$hVN}M!;lGkkePmX
z9dElO*L&$EHpmZ`RT`d845`H~`Jer&1jlmo|Nf)KwFI9UXYV@;SQ0Vc`{#MO<^a`6
zFSaHvt#u8$Bri=nn3#Zq1xsQD)9MUu_^z94Xd7WGZiTBW5<e*44xMwj$(~C?i-JP8
zL%Op0Pttful65%eWf<>cYiaK+VE|KL5%@@Xd(3X5$$(a&p=kat`S+F2Ntw};j{rpQ
zE@Y1v*!OKaLk>OV!o3TLxI`0JW*Y$$Bxl>!bkrj!z4X46tnBhr9dx4QhGIVD)4h2G
zpvR7DdyUPvL|yE{@(^xYKbzxaD|=q^LMu{nt-BDxUpw>+^%DJjN^Od-AT{|#k-DNz
za$Z;hpE$>#xASv=I)*7_X*Y7e$tHw4OgCq}+nn*@9uIHgW1MDQOBn75q_3i_byS<e
zQxZ;34zx`0JWGP#z_jXvlcNw;Yu=4Zkit}IQ`Q)Jy|Me4{=Os}g`(RnFZ6CXYybFt
z!=usJP3B1o(+>CHMz`mVX~c12MOXx|bJU!G5-az<sV2IA4phXwDE~&FkG9VXeavT0
z9DjKEqZO&dD@CTTQslGXepKDQd)5Z`0uxTm-}_#)y}fG9kYMfuM+?L5-w9kDWEkIE
zrn9An*0jof6~=1b;tI^O>{42SEG9#1In{1Re-n^&hy28NBPMg50bN{g;nGHikEQ!_
z(J?xo$wuE%lNSr-*W)E5_8L^&T}%<^|3avZWJ<n*vr^FfOOy0?R3o-SJ4&_@2qsCr
z&@XXvLe_*_@^C;;7%R*iOsVRTNm@5-a7cOP{&<IrJ`G6BTSv_v&aBRGdPA&FA6uSX
zKXrJ0U8zzk=Hi>1?Rru3ye+RMTffO2HA8WMGus4Y<kTE{N?w%e5qTfY(HTNhbsl@X
z3lZLhv`s`>($4!3=LEb4djo<WX=Zk(Ae26OW=BX<wr^QHIxe)Jeu9>3dpxN9#VfkO
zuo)p^1`3m%AkZ0w%50pk?4`b`k{Kuhh^13#gfOf8@@%qG^CjhQ-!$jp>Wm{ATNdwc
zBm_srd_FCFNGB+`^7-i|f4Buo45*OmIJkde#V93utn9~YD{jK?uhbHfmZaj>&!sXt
zI;{>#^;GK?mpvfyfvr?05-t+Yi4q)K7;KUvsh{)7x!Fr3?62u;*Y!)MFYQ7qM=S%3
z7c8L9GCzE>5iy@4RMoZmXbXKMi0jYDE>Ah7PA+uy`6YD6v@XPV0j+-yem&m6vtSc`
z!Z=(e<W^UKR%JQ1%IaZlG6LWFdc3dLwL+w$a5bet&cd*ne#$#W4R>$nf15S`pIg}$
z^s(_+9us3$)h>ijxDzY*9n*GFvk9%haBoIP(VhC)*?xv6)&<|naS;jk2Y9T_#IJJv
z-_dSCDONPY4k$Ow9TqhwgHrYNGAJQIIlXtrGK84-sTTe|ZW^@Gn9|bEj>n|;Uc2zX
z+;L6)CF?z20@FqZ_~tRsp;L1caOzv+xoZQvkQ&q8MOGr^Nu|@DS(hIh20<CMe$L;!
zNbd%CerAP%8V)XQ&pRAYCs}VWo{3`2LjcbQq!0ZazKU8GX`<_9Q05a<;A8*Na6(@o
zBQ@DfGX7|FKc`aTS^U)}FaLxg$o5Q0T{izghBiu-Rl($7oCVBDhf8Qq^!tNN+GT);
z3$-BSLqT}ygpW>#fwhl*e^%xBN>}xx&#xRi>-XwcB=~ycH5L|j5Ah5}9XtW9t@bp0
zG#N^P1LrY6JlOr=<aHBV%`PN6C}V^0nBgB^HdcZ0!2*gframl=;Ej>og|u5??qJ$2
z0pqqtsfk?(Nq}L`RKkAm7YmxgPe7+k*0kl03#WZrGd&3*Kkf&P2s6d|i5QAM)t0nK
zc}BDjTO)#YsV#kL`^A0c{`BaPrX!d_FUt>Q%|FIPg95$3(7O6$56ynNUi@AZIUWkG
zk<|pMT@>N=95{s38`?22A^cw9aCtRaNI^Jx7sBr)*;wooxZkRzzpysjSLxfp)ScWL
z7oEPT%DWn_nbflvSa}3uI+Qv<kRf<z$6~3Cgi06*yIM1aS>1&I7ubWWW>$V)ccI^g
z=@8a!ok`G@bDPN|Xc^k4IqyPRh4G`HQ3|2Ln4*zv*<b`yA$=<=AJpA7rov5zM^b4|
z(I3nm2IBX2P?p`=sfzgTC#O!!H_*<p1&)9n2LNrx#a@O2^9rl@akDDfM7A?4MLYcG
z;1*Ac-|y7Jb+KP9pTue%jtV1iA$tzYuY)2k+l8ET+J#hr%wyPtxL=Z1Vi(eX%v$A5
znRGteaOBoZ8u$@6-U6?IGxviuA=t8?*gClkF}7@Ahdj^7F64(f!<wlCrzWhj=9n1O
zDbvBKHT2?_ehW1&?fFH@8mNrtX_1sI#u;`Ov?4GPFF5JIJVZSlPIiLsXP&Mc$xF4a
zo{;%bb+7gvpNl<7xx<J50^>#8vi%pfJUAxaFk;@$Tq|^<4s=iEaW9}e_tE@JZy+fb
zkaD0VWN76ZMp3l1$`4ni-AEu!JLSPWYwDAY?<VVX+`W+WcJd;7ii%*0^|D2vQ;G8g
zO6D#^07HRNKbeT+B7@l?1i+k*aSeGsN@4Coszctha=*@f@<@}Pc6}6pJ2;FXM!`Vc
zqh5i9f==)n;57q&$TdB(gx1-Gyr^UEZ^HPp60bHjF;pkeQ|x#GCD^3jz!~pCiF^X#
znH{k)gNXl+G&}n!6XyaXD$0FEqvM_#^y(=KibFR!Yu{ebM+WU<yzw!kW%Z<8NO<8c
z<f=9^eHXIo`L91!ah2yZKmkY07UwRc2gmM8{M+BL|9|)Y&kS8Av{SA3nO4-7<}cIt
z*iL4Ww*y|^u!1^%9M?pTx@w(15daIa&X(|d$t`Yk`rm8vwwNHT>|o8+ua=j7CV0!L
z|0GP}$a544njyWJTDL01IL^G9NZQ<%=c%XRK!=Sl*J)X5O!nGLSxQ;v3&W#Sf!Ju~
zYVXk#2`8k(HBOg7c9LU&Cgwd>0p>Pz65fv4Yih#u8aFjS3X=`Pw;27r-u8ItFr~?&
zV0q_M^oYvv_lf+Pnje7%(I=zdDg~q-ES<Wjzw8gL3l=c<i6$qiR4aX4_G=Y-&{4dR
z@J#D&W&HcvS|_#gfw^O^SO-3<CP=qM&K#dedhW?J=ZVW;g3BzoDWE|5g#g{XeN1E8
zT#U(C)5FX7J=Db)DE-{2{tv(_+u^%6wSQLG)nDw)$2%+6*CTJpOYz89{c%iVN)YHz
zsEeRcem1Wi2^V?nYu8J5!4j?aBlm8%t4*@wOXm(Hj~=dx@eniDxSx0+wg1@bi+Gof
zmk`tZoF;qvLsl*G2;(%$mIiHU2Hng=(}T$Ic1NoMoIfcs%E-4=Y}PAK$5PyIY|ZPD
zzhH%oLbAc1_T2KZt3CvPRE%;TGL>Ae!&c3OKR)HrWp*d_Y$Z)AU!@?c(n9CL@U(?&
z)gS*0{kn=m>%6r`>>jjmmE_>>facf<b(58AstRzCvtk)nQT8=7Y!p+3*6u=WavV*J
zBn}_)aXMe4mek7o=c-<^@9dDpb1vhg^mW8!H$4l8&qI&TngXu03^u=lFGCAy*apf8
zVJhR?m*YtRE!3x8-vyVMw_8Xzr)=Mok_%tP?-2stu-A1@LxTt5nMSHiabr4`x}C5K
zxtA!4LIEM<&T+H~>KN_a>>&V7?Bgx_Mksdb%NLNboBTQ8B<8nqE6JoHPe9|jxt!0+
z+Jq~;g(^k4AsoF~25rq!r6$IpL`n)W=3!jGxOL3b^})qt1wBp0c8v-P@rq-}g&6br
z@e}u-LwHJUTyz;hOtS$~Nfc*nCtd{zn}-WdV&#zD-3;p%0q+1V&9N#EUvX8+>FysE
z%Aa3hq>_hxd)lyiGy+p%o+SLj4TulQWB2Vsq*3O%C`H;SdOS_LBTJszoL6fkH6eQP
zaHD&0M-a*3#|D2(<>jQJz-SIDBGh3Jt^}Hk)^li;Ti`D%<|&36(wGR1u33wm+;028
zJmW)-KiY47M7J;GZgg>8B|LW*;@&CIYo@l2+J)%3DJAWlU!0>}U7Nzv-wd%erjUX3
z)3j!M7)d(<G@dHCC<mI%AO&}!>cfg`XZQyytD!!AOjjsn_|~)~bWNn7CQj#-3M6F1
z>>BL|O`5v!9(6$D3XU#=sU%eQ#v-a$P3<$P7S^{ZGj$77?;gRUAFzk;YOHL`y*UwN
zpaU>QU+lzGBciKkw1gwr$_*7}slgO;r{I!bzkPprt<Ih@_I+g*C!@U2^1||aXuY2O
z0){DO+>vIq$P_p4aC9l7!O0;r<I$)ihB5CeM^VVe@YhL8J%vjMZgNeWkZ~%(!qtse
z?91f2+hQPgB@H9J4?x7bn2v73pJQdps(ay+O08LvlC|&}bos}deSvho03ijzr*Et`
zvxB8)n;hTPazRtihN1^-;E=vDW?&sF&x9`@b$PhyM)pz><JK>FmdxN*-?iYpx?v3I
zTE7N;VEU6tkf=A*NBC$}$+zSRl`32IB-)tu2?b((+ya14IZ~+&&R<^o7{Q5TT&Ir6
zCPwRD%xfHeZzX+8|5w6b>d9xx9I{=`7J<E5drzBg=MW6ois(65oqYnE19gK{j22;D
zqhi7~36ZcpCi?TpCMbq@VbqbnXftr7P-E}vOq;8j|Ie_W72PK&j&oFSMM|I$wZXWw
zt#X8|<B8u_MUx8qUCp>q53EQ<dW@)!s4)0+y*#WMl6QtU2aUf}UMM>7YEA9Kz4yWc
z%j&eWcDyldJbnxDB1>>^3&)Fc8XzThxEvhWlGC1SJY{EW_o`j2N5d<~Enms&j_;~R
zec~GauC$U|2_+ae=|ry@HswZtH5CDDLA;SIIu|=m$)85P>b8DF76{UyFkv5-li*d8
zLdWy77Swq&34x-yw~kG${|F6<G((4=Oa?H-0xNLrw1E<zJG5Y;BQn}l7^zufT!cU9
zqv@)WJ%Tc$2gHb!@`+w06F*hQm6q+laW>9rY3}>Ed2A;pk-)u$5JCBdwfB<Y!psXa
z^Y8$^CQa8<HSs+|0U2c{Jra&Aq@Um5u76l?q1CUNgCcH!l7{G9AzDYu*2Q33@P}wR
zQ%>O_!fmxVzGo@UsX8CClaE~#bYL!@EjskWeo?psegH)EYKS5BMC%Shf-mV)vy-E>
zP@?E^V@>s>=4*Rx_esr;Nx#DT+t-|?Z}$Jx|N83l$HwyYy%%|wg(Uu9K=1krmQ08+
zfst?#7!d;L?wrdn(32z$XIx6`kZr0p3K@Aa7-7!1P^ACbeB1YcwfJ$hhX&%dG3#p)
zSn2X=XD530_?9piDyWXe)4nE!#8?hAS(c(ahMDHYz_q1}a}3io4N;$nlk-^nn{G*G
zB>SZ@{5BLW#6w{BJaPT-(5&OE_j<yUYi)24rSOD3!-}8}ctJ%>PXL7-v-0sK7Vl>q
zm(x_wCI|aCE$LD0yyj!ne6{Z#i^@9UZqS*>se{8Zt}*3+9ujN{Y6sBq$AO<@8U0vR
z<0g!EYexOj&~UpQef;>+j!{PD-C2^h@9)gt`B+zTg{HS>1wX`bZF<Kr%UI8tr^s`O
zS$b4^ia>BF8ni}(Xbs0%psM?o^*lvq>Rc9k-8g9TPfqZun_6~inzs{L5LKE+yf^2=
z!$fxS$MgT;5Bo7qno{RFyza+6xe`}y8v2n9FALmS{0rPjDj#R_>_T!Tn<aCcLT${%
zL)CI(Apf?l8~s7hiverKd8RfC$JAtmpuf%u*pkIRB4y`W;rp97Br6biFv0oyle3ev
z4weslII?CjC-km0@+rLwZ4K4mGW%JYYJbh~a=qtt3D@#bqXFS>{S(+5tXhn>HgE(H
z5s)|)Osp-Wg0Zh<J`mf1;Zq#d`)oQ+)*FS9Z*l*q)t#O=b1J38O76VMO*h7+%S!xm
ze&+3Kz~ODm4I=20V<fD)Lw%J&ai21`u41K??QQVhm!0p4SNKHZ4%KeT&u);C+_@in
z{IbeR4GB5b&Egn4t_@c*lqqIDZ<j=c)4hPS4@{*olK?U%f}W+=2$T_Fe9Vh!?dh3Y
zyO3x>`^l8f)LcZMA5OD*$ME;;;{@c}h}9Pox!LHJU5E&-dKV&uG6OISX>T-y(8S7b
zk|M+RJ0NrC_spt{Ee>pkCt0zW+iBzbyOr<SU9YRja$j#-d?Zw~qiJ@`)Qh&ZT0*vt
zfl+P16pD5NN=CoC-5EoFWX%g-Hkh27xR<)QVteaU<sUJX5s%LgX1%#>-to+IK9yQs
zVo0*T5h~-^MMX>@YF1f6UunCzW30E!@zH?OD>8SVnQe`<nMnCvJ#upKg!I-WHLf3P
z69h2IG+(VCLE{?7UNJz-pv|Tj=%;6n1~f}u0E4Z@uHHT>i#g_>Y2Bx(h)Wag)t>mS
zf%{3B%nuJ*?0`Ol%iu-P;{+K~HRJ}Hub+pal*_crYP04+MhMxS^NAPzqAp!Fb+#eQ
zAzi`O`>;i!sMDVqa4xwg1=8S8ZB9eevHURCen3rlwV@2%ixtsZdoJl>f(W0L2A&DW
znH<jt@Pp6`Rb$$v?!%kDrsSQA>_L@kzMo3R9WLFhcE31ER{f6WWOvJJv+_(8m;t~R
z!<#A5Ons6`L$9>kIl|h%H{G_isT{68;j#bL%*>J@hh0!XW6D>X%Os_XgKrxo=vLw!
z>%tUEaJ^X{Xwqgtr<ua6c+&&kI@5DZTPm6mT)1qi)7a!5(+wZ?^D7YZPP6`~Qd6Dy
zvm}1%Sn;<OX{HoI9ThMHbHh&YkUDoEp)a0}cft-L7j3BeSbAm=d!%+m?}P8^4=t4m
z;kSws9i9C18sd-OcR>EdnWq2_Fq{`*6=OWs;_R@T<tApij_7Ys=7dZFSsCPmrbn2X
z!SpgC;_R1KU!VV;G3`J4vDORv^Gk_aV32(`0#JriaG>%}`1KNTEr_Ye4*6q|Y{{&6
zvN21^2FxN+3hNt857mum2Y9Wi+(;&SW_)d&{M`jtxy%3Mto@R+I7bIiN7`3L6Qd)W
zL0p#*z0)z?*!?0QwhSO#x>$-Rzn47MllA+!=xSY3PhiLGJ(}yjElL{#-7tuICDm;^
z63>Iu9Mm=mM$le$e?*G4c@Te>6*=EJCmf}(Rwl=A?7uNg8eCOOpu2f#9ZYlUeZ8*(
z)UNW!S9uoFm|_U}Q&ukK9%i!Y4B*SS2Be*h?TpTwTQahb(!|U8G`>S08_E=2KI4n)
zEwx0rok*VKbFa<%CIxfr@cw0I)59JkR2#<gw7{cHC0OZA&cu~qx8^aF4N+TY30_f`
z@P2gsmabK>$Ah%WTLLykA$eV73Wb?%aG+!VecWie8rKGv9spo-0S3L5YS=}H$7@p&
zQ6^XW<gr3OBUkh1De5*cWFPygxbBdUApMjKE`uH|`2#^#M^jHvr4_IW(fXdX7^tZh
zdSKRox*e@Bh&o6mTm_t{LzBa8Y1@IODm01pSjuHbGr`N9x`_PAyu3Zh7Z=+pm&a$X
zU*-7$>+FTb!$9QV-rS@i-5*vN*wHUCy8^9(hixjAB7H8_q<334)>5k+FTIlSb4{W5
zyo*IHz>dtH;8{roj;^&^$FX6<O)6wW7+wIV2nGQZ{SfzVTL4o8II7DTk5>lYOZ~0s
z|26qik|pf**r0A<hJqmOW8z1(_rl#!ktJ9yfg-~YLg}-x;2bgmn#uj5UXymu=6yXn
zgaU4l%fer{4VLS%E~#nxmLA&J5|}bBVIVbTEo0lhty?OJ%dvk~0DRPLHvcKAHL0)z
z7UQyy7Wr%w!-H(M8)*{D1~6~}l{M3q9{pj-2sLa0scWvfS$a5Or6-;oupw!vkuivW
zpOX^GWC4HWh={Drau6NMhlw*K0)lBbA7==Mu?3r|>OnLqb|^5**Sy2xX!FqWbFY^g
z$9<z`l6Ag)E_Qo3I}@DGGy(&!E9{;}UULG~P*LO;fwgG=?i|nb?kGnxd;1y<8fCKw
zcK@x5#rcPGsT+rNF7=3Lt)J9Fzb;)90nW9dk0pGAN&>FdqheP^?u_Vdj%W<s@eLI4
ziy<E)uBOPP_(#}&R(vC?|3{^W-M<T|)$Sm0GI;@AGCb5kUVhzF4<N6!)y^u7E7aCE
zSJg*4<9$B0`sXgc{9XdW%{He>hmN1Mc+n`%^34f_orDoHvE1k*NICjCI0v__v@a2c
z*&>sk>no1pV>_7z8AY|(4?ibRfxz<-7T52>)xW%XJ$d>4F`EUb6bPDfBbA7Y6l)R%
z+HUAbh|(iO&+-j|CP;K;OlPLH;UFdLdHzT_;=SGZ5y{~Cd-ZP}C>w(HY8LuCqh=7a
z7hq4fq2gL#z)|fMZ2`dP9B?#%0<|9xfQqzI6X_2?3vgwj{YqN9<GbVifguNfetqav
zz1}VzMfsXgc=DEg5Cin2TTQB)E$Od@Q1&F)GZ_DGFu)h+usv6{+{4iKbo@CR7NUIC
z>=Qous%k?7+5coR=))BSi6Pbc6hlZL5td9@Rw#Ne^EN2A_(VZqAA?AUX@ZfgBh%E$
z8*Yr_UZBC2Dk*g;{H`38c*Jk^#NyP6uTfD%p)z=}yy+;@mPCN8IBthFNt>9JE$y7@
z>xv}}elYi76+TYVJ?FFdng7hfiR&BE5>``*oVB3F8xG?Svj@^-32ls1=jg5seO%0D
z^lww;SxlP;i0e_qoK=!750<wyj4*p6f-yQm#d!D|v>j(Hnh)qFcPBI*2`pi{0>^hD
zyjn&3K4H9k@-S^?dkg8YYK4z$n>TKA*!D+h;BEfo&ecR88i$)fd;l@_g->t*%8vEk
zblg~r92@E*I~1IL(w6WoBqoLJ-n{q!!td&$L7%Vc|1lk@e52YW7}FrMB@u&`ntL*O
zcNao%Z)9YY_?(E_w$1Z2s`|)T1lsVMq_6XgyHLh*=Jx}O%%hC89sc%R2uc=4vY*2Y
z*g+Y@adQwq@QB!j{5sCV+A?RVWPy>``wr;#sMPI8SZc`{;g`BFi~1MJ2@lzYoN>Uc
zg*ik2b{mwMibSx(_G9L*J3sq|X~W^c{-v<()?Btk$To``gg@{?k1_vp7Y3w?up@Y8
z7ZPIggy$rP?Bf~i=O1?=b9)nk8{2c65|L~`fAAmF|DX5$okp|&x3MyI)7uf9lfQ~k
zdcE({fde5SN1tl7%>RoBKvw@{2tFC9ed3Dy`)Hg0O>V2da#r{0{%v5P|12S&s+`!5
zC;I<+1oi$Vit=wU|IdD-CA4j&Y_54US75sI9(K_=McnzB`Pl&tJposQdtTKe<@oJ4
zqswY}|51LBCmosL+%T?x@^Fyw>o_~{evS}yLDdIbI>v8pYTlBSgjYcwrb^MRqqUT2
zn1s7E<Ah4Ktk2W0i2T~jqR-c6eFSdWl(PqYI2cA!*e`RcDL<djpEga+Asj<%VOrh6
zLL?mb3b>l0^zRcYN8@)Pky{~(z2+Y?=&xcd%JRgNpT;?<3@iFXo>jTI=lMk0ms-h(
zZ%iqSyO>sUlMsd>yK7d2m0RBAN)>JgBj)ojr@eOMD`@go*4AU2f65gu{?;pLJ@{%i
z{P}`Fk@UG5CD$J5mudCTdp2+^feS|b2zm}V$eAr%G@JwF<xj#!vky|UGIE)EG@stU
z502iBU9p80Wgol>G~HxI>3eTX4xheSKhsTMDzj><f8n{2xOr;|q|RV0U8)PTYFDP%
zB5=NCqjwQgSH{<>8E41nHq)2C-KjmN{OpOJGKqsT)HEyyFV9LP5Ua6Vr~n4k0c8)A
z1mnC-O@>W4vy^G(5!G-`Z%NGn;i=nI9u>jANVWAg`lF4^S0DduyyJ2q5EB74>o5~%
z1Wl{E6AL0E)G2$wl~iWzIHtiq5^Kz>Tt_hM@lwUQ`6ERQlG_5F9KH15Ea4Jlhd+in
z4tRj5Mk5ee#vYngq(*kw104Cl;o0^xp8Ce{c&TB{_vucZ&8cH{14`_nx7SWMOr5B8
z_#7A1xA;n`1hhrk7-D`PwuY4!&O9}bi=KsyZNc}W)CacK)Z1Ux;B(zdGS17^#E+I3
zb4nuS%r%0qYs8Cbxe6;h=9$#9Z`}bwv~7AVK%nw5{TT-6FL;T88clTTZ<B+?$VKw+
zMDfcd#urr52WL+{fu$&J?eQ>~=V<^lR{2ngQsy1fL8)#+rBwnpA6eMd=n;`OR&5>I
zc%g!YeH5aktgUP?@$jwPV!~^Vt!9Z0VQ{|tYEO!_!D`bI*$|{Q-GXM`F(c5PB|!aF
z;-&q3OGxxvh-vVjfO~Il)s_`a@=0{Qc5w)ILZoS}8y*gt<6zIu)kwvGu(ng%r1omJ
zk4j-EUSY;-9Ia3Dh%VEE?fYeP-9XY_C#3S{y@ssd4=;wR{zxty*fXQyy6|53qZ&?&
zl@0S^$q^0msg<9R?#>UjW8k9qnM$q!Zck%U-N{{anqN=rhLV80z4yH7?L$QS4+DF4
z(Afd`Gx?Nv`3Hsf$O~6&Ni0qJE_)lH41XO{i8DB*V`QIVUx2h`n|*@q5Rcy58!m2h
z?LV5r|Jmz2N#iXTs-5+7m=s<IJ+uqq(l<_&T$Y!mF2v6N@--3q0JyK>-v_M3%Q-is
zZsn+4&#*20&U^QM9>J@@{!4`;W9xN^#GU8+c*||ck*$f5bJBU#O}5F<Z!KkNc{8i(
zeV@;VzWek#!o7xS(TMQT=>0R_-iw9aI`${!;(%}$R2+2;h^HFPgOQmo5Jql(OS6uu
ziMPMbuzNMHsj`_-hPc+)^s&ak(IbJfDpKy2`ujvzFXq^ksHB(jUG&-rT4HGo%w|F-
zHb|HR6c6pn#u*?GL&^}|kMytx?rP~b2Rv3gQzs=^(Qo+OSLM?%#qU65+QU6Bk`_{-
z=K)o^Ix7VaWgZ@5^ZbJC(X(Lck|R85HxjDEymfpkk3o7vPEA?pufDk<fA8CGgL2-M
zE;}593;lHm%<1EVP)En~U=o%&$TNOv?NsfK%qJ&(lgVa*@r4OnZf~W$OTE<Vqs5i)
z*vCMCUQ2*Rf&-3F&?WFl6h?o6iTxa5T9Ji>$*J)lRtVhMI9SD6I$5pkF7|R=eO>BB
z1#J7^E%Or%ahA#n8B(8JRR=L#CiWf-DR4rbKsnJ?$nyE6R>3~7*d|{bbU$F~yq6qu
zY5O{!fK(dJOApc<Tzq3D=zjCyXs&Ve)5%yipD{z=49=ST##9dZR^K?|Vwqon1W-gi
zWAgFBx517fjHll{yVMIeR3V>#aMUl>3}NnSPn28)f<34okcw<ZJKa>fhGt^q*FY|_
zw6PnZ#QE+i9}g7Oi%Jx{qVVvpuq?i>Jph7!j2@Wd-LjroBjTQ$3IIhf9MhidLhfc7
zb;CCtgpaNQ8SOcDyWh+EHqCAKY~sqBB*+L}<T!nWRagVv$2?uf-~eQZ4h7?S@cm{S
zRi=Tlbr<VgA5JxL%6)|`f0K^m-+T+kF`0zQeYsKzxir;wfxZB|a;s2XoS%s$dB)y0
zeSJ(lq9LYF<x^u4rOeLWU2;I0B-8cP-&wV7?fW*}2~ffzb30%>cupj|T@*QdxoVhc
z>y_<gGj(&>fL^fj*+SEmQ!ugiR>bXR?=Ets@;^HbQ-|HLo)ohM{26d0hN#^+m)J=8
z)=YMyD7nzMzhyG6{2G5UnbjOpY>80j6wk!Z2tJ|HUxvKpnRHeXT+X>JvFX7S@gb*X
zU@&Ciecp4@Wm?szcpRGnXKLEHp_<70s!?A(_1NBtt$>RgqD}Oj`1f}AkH?JZ<dud6
zj^R~VVBXvfAvuSP*|q^JyzZOWlZ+99EA$=xksXx%(_2zH)A<#a_Hjqws=hvXOGc!r
zP6MmyjkKkkf#Ew3?cv-!D2{!eJFpl{6RoU+H6wcU=`#w$SI-ymx4bx;e~s5wbkD{f
zf8{&1YL2H-u8ec&er&aN6v4+j62@ubG@m$;(G6P81Z=Uhx76hL>W4IY7x{tGLmp0r
z8~Szb(YV8PwPu>Ksr65;UDA;u+|?8&W1pe+!#pt}Oe<Cv9*Vq&y8DUh*N_ju=-x&F
zT~#WHF&+cHB7W~j6H_$5e!)BG=|st{WRh-XtYSmKx-U$N^+b6H{03k}jI0}Fp3~_i
zLq(T&%EpRHyh|IV=ou>$D)n#eRlW7XZB;#G$_<9ue7Owi36w27&R|Yp>v)NFoKvTl
zJL*ws=8=GcdczA$uZWFRMMSlj0f|TS;QYeE8+j>)8Iy~~lSuFumO|N%5r~<X1E@Ru
zN)T@O&Zc3s#Y=;mrydTq^wV$3cb#&d_-VZv{oVXw9+$dk=qnc?#z~Yet)F7ZfT?yQ
zii0!Lm~FId2ixDVj_kb*ruldZf0@eB_V#b?74`BdKl}bvP3f^qf<6mTJ;q`8*p>FW
z)L@cus3|AYaM)CosWphT9;gmaxvw$1Y!P?Y!o~j24|n(8&)vyS=iiHS%)@K<rH}~w
zz2~ZY=VF`AWdjDm!tsr6tgL9<vQ4LEXFpx5gnwyl)^0PMs7K_sn>_Cu0aHWdEH~3%
z<$EVP^%i;6ZVP!lP_6bN*&q8N{vnAEs_yG97kT2YaeBpR?x!9vz&aQwu*j^Ov*}y2
zZd)iWQlniEO}zeDh+i!yk26N?{r^har`w6=X*NtO@}vb7DM=#J){d+t+Bm?ei9YRN
zIkRZVKDNZNxb0<OZ7@d0fKl*!%|$k69bf|nh3rqekkd~6Ol|}fcQprlOdkHPFXpmE
zSb^-&IxN6HM{>zFJJ1q@VJhvcu{aPgssy1e5isAH+u@wW5Y;i^Ps7C@ar^|M(!>?Q
zCU?^=q{R*l9AG5qIOdl?5QsIF{Y8k)SDrL5bb^7X$$;d>E`-C0srIi4v;SaY{$D^z
zi>C|sXpKOM593K)=z)fpccL~qKYhQOz?;gO=A9FM<6vl^jm4+`Fop1MQwsk(KS!*u
zk6)=yX46TE5z@VVV3lspvt)}^h{PXNh8w2U(ewn<js~K)k*1tT{6<@rNOfj*Mg6ec
zm*x?PD<<1-P_&3;5q=^xLlw-WQi2}&yE4>JSDLEd=B7d;8;3g;-$~qYJ-!F6NZ6yk
z!__h?*|QNuekACtVl(U?C!RviuxhyZYqsafc+sE4sJC6l+^Zj}50mpUe2oo#MG1(n
zn+V`ObaI%TkVrOUOOEeCm`~H#ay)bp&dJRMz~Cbstlis%$d|)g2|`S>`FObwd4(ZV
z)GAXqUz1(v|46}%f4F;(KaS^dqH?iBFe%a1IWV5!ik)<AnN@9|*37jc#NfT&N*h{_
zal+3192zk{Z*-hK?mTtn#B|}F1nbR%j<*!f?f-Q>T)Mw1PI9{J_oqwR3yELv*q$u1
z&JxXV9)7jPw*IXERu;Ch%3!~Kv6V0+UocPC{Q<nx_D`1-!J@%`Tr0_-V%osi3HXB9
z@8tN$071$WU9=swzd9$AaN?#&Zp*dv;XlqpLJg1YNvNqH!YA6vfgK^7!lHl+Treb^
zeBt7&YqW(V*OP8{3y)3Dn5+HpPmwtg_x#9McAJBrhhwJnhY5rEJvH`@HNJNEXIE<d
z?WU76n&}UNy{1QoV7{2iU5MF;(g;`u=rt#A^75lv9r|cveH<3~Gbt&uVvqf=Z6;X9
z&gR!HLyypAn2JD~W?Gk#bJ}ECfSl(Gui`0n?k8%d*HZ+v&JvHEgxntgY2P6X0dOQl
zyuNm1*2!VJeup8g^f{Yq%MS%2myF7Rw@FcJ&x7Va;E*7eWiexoU|;}AHMgTx1s{?_
z(d+R#OCK>3JXBl)jps$Zj{$M@idKsB5!=F}FW-s61%v9Gx-3V1LRR}9Md+4}b7x)4
zJ<?NTalz&lBG5OhPnVty*8h^*cmBJ)+<(O++DM#|V_V6Qv8-2_h+W9V!?@O?tW{P8
z;m05ztaF?&8ry|zKKEvd;g+p)*O)OCbMuMp<o<3GFbj<a!KHAv$1oVVnfMZ60_ZPK
zM3DldM=MWR*v^o>R=)Z&wOSRyDz)XlnokNI`HFRNQT*;bd3o@CoFCHoG|eS#KG-Ki
zIA*O3+1NjCP+0KYCp~Q1!mGq(YNB+EPJv|F_oi?2%Biw$E9)?8NCW|x=fQI%XuDG%
zPup0ZJRaqhztZTEt9(S+u4`=gYG`KJE<}^dMJc;`uL<>Zk$z$vUTgp_gM8FV5YXKE
zT%)PosoG(mUW?N>U$*v2(EDRl*QJQ4+J%!R4+P*UyC00X+8!@2GMg-&YhqCjXiej)
z#RgSqE`5$7rhXA_kqPox-Ld12ng$`7pM30QXUnph86qoKnJsw4tU9&3ogj8;r?z4K
zl>8Wb&;F*#fY-xPgpq=M`5mX*edSE~%gIg$MAyP-=>Az4z?e@=c)+}o+JBt^jW#)x
zzeJ!$$lM*AIlt~S-F3ND+4Fi{R0hx2-0Z79UrcxyRx?aJFmHbr%<7DFlc7;H7MUg%
zBGy$)1`j_bJ5YSTysW&~nUopJ9s9#3&Gi~!JCAG6R7MVW%i^B>eowQH(epLGTvs*F
zl;JEtX<RRP_~rR?MoDkw?!Ig|B$WFs%P_Laf?QEkH2i=*U2*GOn(v3AB4^{_VT)j#
zg|nl5%GIwAVp4c7UAmQq$Vl&fQs&-F32-dCk(CZwq7HS$K1ZPP7t*O~+(ohGW%NE=
zaA8e#)#vs{sTTrIPFt%aJl)3mgHz)&=F4d|lobGH>ATo}3jn=MQCIdgKI$%HrFa*z
zRJ@KaiCr=>5lJlg@$>$v8-YHAndZk9AwAkkUwc&gv7~$-mps^J{+ulxtWn1ib$4$6
zS4@F_`;2bdE+nkFKProR3=dto7klcFMZ@s#FI(#c(Yg17DX#D^Sha8q=yq0lbWqGm
zh>lEoOP^0ComR(<d1H12U%u*?k&%A1`fwlbA@nZ9UYDjyTx%o5IiCFmmq0^3*@vpn
zUcS0K`PBL+^tuM$x!-xRETwkb{*nh;M~gFIUKh$5ax&9>yvz%o^tOj@xc&eNBNRIH
z-p`vlzHG)`z?$#q{-Na%9ttzX(&&mNV3>CT8Ay$>W@{w^f!eEPyO0a>q&3F2>MleI
zUWw_ze1`oJ#mzxCOoiDB1zx)lST+D={sDv;`Uj@H0gRw_457l9xds@}2qR4sDx_F)
ziIIf0X)MzjN`tNo@9Zhr?q;_C>(1~XOsfRw?L(!P2@+29e41&JepYzyO=|?{PhlKx
zsyzRkSW^&VFwa*p_1DdoJ(T+)ggi_u40d-$_-djpxMsMo3B1j@J;=&w|8^?@y&C?S
zt=Rmqt*zg(I@Wr)?Cx`3S3bdG@6VeZ;D$Va91KTA$|6UtO?!^IO`zqkvx<Md&Kqyr
zw$NI+Fg3XYM<5On`F>8kIPOqfd@Et;0%P-2Nc69q7$z@E?=Sg+P<p(E9-rx#3Yo>O
z%iiAww)A7rYVapRIUtlC(NNM`r0tjY>22lENq5ze(cqF#meN*zl&=rEWUlw9hhMdT
z2x70DmR~6UyswTe8NSWp%m#7pJ}9+XoA66QibeN%Su>)co#nZT$bMV4aQqf;zTn-L
zkV^)_U%uad5Y#GP-RF99ka?=Wstg%sS6fk4=R7VI_^Q<Ad_&jGH=()80Xd)d_5Sxy
z76zZ3niG4-2n#q$&292WvKb)IJMb6gejT<wQSKO{5xR^adBQORN1=>W5Ly;E%V0md
z3Zm=o^(*mCm#PVML%+Awk>xl;R`R4p%02Au-qmXlzwZ%Cc)L()bC1XN?_I#+<^T2;
z{T_i&c8xv~4RK#9B0bN=llTrDKltFHl4;jl*P^AyoJ%|I+1y{+jazk{zLtw#H+J49
zESR_d%n$kMn;hWAkAJ2x_isEsOzxFvvNBmORy^Vnc!Hnjn)RK_mCaQnrR0&vEA{1?
zi}ClKBbE0hE650zo;Z|n=76k5=nCQAXRg5;x341amP>6WqIk;U!N<~m&MWz22JSaP
z-Y<IHyDfb7dO}z1A!D#I``;V3OTT{)wT)S>tb8Cvh-+&M)!#5XkZeKZSoW%ujlg>O
zBfJ7b1h36BSUTjrR=?b8nR5DyL%7|2L2BNn_)QMbL;DXH9Oa>BWw7$}5ENe1@3(aY
z&K=u8y>GRotkfwB&dZztW!(O^@%V4r^A8yp5BWUR(2czr5PakHjKCeaTWYQ+t&f?-
zu~=Ax*-t@BeUhi2!2e0x!2AD7&t!w?nbRLIYgM;u%r$?$)4JwUKltqGxr-cf38Tz@
zu93leJWl_CY`<s>G{y5@${&^+k+(mkV|@Lgw=Bx=<5Cs;Y?>kO8uzC!T%U{LJFmZc
zX{dE^ss1VWC@_-w7tQ*cntXOzb_|#q?TjyD>hyVjsk_>(<{tj`oZ5qZ*HaF~ZS9dw
z|4%FLXqh_Mxu%1B@LF$=-H8pGb)FG>88gY#PbVuIBQNX`d3k2}h>u)Lpw2YlEv7DN
z^OnOjl;))WpDOteI*2j`y_93XMk&xu&j3y_1iYX;CTGan!YlG>a1rFP&-shKTNZjV
zhPGqX&74P6)ej6gJA0CTaz8gepjNqGRhkilf52)m(WMzW0!xe&D{ujnj`rFnJo@R+
zacsbzuCBhR)~rJjeA>#<((PQgECt<Y?><9%b;905RgrS-hVVU`KAIk6Rmwc=TsvDC
zh=e7Wc+hSm#X4n`9i<}lDJK;gMpkwG^QOOCo9lDBws7Hl?c1(nxlOD}6Ap%_53C|_
zb%UYi1FiOmL^OGTJDeI1htoJ2H>t0@*O$|5C}V!*3aOL72PMa551LYL1$j_Jbku^M
z_-<uWV~GaX93}=(WoS$zVVy3j@W?DaS~2P#?Me~R>Pr8Gk*!mH#wW{vChROs8vFFu
z%vd8yhmz0cKDh6zN07(33n!K^4xuDjiF##Hjw5(o&orGS7e1hyR5JD9J@rba`<$Cm
z-_Ktnzmqm=qh&km1R8!IslVcNLW5x4&?y-GIL&a?SDPZtc#0&npjCk3e9y^WR9yHb
zPTJc=+UiN$oFk!T*u(M!;>Tl)A&IvMOAdQ)m$sUPJOYaXK31|am?s&UtzfJ$Oivm%
zJz`$q{i=tI6J2s#nLEcS@sD1JIko=TmvnWx;;eJb`J#r*SYu=$MU{kz!|ufpUteRl
zn5qHV#VhRN_0(>r0-GE2VwQh&%h}MTvO3s76Q`}5Fne(`GFcI)c<bwRolCvyp-<SM
z-#i=+mbse0R9GJ}pDg`O6y<2?M?tgS3u9r#*QSO*B2AUe*GG-)CP<;+7#xshM}1*$
z)KyXR^<I5rO6Iq1*OB=H4EsuyW5wVWG{T65i$W{R0+!ZB*C`?75SUHEH{YWYXZs&k
zeUVX6dtXcEzAl*lV;ACT9qFP1n1hwysw8YI`{0tP*vR8kBAQwzC$oL3yq~1X$vp2q
zIPy99vH?ZqjfZmFyL)uZe)a)CfIRnnOE?lLZgQ{y2s=Icem&rnoPG68N_p<I+`Xjw
zqwR+dh)jmU4&Y^hjaw|xhTYeeV%kw7L8OqaL``dF@akd1(T7IZ$AI9ogoZjov`6#E
zfOnBVhEGk*aL*Cwv9|I)#!M`|AKc~Bu^;H}k~gL$Mw#j{y_4rjHA7xI3(w{;QR!C*
z1z)u^6q{}iR@c8(Ru?a6!{2ze$0KI3rvLl7PPi<_7c9ihIw@r&9I5BINS&572EvGA
zndRNV2OO387si}E_wdFQ=p^0SD{Vq^R%MEwBmLZd5de0QSFIST(Wm|<%xCIY?^|w6
zY>RFZZ%jUybJI*O$?MS>uCC}~$ha&AQ4_y2s)s{O7~p38k0y6OZ+jPVfN&onjqcus
zhzDfeUFq0?g_#^7;e@^H>SPmQNyx{+AueAmqsp}oF?44PipO=#lb46g#p2r2L2u>x
zef0M%#pk`<SrX6+kFFx6ahPYKRIe5e8d<&Jm7kI6cem&99AY|t@9HIyTSqQT!s8g@
zt0^R`$S-V-HpQ%T#T1IKt{yX0_Z*kC8Xe8;KDc^>Na2)Bmb|?uN+srEuhwbg#M2@U
z0`<z8ATq6ORshIFkTJq=uSys6fXj*IDocj>i+*2&D&tNg9<IlJFLdhvIP5>87<yuD
ziihk{17g1PZ8l#W!^mg2c%~zkw$`<CtKOQ=Q%A6;!2B4|J8f`5InFIR{AXrpM{wgV
z<h$$qlM33<cIQ(<yS0#~qj^jJJ}U$%F>K+>;82MBsS_qrlP|0Yy$qg=%E{@Y%(lTF
zGqvX>76?>ZUu$}NN;P33k?zAvF$w&Dv?d?yNFzw20;t~(YJvHKk4Zl*@7y1^Pc5#i
zvye4YnNDQ9kvp<rItUe}J;h9}ogTqUq8_la2*k83WvWVDtGkoxFuco1W4kde>_@x;
ze(?72W^Zh+@%<P+qV+L}tZy0~8l@bW;38fK1cCcsj1pqWeTKx;ic-M82|+C5SC0a#
z3x%vN4F{++j#CuUzlH4MWq+OXloIyb){&8MeL#Bn%%>Nxac&;Ek3E3bpdu1?A-8dn
zE>hGhSjLH^1x@wy05RPS7jeH@$dBz><Op~2Djusd*zgv3CR@_=Mb3ZLu6c#cU&qh}
z&do2zqr^z|)U~ce$raPXKsbpH+rHyAi)Rae9P$nLq&njM$?|Svz$aF`mYkvLSLNSv
zh*o(F4M#TwD#%y~lp!#ek8Yjxc#alANss>4v~I!j^>6`UosIgV(^B?Qx>HN*toYmr
zdapIXSx;~MR<-EbgVhQnu8%B&K74ZTHvKMGcs1W;f2@Zmdh6HyCRgHAndx??Y+{_>
z{na0Ynwn&19RZ9fpJ<oQr;i`oR78034wGgGq0uIL5Kg`bye0j$#li;Yq3O)GXNj@0
z%L_*HfnaLg&B)Fy?nVOjNue@*|5Ba5x4o#>Z*9HpgzNUlwX%(hPbanYZt_rEsRVi)
zs|GEO{z7G%1HL)G<PVRT9-djARUSs2A@=s!UZJu+AwA%pVpppp=(A=o)u+x*G{)O4
zpBcG(_)j}&ag{>TZrwJ(iV~^a9W!#RW3ZZ2v-zsgXkn(*She$&Qz!BAT5tSTH!U{_
zgqRuJwxdbDo8aYSh4`D7j+yoY2YqNfWWp+jrrzoT0rbhu59e%XDr2umD~L(U&ZoaK
z{mIQ*PCj~#P4$M7+QIo9U6!B8&orr4jrAhYJ=R!L{$5a>4gyY$C*M)pP1OF@ei5Ls
z6~<Z367pj@(FoUrz^(sVmQ%4ef@`Sj7LT4P4tY#xU6>bCv`?DI!bgJNJ7NSRk_=Zz
z=i&0d2vvk`g1D0d@M8k0ewmg?Pg#sX*;P*66s~E;f2c4>@uk;g9a3H=CKx|D$*Zi`
z&}U0G@&O^1S7fQK#Hsr$HmFOqy6F2&&LJO|>O<uL&ep9`c0LDW&F~4e1D5*j-@EoM
zI=s2OF{`T(W9JB2ZjxglQ6AK>BwFz}J%NSYtZy=>Mps8ss$t>yLrvoZ(xLj1_x9B_
zb-LdtKUX$Yo)bR4YI}I$cCjP=G%KAghuom24x*(|&Y%+TbU+2=ZZ&~hZHCJM569t%
zd@@#kR_H9(9rwEZ(}fe8&HKKPh5Gj!tGvkmBLJ<z+@6Ap0=;COF6<=2k!l^SC!|Z3
zieHs#dxAeUurREzUZ#Cz@cw<XA7iib9!>>~+sx`HC*=kOOBb(k;vcfo(Q?R7#O~D%
z>KZ+amEELEzLN)>BR#1gVYvX^_AIWkfDreCk8X@;{b+7#tg5U|cCcL?OTBtdfm__#
zzxmZys}}ia&^cJANp&FPkc=w;sLw+F^Zk;5<Tmk2Ecu{EEB@r@<`jdXePc{i9neXY
z4XE;s`F8jQ=FR2teVk$P>OZWb=j6~wQ4fH?`L)S;d!P*5f0V{P#wuzO$(;h%G<ZZ`
zS6;elQVH<TS7ijm6CMBX`O+t91bQDP%q`hoaI4#eI1mu%HdtkUXX1+;c#Ck1l<Z?!
z-AQ(DqxN(3QBT=dCibzIJKV1u{dvVBf&<g+quWys-4YawgYFq4xX(@6cfxAAV@!OA
zhzVN@TERPTr-AYr9bo=C`^(30eIdg(3!lIP$&R<)TIy$hKAR+AdOhrE(sPa^28?-w
zRf@XF2$%s{nCKAE?QV?RCY;aP(K+$C#@#9!EPS(Hz+3jk)6UB2=DyCdnjbq)-rt!r
z7Cuaox%s9<hTv^I2`BS_89P7WD9nK}7^sR|zh;#Hv-wOp(&95yG4DyUcXg8m@#Vt&
zGv0>BgVHRHr~JqaO;PkMakoD)6AxxyiK^XeF9_9jt?)$lQP$gl=%$n4*C-T3Qk^;s
zCH=o4Z~+$&`PkD=H7DKhs_ym5%j)xR6K!tZ>ii5>)OQ~QaR`P!`#atMNMW4*WeQ;>
zG7ZZ-T{MAhlF*|ow5~F8t6PamW+pZqKR~xjcBs777STqs%&51xqw_)ql5|Knb&);_
zm@y2|LzueAXBh8(ULd$LX?g{10DcQh36^VW#n1)%sd`AkM@aPdAH;<oTdUgP(YEdy
zc?<ps9#vQR@)%yx8w+J0LXLIMBRWp)LvS%q09GG<$-*N{4|%7%r`H)3#%%dGZKr>m
zBpqtdY5hOiJI|n|wswyv(p379q7>msk*ai10}&J=s31+G7*LTWLFp(ZSP%pO2_lFH
zs1zvyDWUfOiip(EYXBobAVI+ZA>74#XYT0Cz30xE`{BOxzI@0`@?>Z4wbwk^Ywc(K
zfB(&FTAmWSoGzNz1)t1?a|CI?*r_V-r??v@BB~}@jz&-HUV`o6Kyp&BE90@~FY%nX
z&pDoa2K|JEu;yy#3XBGip<MK&%q6tD{b1~B6Aiy{nuc!{W5(TN0vC>EFzyFux!rGm
zH>$lbntV6{r)c0<7I2;2{i1x*=ees-&)CU*lj}Ig{S5A&2RL`>%>jJjaLj}~6@fp>
zPS-1<JsTIGh`ypyasBD8)4hs|zQ@EdKRsp9{%_CL0GR3GWGZiR9fLM7$;}_q?H|HE
z0NhY6pqu0=T<@DStIBE%gLJ6jgG;!aqMNIn$&Zar+j!Y-7Pv}9IyztBmSPfGMn|uv
z34!{Jn8@5ZDYi~2$bgouNP3PId<YZVIAZH&O3u6lm=cq>);22Esu<R_jM#g&TFf!!
zr#GMKNDZPJh3KjqLZG183p)i?`Z>Oii}csX=FyT-NBS41P>RIba?J9OV?gibt<liM
z2D{MaP|kXqRAMvtBl|zl&YNf?)jt|A8T<h@M<ecF>zmN~X}OA3G4)dx%E2+qPBPI-
zuAEKpPsq5wijg(;UxXX=9rt}%_A<LMd&v3}ii0Qr*2u_P-sW7$?sRAv#Nn0}zdGa&
z8@_%`Af8bT|Bn5Lr}=L|4)vf1oEg5A7k&cg4Os|)WB%#|oz-0Rn)_KibS^{9*y78p
zLIkMw`*iZ~s*Su(;@ESIjCdtFzn`wq_F(|MYz0=1^QxHQURN#AEYns*0XlBa_z2{X
z|Mfm9{g{wESkw!%-h!A;&=UZ84%S+J{j~G*U%Ko6x9hNTXfCx8*=K?CS(S2LJXr*~
zfw4=~F4A3;bSY(&UFC_R!lam9z@|Jh6tbTQ;=^b0SDCc)-CzIHcKT<he|B<UCkJ+N
zU?&H5a$qM1c5+}R2X=DcUoQusHYDpFg;}M`<g#w91G19jY%x98G)f#Tka54vp2n9p
zeV)?iQpKO29@Kjh^Koo!NY_bchCqg71AI^{y#DKn>sye!h72H>Kuj&Ia}A*<&Ct<y
zuS0>X?bnFdue-F8N#Be9HGrT?h5&24^o2wQ%V8Sfqs*?f-y%SA@$13Vy&`vx|9hAG
ztMz{pl`Hg&X@NDy7$PVezfSmwI2~tn=2gPO17|)Q%Ap0JdV|u92N$JdluY!ktt~<z
zP}&8S;qP0JNOY;@1BQsvx^}6v8G$dT|A}E35VQ9pNNk%=_N=rVUf`sf*@qw8XNx-M
zP?eQobK)ZT$_1U<I&Z|ey;BLxeZ}oCtXhw1b&<2*F$EJd6K{)zh9AK<fHg}=&N+MI
z+LGRQxSXwZ-ell&O{ZwV?ReHFF;gJ1F^8+z4dprp?q4NVib3dbs#tUIZ67hNkzIph
zuw_TYF7)k)xgX>}Gf<J8T8dx@&tqQ&M2_w<_<BU5EyjnjjDainnRju&(GdV%2m5HT
zaxOoWUus%hSspeKmp{7c9il0A?fzUU?khj#lUX}+9_$a5<4TS`T5<?I?rZjDj7~Ve
z6j&<JC?kB~%yrGYR(*4goFR${Hr}l>t=)3=B2$;k>&TPa?v18heD&c9mUcNB%ApND
zYb>7S8gQOT>O%GdL1HcSA;~CpF7x>1)X6qUH@5YBhb4G|U#bYuv)#cnYJixli{(1f
zr7EMI&mE>OTHrp_eFQc8;tsu7tw^OnHC3-%>pdT2#N^<dh7M2tfV%ABECY`@i_*;u
z6L58kf;`Ffn{6{ocMzS<MBJw8e4enWqAVna;i5+7wXQxR4)^SCv+y0|vTmC3Y4tKW
zw`?F7p=Mneq6mZ#8i*o-PG_zg_2Cx{oRpPBkHPZsmE~BQ;JO^5q8PC}(!&SM-j;iM
zb(LMg?<enjIW15|9p`5;J_DQaY;40b`I$r_MNs=p`h7+M+Xo1un2$~U9G@4Tf9K2N
z5p?1rHbY}-<v<Ov#`X%a!b7Li)%%90Gd$t;^}zjxSAtlfp^`kh%sgkrR|@u`!x)F|
z8+Wg4j7cqdq}SC~_La7Cp2rU9_l6%j#vd=W+w^hh=!TUQWda}dCi;{)dT)ECVW5iB
z39RKJDbDm3RzkJHybU6MP&s~32zodoMADjehfF?JqndALJ7{o_!5Hq^?{K;LQM=O7
zu$gb1k$wGiM79f3sFG^gx`slz56{+wvL9y|1IAT`2Ui+sBTY!H0ck2rSpVI(WJSMX
zP9dUpK%*<Of%q#MsW`8V62c{A2}zE)URpSQz2#?<a3TBf&|+QuNsTtPq&F&SC;S!<
ziw2VTLhmyK7zwK%+_}SWMT*~~&&zTq$Cg9y1=Xj`HvAd0nc{u%5G{LZmGZ5MzUtpq
zqqzmS5^o%M%Oivk7onH)XVMJs{l_9CtkhEY%A*>qDU{I(SYy`knAG%I)J8G&(A!oo
zFWtFhQjjiXy4fbPAm^4^tAA{(e6EcR_9w<;N??wc{g9G!(Hg~2q!1?3+r@?HB!)a(
zU4NWBo)(<ZkvYQtqWZEPk0ehDQGf80Xo#7X<jmD5^rI@b5j*6K5HW%eu!FIow4>ds
zky2#c=oF8cpm5GkV_bB4fWb{#vRhG*R-mg>FK52eeDd^YZI@(;!2~QajwCXZY69`i
zdmUik^2E+gKQq9H+-`)DyOiCvQhUWiydr7Z?ft_itx2PMCVB9)#3!Q0QPUrMty9@M
zImqjtw&vc#_BI7W*Y>xW3oDI%SmP2N!Ro)E8x|4NZ7)mD<CDu$2FMH9?Qd=66k7?S
zC-DLoN1oxsv{)-UxbZr$($ud3Cg<wA6piCxo<roS+%bc4M{e0y(3!zp@AxOXC1!Ki
z#qPt*HKT$~VC`ro=4Fh|Wc!7)Sd(TkS7$|!xypSV4uYN{q1+O)4QkMGG_@K(9P-?k
z$Vy=p9Z7JvHpt_Ao#b@=#I5JIngu46p@vH~HcQR6ieeWN^@}KsoLoDbrsDB}=N^Tt
zzU~R5)`<!yCFN${9k)seh(VzZY03cq`xtzXWP*!cK&@!uID0-ZCtZ3V>?L)07Onwp
zJ1}=yEFy^a?GRX5qe3Xp-sFvt+u56a#%^PU_WYrIz{*~oNp^3Brlgm0Pm<{Pqx`w?
zKYLIswbGMkQNBEw1CUmW4L_Z+T!QyFPf;IT`C29o+1q4lYpwVK*ZhPmt&dCeJADn^
zDScM6V}##H`$kKwD|5$WZ82~W-gj#A&4FcXIq&M@Y0d^V2U3b(25ZazU<3N!S%b2e
zccG_SP1ZLnw;<h1R&v{}(+5@10}Gvu*gKH5lGWUPr@lIcj_eaJlY>)~Uatx#egf)W
zu}%5VZndu~!A`wK&(`HFD=W_=fAM}0p$e_7edraEFu@?$C=B0hd5#l2RdS)^p`*up
zw>3xfbQ<<H&ZUg$Yas_MuhE`6$ow^UvV$A-vZ1G6whaNwSrevUB>@CT6{Za&E&F}6
zl13z)io{WziMB27U6%#AeO2_<9Me(NvCxMBog7-QX&plfJLBxOtFHa|ZMU(P8z;e%
z$JGJ!{4D&3jm5}?W6}QSV-5NXenb=!t88r(A9*+&Gt?^*P8^(uNBd6{7vMBrP@P1N
ziG1r*_!2fyG+gHZr;+B7C}jMG+}9E}dx0x`rd%C5pIu*8bGFak-Gan2o{l#J!}b5O
z)(%DB%_Lkd7BPK4wM$>_C&Ci?bmP8NF79)52GYp6x(A!-rGz}gAET3D4xbXC{M>}(
z*Hif6O`)RKU%RZM2ra8=nIpQ|y{uKqGL}fEm8-0_)SN%x6_r;dud{1m0;B7K+V*ow
zcZZE44ntC$v)sgJ2$(hyXv!a@vzSl-_|?gc?%LKHK+WkgyJq=KevbfT%NsxOV^=Z2
zyet6b1*y~r!rb3uzc=iNDcJ7UrUH2Emm{XLKedGa7Ww;axC^}QS^MEr;y2mnzR)$M
z?zqQyjLlac=&W@m{9pI|&v1VK9&H$@GRT~9$bF;GJPEa+>-f9Yxv;q>?lU|fkz1M%
z5G`H0D<ZrUa*C7d6OY&KvU9OL1sB-T7+n*4XTgL`y`p<+d%~7cY6{Srr?IK<oCb+8
z1U9Hd!K_hU7v>)G*s{BIZ(JYm;J_$>1f+F<9WUHNM<lF7-R63ctF%@y`ItYEQFT~2
zwnBWJ#BVsroeYyDq7aSahf1kBm?B(DN}b~k6-;y^GtchHdk7ntEio9;ij=6P>f{$G
zw#-z|T(Srgfk2q!Lm{F!D*EfM+eaODZ&90<*Va{CuEi0v5EbzrGTr}(KW$VOme<x+
z{jS|XtYdo@>*qQKCGqMjEa&qWd|qrc<#8<1hKP0qpWT4b<rAmgKNu!iJvfq+!;e`&
zlQ5xvh?N798!3K#^?Sj2r0t<=C8!5J=gQ2LS@Zbvm}Psmu0{d~g$pKF8F|-itO2kE
z|CqJ@E#L9K<5B)kxNQ&+&asqi)`uJeFtK|56(9yo*-T}y;-|59M(j%81qRZbG3|!s
zPtPD3bK;J1uTBXe!hF_9dY2$KC?tji;8^cvfsu@9GY#Oa&j#RYp7y;fd{BA=V2Ru&
zC>9gz;^-1U=>0Ja66%P|79@gDKbcei0}L9@fR2>Hbu#R%er)PP&`NNsP&s%E6eB_m
z;C|qWSq6s4_DDEU{U%NB<|fa!ojkZ3MR{*Q68Sgbz;^EwFg?YB_HU0vY#VTJ4OJ{a
zFB4;F(YB1eFSa13xmVGkK45FN1(|I@oAg~^`_2UH9rMtQ=IF0?w~fDOXK$p@E8hGY
zoX3X%X9hRmiSA{Q8REsj(>FPUm5dDTL2RaF(XQFqMy0$vXy?3PNg<`L31hUh<<}83
zsp&MHAQPJU)X_&s1&TT;4A`*HuHw!6=&El;MqLH>jl5TQhDqV5Irl|l(kncY{J-Pc
I+*?Ec0pFHU`Tzg`

literal 0
HcmV?d00001

diff --git a/transformers/llm/engine/ios/mnn-llm/icon.png b/transformers/llm/engine/ios/mnn-llm/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..824ebb7abe51767656c59a74589c2c8dc7799f6f
GIT binary patch
literal 370381
zcmeFYdpOkl{x{4tY6h#uIG2?%1~U?p9hJqH9L6cfz0v`Zid`#-6(UB495a|g2qE?B
zAgW!6S&14`VOhJP1Isy)V@_e_et$-LKl}Mzzx#Tg`@XOHxt>4n>)Jad&3wP_&+Gj<
zeP<=P?6&<udcCxSgv1YaOe;4D2`u~;D}lqnmx-9i|B#R%N!VFgdW1`kK8egz8}uq)
zSls{lM8VA+Mr-1J_51Ar-*v<C^Sh@0+1VV&!Ky8{rrJM*WMFalMWPa)ere&*&WyvV
znB6bdT(axEUVFInXZ6orLpM(U^})O3r<9+gK6MYt+tbRpZuT@=8>;wgnVM=b`g0lg
zls{Yi)lHBg_kVuq|BT@O_b6DpbtyaBz^7jJRz*e7@#8M`v;o(B-rn_53r2VA>+35k
zEolQsiS{&$o5$YX@#01Q=;`I<wSRvJ*G*6o`sLk&k+&7AZr;3k?AS4`TeNmDx8#>V
z;~gPA?d<_rBaLzVc*Y!-5YhVJ*xUN0fsx;23~ogq{UwbS@#thn1F3v<+?U(0)bCVu
zzwapGx|!GQ&i913er&4J6>}^sC^B_wW2ep{aYvv(+4ybr+`H_G7}ap!nfUQrw<S+`
zb|~+NaLzIPY5kI5q3OqmZ{EH&*|v>3z?3di`(@~k`Ep+Ca&z^~O{NK3tS+;P<v4R}
zQdR4~z`%=agH<CV4^2$A<q)k%>{S?*%Z5ECdpZ{6Q^LF-PsUW-xl?i5)t*-9oo*40
z?(k_@dDl``U08IKy0s<8hI1BQK*Gef>7`#X@Tn2c?ZgQZ=9CLd!tysO+*)QTZ?dy5
z=BrQZiLuA`Yier#{&yRits8aVj}w9SqXruW%6`W^Bdz=P@^yL4bf?AkuwSKS=`~Dc
z+3PGUPLhkymF|&nts$&UbYx3lp2i;<a@G^hH*JWy_pRaEyRaOb7k~E6;K#Qof`U!}
z)fB3w#Ptj}=_sw)B^6j2xir~bclyhVl}p|~{<u2r5TSJIRTA4!V-d^1C5K9$!>r#C
zYbZy;smR90vgbGKNf-Sd^HXj0bo=+iVW0ISoc>VraQfM7c#50(YX+_G^w1se$f?ee
zp`Wj8Ji6f-b!(03KJ$>C-u9D}b9NGC_z=N?PyL3SI@*%lI5HMj@EhaQ=Z?xRuhP^%
zc*jH@S|yL8sB+Z1A5Yft<^_2M(b;bH3;mwv(F0SR2C0|sjHU%UYLawzqls=_n=m&%
zHug)pW)Y4}A{DU<>GT>y#X%1eTlEJHll8~bClv=gJ5+-{di!*DbT~2yt5+#0cYdgz
z5qu2ls*d@7#Y!(;z&ZPPuTP6M-pKpu(v6Tk6+bPE&a~dTmGfg~im_3#aeE@2NDpkt
zH&WiQ-RiI|zQ~cO>BZ1IpGapiNrwDCpPK|ljkk(MU*(zPtf!<rU;Tc6-iDEp@e^oC
z)C=TgUcAuJH&~nSzIJDA-<GPNh53swj`9rm)l`{0+mdEf>_{bGuG4V^EF<}~9Q<&D
zM~-_e;Rsi|I0GMx!{aSYw;c?Qef4pF#qqP#EzA2K>y+EZ`BdM1)c5Aww{Op$tuBkw
zWxJVgzT;JUdimST^PjH>$KUU)7#q96x@(N4Kh+*Ui6l*(Z08^N#MKWq((no>mDy@9
zKiDsk%h6{^QzZXE(9S#_bfSLd+m|n|oN!L3qIx<yvjF8wWA8qH=7~|RUhus2K<(`O
z%<t{)L0xr=v$JQVuu5y5{vqS6x3)!>sYxQScds8Nu9s@4Z=<Ty*SeGNMeWQiw#{%u
zey9c(gWuUz_i*~#Y`6?QcInw`0RHI7Lvv-XU%!6v;4k(I9=Nio%*|;wM-2|_Tc6gr
zLL$o_1b>K@g)PS-Be#*(z^ks$l__YC->qFLlNRc01BaqUl5dxCiKVav2Uu1$bkcL>
zHAkKGK2|S$e4y+-KQ|t=#<XCmz48gH!uM%$r>KA(9px*YYO=A<TbITj_$*JI92TT^
z*DTI1|8j9I(UGaVzD%aTxLwOwntis|)tp5b#9C!a=%yLl1#tCW+9{J9`D|khej)AV
zb5Z)y^Le!of8OKcxqor3@6pFIvt1i@o^TcD{7fCl8m+wPY_afRdg#ub)ZqJOHBk#w
z?I*vMOwRveJ)q-u*-&mhEQOIqNO9IKj~c@gWBgWo<&CB2i~NpG9jxgkJ+%$TgMK-2
ztD?L61Ma{+)7h6d%G7z`hkpI_*I#}~bF+`&COGWT&tGT=7*0Or=NqvwU)J=bbFnru
zGV&0?<u}6RHoJ0TJR#7EgmsB<`BW+sVlU6=&>>6u`{Z!8!?)E)A$Sa#t=<21qNBpw
zvtx2>4hYi3#P08>gFZ$NZGd~D4#;ofPBmBl6HVAd1k<oz2Obti3O~HQxJ3}@5e>H?
zU+j9>P)B~s{RTQ2;t{UC854cc4mB!%8Ly3rb&z>(9`<=YJ*dm`tnaU1b{?a=&&!+h
zuC1%9wxg9TMJc8K;Jo>YkDBaXOI7u~s!tBeJZ;)!oXjGULUjoY`5nVtZQtEBZPZ1E
z4tw`nSDZ$1md*O%QoB#sh;&CBn`SvkTyKqY!li{uwjUqsIx|xc1YfS&;dXYNjhve)
zm{cl@V!)<sZ}F;|?Z5Z7^|skhM>ahBv;Ab~U~f}d4~{ER8D^9i`|b`TuD=NDXCuW3
zV3c6&0!lOVLv_7!h_M8^RHlT0{W2bIa6K$f=oS0uh@OtwH*dbo%%t6RS{;YTOYl}W
z+LF=@jxXB0MBdqJ{c?6s`^Cwwj!HrA!SKKJB*o{KWw32>@ddDZz*lE2OJ$tkXxO_q
zGS<fua3rI$&|-O<<qpeOz9ZpD8`UX*L6F12kK*Nb_T7Fs*VQq3f@SHcCcewtLB~(<
ztZ0RVVs84wvfn?wEOT&pDtBh=^8=m2I$`e3O-F2J$!UBM@qh5%fEYCJ98;AK)e(Ic
zE0#M8lzB3v$A77xd3O^C4rD#%CAyo*rQU<rFYGq^HSntsK=E1aq36;#rlu34<RyEz
z79DQODJPS?i=iohDkroUFW=5*V>DPeb{n&tNir^WRXA7Vt1YQ0zPmI`X4_$c(we+l
z1KzhcKEM3ZjXC`|GH86}N5J4AJ?_-x>9OJR!YJWF>j5v{)~23Mqwz0NN;1Y`mnt%W
z*)+#B)vW*_YnaMwU26<Ay#ipVbi80^;SDE97b%YZck8Mn#vi=jxj%19*om(Bw-qr1
zz~4aBqAewT&R%AVWpGrLkuq`j%9UFU{1bm%?B}%6wo2~OB-9wnVK7ry;01GBuQ7Dw
zjpU;-Wxzx0NyTym%gan1ODv;={)ww?M%Cz?8G9dDF`hP_@MLi8_DmSBvT~7F7L^+B
zW*&XXrsSTo!R8-qd#f*A>^>u02-&o0ENRqzL6xbzBR69%V;vbTc~z;*8az|^2r!+!
z@*w=$E=7nXY%HC+!mj3|<rxTHyycx58vNm4<gw|lV-+WYegTnRw@*#hd2_yxO1_lM
z^TL|&S%;@(n-0x|$`-pG>F2Q22(zl%3_>ggX&Mfw<_PyjTQmD5Xy<T4DgwG|ti(Ai
zhm>H`sl$}-GN6;1S#Sd{M#hD&pN%=-clOoJ&31JJgLocc(9|u`Z<{T@Z(SY+`xH60
z`9jN;>hQooW4DaL!c#fz6qv<%x65psG8w{-T$dWdGK^i0+b}~1<jxphgz@#saf^6l
zSVGS%xPAK};(-Ub{VlIHh`wd})P+p1F5^xWf!HXwL<c_#wf0gG-<evzqFlxe0$4qL
z;{<(;QTHx+I-EEGlW5#dDRv!X=%g7JXX%Fu96!Ojq=njNwNbrTZAna=yz6?)9oA$f
zY<RKzWjZmlVDjU`Iq&Yn#|S22KHf1y6(*;HTm{b^sRMn!V8CX32g-Vw&(d$-TAB@s
zfqfEOi9GZy!^T`q;A@{HQS83a9`S@e{gXdqxPdE8@wErhpyP_#DJ+l;U;BX`G}9b>
z*opdiflu@d+!`JjFQ{(z^Xwb#GH%J=f(^|cg!=}c{P{=3ce5$r#ykkQa=FsQS@>L>
zSAZ<#C10{V-UfK5G(#Jo1`Z^!6kiZp?0$q%j*iP6raO#b@5VB6;O^RR#A~HV#`f~s
zL=r)z@l{>++wO%C_>wjH=+~gRnJ<U|Wq|=VZrooz`@Yr7BJ9`0qqDuAMsriLFC}Ph
z=pVXfj8AK4KjnfA5#JGT6ikr6izbiJaJ&J!bSe?YmeBQL0ex|7YS^k}_?>zCgX)Cu
z#@-Ab{Iej>L^xnjKiijQLM>DKYe`>WUFdfVUihIU-)y$A_aHS*1I<U$>N0!xDz@Pv
z+!{C_xDBqhBnAf?5ZHefAQE93SSsT_MGzba;NGAlCbIE(5>vu5<0LCogHYA?rmimL
z{fqrFGI6gyX8Tmn&Fp+{0B1+14wU@eczr?j$;r3EKc{)i&)>Fsd8yX^bYvaodXx%y
zD%Z;=bHD-RxW!T&X<Lik;RqwBhZ!U`?V`4?&00Aab(su@#WxQubpV+p%xd`(#vX_E
z`3QTDZFuOdCYIwaFCQKc1)e`nOm<8SH@yIB-BIv+`$dPR20vuq8Lb|{y2DLkafHB9
zJ1qGioKVe4R=X54fD76}a-@2FD$SU~7ts5AO1BtiY~SvfE0bn)5$h6r*8rDjMW(i6
zIiPfnZR{rpPmXpR3>hC7?_T^i+Z^-lqKwQ=cza}Ni@51bxZl%&%pmi*rHbVzTaW4=
z04E#j+`sE8$k<v=wV}>dT@Kbo8_STms1LZWksyPKL0br3BMs9YFF__WaP1MU9AGA5
zeft~6_vo>a+i$%$@TT8Px;uoOh@N}%`E&Z4H`KDIcbf|zosL-eHZkvBTN{=>Fn=R{
zIwkJjHdZ^uSa1NBMyJr(fC}(Q+G?aVfE5^Fyv=qi0IV@Mo$^pvPWmU0YYd587qBF;
z>?8?YWPh0omX~qK>{b3gmn{+J&&}vusS(+>t~Ch@x_#%)o!hrv1-2CKl&k0Zw3Y*p
z?;Ec#F08JZ+TXf73FO1noa#aDV-5d6PL`@76GH{Ovh0;Fx6!;LGHfnW9B+ie(X(wN
zhZz!bIPs=}y>q+)>x!|HxTsAh$)r5LQ0V>PQA8iyD{uKr_lL+YufN=Og5+ca$9tn`
zniu*xB>Pguy6h|OKGu%R*Gn_tCgD{jH278|3=!A{gV@H&l)jdLle8j@%A}o21Q84*
z4*Kr~+5;N8;TSJrRE*vfFAs7K_fKIGED3}pIPXECc68t8NsBY5A5||-eg6DeG#?|H
z{ZeOg`omSO&m$LsZ4bC%XE<q)jmhxGrfR^p!GWf|axULFb|4O|PpM1+GO&uc9$$|d
ziDLO-klyxq8Vuz+ygQPXYB&@zUrKUiPaXHYd+$L+*z&AsX)$PdWJOfaO(Qu}iJq*?
z;>Gy&v^V^zH}-ksV!xqQ4u>F1<+K4Y0Z(X;%Ht$UW#q+rBF%=Ia|Czpuf2q|R|eb0
zVcV>~ctgxyumWw=tyKm@YXSuv{A|I-9~WQCqyU(CXBTJF>z6+-i|v~DRt*Hh0|P$w
zQ4xX0XCi~l9@k9uhqT5Feonsd*Nq5V=*Ha569G&%H6UPcwjhmSoypKQ{=VKUfd`HR
z<e4i&0xW<dAzGC>keUm!PduIifthYkiF<;?bL{5G`@9X%Dl{vlr|mZIhFI|Y!ujVh
zLk*n<Ytt?M@^_gve9Lxbg9I^_1uw);HSRGi!$U}x7;a!2!}_GccO9wjh_mq)vF%bg
zuytS_59?xK{qcZzW5J@GVS`F((qr2r40--w*Nck!#k5-C9|wJdpVx+sR|Jj!;vz7x
zq7EnzOg%UzTAUdv+`Bgda!+k}Yd(@xQ;!A1>vZCJWaBy20BQ(w0Pkx^e_m3(!d1}e
z(!n?$LWWf%6l0$avX|p}84yOG_giYP1RCw}+WsqEVXSWEO+#-xOk!#J?a$qhD{W{4
zS;rR|U%z<Kx;EXX^W%H-OMjq%q#~Ok9_mx8R4n4Kwq_3m-GV4WE+8E6i};OEf?)Me
z)y+X*+=2EV*4&Deuss)2j)oVwL8e4541ogV$IAKfVV5eer*~QYsLh7Rhr)$duT8qW
zOSnFXE`pMWz~6Nzhnrhtgwt<Dts#5%c=LGo9z4{M7o4}0%4}n5!n|L`!$y^dN`k+;
z*mfNP$8f__7!X~GA(5fFOCHc)Ebmf-vS+GTjwQ(Jq$NGpf{!=KI23m%CB-4kyJoW0
zXaBNDC=43=rzAUHZhA7&-@D+mV`ecsyYNT-<(9GVqh?(Y$dN3N2<zp4W8#Wk|9g{w
z%UU1E02*@4fpE}H0~Z`B$<V=UdD+ZuXXA&7>kr!;Ifo-!l3@z``uMUwot6vR=1we6
z=iL+uXS?g`=AO08>zA4N6?1LJSF&Sae#~G<x>wCz)AjZDOuAgn546XF?1Br(!Y>S$
z%0Xf9iK~i84FU=MQ~y~7jy_ww?f?jRfd&YAJndpN3YSHgKt3t1pf)eer*_#w)U&@L
zYIzgb;3Dpn-BvT<KNfnF)psrwq7?SBv+~fzBy2xvl_i5DM!At9Cebl|dl~+_UMnyg
z%p=l`6SPC^<=3X*uEVY$X|N<OGRhtK2qwNh5{k?m9L7j_{Qw~lgS)&XWA_n^s_beR
zhn>gk9!xK*pNLs}9VR>zGIZEJqLD#Mhb|B$F`vb_R;XYKqXs_<I)2KqwM4-fc{(Ev
zivnY^SB`Xn5n#ZP2FV{+@Hib*3|UJFMO+4LfP<9;>moR?f_(~8o&<$e+Mash>o5Ba
zgzXInj~)v-d-m+1Lve0`=kFoFoSIAu(Gg9<t)F+;<3c~ITP)8w1kMYh32I@0y!>@v
z(ty;5F%)C_ays!yY1XqgVD*MYCd!8X+G>=vm(6Tg0$su^R1xf?Hfl(*>)H%$Ny`i-
z)^av}H_87uNC9uVmU+?lm(#yH=#9Am_RtfD2bzbjh+bQS559~~NqL%aA)*w;O`u3P
zPBSJ7ndmlQZ9upr{TWbow6k}YW~dG`!a2aWh{{2H0h6$VW*$U(8Bn0TJn%b_SylI7
z;mt^e@LO%ra?fztPLrTpD^hB|5WZm}@04&<xusyKE-Ph7dG#)c0yHlQM}>*r2fQ;X
zZ_s^awoyr5Ih?1Z4s<NrP;;x8OU3C9Mu6`0S|$nBA721#iRu#IJ9JrTQ3Okj+%q!l
z3f1mlcI|ko=!@-6QTXD=pcA8ASGl&@+$rCkm5(2zN+jTZt%<jA^mX^Qz9BGdm6BXr
z`}IgNKOrX?2fv8rvzaXpL3C{eP<W?LR_k*!^x11b#1X^es7^$vZAk6>U0RYJ@UKgZ
zS7={bfAf!vPycY(VDNE1$l#VRowt}*TTxe2_rQ)e(B~g$xT6kCxS`rs^N^*mmcr$R
ziFw6b&YXOrEaQ`yQ4qO~X0ZY2&}IR=*lOBd7&2JKGQ1Z^42+2iPV(}E`HER(*+cr(
zA+c@1&A~aYxc7BrCke}~*OC|n1tRH(yhwxY=()VSNKyVF@N;z!@73RXXh~E54;R7t
zFOw_X*NK7Q+VK-1-(KdXC}-PfUbM9XX7T#eFR|T9+)IGLgBq;BEZ>;S!OF=sSV5~-
z@KQ{IP*tsvf3CD7G}o~GU`v3fo7u*85;I>(yvK$eb+yp;y&s7Y-5x)?yeX*bG1%F@
z{;kg|gyr{Ie@M4@P_U?)zoUAhvEg81C|H{WeH@n9;0V|p&8jAm0Cgzph8ih@*Fqy`
zw#q^2i@?w+3T>yrt`<RAR%%C|y%8!&g!ZE6Ctarw9YQXy;3eM|5Db^2MY5MjBB$a*
zVt*V*%s19B`)B{jkhwSCo_&5+dyG3(EL|2Ae!;+C9XxlKKVKO0%~=$>aBt6eS9fRE
z$5jzu;r(Gx)^c`1iJ}dlAM|GcGe5P{#Mlw!G#uH;9Hfdn7C<wQCOAFVX9!Df;J1-s
z<vKF6Kq7x5ZNGH<@z>rt;b)-Xb=@^{@4kqwpZbHQLr-BnVg~v}vxP0@VW)=$r^m*t
z&bKo`U@Z*^IEopyovA$Ad>}vpc|PF69l0`4&WnW+bP4)ORPDCg!GzdoPcyW^w1Aeh
zfnK`<W8kf{89G}}<p9IAQQd*8t!gUeC@h$ve@`PC$9Un1prwcE_0wT<v)=E6v)s%n
z#oTqR20pd3og;5!zT~zREIsg1&o{qUYqD`uda9B94qyOuckIv}OXU7x6`*-ZTOo_<
zpi!`r|ANECr;HL-Xit#@`pJk#Kw>+<BWKub+)Z5FJo~XWtUG*pO}FsPoM@_b?8B0N
z88`jiM5FsqHgCS;Mz_UQQy%AD<b~&{xwv(PIV!~UXgxB4NyOe5Y(9z|S-EXAw61~3
zd>6Y$SAh)$mn=@NV*8-WfhKf-u$|Iwy*?Ly8i0yV#5ESl8L!3{Tz*oiw*TG0m$1{q
zFShTyy<_HDO-}oia@W~W2cF(VZXtZ~t%Ycz{Em;sT<68bxrbq4bK#BL+_t1pG;UW|
zlXZ~yz%T(nI0Ycz0(yQJku2)*4%=&&+F`D%DXo`+&Sj-cehOdO?395;p*N>!1fM+K
zSGJ*B^ul8CiAB^%*T}<%54X|Eq7=lPrMGF==eaAM!d@Vb*$>Y>;L!`gQC9LCDE@W|
zWUiHIgFbi#z)=HJv?Uq0Bj#q5EcwG9*}HdacZ3UpG6_WXIvrM-yQ`nWR1WmteFVOp
z1@{n3pd|Z|ndS8LN~`z2`w+P>TN@*M`Z4CwLvRf6_$B%Tfa2+)O`Dv}MN56HW+8*U
zWo7qD>gp;51_>|ui(-?AtU78B)^hYA5J{p3W(>rvAS6&kqJqYk><f+^)>;lwpyO^Y
z4~iDZkcTn=5TjH2zPc9Lv$ru5UxMnUW-2~J4A+8jmM)Pmth>8Z7E&HP-dO45!&lWI
zYBz9W+f+7^N$#jPpyNW?<Kz4brP<J)K@19&Y+yr~5ev;)kHmI*jfAhg)%xv}P$H=v
zdK~ul;RZ>7F&q{WB`4?*3t*O#s`5{YTx-~rIIpq#z7dOe%Q0`d7Y3iTY_?yhj)$(*
zq2%{Yne2@*gSk3|qQ2GxbvriRB=Jn47Ay&cFa*{aX(_fJj5|bL66~QiJP&D)kF=YP
z_~&N<KuthGWneS4iL=e-|J_k|F`xd~;$Y&v@NExFgI7EEZKV~Bc14iTH=l&%;2C~v
zpJ;@vHX5!+hJGB6c|y<0ss?n6S05gW^bi7py0)VV2TC$+Zgm@;y*n4b2EvsTE<`s?
zjkp7K_v_(7M`(`(lrdo91z`8YycgIYnS9V_+c^e{&3-}>OUh85#RuW@$jG44N7gip
z^Dctcq3<TrO!#tZxdAkT1%Kt(GLg0-S25*IbIQRLAgVw@A(qlb)HTM#JHSb38H+;{
zB0G{7G+S(K5EEcks#6YA;-U>${RKK0Z6fLY{{3^_F<;J$iQ@AnFxMa_%7?>y1q;6;
zPZ_DBtgk2AlI(;dRT*#y*r)Xq=Is<vE0n*XXTkXDqH2Z(;H+umv*|=EoVhW=fZeB3
z*IavsW&y$r;hG1E)j;~|k&r;RL;HQ1lm`3;5P{^$s38SV4w1)yXH#lFoISeqEd8`F
zc)5Vr)lv6-B7cMD&XGOseK8Z&I^UmQL|&Z=u+#MVWQAg7v@Sf$knAjn5Diix_E=TU
zaLZl+m?~$GKQKIqA0b?zGU1}OF)GFolQdXCuZehfjWiIx7M=gh!5~^xQ9l(I6a$p8
zQqe|m#rU{l0hH0Qj%R2VZEXpm4h(W^BPpckY^hxi>^eMVxa(|%ZSVjS(<6bB4tf{}
z{uA=OKk^}Mb|pp^A!q@KiTA4ms0}ig4QL5CvLmH-^P<kUk{|2VhVWW>B93NOrfF#o
z`PxtSkCVEbv>X!FZli0K_C+SjHdfX4jGUBIpS_RE-1Mn=(9*G%uCMWd#lK3I)V|sl
zmw2t|N46iS#qf}WTFcV6`eTpZd>fRvZcb1o?H`Fqb~~pKSj@i0ZSyCs3M>vFowp)j
zr81AR*#&X%>}@J<O?uGSk00KLWjFP<5|vUc-dddLTa1}$Y1KbaKh^%Q{z1g)7Y!k$
z7&~f}O`xy#F%KI%iNhr8eT)pH%L<Tb;BO?#Xj$y5EZI`@tOReJ%#@KYqbShG*7ijD
zVXuJp2BuU)hFHCFnKDkvFBw^v{A0q7j(2@r9QPLe+3meNU$<XRDP<^d<R2r-t@USx
zO%CO|PXwLFw@e1wD^<l|<vC<6y!Ap<G3~k3VTH^0Vg8IPr6a!Ro~Xx4<F%dk1-LZ~
zWgX(~r6Q(G=CZY03A8H4k#+{2#0ZeQ=;~R{y~`j|&s%ZW^oIKM=$P>FC&E|V(NlTn
zolo5JT-w{<H(4IpGupYeB6@K{cFg?8!lkcE8-moXUN_qVJqh_3SLsNV&0#LqWwl$W
zmO`N9b}~jI75Q-E!<jA$oLDZ@%9LWJT%hk&E}Lv!$^^~2CF8VD<p$jpY$|v@`M_ek
zjJ*Nv81~deooM1og>Y6_8}o3k=i~F*oiG93{a0zX|Hykk<g@goDI}<H;oUJ^0!^1=
zMN*O74F5EfeKo4SjjLEpEom!7apv-ULzz{oOd0FFnwyOS+ZkEJD|SZ&dJ5>xtnC?D
zM-vtHYHSq)z|h6Ux4^9-_?(R?#f>53A6K9=-o4y!q7Fsc&PNZ&^;4P(sd`E=H8EcY
z=R`wcLil{cCWGi>ks-%kG{oo#D8^;++N%P40=dK^RgBf`AcJtc3Yf(Eh8ag)a^tl(
zD@egLkqH;DE;KyhvcT#MQ%2G%`F@+fb%2qKwQ4hcA5~+qc4y?=SXa#4w>7+)Z<CP^
zJ>e|__xnWt5;^i^+Q9qFWPFN_NedO;h-s*jDM3C~%C-zJ%DQCe`X>IWV`)5bwY`iy
zye)_@N#QVsoe6cf_NrXA_GV*vL<JOi^(6*N;3!>Yxiv`Vvah)jFYq(kxWvm$8NzC5
z_BDX(MI{?K>?;kqQVJ!ax5PqCxE!~f7iL~PJJI|LY=En$o>0MOXXNb5n-$T00t?a2
z!ljCe$CHDxYffEX3M+ABxbF)v;wW9Qq4{cW@+WOZ59mE?Aj-(dN36aG$IEb~?}NnC
z0}PxgQGCPEKMep|3fTXdt$ojhn;{Fgu?xR0sQcTWd-ja;BF~OLtxXRLL!-dw>HK|n
z(1}8evw<*>+ZA;-Q~|tL(?Jrv#~UU-uB~51!}sbf8Ee=jEm^LTwt$LCT1%>cxesR@
z%S!bpUcTS9;-Hd^Wa1?3Wt`|V-*~uc8eX1OOw%&XP$J1H;G7UfHm726$zWu%*pv<a
zL(Pl4<*x`39Xsbwc6%wM7;J`1N)`>jG7)7LE{_5r%Jric`*SzTOsP?|WErmZIdG~u
z&Fe2J9cgc{0-R}E@5_O=P^I2mK|-J;F<GGJ$c6Kmas+ztB_w0o5gTX~Q;WFlBqO{N
z{3{9F63(*ie}N4A{&Mg4=5w<Pe`N12{LuBGBU8~$7E6q!({tIxb1XBb0AfD?z|i#w
zS_wG)Bn-;9hD4wRXL;Du_}E+S_84}o!bMjgq6`T*nSge3oD0`Az}}9kOK4!yd;t?w
z0Tq5O;BOSb1yx(4KS}gu-FeZ%ldKIZxQNtK^4aI9SD#-|urq9SCzzUxn__0N!dlLb
zr!DVVCpEC(Vz0u!$~DZ2y>FQH7v0S-OXHJ(diX#Fh<l4(LK*`SD3fB+t&aF!z2AoA
z9+ifsE)Ix8NxU%_jJ;hBHLH!QWS__~k}hVach}B5d2my`?%(lo=au~aa+8Oxyzs!q
z(f2T&U4-p6B!VNgn3m<jpbG8*@j!f{0?b?kp!&2zQpKPaU%&6dEyu5-wKve;(6l5Q
z=*a@AG3*$NN&@1@0QgXifwUEY(7A^0u@r^F3h;BVQc0(n3US!Xw6x=+YL8>)*73dt
zh@N%z)IA@o@{Gd!R?eRiPVNsnJ3eH=J2@bb@e$d0pXq!yeGNvTbQu<d`<*1Mfe1YI
z!DWiqg&X@4z{ALhy$`y;&ajuUrs1t6bAi&}Pbdc6Lk}I`5h%&czu-Pwy9l@y0i9M%
z)w(4Rj<s~v)vAAdBz#hFCwi%Cvl4LEv5|iqdq31=9`mhz_|=!J4g1wHu4WS8V+WdG
z<Dmn$O6K0xWif$6HOXXj;4itEZT{DQsW+AKliRS20E|6;Q>j#cI(KglwFFTsLKXa5
z7j0is7Vzygz?K^^*{#75;=P+VO|mlxmLX6f?W(E_J~{vCkGRNnvq4Mc&N0iAPk>7d
zGypgK|ELZbs-G@M5iQw@z6J>gN9Vfc`~O(hS+j=PS_rSUF90RGWPmYbAO%`G!yOSe
znhZz{Fei*=g>G$hH!Hv)N%<=w^0h$|dI`T1mn+PSU)za5da%?bP=x{(?=}#K=F#JC
z3Xc6Jj!FJE41IVH-0NhCb4q6Vku(S#I4xs<Oeqt86O*Vwvoj@=L3C0Z9BFu`Bd{@5
zjO1H5CkXRJpr)`&|3kB4VYgF=mKwj|Pi30zcyu;QWFi`9PP1K{Gymzs;mYu9cRiyr
zWP+gkhGsc!lf{SmXWI<wmiqss4@7X3-iUYO@~teTWH*L<pl@1x!yni08^XzY?1RwS
zW4gk;`&5#^vbr&{ED;Z101$#IDi$}xo)=8X+S_?%EPpKM?yij#eH>9gQ2S2_O1|x0
z@k?Rk;!+AC8uRMPCOWNHRrWZEg2enP9p?lr41{%+3tIAXvz0x-Hvm5Lg6D5(Z&1gJ
z6&NVZ34PyFBdg6n-U#Y!n3FYV1>T9~>lGkyw8y}B0YU^hy9O%8US^Yj8ZoH|N5mcH
z95<WluS=f|>Wcn4vZk=>$oFaNiTPTxiMKQm|6<R^vsI$`=2(6Rd9#|Pkqppg^A2D+
zMCWg)o6XYfWkBbto8fa5C-Vh*#_0E{=rcvM;${;1({nrVze#TMFXi6d9kIHM@=}&F
zF;TZ)H2m&$`iTSGixXpgAAf{ZQtmXX{PoOq?}D<yFNKTB2q3vsttWIpLvXTOusXP6
zxM4~79muL%1IhR>8Ezes1@Qk?AUxojzrdojmr@kU*osqnVu>PL1|!-GE`r^vu@xW<
z4+6?4BtTb^tUtk~%7&JKOSZEAGhRMoGTtow)U+sKK1evNW8$fIg@(2$Z%_36Ti$$k
zee{ALD=sc>S40c<S9?2F8@JpIrXDEC$chE4iDjtV0Col651T+(4R%!UM|@J1KFkF+
z11g4$zaSkT%Ln4ERcTtX#AM8rDvVPbz-^C@xzB@#rsHF_Cc?3}V=;>>2F1r8j49$4
zQEwTzGaZw^yKL8o0-?B>4wfI*<*-){6R8QqRlAr?($(U^w-hdeI+I{1wy>fMyrhS@
z!xiX;6JIjmS;d7EjJ*SkWF&1Jp$jGjqrqj7puZg9Dv5m}0m@4(6JGEfE`v(6<fNUP
zvf4P>y_6m)8Wi^B=_$1;D5bQ{HFVxHh#7e;lU)%tS5R0#^(;@fwXkY8a$i#(NZvNn
z1B_h68=-ko=`z|2@SzQ)Vy0_E4Hzo)c@;V`w6rRleV|VM<juuY%`&zn7#Q<iS1jR_
zXuc#Y|LJ>C4@@Ic^lYYX-T2t}*j+f3Bh!s<hF_%&`8>XVcI*qRkH$Fx<^B({L=vb!
zhpi3IjBm-viw~MApl}IRRyZvyx+-|cGAS@0;zVGGgZl?hj6R771|^p8CV(D@CzSF@
z3IjL5$VjtYFzx+JM%J(PGUzcdI6OH52Fa8pmzYM{j-}yuI(P9Fhl9{`mV4^XhV9ie
zxAoLB?_MzJTDln){cYyO9`A2&9&UTsGPML-3M>HD%?4y93E+4G*uS+1rtjjhFNeQ=
zrB>HO@#cGGxd*flmaQ}qei-m`udUy-3_Ki!DPU#!!^F!k8MEqNmL5iifsI>kd2n-G
zcWskjW%y;7M^NG7B&<+D*V15f+SWumRf83K&`=GpDH*6?EGY+bf<XwZen$4i9CSch
zDiZ_%$+d(kRf}uLx#YqnJ1ZUmBM%@o;$&ha;cr)iP~!=JE+j(;;vk|Zqid*@a}9Ag
z9Ggf$PZa6D;WBrTC3}(}=uFIseiaP{SuD@AAW>?SQ%Wh1fS{A@b9M$SUH93c@*&>b
zwfG^seXs%}T(B_c!xzY~g4#r^hP}pGfuFf0hpsO3q*XtR1FtDYHiS7P7(xaatQHY-
z9sbw{qyS%HL8%<McHsOaWnn144!wK5XmV%#Y<O5q*!d>EE$<M=gaF@KbePZeZ3QD^
zSr%`J!^Zg%f2L~nB(iXLw4%9u=~#lcV94Fxvv@`Fklw8-V-r`mgB17QXmBiJ+5uiy
zyCGtd^e;+fWJ$Q%fX{u2q8Ua*K^1L2p&{5F%rp0r3l`N(v6u|z+CVqI)9boKU;2D3
zfN$p3%+0Pt@X^#$at;xK<!yG3Iny(BG20+|B>wh2K?vA8ob_Iv%^(3u$l0K|yAieR
zWk_zN6yH*i8{i?a^M_and9+jud@nQ<5}-^*fZyU~J$8n`-7W(T)-FJ>6*ZUuFNm;q
zc2nUif!O3uclGI+9&s)z`8Q4a#oG)tiDZ27AcLfG0n$!X0+8K46(+Dh7?U0*oE(5V
z{ys~sgp0+I$eZaBa<PC@E#p96(g++5y)<4C)LRw~EZIofiPjUs#IMQ)izUA@Ab8Ot
z#|AxF2s8?o<bbu7S)<dR)-9oI#9Mw^5jGoi&oj!#Q}4pgU&lUFk4FtPT-?+x%(mD*
zS7+jZjo8g4t}dpcH@a-(3RSKVdj4z=nE)9t3e1BFgd~j;Aqn|Cr{w$Wq;}-+ihz8N
zfK(di#v8gt=+fX=K`<k(!I?pjfvYj5X|Q^FzU&XXv;1W_OmqTzgJq;s6@M4|7=tS@
z!|k)3F~NiXI3^&`>B*~!=VDo0m~1OhpPdZyyfqMr)L=fyJwh}D=Vyq!ejlP4@FNHE
zk|F0;FzcYw@wYhSIQXYmQyK2|^J;DlbZum!SBq1HW-*?cwUW1Ov0(cYGJmrH0Pz-^
zMKga+zN(As{-O@9_!cguDOkDd%*gFR(Gm<v*cUX>{(?NUMAZTtE&&eM7(iqUJ`ciz
z3wVTbBwq4?#e6zAV&o7&FA1R7C_;%%7yt;UFAjVdcQ3>B4buS)(kYjsFrfN<*>1h-
z{@7VNR|_j9#?Ke%Q+$4bc+ed^@O;RpCiGjsiSvp43<#h2CqL}Vu|k`^sg&se+T^LR
z8SMmo(glJn3EU^tHd63w06x)JkqPvY3J7)uhX}^+0$&}{1KCMZ9x0peRW)UJ0W`xS
z$P3}zVD-Tk_`?cscJz1RBsp@4^ixzKEmo&<ks=y^!ewWqXtqZf<6r3sG}Pz4{CoPH
zrKxOXVdq8L!ZC~JQ=};~+W+&M&Z!|T61NX7+`Z^|C_-5xUk}0`sF#gE57k0isFAql
zNK6Lf2liD`wb^ms)Wq!Vop0}S@q*9(3rJV~1*A3aD;}Q(Aa#%w%CNC%1leO`u-t4#
z*kyp|Elwt!*Y`=mx38hb2Yxk#;Xw1lr$Y%Kpn~UvoG%{;-h}D?8y2(<^4N26TO(V_
z&J}AKrMMX!wY9cyJjCWV*j&C94UfCL%ap=8&dt(MAMK7B|DE^k(`2ol(xwK#;14Dh
zOA{3_lPMLVdz+lsu3g9F5fquAQSc$H_-pqeGeWY&#UT|#RuJ_O&{*ciLzIG`nTd6Q
z=5D2?xFCaS1ZX0}Pgp?93gu9hW?+f@kKl5f#|i}1l>ZYUi2?P~zX9rV|7^4TQO;FR
z<&s#oL~*k?KMo9+Qcw~{Wg~=4NmUBH6g4$0`z)~+5}Rq*fh4*tgkL9Z<e)&);4(|&
z|6&i7q9Y{J`-blJDsYXf-DC;o=Vh@FYlFA2)H0H|;#9PGJ0;m_7ynt@vS@L+TQss}
z5j^70>?Xgj$Pe+peUZsFh?&zFe++&|-~$g%preG(qoCFXTm*D)mSBv4nXlMLA^n65
zg$4=f59&Gy?<)=rofqmAH;E|<;zTQGY;kio300eiodTf0U;+eTob2-tgFXUh3)4oR
z*%F(hsJ|S)(y^m1VsgpBq9c0X3#i-MT~JwJiNDb)8V3Uq{o<`7cF<4^HCE-*_puCJ
z=&SLl>NA8AzZ4myOqWtv1S(iKGGP@s$SlxO_&^>=mj+OJ8tAQdmLZ3{`6Uo9q7sEw
zV60cg5|U3HkR{prgC&QkZSR0hI~gE*H)KLMzEL!Y3Qu)$+3QD8O8I@kgeP+DTIh^_
zkqwV;Z@;>|`N~#+#Adjp>{1w1ZWUvn3Jp#YZr|mIxatMx1?r!xrBD{1fqEU7AP057
z{s?4Zegr>&LTD>VUX`zBytf3J*tk)Pcb&Cib1<T?J6>nQ7a?TqdHUwY+lM|2lMC7U
z1@BPe(t=2>!gRzH72DxA<BN=>Z83a0ytlh5Xc+27?g3$|DTBEHc<`;j=*M3(%E)Aa
z;FD22k5$M(B{8jv43-?i@y|?|L>4yCm)uH_qR>?!4xLkYYL&}Al7qK`j+#Ix7@xNi
z$g6)@n1j$R9Lw?<H>-Z-w*{9XmA@lw>_b>R{FgZwgG4V-%kzWcz;>WSH0UTh@YOlA
zO{MV}U?oY$3$llip#)kW&@V7E_MT&bf69$dLS8$QL?W%qrC3_+3&02{PznKMNb|Sy
ztsvJsy#YaJrv4ZBS$g^)!f)j3VAHh0^1;Q64_oukI7#^zu`fN6WV-;-wZVK#PEw5A
z_OaSLZ17H(!G#F)=mEOV!K^|)U2O!=w1MKWlwzz3bfjI3WDxc7$!<`_id$>YL4qV6
z*yoCdG!x*rPeqEB0Cn6eL-|An)DR+tqdf7Sn-DF2?2DN@C>rb%7A(h|*Q=ju@RR(P
zz71@OUf%HdYn32`3{e-<V>Ki{NAQqs2*ps=qECLvqK|_>Qiaq7J{7)!K6VPv5M1ps
z*A*iE1En&Q>LuU+i}@f@V$!vQzXNLoR>6(<1`6R~rp#6vC?e{e!Lqbfi2}sW5dKZ_
zL??eKICFOKqIw~Cf*Ys>I*K0M;8vgWg-=Fk!<C(~!r6j8Xc*`6mEDgNH(R|yu~_ol
zeM21d3*bUjnfOAD5&ulqt?v&L6u{-9D%nnEYc35=*&goYdMbko9tK2IW@{`gp=Qz`
z2bVd~-`MBu9sU(YRd8N3^we4CqvR9eTgm#rrTD4&$tST>kHNTsz2U}kv3tv)57u6Z
zYDmfai{J)n#Yqa#@b;L(!l7G0wg>=K#&-uHN``<<M9u(jeSv)yh6G(K{2fpxxK3{7
zT^sa7U|){iieHxy&`E3y=?Nr!y!taJ#6^b9p}JY<`_H7G9esCM@c!uTI{m9KVHKP<
zHp3OSH5WoHoD952DrEhns4a(qprMZgy&=H@T?U^9+7Zj0P_5d1L<1(43vE>_`>G>s
zs3$yj@YUW56l<U4mWgB$N>4CWsIgf^GfE+Er<^-+{K`Aiv$McTbHWjk@OO3LPbn3T
z?|T0BFS+xnpX&c};!l?=tAKWanxHWOJ|`bTwh-Ec_2`2?FB{e)8;kPcMJ13iWL5xl
zsUAmQF3@|>wD3h!@hHteg9AYq(kSdK6d)+bc`*6Hk^&gWc=zLI$S6SOm7YsatA71F
z^JPzTl(5X9eA`3kD;-3dG+PA+*3w>v(4PX`ZUYquuWsiG78g`l?2}mcPom!R^8LGv
z%k2%0U<Rbwn!w<2rPSx})LPqqi?M3Fn8X0YP?O(sL3&NcTy4id69P$25ych0)z4s!
zAvOd`vUgq0@Q=$fFzetg2**T|e}W-^_3OR)CH~$~AK~mng9G(T9}6E(3+B2!t{`dp
z-!h|p3>-YHsF@zQITv)GyEaT5;d*1DbexYDbUh9i4yYf9606)EOGvI7DmYl^|J5-;
zJ)s33QM?8<BRo&LFM&@?q_fypF&aI_owon{EC4**LiyLx-#Ult1lp)XCs58O{2kQp
ze4!AjU0qz{;(3@^1De^=98dId<uQiAFFqg4?!_-jp(mH^j0}Kx?ERDZV4Q?8vws%t
zX<amLn?9SBNq$ED(KSGeX}{k156qj6wA!lmMmf=}G4e}F0cLu0uRm+=>G#$@ElYg#
z+_c~8>jsb2e>>yO`pspp^`!*W->z;c_1LjdJta)U8=61c^;FZ|k{J1n*81yTW%nj)
zr06M<jQnsOYR%i5uIe>e9`|#NP1ev%(35K9+{5b9rF5@27GeAw39*S<^&GksKT%V)
zi6ghmMi2csSu-KESH^=St&mcskf&1B$SET>WjSv&d%XB0{Y2fOFs!RSW@wwT{?gO1
znFSN?vy-<+45Io>3eSGYeeqWJl3yYvAiD#bNN*&jRArR0OtnZQ7%QbiN!AyVtu_4k
zs+CqOoRwOlng{E+A3rg~--E@!BcXU*1!Lsro-e<1`+rDecx{2J<2dQbIp`Vr@iQtL
zIZpX|WF=J<ciF%Vu9CYhks;?W>T$`_-%78tk)YuxPr}F@bTdCkjA_xa80cP}?|-vs
zvCy)dwevuDa&vt7`%N+P=Y2%e7BLI!PRtH#CeLf>o))-m@lUeeo9rYTaX{)iy@^w%
zQpI<ZgaOc-GV<N^1a4}L+DdR`a1CyB+3gNR7>}afW|*LjtX!gJuMFOUm1>&lDytg2
zyOQcq$xqZMna3y=VbB=06ZE$D->{-vjdfQq3&XmFzt44_4WEn4N>@MG(_rD)>Ro?&
zp!2OyRL`qDK1(GxTX~X}jZ`J48p^p;j1|c*$+{EXG@r<El_aNBHj-_{XLGcXC`l#+
zlTNuvRwZd@wq&?U>N=#tW%=Q7!7l%E)7zUY2h$@F&6AC?9kTV&?05??qPOYMvuE|q
zn))05_;NANBu11MGdrdJp)}DasiP?)HCe+MMn!-F0qnTSZigeWw$ZDkDIKOInCWAz
ziv0L@I%u_?y7axt)>66&sist=?OHCCeDW!m;N7ySTNJK$Gyy*Il^vokz=Zec3GVpe
z_=zRbgzW@p^N5Lwu{9PoGh>V05sSit_Xi$Lto!4^P<Qy3riXWU6KO*Lx2b_+9wb}4
zY<h-IVgW-9t6(e5*S@6k%#e=Ny#z1mdI@kb^~f(nPV$ufrCvLKg_PsHTm9^mRGTa-
zshUa|axQwOJoE&KWf=bwy(Z#LxkQPKIBCvS_!oJYey(1yH(+23)?eNMo0y6tz)`^K
z_?7Dga{xHLdaDOC6CJ|G``;84!Uw~eH?IuqbG1Rt!sqd&mKP=`gk9>Q<`kdh^^-S=
zwSbf@{dT%^ETCSaM8zmsV~rxIil3;p4IM4#xit&tA=|0A^RTjm6rc$KQf^HXxg4vj
zwAX5_ZVQ}u0#UBkwA_<B?U&Rlw=3~pvUTTQVJ(6^B*{k;`8;08;_`U(V##>Sc+bam
zyUnA2i<c3dG>cq#cX7y@CoD7ZiJVNoV=l5$P)s>au_JK3M!9l%tX`&MXCv{c%IZXV
z6A@TI;C38iwQ-N05t+Wb+085|)i1*-%0{o5Tmol!xRGpxNk+S@S_vD>E%Wr#0rUVM
zPuak-y0h{#wOUfF_v$9VdMBYh#Te<L*HODpue|38bIUiHH;$X19~wV9e=lsT;KP~u
z;g4D7gAML>`ewJizrA}aylJtJl^yh9>DAkqq0VDIgV|beoa6*nYDc3oRjE@vGT^I?
zdn9#TfJ5B$1p3<~o33IM*XX)bHo9EuWs-X7{YAaOP2>$yx@-Lve3&-$VDC|GS@=~r
zTU#SP`5_OMz_Kz)qZXY*`c13;qlISk%^%nOC&)d2uqo*AZ2SJVysrSch%ZxR$6QEx
z^h~Vv?!U5t-AoA)2VnK3t~Xk$C0aFd<Z4x_R2=f939;yZE_;)oH&d136LAV~xg_8|
z;22Y#gwziBuIiTlRMQGm_%Tk2pR3I%pfeR`D5)y9>ymi?(^ADeNTd7jQ3EpN$T=MM
zJ2Zh)_g#AI9Q}@08+K~syv1_Ma$$GR_t8}c!+-WRXyOB3T@`r7z?yOZ-bU?udL`c`
z(Y=zEkg78ENZKXVPkz@WKiAWhcq*`)sY`6KRzj76Ek<q)d5vQFCZ$fQOE9Tqo&o1P
z4QOwYS4;t#lgl$L$9Sv-w(LX;A90|G;Hv)Nzu>lthe!2&YNtMoy!8p6o*9V=oe^yK
z>2HSp%9;AHEMWbAADdofk|yxO-efOXRgK0Q62R>6zl&wz`KhqJ$r>aOBfpHT$r=ub
zGb*!Okshc9dk2$ND{NSZ|9?5RUJn(shDYq|uU*Zzcdp~j{N6lQJ>iil<9u_^i4RNP
z3Wc4pt3NLp0VWXsn>Zz=<-K&NWGB5XT&;vuPcncB18PIQq=Gbo5Q`&hzphfXN$GHs
z77(?RZcCQjuC;)25Cmx}L>s{r%>*^Ym=Zr7&24}(XE(i53cGBMODAdo`yqi4E@iD*
zHy%AvvVXi@RPb=TyZTkU46o~scSPUo?5-MNS<r!q*#}1rd^TN@CS>SY0>x=ztmvSh
ziJqCQI(dG49NI;fOMXdOd;3#CZ}X*bi5lhpw6*??M;oaLYEpa}K+7GLg6skkLg2Z;
zV&}`}0jl&sqn>G>4)#6{jC&1)7cFbD20@Zs3)_^hb1;~)h6q52n{Z7){1$1k@TM+n
z|ABB_kkZx=AK{b?Z+d6Nher#MqS2Vg!q*kn--nmz{{O-7$cg;rxf7y+Fy8X-bG$ha
z3LSNeKO5qGELsnMs8+wN;7ttPeA``fW?++o6h<+Hta`301&t4&X49A%W!?#gK?ex{
z*gF8P9c|QxEdzzuatJ0hX8jk_OI1M}y}c68wNBK!B<&)3NWRw8U>mHLyKE;;aTk4C
z69L2nd<(5b{;p}}s-)+NDfD@2T-3z$PIck=@vfcKGg;2#SH9wvuNa-`84g)K=Cf!k
zS}NpCtuf(!{T5^q_T_~l&#Y;;4YCpd8HJ%nRrm^r0^5j@+jSZ+by^X)i|?j}w_@cJ
z6VbteDmYXkueHl2UqkZ(96v0#jo7^C!J+}tz)zHbLFj1YXkGAcM2=9=R$#i&|HN7N
z`KkKHGjq%1kI#x)e$hWLKNAA0T2cS?&5S`^1elX6)lm=p;0=-~iK%{k@=*nJ9Yvnp
zCSZe-4jRa+2kavb2SOmN;?Nk&zath1;;t4dsGK{?9*aGt;7lQu>(UP=0dFWe=;?Z>
zHC-h*6H*#2WwXj84&OkEOV|sfglt_$)72_?lP%y0so)YDwQaV*rIYlY+FtoO>HO{C
z-0l-+=iiQ-%`ZeA2p8}HjGIh?qL*8RO9LBB!m+NCFWeE2BHub?6}&rNE>8ut39SHO
zi@!3=ZVy^ay`~I07Pcc%gH+O+uTrIug0lj%)~IUK%e>S}+6KQB8)x!{y~zcs@Dm_l
zKnAjVdzmYf4W=9<eTthq0j~dH%-7{G^{(YV`}@{_M;Yk>4}8aL?%RxVSJccmb+5Xq
z)z(&eu_U|Xd@zczsYv``6aCbh6J19Yu1jp!1ts9hfrPLs!4;1t#{sG7fi_?oO{p##
z7|h{DqBA8AeAK^2=u(+t4IHRPT(iaTlAl~L2HXMNA33kJV7ru41a7k1k!{q3Ly#{~
z0g==rrcY$7&aMfYTIPk%^FA(if>}8X&Ti=925@%HqEYy&**kvClj~K<cLhxatuIG*
z3|KBf&!{&E7Uy`fW-Ja7N)d1vr55-Z1F6Csq$DE^MGN#d7)58dQUAsaoX3(=vd*Ri
zH7gch{}RAOY+TpCq99|C+M#egKWb{yc`SP3_uFBjsoJ1!UPzvda`aCJ>ZYFEG^qP^
z*`fQ<)aWr-=%4kZ<un4%k&KcHkO}dV)zDZA_>lGhyQFqqPfB5z(gGL*aA#t}f;_pO
z>xEwGX*trld|iM%aflCU+XO4#NFXGlL&x|l?Cb<}Eb-))slb1}Thh_2eOi&U>u^#T
z#)AyrwU@pv_(FbWnJ1Se5s_&-k2|z5**w=BIbShe@S(TC-A3Qc>B=Jfx1{r5M7;n#
z(eL2%W-6jL_m2O^e?3A*)D`3^!I^-uq)T1TPfpbAR7`<qs<Nt+ssO>`{xniTKJhvv
zq3}$QjSFxUL2hPyxAc=RR;Mc)Gg3`8FNt*rgFGqQluB=mUkv+BgSW?jf_T>aeBGmy
zJr)Ju_1B<&V({2o(WlQBO{lt9uBW{lb}!udLaJwG6B05$dNa^lQ<uHc)C9dLL^q@o
z$W%Zs*CkY&uBtV;j*?xJw*M8tss&j?&nVf61^6YE05s4~>_)o+Iz&rgHR_+PO7fE8
zyCk_I7@(!AN(uoJ4TK}b&%&lp$US>*!JF=h2_FI(K3~}N)9@jEv%dfc`@iiG<#jJ+
ziH2d6Fw(b$i&K9|<`p7bHIcuQR~)!0;3BX}r<(|H;nZ!x6bGP6kT7KBq`<qv8?B%a
z%&0Cv6#WVjrLu8|15O`)f6BfH4q-@Prz%{wHbfGz(s1HMDoq?~n=MG=Ax8n`0mxMf
z*Ox~7kVMY|LSyDz-p`GH7;K0ah5nmFn~B5-eRiXlTZ%u&Kpb*4RYi(?5X*;oJ_9Px
zDuIn55(u_N;Fp8HqZY7h27kv-q$gB)jP_sm1pU4N);k4)P!l<&8HZDqd<K{(R7?lf
zgw#vgWh0fC3Pk0A5m47M=<p{xC50iyC&VUC`!~AD?iJV`IMZK#cIoZ%YY=LSrIGP2
z(ZY}4$2+hgW_~f|n*kU@`HhweF&1EpkrG1Yg%e)`H;d$>>FQ{*1{&FMU_n@JSSfH~
zK+dt?&%n4q4xtZ-6EG9~D)y+Y8k$9(@OywX^ad@o)1Xd?PT%t|y!;j@BY^QNQMx7|
zHwr99OOOS(v;0)l_g?+ux2~tP>iJPiEE#2C-<iRw2b*}4-??-Do<fnmb+KN&o?UWL
zswcd#wdxjWtKV>7KKG^;r26UD%8|hDkS+k@8})w2GFl(-ju~>XvWGp?_^y@EH@Jhr
zRa#Knl5DM%B}XvAq;B=6U7=n8p@d2D4REL}fe?YGmYCwuv@sca(u(M%!PYr!@$c8v
z|EYa~2-@>8V*YldMQu}jdBTD4%a5b_2FGKjTYVPd@(4}cKg(_}?`Tfk+AD*MJR#P!
zL}iKsglB_Iq*ta$WIRV(q~WlopPE2#)Zd2Q(o7%REzqrhqh%#Oq6xCiVTi$QY7qQt
zRgkul6Gq{_y`-*>`X!lWY5}EHr9c(4)L&t~^yy9QIB$A(dFR7w*!ep*c?-{u8Pv`6
z$^@B<UIKB}2(&R)6g;7yZo`VQ8SQ0)La4$ixXK>lyFmj3MT#|igKTSr=>_JIPJEsO
zRS<3=J`u(PVgr5&N}|M6mY)vlZ&cwdH6co+<Y#&X@0Qdm0-#?9HxJt<iz*v;N$@^M
zqx9e|L3to{CDOO}V=3v8jb*lSy#UMcpO#i^<ljKKpJqpoJQUvRUj7J4FKVVxIQcqe
z>95JWIF(boE3-f^U>d5R-NCS!c1SZ|q<+eh<n)`MCZIWdSJ~J^FBl!CEc^iKAWAy;
ziE(}z)5!)PuXj8_&S4$(!G))!CZd?<2Whd^lYGHH$qSn}>i1p0gJQnM{Tp}g8ULq#
z-wdk%*1msFMHE~-->s6GNrolHaOG^_HcykG?4iOn`KdryYXoe%=~aQqY=@8pKFe#;
z6he%uMlCFoI5xmE=m`QhI^-8{;wxS<7C`D=wW)<xgyqVWtSp)|$A{wsMGCT!{~Bci
zqbXKajtVh^(ju*Xn6u7&IVC1?Dh4^qiT-6Ul_5PelJb?cyv3Oc(VVE)2LY2O0$dJf
zd;d9J44?$3n5?6#bohq2WRu-4?m1A50T1L*>3+!*8o|@3`CB1wl%gSTf*&S;)qn(@
zY`qOGKgtFWT!Q>kYO=K?C~5+nKcDO{=?~^t{K{(0kj3w;c&=S^WHMpBqrb-e8-e=N
z-Q3d^K2wHToI$81A_u2lD?tdh`(0KJ*)3pXI7lSY{aew7(v#t!aU2aur-(ygiTC=+
zJIG4;d&usE-I(@s1&;zWxjk76*az2W0QL{;e1DNRn4Jcbzft}u-bxns8nQ3PozY0v
zgul`vovs8*1(kwI2V({21cg<imtI0oNp$q^QH!%<S?Yx!&WMDfXi>Rno$?h7nDz-D
zQC7^9#nR1~nW?lqb>Y{e>Y^og;YPS@bj46EtYp@0O`JF+)e=xB$-(6*>;z}g+w2dO
ziXRmu2}M9f2gtKXuH<TgL3@ilp@Ncv<OVtqX}6Mp-BW>sj2I}JUX_CL&wzGF!`qrN
zKv03G3GgbAK1x89ggrYEivP`p-kf|naBNR>-;6RO@CPG7;6fF3(WSKMOCDkIkwb$t
zV`C%MmO^Bb1#YQ+I4c&^nx^2R1c~r}VxFfT74jfT7Rq3$Q;x%NT<}XO03k{e|2<fD
zAXY`>+EYypPQ&qlFqz!}H2}p=^z?IuSg$8;Q)9tPNo16HdY~><8G=z@w!Y|D*(M+1
zWb^Hq=|0gy2~Ws-G@-zeaX#N&zi=MvbkPe`0X3HwUNKAN-A1U(SBBV2wz9fD+MAEq
z0Lo(|Nq&6&mLzd)5QmTf8U=C$<+HR02oc2rmB<2{BJl2Jh!rxxE6PL3Mr%pY(7bY8
zY#g`vgDU`W24hf(cyFz)62h0{WdBM?|9xH8G7R~<NBzqk;owS20*^05DJfJ@+DZw}
z!S3}-RBM98)=c?c;i|%FfGbI)=exqp$ws}mU{9)&wqkm@EU0vlQraK}h2R9M=!LS&
zN@=!>P9j4j9qoM#15*(DDd>NI6`%~_A5}p0%_R)j258R3eTgZN<1QJSKH++5xP5CY
z`1{*C3zrP$gil4U;>%eF!goAA)i?N-w{X-z^yC#H+5c;tATIyw<BU&iP1I-{qM<|#
zi^-n9B@HzbDnB3qST<c{L0?Qt`3=SPiGu&cx!FUAbC;ft7<iwZevsXDrbn<RTtQBf
z%puV*l3PG}nqmor4Pad%e!^KM>Ok>AOUQ?)hejHV(s!_pxGL8Zm*Rj8X+o!>t+ZS%
zD%zmiM4kR|Hu@%{c4)!CkHDcc5;yEc-&-?<dO08-iS3%(sx~3EK1{oS?f0M`oDW?6
zt*`Lonem>7&{Tr+r|X;j=nVzioak#F6d?bNyLXRk^32wT3&o8hm4HN`vL)dlNT>*o
z0<Tn%a8RiN%CV*}MwC56E0gffi&P+36+%GdEGiW#b)*Ghkg<|2lCcsED6l(@(pC#K
z2GCkXK;;yY?^+MG(>d(#x8Lv2?~lD@C_H(d`(Eo>*LAIRKg4Oz0dIIqh&qGmFoq;~
z-yo?B=Y-;+js=7hIY!WzAg@S7#&m=6b<o?wr*1L>F#_F>a+QQwAb3R)nF~m`H3x|_
z7}-|<C8+2DBSCLr=@7#WJ<!$ZY__a__F1i6+sxIUJ2PW~*=}@Z9-Vs}GcXsT85d#`
zBR9%UZ=^?8k>yvB2)qT1GfZa*zU+~rF!~!wvyCi=;iD&d{2%sdNb$v>*u}NP#gX8J
zuKsX-d!+QR9X=NPtwDjRU;y&34H+)B%o#cx;u|quYbb=6ef<HPL#MR{_IL!WYG74~
zgSm8zfv3rro(21zFY46y742!_{Dol#@@M+A-q|}T@6CQUkU4m*TR&E<Yk91ntXB?f
zZUzoPJfS<|LiYqI2Ty!=jHwMwUvW6r4X&yM)ih9)!o(QV3y{{=VQrS2@s>K^+r31q
zkX`)Xj2wS5*I%(zEJpJJmJdNR>{)X1azMV$e7yqBJfmhq&Rd@LY}5BY!6E!7N3)(?
z&Q0_4o|-g3_iNTS6}~!-$SXk=U=h0urc8o5Si%)g2zp4Lny&d?zsoG^!?!$cUTyO;
z^Qj2j#6K*I`b$~CYqQ21e`kM&_vPx~^?`EfSB6uT(TT6e?KytrJNKd^a@T-A4RB7I
z7u_@ZwRwbrp)~TL+r}?!zp{M4P<eJ_e`Al&M8Dj2)5E<P5`{Xzk&ovxh+}iC==nS}
zCEfr-#8^?b`b{5cvFU?P4I?=fDi_Wop$cl!Kc@lz&Z93s_s?Omc#XxeVy=C_qB3)1
zgXJs(@djyYoN1hLgPVbmw8c=ES7jM;%}KP}VUb~Aa5HV9Qp!!TXBp2^x#k2}`{^&G
z>23_Pzr26dYyIq{3A-#^>fX1%(%dP1I%TK*UNSwb*DCi<R8OTOw2fbA8ow)ZsueC!
z$wJj4VMhtuxH*v>x_4W%QDTsr&x#yRAzYtGTZo~Oh$@%bSVMD%91}}p`q)Kf5w^+=
zrtG@b<rSVLFhkiPuHmCf8JE|SsN7Hm4H)5(sB{o^%9ci9@vAzep~B7xY_NzGx>Z(F
zuy3;SV1o8`@A`d@U!>`u)Z6JNKiK=-w0`($jAG(1<4qakPj4hleVj6oKD`%DyP<S(
zdZg{)Z2yer*B<zD4DlHT_7EIm3uDpp+Sdj@_0F%+ZtieU2WsjJE#Vn#l{pX+V-ZCx
z{#5y7xwSbg3kP%$Jpwsu)xJXw*3B{}hplDodi)fLTKXNQTI<dJ1=d}09_qlADwoF6
zi0m$Dl<Dq{4cQG}K1<E&Owf04m%RAus#n%@r+#!a$#3Y+;w^jMev~y)t-Gb4O3>Y~
z8u?B?IFRvjYGAfxAll7_9qJz~usFKdp`yWC>Qo;qE|ocPc*4$29%Zd8;vS!ZdjyL<
zDG)9QEU<Ppw-UlKv4!@L9kd3fY;oId$qF@cEd(jaXY-@07KkjJVfg^nN2Rd=8N~p0
z*=ifnCO5Etw#y>s!sh(h$&W7%%wBBDdN%a9x?SIYHEh}M9^XkB>ABUretdAmH*I3@
z*eS{2)71lKuCPl^Z#*wl9l-t7vWgB)8X8<X07Fi8&@iwNR=eXO^*g}^U7JKp_&zHd
z^A{J_nuJ5E7Q`xHD`9yKW_wF(s+2NRh+6M$2V~~_d9H;8Au5@9eXOTCFs|7o+{S~t
z-2R$RV?D9;!jAABnYpugL$K&$^h$#pgf58Y#CiXw(5%RuPSZZTe^Nhsr8@JOSIX}m
z=Pkc@>VLa`d~#dYYp?!BK)?Q~WbCYDT5WevuRW&taeVR!*2T?-m*eSL0V_uk)r=1k
zmttX<wIMac4o9TM8lJbOAMjU3VcEC1`Q)FWvZE|wABD6=9$lA}3SDYjgQJz<!O%DI
zO&*r^_Tj<Jn$K||HK*cUoi$)&ex%?Gwk+D;L*r>MdfyoWz<nh1*`3ED?Js}0KGN2G
zZinLXrG(l3&|byZz3+}G#?By$e^_aZtCpNrHO-;JdEo*}-1P!t{WkMDj#sfma~hZ?
zl?~Q4upkyY0`IZ&c!PJ|;`Bg)6`w2YbX}z07;TGSM9ZzC$r6!;MyNHCvdbm--bT(!
z6!*i~v7RO@`5(N|n~C!fqN=(S1_m3r)>@0Ucxc{9R7%ej!1dDlZlA(B`{^G0>7TWD
z$zKGP?g;x&irp)U$x^GQ-_=V81W4eL7OV(GWLFd$dF$l$8F^HI^3qz^#0E;LT!?0B
zv_uL30(~7NEwRiZAwK701ELG|HFyI_WXXw?N2m%XE%{t~!`+sljjsash^&U$hbZWr
zMEHj*7qe_eN7tn&X1-0xcy=ctYg)J8%rE`VSl8q0!u)1#9#f3dy3T5O36h~p?2vgM
zaXPmFRI;&vam_}+pbDmP78t7E>5)0{qp`~fSoKCD`(P{zu;YM$%iF3BTxQ;<EJ5&y
zSRv=t970sLIG_TDhp!5x=KS534tpJqJ&**5-2cI<y7*n6#Hx;DOe-Xl&wvXI@B(iY
z`m1)A_l;yJQ=r#?i3ca&hytMVDjEQaKt%s&w>ZoTFaXJ5q(e_x1TxyYv?UgW4*(k~
zMBJ9J!>e3)xJg>~>ry9PuxNRbXqzcw_c4Gwx&YO+R_4U3sG^0gjXe)WQ(6jO$yROh
z)oQM|_v^ng#w$XfCO2hid+y%*&le}<Gq*=BKiBdG+GfUL+MhpJ7dG%#up9UOic+3Q
zxSZ4Bnq!Cr30)Qei=qk@f>$AMsJ9-Issk6`^S7!uuH;p~G${&Iy8+-n!%UWK9>z6L
z)PP@NF8#O7(b>(1S~nZ29UDRB)Ti+?C@w_nG@rIEN04@+^{$K(Z&=w8;byQ4H@im3
z`ttkKf}`vAk2_cErgF^&G;=T;-<GDIk!WiXs7EJ1pahB&u@p7fvLt0ml$($1A}VH1
zyhXCr1cxX9(uE%CcW|pA>?H9Lniw%ORF?o|Dbk#~WagpK9)HbNdk7bZYOvUd)bqlQ
z-v{`+aaRV~=GjIe9yE@yjBIgvTV$zaDD2;@>OUKgw9P(EmW*E6A(?tK11CjeaB**D
z^Dn|P;X_^drQQrH6RElJm)k#8HFaz_)K$c;w>F{MP7kEH5sT312LB3{^Lg~VUAAJ!
z<3tw5;w{Z!4mkpgq~=8Brn$I<Z;mjv?I>DE|1J8wVSy8GWpPAR$L|r^w%|lECua(B
zX_aN4BLxWD)7H%^Rhw)(O3VX{q=N0HT%+~Zd#BR%kCL<rc5i2l-wKmVKf8bZ-iHC5
z=bXP=IPp~feXie!_xm$t1Mg7c<8i}-t5EC{l@>*8W5ix4*ycPWm$1WuK}4XH)I8Ki
z8Gf3mgR^TmcG4xsBog(L_<#fL{Q(9N@U<X@ZUq>}P>O7@2rR%crnLDF)}b=iT6ejC
z7e%qXN?WplTgYH#p5ZkVF0FbN#o(9tn(*T*mOyo^DdW-Iv{UV&MqMt&ht5h@@17rM
z#OMjzjyy36u(AXVx5PXHwrV$cO_Xh3L##N`Kzv*fg((dWi=u_Z5PXUt2Q)5?E_*GV
z#hU~%__ZRgCv5mGm&UxsKD_8=BTKc++WgnHDw)G#Bv+~W9RzevfOa#v+G3W2%{7;t
zs?Io-i+`4K-w&y}r^gkucT;9xAb2Mqh3UJm+v$z>zFh+*_BeE4@>-u{#IAkch)yyQ
zB6;3DpbxQB97oPL)uWgec%yCDNjS8KlGzpn3Y*>`4Z-TnX-|g=6jFaBWguio2U=M~
zY|yqcat)?ie|?^7V;<naP2Py5b*KP=B$-150Y>r46`Tqx-Gn^B9%T{!ZpHPkdzU8}
zC>ak*ws>B_LbLuU#X#m0cknZJzfAq*+-l$N@0_cD>O1@6>I7X+YFa|pqp6suSuZZ3
zwmRi32t%6LoM+0sk(#r$tOc=9K`u|IX`Mfk6cDE@#`A@+ImbGq*c|Z^2ZP;gma#|K
z>z`%&Q?x6OfOFxdqnYqmxtz9kt!-VtRnriwv<Q!`!UH_;kG4IIz^JlJ@|2ZYcgcuZ
zX4`gb%z+t-^!9{d=W%M<R`GK1Y`pebZuRA<2YPMOdc3{$-!GmD7eDTNft9|le_%J;
z<bLmx{=&da&xr4g+fgel14C9_ft+i=tIn%WFx)}De5*YH)ja$MOa1|HO2Spy>NwnE
z3>!oz>zp+u$N`yqbd?I<7qP(fdJ?1DrhZS8MV+$HpD475Dk&H=M6*MM1*>NLzry3+
z-~O&CYvjyuLi$spzkj(n;ye3($MKCNbq%d~gsHDPaUfvfI~xZ0t50a$mesyioXpVg
z)5uj%x@m{>K-2QpaD%S8*UVQ$iyTQfUE5&G=EpVTUR1^Y0zAV)ur+6jpB|I&yjXEG
zqhLpQpeXbUV9HLlty}T66}irV;3<frMTVM>1Wx8zi^BY#tWKDI*y1%X^(;X$nJ?)*
z=UjI0Loq6xnZA?wgYNsR%w?nd7_joSrz*Eeu=prGu95hIcWJWPkuHqb1rN5Y^R@sB
zqH0G>ILZT5w~vt7&Ae_>sLX4?_l-xr5n$2El7zDe;EIM;0%g=06H71fKA97kG|NM{
z!oa-F)UIv(C0;5xePu_J<Y(sayWvMk)l*|>r!LKmftQIV>QUx8@|G{Kx3}~v#+at)
zG<azd+rXa3F>-9&)T8#GdY>H&We@7hsioU43F%;{tPNIF>57h^VdS5|nT0OAXwXHU
zl7r9~fR>*Z7M57t9QBC(=Y<unfX@U7g$SXA4XD~=v_WQ<`EqETWHt%^{@bgFRbQU*
zo4jA$2CNvKH!20B7T{O#%m-t|QQ!)YW^Oj;2`BuO$AR;^Eki`hiJyX~-osoe7IP1F
z6mu()1^a*v5mlY86%C-5mIw$cCZpE$IM{erXyXbv&e#G*k}$wP%9Y`Jd8<L-qiJjQ
z7?#(8Objdb=`Wl0e>nSHAKvTp@kROX7Xl6TyT7gP0yyZpjwv#+=De+mpXPZASMaDX
ze@=w~--!bBU|q0*Mi8y{-XVSl3<GfxSP;#1!oEfd7JKphA?6T#gvA!-oDflvAO<u9
zDhY_Y!4O<nw8R;hv)eM<6r4i-2Wu)^2ffWfsGU3_T{(`8o`6APk-e4XQ|k@B<+QwE
z>u1N!{PrK*5#}O)q4{vMf7VX>EG)h?tx3HczzMw;j5Rc#BmGE&SWjYBZV*|xewRz3
z`W;L&Hegj+kAqbDE4u*?h*YWdD8x;zcPUYZM5POO1d3sUnv@ST57Yzp1~38^O8*~N
zfY?IRGRBcb5^$`67#^->z&<d?E+F3S{7t-#)UQiBl{s`{fS`d^`Mad{rAC%kiH^Rz
z5xXw5!t+7MQT#>r^@fos)<(cCdu*C!4;BfQ6)|9tpDs)z;e53f_VPq)9Aq@c9_eOH
zvD0$L9<{kSsEMPYITsE_$az_dcX1T-L$<K4K-jqkZUZV(M;;7}m3cK5=3G%ZNh#@K
zImdaP<+bfoUnQmJG`AR4(fZ<0$&Ro_QWZKg$@4ES&sZKccW%}I4z}QisKoJBuAus+
zDq=e<!ZHDP2^a_!H2zzKk=ZD7)B`aOQ5G#UWb<=W5z1X1+t3Ijwi_#MQTOFTvpGX#
z_Xu$rD;_Eco`*y>nCf=7SGZD-VK7U(tg-K{Brp1Z$$+P4et6o~KJk}}@pFjF7!KcH
z@-mw<-!NF<bgaX|zCuvMb_NM*0lFe#=ZEJ>TS3W@G4@z=Rh%-<Q|*=`Uebp=VzWY5
z=CxRNp<XCUvaLn$p(;3e<<$!pYz(wLEMO;zHbjf;5y_%jZ}S5vV&YBr+;VI?JpCOx
z!Bs09f{j$3kT9PLE5imuP``Q-^xvS4U3mqtzZ=n+Wlh#XpPk)T;GO4`L~aeB4*?n{
z+SE+xlG9)<3Ww6NP@kVq=;Dl^_Q{Jdz^)_6tQ`tvTq`5j98;9B>(VoU$iNC%A(}nP
zCi7!#zRzc|hT<jlhJo#|bCyE!$w=bBCm0bC0fdPHb+Ui<z4JfIGey|bd+9HayGw@U
z_^Tn$&&51lKmMIPs?bK3kwLMnUQ>b$K~O0-H5+Lf7{`T!`40P)bp?iH5m+b)hG-Ze
zuyU%DzejG(>nMS&$N{*43R^nBdqA|ZyvRi{v<R=V?o+b4Mjd&txPy`FR-wvC)M71W
z-H0`@gn}}6FiboSsIxE`|4d8FcedIt>*ckx)w<{RuO5}Z=qUavI_>oNBq>6P$buv)
zH~7N2^N+@mG!>T^B49*>OOn9|O&CLrWb`{6^+rvd`g341Szx?H>V+y37u@D<OE1?P
zl&uz`5fCLPSw!fTMpYr?ouMZa=XHQHplK^az(iN&`FBWPj}vX70C2RBOprC!_xGF^
zi<|XizkBB6^5LiL+H&{InV#*vYyFKpbL>~}8cQAvAyz<z!newn!X^MypzI~dP?AiR
zOXnJqLmY&pn$weL>5NAyqI@<K+JxH7lQt0F?kLV~kizoepctVPZVMJ|F_pVU2YOl>
z8rsvhTpXf7b;0pjc``2Fkpo}}jP@5D>5y`db^N}QGJ|EayHb%c)S{TW)9W|=;}I<M
zK!$cy^8KB~A$#A>Ykz(uC1Z9%cc0XTnf}M>O!)Q@0t;AUQX;!*&^Y4&0oul3N=$J>
zO^ODt<6f!OM;63(x%3#>7X}u(Ho_Vv425oQ&u++8|EG6Eu&5T#z1oLJMtQB81FcC+
z0YMAKZ{dJS;txg<Yq)uohsa{XH1$x!n_HZ;8H{LgxSdw>q#70u!9zX`7j?c4V~;`_
zeX$;|k#;Rmg5>fAgmfp%a-$72>w9jn!-@`CgCjV=q&0=`!crlWW5rugKskonsydEP
z!IMIXL$ASYFqG9GT6V+N<D55<zJ9pC9KIR|7<s+H;4mP)IllJ6PNxH!O^Oy;Sn_S)
zputuP&Dj`e<QW(&;#l$_Ui&r{06YZ8P>R6kBK~dauYK(q!^$K-?T_91$!h&=V|}mU
z!K;Ozn$b@x^tJjYhy7ahF9v2_9%~<UgCPajAX<fZsDpK<<^;P9o|q#st`5N+6yq)x
z=tBZ^O8~R@7A!x3Gb2d^EKmcfwWwB&3hl$oI){H@O}W7}!-#_ia!9m1giK*@t8mjn
zIC7E1D?*gvP>jLoKk<oI%OdET&RDAKuB%~Lx_rflBL^?y#j+P-z~k1!`%BP&JCQZ9
zqluAvStRwczOZgSbm;8H!bBK(wb`ujU{V&7YY6fi;M0W5S*(NxjF(0eV|D`IKvJ2o
zY(y3Yh?bmyY=Rpi(PA#*Blf(#u+xE}Mc4_pCYUb;Wa1ap4X>5AuH;l?`-g*ol(0D#
z@Y#T0_1+#i#6h|a6ky&C{5ZO56D(u!#-f@HZ~uN6e-&x!FO2js)4edR*6rW>wg{r{
z;o<#~$yqzyqx^}dx7@iXcErItj-wVbYoOeK;9f&kQynZ(m`@Pa5ug<$H*iB(F)0k~
z)~#6U&tH>sz#I_0!63&V@7i~XBuf!<r9=-Y{M7Fhl4WdtUVR#|iNZj%qY#lNtLZrW
zPit?=Ho_3gleC1He6#-P8wu#JFoyA_|2^E((wi~0PXGNk$0QSq^g-zE5OXWF|2~;z
z4RX)5IKqFvOD%?3;GV8PA?b3dfTg<?a>&4*!;3~3lq|$!!Z4PiUcCl=#E<KA@du*=
z)pjQH%?6I&GT8JeVm*vSEZ&C2b%8Fi5xv)JWMl{iW5f_;b3T48s>+%V8_#smKiB}A
z0DK*Q=8&TT{+g6cT83pl|7NM5PSd+yd)KR#LHJKlE%iD({X_V0+(y@}bKXYDQso*E
z>=E>-(EhW)(JKzT`6bR2u}B9gtYOtWhCQIcQ9$?zfL~e6hVrc47l^bWRK;Q=>gNdb
z1~;2#(qk~oUomI+%?u5IzMkXIrS#jY4@gT}H{XYEoHd#tc?p#QpR?YqKmFv6uum=}
z%=C|BJ(f%v_qJ<q9&1k>{wyia+Tgd@FD=bs^3+3eltsYDiq6I*TO&Nmkd?Ks$s4W3
zoLZnkb0S%N5k}U1;ME-#$Y~oqrF}i)ImVtR$))+Iq1CNXx)=>A>gK@BD3*GcKmaN8
zY%Rg5yo=aY0gKdbTLVvf!vS^Tz$OJ_skgRp#%3Oey#WhIA*=+Iz81EAYTQikhjVmE
z<5zwEI{kZ}(*ITX;4@xXVfUsp4%{WJ8z05}J8$*F3@NMQX@=~J1@qT7e9if5hT+=I
zFV@=`IDO<D7rEKUgQu?hgZ;~z)IY5G{k<V|yFTUAaQg7^yc<9K>EACa2HYio{@AMU
zUb^|)`z9aO!z=Mxw*HfS?~0Zq+10GjQ@qBGw?tABfZ-y0G^G8bn>F@5uH=h!0yHsm
zzZ9ZdyiMfv^EU1+W?R8P*hfAYvyJFPEU%w51%Fe*v5Xa)SJ$Wm{%#uu&?m39u@x)i
zDO-3z+R|$8sTsvE{x{WG!&ZJ;-F1mhcQ$NU>Tr-~q<=OhUf1<_Vjye$MuOx?eu{aZ
zX#R#`ODWt>-z|VX8z(DxkSdGUWy7vH;%I0NP9Mr2LVSnJiE$b2|J+LJ9(r0GxZ5%Y
zFiqLS#xKF>Q@s%<a$2c=mqL|$9yiPEGKwGVA*$ro0=5-oppqL5WmJU<NWbpb187)S
z96@%{R`r?5hoyGf=S}UiU%3wq_ACr}+9qA*oALa%pYG05GrI`OTui$JHw!z9q6A=t
z=$$Y<ihYf^2w#MW7cn2|a;xh~4W<2QtlC8kVc90a)7y)hF+b8b%tK?p5_7X7Y=={@
zd~y2dEStZ`Q^o^9-{xUi6h&041_DbNXDeHXCqD=W6P}Sn1!WFci7H4Sg$s!Gp|pB^
zx^`6mVmv|nY`@>kt&#P*M@Qx3pX_^b`(nR#tS>>Kf5}t4DBoXwDZw<(po9ZWKpfp>
z-XY&D(QbLrafkx&+O5LQBe8}{SfTS={~RkmV=Xr4%N8R&2kjeU{efQSA%siWFxdsg
zY%emCUt27-_RxG{Jyzgt1kz?w<V1PX;N4M_)N$NaZC}CiN{p&f7V#mUV8I;>?1(d!
zJM|>8tvt%OEOU-mqGc#K_9f+`e`&&!rwz<JPLWLBNRy9$S-j5u6Js;G%oj%$FRon;
z(Yi|cm@f{r{f9MYDK=s2Yg$op<unrNWv>qCLEsvqxQWpD$2J)>YjQ|XE6344Q3n{g
z>f$}rC|jlTS?TX$8yX>wHYvkl!dbA?XlDHVl5^1)nFrGf4V|L=R;Y%>7qVN!{dHq!
z{nGUxk7(7o1NzC{0sZsNT~D>!|Ifb)Za40Rl0+%>eO;i(i#!o;zgTsetNe#$_x#t(
zp793?wLcTY%UyDO9Tpih1GN&-f`;&1<e{+<o%BaC!VOo(nfi2Wi0&{M<b<O)xi@ch
zoHEFCFbCcMKe#!u1O0$F8~0h+(e>#Qmle8~xr&Tq>)dVS<Mtmud-hd~{^5=k-;79w
zJ*-L<(}_ackV<dtdCEt(10Y>$7BamCVNa$%6e}5lXaiy&imAOwF&hqhj3eww;qc&B
z(nn(|M8;DdC#6HC${B0;aBLS)EI{&HKi$n>cO+dNF2!NkuqKwd)Z-CD$1!HW7GxE*
zA8|$)6p3kAI-O?1-x#VoWMBqxGcg(_x%siAD}Mirb6yJVi@p)XY*$L=VE^#b>)Z;?
zZ$2qdv)~cp&Kxm=kk}bCPqhJPQILoyEoR3ERc{=S^5&h_1S1Riz>qj!xb;wEtuFq1
z^PKonHh!+$yxu#41TPvG1`AyFE&zi%M>UQ7ma5LTh^qv0vd)2kWN*F)PAq-x04r~P
z5!DiOj4J$%4uNj546Xr#(krrkdFnF>qhC3X=>8dUj3v*{87J(wHb>(fWSLe3%K-}N
z76(ZRXA5FbB&F3xI}b?jjN%2bhIS&PQK3oVuRIKe63(z9uOg=1OSyuyDTgnOs79S^
zMWc+C8*OQHQ}ULY<07H{Fnnmi4MbeJB=8>}hA^g$fd@q+1W*pxRP2Wn?T>-|C)&X<
z$(Rl=hgF&>^rO`Z{iD+B_omJc;cxLawQ0*~Glq^Sbl+MH*h>M~@%BN9>|EeJ)OWBa
zR9S@pt~GK}xsh7F0P-A!RI7!vxYpW4RYT-QZ3DrJy9qMrlCeX3gq{Dx?O25FWfEFO
zWa+Uvxt3(vQRfKY%kGjL^f$831(?N&i+JYl%8=@|nOn{Sx`)R`#KVUj(&7V=5P*N}
z(81iJoYQb!FLos5wIC8(&%}=T0<<x>#me>WC^;J?0I^2L237^=nF!k*JbIEHx)SPz
zT4-vwMF^@S63_WyiL0r5fxhs0HB|qpXb~5|hGWHK1%m^3-g4a<hy**W4j`*<6OmpR
zzuhwK#fPY-677{<KkZi&Suf8HWNL5OrM_4dd+@bavjD@@C;H+uA9uS;W?#&tD8|NO
zo@PGHY8E<%?q8sJXAu&x8MGc!vA`T~+(MuPcMcJT_y%0lKXNV$>>s58nqapjf3L~}
zdk#yL{0G$=xS#BMMSOuJKd-da0tF8A(KFa~PV6G))PVqnIMG{;6p_x$F%V7BYHj7A
zVOfdJK!ZT3@vtB)sspT0$c;~ao$<W$(#YFWFLSH4*{=ZU!Ts7M-F+`R-wY!I6zf)}
z^`TWbf86nwvSb)XvVh_TygRrRzUSuFXf6%V-G$td4L|~z8^<wi;5d{HSugA#OOOlN
z00?V%C(5{ab@!H-TX_mQ>s~<@v=XisNEV^%N(3piDVgvGa5-!7Nz-&~e^|f%)+^{g
zIN_H$R=G}ZdGAu1exbW$FzxB95U)=$H)yusP*jHg3*(hQuzP5JUC?T8MY0>F6Y42V
zL&3bGf4q#;;cobX97`}(el)TT?gUt+tN&iCZpmt&jvzF$a{nib-He8py<tJVwOE?p
z3i#w$DIu%8jj(MH_;|wkZDn?uV+SWrWz0Za{sWH$;5eESUmO)6PS9ftM)EilRYa5{
z(UE~kn7jpqyoHL?BV2*h4;SH7bMJOj1Pw~gJlCzV8X`Qs(uHxjN)dTS>^bamR*o}e
zgTno%M1k!_RZj{F%^b)TDJaFe3Y5p;YfQcaK8pMZ&_3^caMW*dD%Vfj-`hX+^ua_H
zaNeZ-`42nr<|2u19SG&%VAH+*kFp|FCPncU>8Akbp{HX#3vi^XXsH30d_|~7G%Ph0
zaD~;s!RUoJ+II`~7~^A|tHGJ0v+uExd1$xExV_}2L7`oMbB%uJXa<<DcPR`ZqF<|t
zHEMz}>B2(#95gbTALxOrF6b!Xdsu`^mny?o%HX$!|JfS7um`na{h1@3)fC?M((kK2
zw)zS<(mowBkn!w7$PyN<O)q9ZB2oW<!2~9R<K{prv>g5b%;HDigJ{Gc0a?D(P#*PF
z5_?agw!^2^U7$Im1)$%!N*rw*Xg`fYZSzea7}ETPc=@YpeH!6~g^0x%9x)^38q|P@
zDq<16j<qmOc`M(^@zp&F%X*p8K7OBI+;~JXt?@|HHUDSYh>qv3?`ODs8jk750LD#N
z7Dk)Q)iz9iHz!(S**IQ?pxs<%90Lu?1}t{)J$tw}Ny=N17#eEGjs<4~RYB0ZR+#SU
zKoL@{1{y&oi{1i31Z6qc;GGZJay;1R5S&BM9p>9cmi$E`xIz$H9H1a(4k~PA#o5%d
zk-LDyZ?NG;nzE&m83sp6L2&+#*gQK491j#(<dQLtMr#amoR}i)J2J|nfe}o#E^fi-
zPzAzW%;yk}!7D2=&&`enhXKs^o<kwZ5p-}*)mw{;A_VzPyi@hwh_NUDDYDHK96-jW
zzSn1jqrjk0JoImHMNov5iI!I2+lK5UND<@+yjkFT;4DSAUOQ~3d$`|k|L~=w12eSZ
zA3nJ~AQ``b*-hQugvl@Q2OCnG*oQm=95<k_P>J+eOg%&wE6AU)#Rp8JXINe!f<e)?
zAi?T@(}sblkVR1>|EL{gWEdB~w3n&^s}RQOcUHRkJJ|=rWu;+?KqGW=>d@0Lje~_k
zS_U+7oXrc*VN=9l?8}_Rv2aEwC7RVieP_>87GcCKi%>CWq(siLnfl7vZ_*E_Qu`06
zGRde@?>xRo*-~^^lSFiCCw44Zw1Y?mfDa*COCBrk7&Bg7n^0m7NTEL{_k(SbpKpLw
z#*O|-=o08Am-2mMNDqLVHSB9d;)1@1_oz1$8(I!BOmj&>7f!vfGl(pzvq1&1Ci6im
zu_pM0(=_)2f4~@Y9u~*;1N#WmCBy3!lV45v>As&0tM(o0@moIe^g9V4r7cVU1XS_}
z>M8FN!+-_<WfTNEkXrg0I2wpnxeM;%#QaEqx7|u;pClUU;_^TceeyA4rOd;85LMOr
zi3=#JK^8>z=M8DCv1r!Rl7xY=*xaSZlCiCDkO43SgIcKKt~@k(fdUXBmVPlB<86F@
zk1{sSs{#x}Z>?m+>m2>|Ul|yzdogf%LL!+?SwF4$PN6-iA5YP}beE4C{;a3WaQf<M
z2!tfKC~BX-2$c<$5b=XfA=4Xzdr!Ci87=%AsbsJn2m%CM?DHA;2N^s;D4wfHtpnV1
z@w1zaq;xSG5F249(|5<Fz#FJq@e?PdD;UqwX8z!(fGrGh+N1idRBr}^f>X$LV5_-x
z)w)a9?Ih2}(HOg2eR=%5>m#$Mu$65uf%NMqr-44YFH>gwFZX`u++**!#qv{Yu|q|+
ztuV1b^BZx5f&FP<3-=Un{XrC12o<#KOIhJkz^a$gr|32@W(9)wE>m35`$MF!MnnlL
zsld5HFS-G1zy>3O1w14tQD7zXEDNJ0LP*6DXcFWv_ROi40zds-7N1BR4pPdwpwhlu
z7Kw@$$6Xor(|6V<WQ<Wpv?wO4b?PQv!8&)Ad|Wtnt`E*fwSF9x_PO1xm40wUtCPHo
z4gy>}hUG&uKs}J`45yvIt@hvub}?-bTm@!Wg316@P#9<UKFA4y5=;(Xm=EOQBB6MV
zf!Xsdb6+dQeIZ7vXOCJ#DRG=?Igxg{f|y09hX>wDWgGPcCw4%>E%`p+`k-9EEJIfo
zAgOZ(FTJ($o7P>oYn!9?xvwPe<vz(wLi^OE1pQw?e2hA|2TCj>&?ySWEMJGpy7QLi
z6T?6TGyh><@H2Q<1$qw_G|NjZ*)PbbMoeNJP`!5Qf_Fy{l|x$YB&xt2(i%_oyA(KZ
zF<g2ytXylb9X^oE(1#HxWVF%<f$v+<DlRIDvd9SxGe&l{le<W4&zn@=NSOSt4=<ky
z>z~zC>(l)7Y5LV^@f`86a?DP9#ZG%&KaJNRPd>~~$a*l|l(k}So+1W02d+}b*aI~6
zg(8NQJ%EhK4W>#X>d^vYBrC^wx1}@5re0@iW<zYje@G!GD+z=*37NbkX%uo=EhpW)
zQUg(-aRe@TBZq+8Fm`ax-$r?uJbURG8n%MsQeg&H6@+dkH6Kn$W;CPM?X=pfc9|nd
zBN-EL+W=XA!o<O>mtDq#7c=^?h3@(ma(z%AN*oQvL72$F^v-;Ck!n0L8&1JN#Icu=
zV~-k-Lem({8san@BQis5WjxVxoRs11J(SH*`K^ZO?+S`BKpePeu0&(%3P;_LT=+rM
z!B7EQ$JfkVdmJl_P^H12Lh*xuDf?6I^x(Xlf5Sj7LhB;lDAy|)O>NTN#RjzKhZXvK
z{lv_@^aJ2UH%{t*NSMvn4=FOij1<!+!=~>l@^`35qD&9BLWM-SrJrKPYCGdFB>sQ9
z{D(nCMN-H%pZpfGI7bB}2Cc+PKup28!fo*;1Df-2#!+mPY05fI#b$qhB3S{^7;{<)
z4Z(MF;xsOp|F5&*cXI|1Wy2FM$;8rx4`2AzWUE>HQlsKzafB})vnT(&{%)MI7U(7#
zX$FOvFZDOzZk=2I<ZA#>#Lgx+xl^O+bN{`?d;!s-A~p>$8%Yb)o9``$G|Ni{fD<#c
z2qp6$bgBeDSi|CFz4g=bU)#n(Q$+Qa`-{H}OJ3|C_OE}^q{~gvj!EwKEDU_w7Tdq?
zc{LXQ2SpaM_|HeCZ#R`i0NNmHA-SNxr94_OX3>BjjIq;fgdbu2pRGhEi`dnhKsyM6
z7$9Ohtl#d&H4Ly(vf)_R0F)pSu#c$9t1$y-C~~PlDF?Qh*X=uD5=n@MNoDez$fZt3
zi1G+r-(o(M!)wF{?JcJMY=iKcd|;NPyMMHOq9v_;X7aLK>ag9H$xS-lNwfCR>Dh~l
zXJdYsCggqUaDQ2Wkqz|`L-E0wRVbo{82YsU*>i?v2dhvIktH0K2y)AWMHM#(+$RWQ
zsBt(HLCq-aPr+j7i5`#;4<RNeSq2A8ZNwOYL9GK3MXlS2|7O!{%33hrCFh<+67en4
zJmd=@9Xv|%twGEn82Ca*G;y(hNgkRUR_^AayBN$?yxf-6*qaH3pvU_KMz1F5@Rtjj
zlz`(rd&N%j_{!Cg+4uM`WiTWAF4C1K9>X>n6i1N-rvP~W<lBH$LP%lO{5Up;snb-g
z(bQb6w6N!tMIf0;O~Sq)jfYVwA3F2P$`y8|viVe{WoID@h!)_fw-Qc)1uw_j+(;(f
zV`>M!2J4xl!?uxkpvdODpQG)66AhX2pG1RV_9gAh8Ta-e#Yd>~D@wpSjZ|+T)u0#s
zsozKw=^mOTIOOAlV?Sh17@nt5#xiIG+z=P!A8LGCG){klH`N;kI;sOny&*dbCNKEI
z=S3SBoCTmO4s8ZMEJ(ySkF%%HhC?7aw<(0Sn0Cd8nM+Q`#+AYjHu8yYnpX_9y$N%|
z5LV|6UE{>d!%e5wjHfb;_ED+d)U#uL-~W^+)Nj$AgF*cwO<}LoqkM@mqKt*C!tgr0
z65!`TxcWqkT^4z$5&b2lvW4<jv%D0<C2R+351gkhtEMuA%&-zI30*1UomW>_n>)`*
zKqQEi572|(icSb~Qbc_MD+T(9qK|<ikQ?<oG=OHSmeYdaZ?4SVbv_Cf`wv<oGo8^A
zSG;s+OL4yGx$1lK`OM}zi`$>LBnpep+%bIj^Nrh=NZk&u%M<?Arg-Alg}YO(*!n9A
z9xfOx?sWYP>k7MaLFj+{=j_Xg*<-_1`_246FBLC;IO7YN_H^mM%;>?9_9!%s7L=eh
z<Eg=`83UfW=3GMwX@p1T<KobR=)<%-N=hD%LLEXfpk|{Df~gDf$%a;Q$a3^T0c!9V
z(L@MuU*e3D1;sM+BrFnixRejy6P^Lj-*InXwul9*Iig@6=&|ODZ0S-8$Uug#VJ_yL
zMZ!W*QIWLx9yGj188(~7&OY6_e*ft6Tkg6azR}N??Qb%dPkfmEyuTD@De5oU`90F-
zsr5K046_7nKc~Hzr7?kB@bgxdOP69RG4JV&ngLBm%}l{!n4f5gAY35?=DD^9xpDWx
zj}mmmxyfA`=Zj(rq>EvR(+J1|N0j*}8J~W5j?5*a#DK%VLC<N3-RzI)T~~8TFdSfs
z#tEB>Y?EHU*_qxf{Ehh+5018LXOj{#N2~Vh*S)>(*&VETr())-OC!D?>bvz1^dA`t
z*&i)mF?IoG4wixx?(T^5_!UNCMO&ibaoa%TcHvfYz~^#UNmx(zZY3CHek;_!>Vxy-
zA`l7qdHGkWnbA(lGiz5~aDlhwX-^^B3f4I+_*ILT(r?}rT_wG@ge9Vh1$5#VIhD`7
zNR-WCkw?j;L@1T>7%+fjEU+v{a?hH1ggLVxmD`_Q4%YNP9{j3Q|14jjy?ys+_46kE
zX7d<86&}068?zlq8Sp@mbDAUGfZ?rIrRQ%G-aV+xjWhs*UO$K!NZLUhY^eaR+5s8{
zORp%3x&>r-fLPt5K^LgMttjERAvA^ppMJh?jT~srH07#R;8+L-7kJdhupYG_?hTQP
z+-wcPktW6j#Tf;uehe?A)<2uF^TQu5)OP1$iw<E+E>9hVSg{&`BYP>OZS;Zhz$}-I
zX}M?r{$EgYK2nC8Hej0L9wz#7OfV;5f^ov41~s+LM~B%gLsy4I)T=ZR?Z8wg3STH%
zJ%Wzoo@(agb=M}$UZA@wx3B^Mjq*4|vdhy*{qDU80qhX?2nfJ<Oo^&1sD9xcLR@0g
zPR)2COk2K#4iSqN|5grC@EnWvy0BAPV><p#{y^4ruOjQEdc{P%gLve{*uzfAq%uMG
z!;|YHFPb7c$`QHD1QHf(t}0ufr=vomBG^0gU<70J!AH?x09?yf<Ip3b1+oqfW8}Rk
z(?L#ccB@lG!{idCeT?n`wG5zC7NOK!#8DhL+Bh)q^RWFT*Z3_fu?L7Q#vy@bK>#>;
zVI|ryqy!e8M6w2LYI1TBBd8Ge9AEu+1Co~~6}l!n$@9SxznLHMhwsf?eEI#A9RvFQ
zuw}Z3-=*o)vtE*Cx8}c@{U&F=j80-;R+9Q>X9^582riFEl=*O}lyxAHoral_e7~kY
z$G+N;E?LXmV8~vVz=C~FyP>Fn>`=>e4$4aGv=W$6FI24xruHOwS3YDcDFbZg7!8%K
zq`bfjCUajyr8$O!LxlUVbKf`+JNnz^0RQMJwxK;{Y0wr0K-~?i)=z(vfA-$~BdW`N
zH)<zNY1QtM7cU=7xPF8|9GXjVLPnG$Ib;jd&tNEt;YMI4Bpk3>fG47^Kwp!{jHP7w
zvLUV7<KT<{m$*9Wc!R+3;y+`iA9gTv_@*}Y;cf3C%4slk?(|Bzw<-u(gK?Yp=v0*-
z4~o!6c>!v)?qBD?ovca9frqm=E;DgtWOnk(!Hce^@j_z%a2m~bJ}=Dkck_YS52!%2
z>=1UAgM1aJcXYfrb^#l`4a6+p+mi7};Jt;y$=ioF-KxK^rU_{iYs3zUJ5E(G0nUYz
z0<{8F#5_}nbVs7{$mF}GKJzy~n}Kwx!?RU1Z!85Ifxe-ou4#C!cqw=o8TvM&<$Z2A
zRAch1>dX74@n@>*liW{<N7mR+#JeBu-#?bGe}1L*dYB7>9;dggM@s-I<Sr=>w{0U#
zp~Z)55)PIH0)pC(kFrc-D!>s84`P_nkUc=-`JpS-EJ-v#6rsUx0v{tX3xA3jrnyTz
zS&8Te(hv<)-DdQ@ID7N(6$5M4>=?4~-bH67l66Y|bVNVuC7Hc`)NguhqB`@1GWF@~
zj(yL*KZm!dI#U&1ceGzJZtQpX-*UY(j2(UOauOa!RM0hY$~6>zhRY*wLooo0-r0uW
z(YMq~2>aZ~xTD${&bZDOJD|N|fEj!9G4_a)gXE=wa3G+_3n`1hL%|M5HvnTV>Lqca
zCGesSWzR3l$7^TVxpO;&4mJAxQTToI+@w{p1vFV17+lr4W}(#SyrE`YZ@lm9cjJl=
ze}Oxfo&EHK#z&X=HRG%t)BbYg)A*f5U274KR(=yrJdnyUcZdg-w+|ovEdsxQ@E?#6
zZx8c&VH|z{V;({m<_Yq6$PEA&AOj0Y=K-ql!#EPe92}-S0ITg$2O#VyL*l8wd|JI3
zCx%*=gF@n&nG|YK%fLr?^FErsZPly4Znu8s!F9V#h>U8<^nwZ9TiR#Fux!7YVTe$Q
zcJfNvfXu{jLy@H%0+ok>EDkHTl@+iTlwj)Jw#{%a&8c1vHp&*`rL%NEBCgpIU469O
zFkbDE7?h0@!dA^j+(z<v!4Oyn)bGSwpfbezV^CQIqZ{V}tqiz4!@#}yWEFap<pW9J
zzRAszJ!FhXcemkeD?S|-g3+6F405-va8pCkF(?%{=(%Lq+7wbjzBCN@Kl?M<T*s*4
z4YOr$XW5I$$;Jkt6-)69rznP^zLcgQoN!!`AQ|QKwE8w4hEx?DOu?s$EUZA}p|7l!
z;XsV&dlrn%jzGpS3OXv}REzF8(<FtwW=by_%2mq4r1elX+D<T4rJU->7F22ER}{8n
zat{=w2%+g@|6pol{gl?rPB;2XQdGapci-a~kfN)0muGrvCkC`v!fNVTCl@W~+GHdJ
z!oy><RR`q20e~g%p66ucA*>4wth)z$99)4;X){j+aWUT%tN{Zq#cYR0h){qfIzf0;
z?@~f<v{EpQQ$WW$!DhDxpoCp=1;C-_BUIKax&G)28M&bQj>j9dpebT&%nIR!ejq`y
z*Dyw9eeD9~XShHHC~3~s(I1s5k_l~JukQKJq;#^=UH5>p;Yv0ZooXO&?}`c<A}L2V
z$c8`!q8oEZ{6*p=wdD6<L-9_W{2fO2Tfry6LrD}gYwX*DpgN&^nl+!`LxXH(P6MET
zbSYIm<aV=WGt(ma6m<P_9QEFS`3}r)Z9~*1%>xakQjZybSPGpNh=Cpc2nYH9MHa-&
z{zDdA8pvGe+Wv=)B`&-Q3@fVfI@3!83236w0dbC%2rWLmiVgikw1-qUnt`GZT3WJB
z?HEi?12fec(>QOS-T`?b&X5OK2)^mz>}(dzQm{`qVB*WeqyUYB0&tLy;|QjW_i!3c
z)*vJk2Lrqa9u6!i+APxt$+V4V%b{w_D`=Y6fdl#;YF3|cW40|~gG%qf+xizRR$=mq
zH;eZ-9Sb>Ht=0NvJ$w1U*_yo4Iq+t9sobI#Q@<6gG~g4AT2tuN0-Sq*Q)Jj-En0FB
zmOpe%Mt<z(z-**XP)0t1NFF@rAh0(Sxlq*OP&ah?i(?pnw|wp_vhwh~DDZT&uhHD2
z5t$7=CU9<J18@j{?7wEsyZD>~!aCJH-6Q8}eRr-zJ6Y>4(SH+@;5*!}FEGZKs&1)Y
z`bE3{^{dxj%>qo;rVUJCa2pA++LcpuX46M7Mj^M*<mr45i-AFMF1#EuCY*5ArPw?P
zeZ?}igQkJ$`j8NZ3oC_CMXO#}k4yng$0>DqS)_D@F7<!g4<)8XccF>4YXy$gFUOA3
zU=ISY85D>#`+0y+E0-F`Xcbx^EnLyH!;I0Yl`_oI?CRh)qMPJ{&P>Kj^+m;g-Q!=B
z<z;?T6AJD3Pf(WqfuU&P!A6NeX&_2iAy`3UMj~{fNgyo5o4A%@v=LpR(@Z_)XS@zV
zPea8;QEl%6Z3LEh3cMWywU<T%Vm?RvG!!XNjM*7vR<<Wvc;5L7ao5c-OY#=tggyz8
zx5#BdizFJ=s6$Gn1dTk(5~m6Qk9EIZuhZTbkm#=K$G#d?Og)=ATm8Ec$u+Y9?Ke2K
zk0GUz_R-6Bl8H+xQB^BSjJl+|kqQ`YL0^+zYXCh49)#5{f>R00zbMM!yzC4t3AU9*
z=w2f*IT%%7;8eO2RdKLjp#fwytTf0385?Yr#aK~Bl?~afKW5`B1FRcT2vr!dvPEw3
z(V5(h#vRR$1&fv>UE5}e1~(*~DWJ8+yqUV$nsD4!K4eHtfsv8H>{-9rm$#@!{7oW^
zmP+(<HNw2BPaRA~6aoq0Kw=4_gUq1>%qG&f<x2FQ_o8d^9%dNHD&s90eFKvp(Dd+n
zP_vd;i6aC!l>iYyUIz;KXEeznhq?3WP=blMm#`y_S#F|80vC}39Y(oG{taYqoQQi(
zY7TSW1uq5wQ31=5!*G5y9_BZtO<Av{T0J$>+di%Knb1`sJwAMJq*{9^ccgv%<xGn1
z`?Ksf+L!-F333~!jS_=S6E;CGp_YOi3G(@-avFbFjUhlJ@`@ynMCD;j>`=W3#m6wJ
zPU^-D{%I>3TBT4y)I2f}g;KL6yVGR8!NQHv_yA2s>k=!7RW$a3-F(1GW3R=zJz%jy
zkq)9gZg{5xhGo!W0>-_8c~P}}PQct=bb%{7a;}a%1UC82ru@TZRO`oLjxJLeFThNz
z+u&wIGo6j64mOQWK#HNi3hT)kkcf(kiKowKw1@25eP<BJ@F_6nvH=HH&^bjEz*ICH
zM3AY&1*%Py_9XpidVr6Ym0)-nP>PVl^>I0XVgpf2Ls;DSa~Y?-(PBkfW>a$?73Vf>
z&p8lAsEDn9ZPLw*6H?jfZ-i9qhprA+XMK6uZ@Ak_^77@~5x<Pdr+7y;baQ5X?7UCs
zf7EdU9ixo5_JUtxWE7PF3}Z2fj1wrTwDm2pQG7}ZJ`L@ydTTl!0<~or_*B*!x~REK
zN@w%v$f^b^0W{RmJQt~66<FZC7|_6<o3bii?9+kqGV;$EJMpw(ApF4N)b)zJ7KAa&
zy($ZGB!bZ!^h5(NM-|r~T;Rf6DXUSZP4>`kWa_G`XCM8no<~bx?FQQ9|6g~b{Q$M*
zVd!B^sRYn_Ici#qO2PDSU=kiRGZfuXM=YuZDMrJSXc7Pjo0)Erluw7;4$O^c($?S{
zm9iv;UO#jJ*QK&!Zmt0`WxVtPnf*oisP`pbDMhQfe>H6&Ylal_c9~(^JeQ9+%24b^
z)ke%x`KV|tipl}=mWaK1fWrz@3Zabyy=ew+&v_XWdtCPw{+h-D^CU6O3?S_3wJ2Op
zE*%8*DUHH35;M`<0}`oW)iPtGXfAo!H+B^5P`_IR(~v|nOmj5IK@3IaUYca|Vabp%
zw*vn{N#f!_UaVmkI?i=215;=J)pnr0aWG`$?HX$j+~lh>4a7b<ym*Twj4M;ifjL@4
zeqyN%O*%q+BsppbM%>yQ86Jx+C5KM1X5^#zVnO3qv<ESBmDJutDq(sAy`i7zELwH=
z-J){;j<}4I%*Eyh3i4XuY0~K*VB}<rEV((c6Ms%+{Fp+o{pPx4Vrn33QhO_`b;32J
zeX1pG*=7B+`{W}@vL>#W{l=d&zr-_9burJnlg_o^{Q<BL?HNjhs91DfeVeE`PQx4*
zs6u7jKsREpz!BUwhi6z|-bYRc1{I_*$RauR?Kmf43p0$y0LiF021vqDfinzaLSZjx
zUBc(?#cVl~*dUU0#*jR`*lWkD0{vAb`6fk|0;2s@vfT|diGMyZ>U7pHJPZA^fvnkI
z*n1D06_XHk{rdwvgnv7XfEtZ{Nt<I2L=U6%=u90Uk~1{3HO-!M$`-;|0etP-^!Fzb
zmav&B)3z-mpP4Zx7Y`z2t1Fxh<{SbnB?v~EcftgQv}JNFb1`#dIh-TZDJ%*ae@W~<
zVdq=A@w0%V0o{uhlGJ8%lKSIc=0N&$-N_W4rR!;R@&I>(PjSj(kCMdza+o#efjs|e
z18ftmaay4k$DU|VJ~v=1)c_AQY6`&$hE8B?f;K3n5UPM`^fB=QnMnB5=%Lb{0Dn|q
zd>Dj+WBuQ6L7yg>&2i6Ly{p4B8m%1Dxbv>8ObmkRu8t?@Z~e_Z`dQB%jld|U&?Sj#
zZZ3^@V1A5^5it=LHU^r8CHbxNYH1jDVCY!V)9NUQOf<309on}P?86BfoQH*_huJIW
zQaIM=Yt1>e=%#wpW--GZHlD64(Ovn)z&xX-G5CY5z*`g+8M4!uK`&<NkxVO`!60F+
zfV)jxwpai1>ab$w(0*r1go^`NdIF=#{-buj`|`aP7CD-;h{It8;jo_}qX*u!ia6QI
zT`PFONZ2ZnH1(32Rws%s>+?d$P!J(b0){y-^^H?88~o9E0D+~aG}Kgh=abd|a{zPa
z=C&?LdNpMaK^IMgg1X`rtuBonL8j$USX}o!W^S02Hphy2BgC)gV^9y>H2&U>lED@b
zCc9Jm>FRjj!T#jHr<dcQrpLw;vYtI0-H%gSG(_*vj-w|v-rMo<c+DRMpr|K+b^`R;
z-eUZXB9e|YOpm5$`P^BL4r$6_hxl#G3k~2ityDXa?SvOQ5tJd1doWo{h@{$pB9_B4
zx1#O&+d+|c{V`Hm)xiaFmotM;u;!8Ha~D7Wzy+D7CRzPsieJXd(U1h){g$ToeLZ>w
z(`xwpk}%)i$7o{-b^y8<?;64Iv<l<6$xM)Y$}Sb0VGJ|zl8Y+Y;yC47A_w$xRiLNx
zCH!bH^|L!RMho%|W{>I5=NhlOdpb9>I&JohC`ifQ*;bbu;Kcclv~6ehbcX+Lv4zur
zy;m4ecf$VR0?Spu{+R#GZUcEz_!s8f-B(Z7eYR!87ek-CdD~&Fh0V9R7wrQ_d&jfJ
z#epBxjtct9Yrk9@;<GlgvgXS+=dcCiJ+^}<cxE?>W5(@M27l9dXpLh?p}ebEF2=XR
zQp<Psm&Obp5u^rvQ8QMyD{DieZR+3&tHU;~H|tGShsbkwM>hI&HT#Hfu9FXan=q@n
zgd-n#Nne=$ugBW`he8wF>?99PDkgFz&qux5<>Ink<B^RuU#=~)N?(J!2pjx5eCH7U
zcVs2E#&*1?Xw`W5&ZVwRLoU~IOjbLF#CMllr4Fht;9gfpeoc28lK!pjU}cPa$TdZC
zq8=Bjw>^1Q{C=VBg4L0YJ5%TG#w2|W|L_Mc?&9u<!frlp_I>`~Iedy4drzHh>dkO@
zj>%oy!)+(<yAJN+59(Gmbu||kbr)NuJM^`=ULUFxj7Ju}!QLa4XT5w7rhggYr@i4f
z(_^OB+^x>i@7G;@c|TV%bD=5oaqZD|U3tPxb;h$RDXL=I9hG}d@=`Ro)3;L(cbCWb
zl;j?d!9|B$Zq|?4U(ZP|t`)z3YV23;LtRT#4;SSgPYdW*)u)G5)`+{=wCL}bPL%I*
z(7ajVa6O~FYfaeU<GT*oruVLr54luF?Hu~d_h$Icek^`@vTE3+&$m3evi{4P+ou|i
zJDMeLEcETh&!}z*Zrd&$tmN%%6gT+Z4D?x>J|GyZ)ZV+Ze(F)RPI*?M?fn18_NdnM
z{dfx=ud!t8Nto`(b&~N~JME9xtEbI+jqSA8V%o><+GRd}(UkdQhh5g>I@zxrcclJm
z50>6%ZAf~rU}x%JRH0_)P*lI_f{)4g=WVLyo$;EDH)ojraep&CV%eTkT}wUuykg=H
z*|_!zYHUyPuCEO_g^lW>G?~gMPd;U``nq6HwLhnGO~_P($?CAd+rfv^lj{EH)8*kl
z7A+t8OmJ1e#&wdq!_#B=V$E+FkvFG?K40~>EwXjOMTcCkYj!><#X%kS^|KGcvZl|r
zKcDBBFfE*#Tsokg_0!fKRg7*K{7f*O?)rYJW@9CHWJO;_^x)1M&52sdfj1T)TbH`_
z*&>U52X6}oktlS>VN>&_oRFej^#Y_WFQsf3vLwB0P0T5aJ?rboe!CtK#Q)N?Jh@RY
z9#@`gW3i{M?F0{(;r1cS>a^c=c{GW;i+l%D%g@TkP6@_ctC2UgZC&;9v9eX;zxA6L
zJu8{P$`4j&4(HNVDD-dZ?@v9uIxzcfU;B%%LK5^{F<Flfnzg^%D1QHoYmPVbyCc3{
z-nk~G(Z}Rut?hWbV7#o>XDF!Ad@OFf$1%MU*Uudc+Bt-e5NqoGC>MW#FmFpK9P7yl
zmJg+tdx<|_Vm*E??<?g2jp?f1!Htz^LoP^lWPp4q=u~iBWqNH}@mYD-tv1z;!8ftO
zhr~ApK4ZU%@mV{Tn$C0olJd+nHEP+OM$KxmY3GGD)tZ}8g$r83vc|qsOg=yAhm4v2
zi<K(lQU2hAlnlIk;Mvjjx@%tLXT{yapZ&4B*kNZ*+u6BwD@iUyZX+;og%9cksbyHL
zvY1o?uz%WyH>ZLIzqUp6h`Y<r9y&hZl)i@2o4Ydy`_eeo&~?=mcn0)KE*vXL9_m^X
zsoP7aw`b$2F+7eSJ*+Xkyf><_J0c!CoxZtps^ET;{`N%HGaSy*^eV=^{HCw;W@$gu
zuOej1806s(Eny#MKLA~P7kA+wx+_h_g{hWb0DAB-UjlO5-1~e7|D!OyIJuEFBAnR>
zD}KD?o<>AONTJC|USE#f_2vxyY*gcv0EOYCLs)85rPWD?kS|Tydm4+9`%8HtK<f~|
zID)!sjr&+IKuj~-wKV-iy>E9uei6X*j?49%<@nX|U8#dzJJQYX6xsS9Nv`6XO=2wI
zU=Z_D?`Mpaj{Hl=($7_t&otaV-^j0l=iz|JFx<viaJ*@%Q<$78+}I!ADDT}~nBH9T
zrC{*uoUZMS=^i0m1*fmAYuNR=pmWX5a;w1;xh8jg%e;m@w_WOb6A5+OcJONe>qZpF
zwUNudsIg@aMKvO$+t$P^>$<fzvQWD%wQN_R_-6Tp=Jt?FSu)Bcvm^4cNO=x+M1w1?
zHbaG?LUMKTo1=cSckO1Mx$Cv-{IsPhlIhuzDYL7&b~?O9=H<Dg>!+t<vSu%?pS<0a
z({{^xN8umekVh9H=Id6C$1N6jN6E3~?~h}FzVPX~x;F9@s-5^|{Y{j%h_8oS%m%+!
z4Se>;H*lBz->TLjtk^>z*L-<1sXM9=zsHZq4T|4y#1j>^%^z>~=|bHNrhO3~Y1^KT
zH7eV6vKGjIq(Ow0=N_;19sKstDul1?>c~o7pMdLlw{0=92Y*l4207X`e?juH=tiG0
zJYJOU2j_uV-C4c5DQo(@glWU$?wQRQ+9N4Cot^Hh>-tAto0f@DfIq0iMj({`<Rf-W
z(I{8NjFlx*SvI>HKIJ=DX@!NSobd&-NVObWDDJM$XvF#gInyH=)9e1&%}?Le*o9rZ
znG_XIHGR802Z)SR-?_9aVrTr|iL{)q9mq4^50L%tctS)dk|h3v!T|YDCBhDS1v*on
zjLIB@E3O(ZN(L4ccCERY|D;Kwzig-Nyy!R8GNSFqE23NMBtPT>ud*KS6uMU>M0?K6
zPy6pC@}d8D=Va~wV28$fv;y1&@F=w`7weA|jzUV@tZ%zOEGRXyG7Svpy1)lvB4B^P
zbw}b0QvvU(nxM*w?g+3dkLzO12`l+fSHyp}O}2+=x0JSr<)9A%=eyT&cUD^YSnR2^
zl8@OtHl)YOhg87Qm?_QK0sP5{aSZsJ9e8oZy=dw$RyVE>Ob(6A-nYxt9`R$H@EMET
zc`_I4QJxUq$lW=#B1EkD)|c+`5Mm#9f7_QW?q0_S+)OphMPQo>Nz_(Z+lJDS(Ku$`
zw<-piGk65Kh{WIDwdOhyuCgAx2{1&Kh#f;3-$<nmjDG{@R_L>*&KG=8Sm%4QsI53*
zc?dG+tjTK0MQpu-UwG&^b2~A2f8t=1zvN);t?RWvjAY^b!CZl9n!RJJn7$pk2e{@i
za%g8IOWcjCr<Ui+$9_w}ls1NRN*#Qg3TDmC-tMARcqYLnpw`gGJL6FT3VkewqEHyN
z5|H()238=2v4NY{1EvN+HE(WDM+nQQdJMfyOcY_qi$M|=rib-Y**#*LIv7-l%2SB+
zr;?B0YIAPj?vw|AzH^7gomDaE^<NK_R%b93UNPPIb7`s00!Jo93sa05FYLBW{~RHY
zl<RVL=>sJ$bMO&=u*~)5IzG|hMm*3d1medvc(9CUa2cPWsCW&aG`>J%g)pzKXUZ7Z
zbvn2{l_8WFj}l~4c{1)9$zIS_YAauBy6OLND|)}qQRrd@X09l-FP$a2j}=pWegtnX
zlw((n6{F=%+Ls}E&Etfb?v%{Q`y&til|z%|HmPM+JI95E0gcBg55`u-#1k@&ZC$o!
zeR5;lS!_P`snO=$SnP)?-vpqOmXjWpUg<Nmu@V6#Ks?v2O6@LpP`rZx0o9?E{|yo<
zsxsX*r8{DM3~?t^VPM$sjo#e==l;@*fVp%BU_kM%Mv!hWoWsXav-7zl3z9d+HWvTs
z>)^v{!v4GcU`oA?N`07O(?akdKDL^O6|RD65uOf~KnMi(uohK10<o<*kq&7wcn+by
zD}M0p^yE@UP#X$}?udTWlD7FMGI$)k(F>uEG(dvig7PtCLs<P5fS!-9JBxQA2e&uA
z0UB1~H5Lj`@7j-DZo?;x-4P6~=mQ{roij$8&g2JJ-I|y}!P3wF{bh@O;4Bf~tg&JJ
zF*_aLY&K?KTscNcWi%ygPN_6$?-<)<MD;JYWqs8au)NBBt=V8uWjcG$#zL$g!F~D~
z_pYT>Jty8TY>UDcA$R`x#$aWdSc6TtIOGb2bTgw7MN*LNuv5M^L_TCPSedW~bV~e2
zs#8e+>X7tW=qJ=~lcfaBED%(H(jD6yYvch!1fj$Aq2Gd`4v+QtYF3X01111tt5Oe(
zZ|*l)y~%U@uPWOc&VI8`6SOBKqcMK7jqaHT@M04eT?p9xQ4I7=(*8)M721T^(t)Y>
z+zts=n-P6Q&4OrKMs?vfu;@(-Je4i&7BZx(IC-cBk8+VTVLHmexh*ABN6W3SmUxhy
zG5hz&!{fn6EXCd7K5MbT#3xY6!7jS$iOn;Lv=E6%<YL*Ljs2)JsE{L_Ai}};gdHHr
zSi~UIq#f93#O*TFNS8hVvbXM!S8tvj(Bmx*&iY)17KMiX76BC+yLPknlSi7g(<zeY
zU-kLv=5og+r2jW>?ux$|u_p&C4L5e^1C(zl;zl{{IT=ry9+4ICe%-2(Z*0@+Gwuku
zsZQ_neTF`^of4#i+W)_9Q+GWiCGbj2brc+-5d_D_gaAadF*en$&ld9LW|R-Ib*O85
z{9xsTCaNDPXt}5G%bV;jkC7EO9XoGsPjz$i(|v>On7yb!3p7j9U#gycmZX2WJz+Ya
z(mgXmTdLRiO;zI*kCJMQ<hh^jTfga}x>3^`+mT1wk5EQ{RK&4BL1R!OKp(LWlxQfL
zZ6|8RrYN?sZ65r=V=83y2?SYp1gcY5`WkouUBjOfvA`az=<{{FNvq@nZcm$o^n!Tk
zj!<2||E;02l@8L1Y>C%E7quYwTvA94QEU>&8mi>cHK+DR0eF)9bdMENUzy=$H6wV_
zWBY91fc}(Z^6x6ng}Ge1Ads)URe5obYF+0VGb|x)xvLngKY7{ENGVF^vabA9<E}S-
z9Zfsy)1wNHw52oxqRm!^49dU|-N#Z-?&@Dr4JMzY$y~5{<d83*jS!|1Bn8B&FaX7d
zC=50~Bz;X7APzU|`hICO522Ovs}Lv#VtVNpKx``0yQ1RX;Fr8V{>L^GWk;~wqy@my
z)jXA^yHx!|KOW;Z^;ADTptaL?hv{~_+NBI_wSE{UO@5F}gz3LGn?70nvPH7)D(Juv
zS8<v#NB~D9Xowk~;Au|OVAB{IhTF8KE~8x@fI?2L%9In<8<js_0Ctm($mxzkts)Le
zr3`$)Y{6>Of*n(^IjH6%Ut>^NYm1UkqF5tM!LLbbmRkW;rW(K&yLMoYmr_Lq+BYJ_
z=$g8C<oLI`X-u@-ug;h$_0#v@g#^YVLv=f+zdtxI+0~?f39Oh+R!k9V``9tP_fF--
zAyA_dYzuVuII2k#QY6PE`uR_@0KPW|nowR}!QnECA=j|+&q+R~I$@>%al_c>rwZEK
z$&mTy{b~_n(euHeW5y#3Gnn$THiX#bSTJ&E<5;^Kk6BTR9SxIr)$iO`^1-r-n3~G3
zU1GYMBMOfkdOvG0ChNsP$@65%3#gv~{g_05OfoJ3aXSR!ro68I(Qjr}Kc<+TG;9Ax
zq92&KsP8Xk*7-JQJ|z*uofwyI;*IiMjoNLD3063^Fb5hA7($eY%2wCXrJIDnYOrvU
zqlMz`Tej>ys(h$h*Et8S5t}OjHiO0{GsudZp|XTchQ~GdrYj!nnrV4&*P0PHAYik7
zf+_G>ijnHGuc{k<k1;HozL<qOqw7}at`L2&)7?nXznD-A+x@{zF^%_F0r~)ek|%g8
z&lTf@+u_I8?ozI88e4_xdlB@D?7|ap)2<Jl3+ON1)oA+vaQF66QQp~}@0>N$V{3F(
zw~0+u2AY6xU{W9g!;z+LBoU3KOM{4s6=>pHA{yqvImcTQteSHa=o}*gf`F`8A#{R<
zq|lg1bF8K&&;}@q*tRkFa)eH&s!U=GNtFr(RrTD@elSUQPWQ~ZcinaWxNGG_d8vBp
z`Tc%-f4|@T-TP^5hEK|MuuHOzdLAW`r+UJi>y9Qz>HS;1OF2K*7%g-%HF>9%gXBmW
zG`OG3b@eU-Pz&($&FD*Ni)C$%j(4Z#OROn}yb<=B!;MFxI_BRi8cytBuWf<l$3+8G
zt%h#fLMNbj^uBFHXT$qnxMI4UbLU{Xzt0kCRYr7*Yk`vT?XenAz8DP+Dpy6eQNA1=
zA60UPW~<pAWSB!t0+g`(g>_DpITCVA$K{&X`9H`&I*gV~Oo)TWxDOmyFO<jQQ>7bH
zzMcani~%8r->wWNKuijV@CN7-A<=4+)I_kX9TgD=HEM3+`t-O#+XA9%J%HR@oDGn8
zV6X%MOU-aXvs5|bNK=4C8SOUv5!D+1BvDoTX`-^<X&fH!$qW7$_p_->f{x#HIa|>t
zJ3Md2v+3nxpHms)|8}Lg^~3oUtTAh~7IAsr;E4{_0U$y)HC9EBxbn%n>+4Zbo&$=s
zA`z@RTC{626Y0W=oEa-k&llH3>l0%JPq4<13M%(i3jX8BZL}8cGFiK9E%C*VI8WLZ
zn6~cus!!iJ5oQVbkl6AjejnC_w)00z=sn$pMfuq57nM5n@)MVS36lqOy=psBt};z7
zWe~1o@U1mo0!Z*p^2#O^jn@<$W%*K1N2Pj4`>P;q+2HL~@B0u?kr!Fjsfn&f!LIgA
zu&CP20wP_#C<)e%<;F?as7iIdXE>$R(H1^Cr_qh*P@6)Ya#W47?yf}yEbuIwTU2Zi
z*(H%W<uHA`KpEI=8q?Y*)6{?c^E8#GyrnVSZjBmw$3M&SR?*J-)~eAp`M%gp!Ndbd
zJ=|2-q86sGd5Dc!IvbcQ8JEu(;mIy-$XlwbYfXclJNKvUl5^Ztv~xM~8yOp{BLT&N
zUbI1wgauUrj0lhu<+FD`QKiiB!V0TMJXOsdVE@#zeYR6oyGe+^KwILPV;v}u@+as1
zq67Dx?we)Y>}-3JI>Mx`5IAHB4R+otjn5l@b*yE0Y`cA;TxsM)4cs1CP<kN#@=9Xa
zH*0jrcIf~18ue~>uZ#evIV<D)GydHYonHPYOSBJ~lDo4y9t~IcVEEj=7m5bXxx;E(
z4%iN{M2AKJ8SGr`x40ga^bW{jNV%Z|ZrTH2iEluk`BM7WA5uEKH}1-E;PXiH?cK+E
zlRnPbW~~~HNp=EQ5;DKHP8@G|#M%lYTV&R@kn+XH4}b8`r}@J<N9~6H5wr4Fi!_hD
zy?EFEtnobZ+S}6~D*Yd|+3&c2wlwTV1ydgX>3zR=y#B@G4fpT+*?&uT!2SG&z5iqm
zri~yOwx{PFUh>O(o7Y~iEFb~*EY1Ww?KdJv9x&F*!rSW0zg_qW5wO#oC&P^ksYDdJ
z_fm%ulJ9*~6cwA4i{$7<An+95_l&iwpu@W%qG)MOE&@--zt~Q76afulGb@bS>(>^e
z0I7c}FFLUAbm{{+Ln)qik0tnpFEZ3)QFQ4O0kikrwtfC9Ipn-+E#^DFQ9jn_{C^(|
zdpC_kDS0d&1qFZsm4VRaSev{nj>4x_SXI!+*3yE&^WmPLpj1cQLMZ)yu9)*wQy!l&
zbO_wPuOSSj8^L&?>xlnVNBKbO+6pvq_>jnAkdoShAx<UXxIT{GbBiqCXdPH5+7AB!
zHw%%d3Ar3uf%`q4-gWtNfs~U4E@7{HH>dk_Ua<R|*{_7oD7PX_U(w1qVITYYV4(fx
zsQv4ky79W<;79G9zN_|$1<;%WPn>Q_7?|5=zOQR>PUyS{@`kC2kUlW2A@#^+V?xXb
zw?N-*$f`)&mFvD|1?97&ptV6;Z8({(wSkM}1Nc_td+uMA>x?lvGN2AFl+wgDZw;bI
zO?%Or-lP%dy3S|I!2adoGa93z>KXhIf@fPNKMm<ynrS2y@Td8%MUj<pBhD8bS;eId
zG0tVhuKIFb>4ay|pfl89zH|AyE^yGS6zaaeY@y%S$HornNArq43<s_fa?+d$ITfH>
z(TJ@B<R@EI3;V2!Jwv5|e81mu(pV2LR8iABF5UksSVf{@l(TYFiXZu&M~%H|1)*bG
z=P@q_rDn6r$f>nSrG_KAyUfK;M3B~@s`Bx@JBsg<eZW5e?zysY+;bqXlV2+$4FWSK
z^+mK!l}4uROq5kJd37d57c7qq@EEDlR`Xbga;N0h`4sc@-O8;^!{Z-B2Kz@sLn)!8
zzpdPLAOTT-ptWd*F{usAk<;ZQuUZX>ITfCboP!9qWlO$wQRY#w%?`mOKLfZ}VPphC
zlKv#vBI0LvBs<q}_b0bp6}25;RFs~5s5i#->@!H|cK^1fH1|EvM0gfzw^fAqRppd6
zr0|v%;l7_&cow!Lqfns58y)3t;cfZ>Qb+uVvubmMBP*ioNQ)tH;z9fEQ|HWMkxFRT
zbjPpRb2Bc#^G^2HUoWtP{G*{ia^Bu~qIe*q!?Tx4gu2m>iV|x}%&{8R0Z|ARXy{Ge
ztpcCo2B!lzZNdVQ+PszKr@jI1Qy|u;Jb@!cGy3+=c9g4s1yX)X(3AD0WPCnj-g=ba
zz#$m5pgDEf^s#yK3!!%%9?KIP*x8P@zNGcb#x9yeeyuWS3E-~TYYAK&HGlm{q`&ao
zi20MN=383(#9M`XuD3&L(#Jn-jOsht@IFz#&>ckyPjG5{4!MduK9E2{=9W@YNhm4}
z_Yu{a!sk+;D|WvOTT5F!81LG{MWa!7dyd6-8{8{>kHSPFlVb*LT5<xsS;`n2n^L#s
zN?DzkM-GY_w5{v{s1&*Omf5l@{4<KE1W^%AMyDls*cNalTtjKn&Y099_j8=V0VrJQ
zY_h!fgG+Z#om0kp!2Omzp^UtV$LibmWS8(LRda|K_=!HW>^0Ak@}hw$yoQ`2D#5@C
z<914Tud=6LR-nTScfZi}9P4ZW4q#=G|Du$4fQxS!ik-C@HC!3|N$xq-VTm;wVU+wc
zD3u0PliDdVV=NL^=P_gy{{j>PRWd0DK3djyt`S|UXh8Rx0Gyh<L+p<Kc=I}?Kucg^
z#C#`3H|EwWU*;%Z6)U^OK5Z@xeJ~RGtUdIHW$NO*U{~6HT=Y(O<8y@D<ea14O5#;G
z#JN6>0E=t~rtNapwt!em)<pm*JbaX)Rp1x-Dj%jCRvD$2h1nT%2vMM{!%-DK(l@ap
zeJ7Y^f^(HE|0L&B)b=I$UKK8q>Hx(}2cVToakO{JKn4m>VP&gLErUeIoNgkw>WA2L
zh*t#_vumSsj&fiIo$FXws#c5j_ILMiNR&Gx%C+ql<#&UfGd%@)vm(vjVdXNX<r97S
z)5;H8{h3tYYq2<pZm8mON4JEvZf-EbnHmGRhxyj<RF#RzY0wIyq?!l+l2stfaJSI_
z+-C(S0>*liRJuhtXhWTolcwQ@&5ey*g1G?_E}@2{wm+%<iMorZQBwC1djvazLI7C|
zgxMS=K?Nm}kV{@!JCJOr)p1M-+*)7>mMepbcf_vjk5qcib4|)zj8nSMm$Gbj-!ilJ
zyl!G!?BJy}Xn{!j;3mkX`^aW1h2icMu|3p`a(5!cQn``Cl!(|n%M+DZJbhI?5*l}N
zC1TAziE7a-JJ(s#un(~?qbqO($Q!RASwcanGazW_L+72@#sp#nZj2R@r*|)>>Qu?n
z10@_8ydHAM9L{Nu@kn_%(62X)m$mUOUEX-nMZzA`CyuP|a<1Fxi+?*|Lh{qs!;1T_
zt=st>p>;_;E8LIf4iIU?eIodRu;Z6nekNyvQ%xEs{$^I-9bo~<N2>)(<dEx2XG<E%
zuo_iHx7K(PNWeiPp&7((vexKm72eF2kX=ybD0i+k7FgF-u(Y)WWsX8A`LU^aRa9P~
zw8v$=R7^n@qiFH$%3LSHq!d;qt*pPat@|A6TfiQeP_Az``}XFAZjG33YjmS#3q?iG
zNZ?D0=|*13b&WE1#;!~#Pd!Fpnd75Wfpi`<IDKk=QqK<wd->6Fh_t6k`1w}b{T<6I
z8sgeK(|ncG<cWfW?KF~d&QRF4h5TwQr-dEX)^{F^5S4FwoE%+Wj{Qc2U<nsOF-fD&
zaKr)r<Q8m=#PHXbZd1iCJG?|B7wbCnaCs_puk%^s5r5{aBdOC0a(6i;FPQ_QW<N`K
z>l;95DH*>$JhA_v&cFV6lk=p9o0NVG1)8l>-x5BR{4e!eQQ1*YKd`N=D1l!a5D@$`
zCv|SUur!3u*0h=e(<yKsv9rrr4ts?}$7Mp#_w&0Q)EDKX8Zx{!DS}E`s3>yMB9K5}
zPDKNo*tEPi3u6?Ciqc|KZ9(TFN6}9@Vh2lBmd^ft@*N)LcTTY@H-`5}SnSK&qlD=~
zRmwyll+qc@R6aj5Jid1%aAzdYhi-N1p2YVbNlYXL`c`nDce)1WIoag;ZDmDkhI-E%
z2x(S>fPorKLsY(%i`fW>LeP4PGSBjgz!76Yj_Q&sPlMy*u&}+6np|B#uo2CAE<k-B
zH=DY-d-Zfg%Ty^W;g;$_mGZ{6O`fVILLxan)Bg`qi%H}yo`reHDaKv`$62Bwi+i@x
zk+gX8)pO~}#|Oa<5*ELQ9R_+VC9&@;EPgYUjjee@>Q9>^C@gjvI>5wlRZ2lv-b7m+
zowuVp9<d|4^AR<84IRQ$@kY@MU#!M`WcW={1#Ie#<*XG9InJ}8UiH3{ztRf%)DRV`
zTd0r>bf?Xd5GXV=SJE;+SrrfL!`mg*At$<VM(ki<zA=wjljA<3IUb+UwX@8kjAiQl
zt>4R7gX7<2th2jApB}VNyf8dA_(W{awl#+e2)wD_ezq9;<kUoGcJ71DwG|Iin0nTV
zW~eMoUQcO0atRCq4HOt)4L2r1ln7i^d5K!IMP9*I*&-O2@Ilz&7GXu{n`_)m<YprQ
zwvKw|ah%dzAhzkN@F5(k&oS0=-B^7n2<)Ox##^l4NKV%es3_i7xw*`-zoD4}>e$~A
zo4@gWkHCn+zq~w>tg@FLmZ}TuVVkUSv_|C*HH3Xfo4l37zpzrym6Ken=@$teJK(+s
z?HlnKrpMD4rgF1MmQZ80ZX*7N2nxqyQiC(0P@|_A*U1JL1&d&CBZRg|S%yf#J+Rd(
zdx@Dqa-*zOgxHH#5SnT?SIm|uSA|W$xIK4$q7TgN*P6!<oeO!U$Z9En?9qA6Zw)K&
zw=19K?HS!<{_5r7(91gi_Q*Y1QyMq#e2WF>>1{nPkb<qxnL$n=zYr91cVerkZJ$3C
z3@iG>W-D={wj5y0E~?d5X;E{44Jz27&Ub)o&L29Yg>o(GDRYaMcCt!~U&1&XlON%Y
z)rj;Va$S^moA>ALvtsvR`Q*F_*q(KmrG5kblAm4bdfD}Ch4&eL0V3d;)}P>8pF6#(
z<??pLZVz2mZfecnJ4fH3it{^<$B!&x$9bPo{gG`_m(LYs)=0!8BxRWPikw7Ra%QJf
z!$Ihj&)a#di)k(XS6rlIuG70#bSsk3oaAaMdf!{91$~e++jg?jT>-McDw-@fSu9ar
z;X`p#juyo@E6a)UHuZ(-*>hskl9y@-n{~S54$$G#1?Jm9St@(rv*F<9k)|*0osyFZ
zmAg6VU`TS(fx?{X-)`IaxIktJs$9Cz&P%Xy;bHlzo9is!B)Aq(ffCRnV3xEWtKsdW
z7@YS&-ZQGIt?~t;49iXw9_g#Q-+hncWIim9i?-Utqcgnr&(;fSTkJyKklMcBB@Q5F
zfR3UqL=P!0J2Q@gT~rF}TaSx`rQGZ}ZoB`1w$s3JMp@Z|ZsK24+n#yO!Oqxp9%5B!
z9^vsdeJ%YPIl<9*oC<QK5=xb+HVi~Dq_QRs$INZkwdu#wpa&R3sfKbmxW3+uih(N0
zG6L9DH<K`S_Mt=r;6)Kz7yQS$LS?9u5RLzp7g?UWr5MJcQN2rsCL74lm#@~y-LaS#
zqBQwsFfX)n-y5oT;B=~Yg@87`<k-(aZA;EPIlwgGFj=lYPf_Eh5rKf_+t<vYDZ~D2
z!+wT`j&C#W2>fI&Mw2*WQr^;$MQvrSMLo91a>-MYHp2#LiA{qijM+z5biRU$LhXHL
z^$9Gl2~z#fZ;XyynqlnSFUG09tfBhhpXLZgf|0onP#mF}7haqcvpDiFpW}hTQrFAK
zJAgbxv_jDiM?kC=>j=K5iNg{ROJ`4|T($sKBY`jL=;4;xq_n;xVV`^x{X|k4zmn5c
za<f&U9+P@PlAQi^dXg(D6SPBiAv^UYX?fw56&wJ80=C2$r=h82?WEyY3Wx|mm?<jD
z0cJ=@=-eEqqg3Kuy(f`lR}S?7NU8e*rEpbIqlLo@W}_aQ&5uQC*rC?h-nFX2Fbhf4
zkrcT>m~xI4Q%QvPN85Va77bE3X(RL<rz#d*2!0{-+SsFr%cvKp69UA&aRCrKG3h-5
z;%@tRxn-j56O71yw4)X!@GasyP}sW8I@NwQJ}aNF{Jn)bLE5(+D>-j4Zm%es(U`+-
zu@S_mXhA6w6NSsSd9a~WX^u3dxz18IvAPx&3RyB_RPU;fOGV$NGAuxT&dkG_eynx+
z&Y#ANus?e*q!BT5h9umt=kpu+gZCZ$qUcI1d%ID3CxLxS8UURvR^4DroDk9P<}fAX
z@A2I`lj70EwHqszzEdU4R1PX9fB;CM0vudjOQjkpF+)noa4x9}BNf4j#fKpdm~w;f
zVgF3|wwk2`J&09dP`nu6dwvC{WllyZYpb#_SPD2v(0zTJsC~NQB`;1(bT!>o`QA#K
z$NodywTT-gY}U5im`w3U1x6JAbxx2zeWH;kL$!5IlBoY;A3ad0d~6^8Ql|j&Ai)Ci
zvaI#*ztd4PV-wpgHwHXK9Ce*dbM;PiIp=bYqxUt$ZK?3?z{Wk)K$$w*O5_7civ@_R
z1>M^twNvt_qmA-H(}~BuKd&Ghg5T^9i3%SEczc!Nm4Rw2@(^Zuuy_+O^hmTOmFORz
z)4Qxc$$OB{vN3AXqMSA4xKJ6BxHCrMJ_{%&F5-l$M-GkE6z^QWF=E9~-^phJ9|`0e
z4h8c<-#b;OMuKPMRL$%vih1*Z916+29mH-tv9@D@Z7TBx!im;*q}kTniq9Zth*)+!
z+!s62H<#iVJ4fTFlW%i>)k0?2q$0ov5O*x43hXd*Fu<3WyeP}QG{d_>UPAnS!0^y2
z_iAcm8n~@wf<pJANv^9#M!}E|qyj)ZXUEPChtB<b)OX!4sP7Isg-!_@_6zeBHEF41
z*?|ON^<Q1BVyv%4;NlsL&9Xj2y<xwVd$$t!?fZai#&H@I!YuH|KA-d{yHY?2C^cQU
zzQDE;e12jX5|*KApL|HPj)ICb*R$g4bYxT@C*(vF&4}Bg<}4#I1Y_D7kudigf0*FR
zmRea(vygCSc&R(z^>TRJ`-Mb5u!UV|N0}|}yi*&WH}QrDvi2Zns~Zs5@BNV@K|oBK
zY6Rgt;EspV8pQ<bjiM?Fi^7~lDX}FSxk$|%VFKnCf@yOn?MzU|LqOIQ5-6!X61QA0
ztK>v%7p=lCo4g;f&71kQy6E|8^r1ushgZ3Qug3fYYp9KNhEv}(R+V!|-9MvEqe-j)
zf-x@LX;lI~crHT4fX3v_8CK37G`p0vzjDM<ul#Pwbl=ax6vStP5&w*H8E4;l0rWEY
z*R`ph9SKyf^~E2TzVx5I@d9LHJbNy}bN>_I*(|YTv5yfaBPc-kwux3aS(&A)0}m7c
ziGz6}jdC>n@;PgAlrtvZ^Rn%SBLAHA=67HWn$(-B3L?Z<qnPh}^^2Ujw@-}R_{j3=
zVn@x&#$ziNR8KoN?Zfq(nxpss?PJe({p4@^vw!&3%q^Gl|LMPW{&U6u`8Nkj{^kew
z@0+u5h3ny6uRa=;T=mcO3m$mwUlzP}HqUZB`0LS$mj^OpA6{|a2Txf+xD{+8$O!~a
z1SUfU)v4W{DD4xuZtB2n6Kj#XKEDXJPesz;_VmD~)N)TLx4I8@ewJm9cD96GAM*~T
zkA04$ozr>epPo8gzP19hbhZ)27=!Lob$@VzIoE6bG3x$k1xAdZ8x*;ERB<OMG6y2w
zjN2RdFK#NZ=g$R#3m|z8EceonydMY)qfS9X40r1E+96JTaSe_O5gH@P96e&&T?u?r
z?MSHQgJ5mXqu9xDSZcj%`Hronl57utL6FU~?y`>Txshj@7`4L$0`HG1u7l>Ut`FbV
zo!hRPC_NYaW1IQw)pozzJQ`^Z-IMopYSH7HGy0RHcgc}Na&SjcN)lLIVH7(<4pRNp
zQ)??ychrym7AbfdV11xFJfk|EIu;;Uf-h9TlaEovNWMpi0)9)9!7z`~2oYGhF@oHi
zZ}oPbXSdMnlpV1gNzt}3{Yfj1KKye45Q@rtlp?|(g%*T8KqyKj<<&XMpOGqyZDsGW
z4)6QU?qiVTB4<1<>cu@r3L;Inru@ZqJ1`vljn;f~Z*AH;mEp5DNX<u}rvzbASPS$;
zq!B6Mmfr>Ls0;{iwaqqU(Rz^$`JsdyL;>zM5-N44*|CE_251EU2ep`533!DhNcxi(
zZ3n*b>W^0o9XPqF8ceDpn+UP73o4RSN+T3mxmW<PdI>eH@_rCXX##tckj)E4>v1%w
z*O}8;$=Bz;k$R_4`6F#*VC(A|N@c^!_-N>Mq~CJA&g5@3kK~xI?>CR8m_H6P-@cwc
z(LcR5VMgL0n;eNGgLQNrNns}~ORPQE2wV|KZplt7dewAabSet?w8W^9+3ug^Zv&V~
ze#Nkb;ThgWSt@>H?o2=sy4}w>@~NIhdc+ggW`cEj_*rKamCl^>=cRgv{#jXo3Py$%
zEt!`kG$%<IpiM|>0E9Jfw6Pa9I!h$Zb)@Re{wgdB;?_^)`Z6kJ_Z{34aiu(Rk8hLt
z4iBUIv@!j5N>|C}h5sUJIXrf4c!EQ@u`lDEzN&m~mK36YCMuPPi<}(!xZ>g$ebUM$
z(*z);Fd`>!2i9bFa=gU#QCT1b3$>C@!jh}O`9YqP^(M!jG$N+Uiz4U`@ai!yLS(}m
zV=jXt#CJ>0fwU$%834F41s5SZM3h+d60kIhfR-)&bGd92UMN*`p2_Qccj}8X*yml-
zuXc{Ur5nF9Vh%1dj~y6M`XZG-oU`9B|LeI>Ak4&C21iOhOHqdG6U*$k`juBQ7H7WR
zk=&zx0A4b8fHQl*ySZ$}_m>rZ%B69az0V*@pO^uUD&~Xb7!x#&F%7#p==gWAKB{hf
zG1nG3@eWY|HUj=5itbCy#LR8*{~ENV>de;s2oVCnQ|ogji3ltf0O>+skfVDP-0`Q&
z8t03yE!bE=-Vb+w;)&`36)et~_^O!_|GCg*miIzSXhpp~`1sfEwmo0XwNIGGy@jUu
zM|c~7fdYIaDry}9p8$X=a~0$yf^)WM4f+<{7t+cSr+d_WL|>CLH8GO4J=hAkVfA1W
zMin_x9VQ8&RT}_|P2NLlzD8ndEs+ut88FR#uM{4nmcncBhP)~(nJeg)_nJJHtY775
zjT$q&&v@$16Ja!bV7rr5Ls%NLkG3hj%HLa*t{l6!!0tP1pJ=p?b>s!V*lu4NwWiSz
z5<pTk_M-4Wr?aC5u@&fvJuirpK-2_r35xLv!jMF`S+s?+@-ujbDa~<tr48aM0JyEN
zb`bQdX%(n%DMBA=Xa+P0$L(0L@=83v7@Ij%ALn}6*jt5Wt@cH>dSk`o2_cw<v(n)E
zd2eeP)e}Uu$qvcZt~C7G^Wfg$b7!eiTm8B<JM)RF1BJm+<%@6gq3O<^FGm6|+v)!b
z)!Fa7*A_gabcgZbqcS>@?^BBwoFT+u+KE^xkQ0=GYb)YvS&&?J54ev@p;_a|!9G0~
zbgw(R%Q;b<Hqe96fbzv<PU<OA9F{@=umWDX*5Bty(2!EfT-oVTZ5<Mf^`g2|#Mf3a
zFNpgUIiS^bcDhs?$VEg@0;l%@*K;Tza7W|z_eVQ#;}4q1Q$C0^eKKM8_UOiZBjb_R
zPwAA>Jmy*|f2=e4;_P>pSroVVMC6K@ub*s)>`UqayDoyavVIqAv92pNuu@fvx_(s9
zvqW1Jx&D&Df1EJT8xL_nQl^1e4XjiI92ol10M)%o<<}UiA(#N9qa~8uUAYy#RVe~)
z|9s`J^f_x~&tI`E#Bz>lp+y>YDP7QPoSlDk1E_9k4sB%XXcrXa^N~|c!Pm^Uhi{wZ
zw1qBPf_B~br{_vO-~Mm>J3NeY=&Q)k6y?1-^Nqcc0j(GiZ{|JKQP#9i1WBNh+A@Z`
zX89GO8pQ&UMcI=ep3NxhIsQk;W5Pxdr4hP}>xO7<p+x8VJ2j^w|HDv7lptNAI!r!M
z6ao>WbLckqW~sX)oDF!oiGD?tUDdcq2}0F(U;-7AbU~A3Ss-^&jGl#;rVRj`)apb7
zpen9`sES41Qy;u)XEcm*>ncp8utZSLZ_L5WP>(+l);UgB@`Aj;?+Z;|7TfP!K4)sG
zE;zESaZ!(<TSNs1@76)WD4jji(?iiDy${Wdq96{f(^IJ<FKoj$!!eYWKVVJ|tWgi{
zXN|aCttc({-o;AsV>T?jhHS}t{$$%w--6y&gRhcI?>e9+iEcE7g_A8A7s7$}nY=pE
z%3$Tvo)y*|z3)!#<380F;MHi2T6KSNb$M5sixe6^2u7s#hu5fykKeJaV(sED3io_%
z9(&EM{6?c2_t?k6ltEqag8Abt-6x(b-9)HP3E7oR%Gc(AI7lr)w<dl3y}ZZo>0Y}W
z07S%4Pli<H^iNPOSSzLsg8C5zRDXh)tV>=c)B`qaDXo+EYzqs;JG#E!|7*tukPiZh
z5b)`>VR*2n`6xxq7NJfoU#$Qh|8c1av6!s)TFNW60NbMyw8dn?Cx@C-{S|AYddIo}
zQW95Hk|aIxkhaQj%7@?VJv`MX{6p;_ce`?`O~pPzo&S@_z}elwEta5P8|c!8#>4El
z8uR?`ngf1j(>}0j&4qd@IR?q<Y$efxfkD6;-Sc>~RwYxVl)9ATB)QvPRT=0_BR%b;
zRpsBWvh?&*e<k9OBQ^tDBqxYG>wc=<Slf=WfLM{FL15v7`r>oyQ-$QZ-tsQZL`xOK
zA<$S-rMMuf1yc>eq;f#dq*IE*J=1F`4OnMTCPZFXh>yYAKo{wi>-Jms<c0F~@Vxa-
zzeRaZ@n2PLWB)R}KhR>j6I^BwUZt4%l`c?A{?r8&(^!uA;>NNc9d1U)ipvxiM4Dg>
z@6tAOak`{9(U1|jr0rAzp?}@s;?$#2YB46rgzDeYISKRx_}W?~^&GW7nt0H5w`Gk`
zTUE~CmJmyXPvgIdG%~hjr$5=@Ss=a%^`-W!N15iF$sI%1(tHDE3x*IdXi}759(L9A
z6<M@j@m!7M32q^s61Xuu@lm9|FI_2ZGr0#%*Z1d*51iZcWsYv*a^6Flj0-!R`kLm{
zZ9`M@Y1*ZjDgLL0ktw8l(u=?TasRK?ll4R2Qq`-)h2n8gL67J?x`#PXOOe<mQf`*?
zbQ0rpl(x(@=k;*72j!V{xss4cLY3{yhZ-m@uk7VBpAhiR$ua69<PN}Xq@T}qX})WZ
zzlw-y6#RfI8lpoO>|8?^Nyi4<f@~@phCICq7->>Xp<>-=yYi*BWbAy3(u%*pZ4Ztp
z#g_5eDVFieqqQ2-gxU9keN^yJcjSkf?j`$95NAc_#bxGxPH07oF&nd*oV*zSuLX}$
zvn+CfU(g`L$#3v)5%WM5;1!A~oiK|eE50n8hwgHzTjaV|(kbP61Gp2d;S{SyZ+`xM
z5q*bx^IA*ZE*j8ps5szS0E7^UaBkm03O>_RcojCwlaEh`xQ_riw-oag5ttCURKwEQ
z18s_b(9S56?{e#}zXh_gm~DY8BP9f=_x{E4ZqmTXfm(GvbFq61--JsN><)4Y0SF@K
zd;iY&03MKhlt`>xA5=PP2U<bBRv;M5XZmMCHMt!ZU8+auw)dG$6)X<^A~~yw3}WGA
ztOQ>qy4kWTQa@UoA05T%5YLxvkx|zb=1LHaMfMSjf@d=P>ONPdigoWwe?!Mof&5QV
zGl`d_9%_izK+IiZ&T}_UnQtUUniNeaQn{%MrYOTo*WClUhj8^9<=RM)Q7KoZm~Zd3
zkAIvOT<e)LtsnVT&9uUaj-?5GV9Oy7K$gVlPCi8Br*tVXbcdBI!sc-apP%@Yq}H0O
zWua-NnsiyM=Tu%;D~FPb!zRj)^!}n#w}?21A$h4PGO;;msTB(YjqcnCvAC;LlRG10
zMV$Ld*lnslA&iIeHaa+E$HuT)(UD&$+i?kyP%_%g?3&&5D}0+})4%lPjeVYDx@jr-
zdbRRtSo-J<oU38B!XtDN4a$!6eE>fYPzaG$6$`z*{2_Lqey#_%X7nl<Pf(9GHC1X4
zdgv0%=eqQ)$W2893y#MZ4$vo+<L4ClmZFz967(I1qYDSZOR>O683h8gf{HS>8qZ^(
zA`$)EG(jM|AgEP>4QCxLxVOl)dC5p0I|U2C$}Smr!0B{e5E|6#OmfUE*l0X?W&53C
zOR$5ix)%TyT%dTg_RsES)lTnw_EDrl<<tJWi9fWNzG9$hdC^Nv?4BxMDUJZCQAvIj
z-iaiS|4+?tSyb^w0TOYRBPu8`ZQrik8nGiSx1u<3#JCxQEC+v)8&BBYM!|$~M^a|g
zpyb1B(;I@rs1>W45ZmG?o0|=j@m-UhkW8W=XB*)_1j`+QnM6x~vI$5I?*OYA*%LO#
za`c}@B0AU^wCwp4;;HT2FQ+wqbiX}xRvT>ADp#11s5MJlSiupt$`!-eKra#?y(jWD
z3Wf=p6t#rw_YoOeD9Bf+j%oFP#_VJzk8(9^Tv5SMf*@*7;3K)12HXeWlPX&pHjDRj
z(O{A5t%l}Grc|_|Bihyt^~$NHFg6vtuUgIL2D0?xvKA8E|1R24%Hb9fBxPey6AvY!
zIAP?apFD#=l<J8pd^6vDc8BHi_H<vdh(yZg%ap#|AuobL;`8aZPmT23Z(p?olYW!u
zzcM8ht8-RSJd4*54;E{vzKgfp$lL5s8(8b#B7W3EF}U%(S+w8c7=vpO?N^Bwf3)OP
zj*V!0@HmbV=T+nd!K$3-N&ab_EI2bNp-oLf)ca}LwTkr9(noVwlnAI|6hbPDHUFlF
zhNX(x4!re%YuPAj$yze7+T%f8oAUIoO99k^i6E!4{97XWmaa>?=i}Bq|3$M`Rxxkv
zsy)QM{^#(%`htR?ltSkVu?V(~3#e#^Tk-+(_~yC{5Ph(nK{kk}^6@?dANN!0Tn&B2
zz`+>k4-D>z<rQ`%h>_bqG}TBna%SaXm50&z{?4~ntw7)g=Qy4uZJ{eLkU+f!_i?OS
zI;tc4795G|s|9Yq%x_nUj<>91zU~Q;7tqjg1iSY{T{al+$E%}zB7b03S@_j-U&A?)
zW@O-o*{8Srg5Sp2AP;W->LMzitY2&JceVfTE#G|Of<cLcKmzW$9IMIZJhlU46ja3r
zrSo7MlB@qYLNrmJ0DTr1%rp~6UYj99Dp}&=pm|Po2(gAxHbnxXgaGsq!M8E&lXG?<
zr049RYr65~$k3A(fcqcf?BfT`pWTyppz_3=;$7*_52Zxr<a;(!0C|p{rTz;iBr|+u
zFcRX`CChV)ob{t^xnyNSe=B<vQUVW?IvBZJ1ZC_gL`eE72hbQ7X%CA8Sq@xUxUXJt
z+jk3gG}U29Am+1cCzq_qmFYd?{w1jyU&v8*FPi!8$u6)omvlkvyB!nD--h9>FWzbT
zRToarjDOtY5!CwDNZ`UqiSq4vLqvZs$2>Y}au-CtxnbwR%<i-|Ut6A<+@rS7Wl7kh
z!}Hx$)e;X4cU5GocSppEY9SpZ)qHMfwpDC~MA(9KBz3A>B2uXd5k*n*7{7{UNr4F6
zoE-Q~*VjXXARBR3cwZ#Wa>y;e5;Lp-WHze^*#$=>0CQJgNM6Mdie7!Z2zQC=hY(<;
zx5>0+=R~{t>udHvVQ28X;yY*hDld4k)3@3(@o#OWJJ-#r8O!nUcCaTBiZO_RZX)m9
z_IlbryhvAU;EQD12G~oa|0RN1-(?Xyw$-F~cSP52?zSaH4WjCl+Z@Y4wW7VTVwzHt
zGJov}r?@E{4a<au)UZJL5=ajaUk=3#?kk#J?9Nn0yxMT&kqGY&&tu-T0*_y=jMnf`
z)F8=sXlwU=Q1qc<`B~ayr_ZkWH0{jok<?$NE`8}gl<wG2tNASdf4&m_$APy0TzBo#
z#fN_R>^$oi)7HKC_vTRN9`6@>{=xhDAF_Y^RGj#Rs#<VHZWg<ChOwdfM7Xgb-<==O
z-nlkv9yRm4jX>nE`9_ZYzxi78I@>x!o;H*3YFlW$an?Xt(Nbqp6U&}cd>?n}NtJxZ
zM(&c&L+LVCOjh*fr44kioXI?=fjT}ge)7ql%pvCOz)m6Q-Yj;4==R~XILOqq4)1xm
zlhoWTA{k*Q(`-8ypsDljsz1tF^Ia`=H<1yfkx~TxqJh=?6>jZ!&R>Nh?oSl076ow%
ziF7r0yuTt`7qHV7HyCLO44b|fPWRt;&;F$OgV4*siyhwQJvjgMv|MXgCp}l}dbYdd
zbxu9Rweke2!Ms5Sg>hB;&pDc0{o68QB6!nvEe$6Dwp=bei)b1#O{P0xy-MBIn_)cE
z;YA-35?v56e^qi$x(tJgi>%qqebY`3-*`-|T%-*cns77EWX?vFlrcmTJ6ZC4O^bhx
z^(Dr;iDIF$%vm_)y-PAvmw1;P>g(P(@#O@%<!<wYIdG%hbo-tZ`?!1J^BaYs&vnY3
z);9Z-`Brz-n=}ip-&nDWyUweoI?mWcQH#}!E0;^2TOF-*a>Kldjw!|N#MJ8Di6d%p
zK7!;IvN`&p#N2CX$e`A3@L9Q6PlJ!J(jWD{ohrk_ay^bIa%--;_IlqzdhU?sW;94b
zlJSbIRhFtZ!~bkLHlIeLK+&X;SRFm1(`yfUo)!V9vh2<$<{KKNT^1*O&-jRC+>-iw
z{`+YM_8AjhTN$w=CN73a&AylUSLB_iiw4%X?{7$<?W$TVgiqvq>UCAokq2n3SP@b7
z7Fb-2Ge~)=qQ_g-)>WmTw@Lp!hp%FGbv&Q4r1BukE{j@6t+~PwEj_EHPy>KDaVX3G
zQ>{SFTtc+)3qHEFQWv}PUX_lf9FiAIb@U5*jO($jsh~8zF?}NkV&}~>BPf)^!GYb%
z_*;3wu7Ng_zjJ)NQOxN<7HC6#f$@iMotEko;a6>wg|?dA#)hV@)5}30SyY^RAgiH0
zsow3b?KK*YV|!7|$hIN8&iRLTb1mjv_f}j~@Cf6faO0H?5XxBjJ~U)!oNJM`u@Wd{
zdns_dXlIO$E!Q%TWuypr)VoB$b2-a|4I4SU+<U&HGTNBp1Epg!%{J{Q+xO(c$bmD{
zU|UhCS6a-Ul`GeI(wI3tIHm+YG`B^5es5!fW$skpw%&e&yPgf5X|1=UG;!3aq&xu|
z3NK!)`O}4=N6OWkS^OraUl`|KSQ-EX<$i~)Ayq^L_p^LX|4(sy{Mq6Z!{liwAb~RC
zPJ(@{##=40@lThBY_{{>nOE6(DO4NfDCT-_q6OA@wyFxN_q`*{C-Mu1gE!hGz?F=)
z4Ub>c{#)Slx)IKczjZipN81@%(N`JQ*cu-lHDi@*7Y(}s@J9DrU_F|pxI<xI>r5;B
zvoVtL7iz2iHFz1OChCCHWQT<^TCY0V{7ijZ6Q&-B-K2v=wa9muN#7WON34mkdDI)N
z`WJNfb(9e}Cvs$l#6VBjq_*Wh&MuRAL{I|J);uk-d88C0A@Z*8$ImQ1Y<Mic`0g)6
z-kLB43N5Cwa{a1_W2SGs0J>1S-eMYV(Y-SeGvc{Ic`TJmtx-r?;Jl2WbYl0UkA>W3
zHKybY?Q3Z9UMI9RoSV@Uo~SiCwtMy>Hb_qvJc1st!LnLyLsTEE<_(mI&GmJa(V2}D
zJ32fB67~Od_!UV6#(a-yxy*6K*qar7sG!=`NJ*cR3LJsn!kR{?axKZ^T~jCZb72mT
zbEL2<aP|?fH20PopEN=U6W_E=E&lLC@SEUwZS;S=hH8POr1IN_ycUF#ZyWLw9p}(W
zsmq9EZ95@Pz1w2FUe1g%Z%w%W-fqKGDj`yLW^><dql}>DEL<uaLIIAhkzR_+@)wEs
zlyrO#t6g_KNoA;@Xn{_BcCL6G@r<j0Wr;4^|2BGQKR4^HAqZ8Ph^Fe#X|6d{di#xX
zWw0eObc6GkWieSI$6|~rgQwXxRjnrmR;!ALTA8;pSE*SVRXRM^490s&BbF2`OX-QK
z+3cC8=gN$WWJZM>J_cbSx^47V&z=c@gk3U&EnQ#@m5!nT!i@U=MzT<Sog7O~eKUc2
zHwJId#tznxu-Yg=&%H>7<u#EF7HW%Yc9Ti?tWq|HQ@EbBp71zu_sfjb_^5`u+THH@
zly15$?>qh1maTvC=xs}i#bfVG(zq9H*_g@=W0R2oXUFQq$}71d%}ti+@IEZi&;p3B
zZf3_o)#a?z93&=spI2?B4eIInrY(=7B16AqG#iH)z&PSKhq&y&*RP4qJFMYs8+;?G
zCe8uQK*COA9S6=}9b<Vpp_Hw})mmRCrej`&EtBCkkKfXbe{D$*ewAkmd~LrtazkT!
z)sVrtsSA9F7yGCu5olO$YxJ)<c1gNA@&yaYF*of<KU6@)^$Dr$3nMjx!qK36=2UAC
z?LjlVJ1TEIWwxj4`x?oEnduAlzG}{9r|L~$TvrF(8X5kd_Sk?+np97s5l?k__}w@#
z!`P?i!*9&xc**yWO-gwY?k(hDVWjIh<NY#IjW(E4<Fi8tMnbnoLLZ;8`#Dv?&%>TK
z-R>W~I5-?|7n&x*^312^)H*?~N6L%tTk>W&w|dCA&J>He-I7SK0{}D3@nPc?ga4-*
z?w{P*J_^aw`>JA&Ol%RxlnlTch(`U2%p4T}CsRk!L_ia!H>5Fdw|l$vR=yaRQmgHs
zB9PF(TODA?DoB4|gbZnV8d;DNN<ML@Dp#6wwMtr(*QX<d&Gg^W2ERIJF$LQAw?!Z9
z3?0yiK!<<2E?kPS_e{gNegCqHb3_oF^n<utJc$W|l2fD+n<({X>jJWr#ye4MkV|U{
z7*JakU+9sMt60=UjzUviHM7`#a7m8XnM*1+=Sn~h_cho^8k7Ym?-uqY<D$XJ6E-Ou
zJGXnQ3-)c{hN6AMkOHR|pTbTrGrFLBnt?OBhW8*$mW-zG|33p0fk`kC3hSFy+96Hn
zKUNzN!u#KC@;=P%K?${{?TYy=T{fBi*@AQCX$6G?YY9slS(~KwLr9Hi<&sAzt&?3k
z0p`+tmnUBJ^_^SF0KSt!i3(H5b~il9uK@5d-HOmhHJBnd9eD!6-PeS9p-}=h)K^q-
z;+Z07*>*wtPaCh4IrPXezQ@deFyHz_XC7}?dRiit&$8@oN`s~TU>ALjf{ZC{5UbI?
z$!?E|Ru%4x&lJFjw|pH*mqrz!%R}sGXCw{lD(07vK|_$OaksSTnI9RNo+ZmfdP0_h
znZOJ9o%KTlt9xaP(tOvPGS}nm$t<xpTl0rpTbHMc>sRa}(|q40C8}eL(#x(d$H)(L
zYOItnhz8JJa)RnV{)ytV?~imUbOH#h2>uxy_v)0q&>wFUj=f`8SC1FEs1~~@Q=58l
z6iMy@3)tfHb7_|Q|9qwB%wq&(Z5`6B(CC^vM!ZSuvQB)-BQhftNa(QSRJbqLZ*}WT
z;~(2YW1r{<b;I7i$XyjBI6w=m2qF|pLJJ$~)UmO7s%;J^C%v6%E;e42^LLU(u2Lst
z&K$FXRNEE{l%NI8j(A=JsiD!-u}Oh=;Yi4;a^QZAYun8r;%k9*Lt7!wi~+zY5yjqP
zOsJa?ktq<@lFAliuk43e9{P^ARshq<T-1b}kPnSIS(M;UH{Q4Fl{lB|lz->Uh=M2f
z`v*%GyhzZym}gF0l<A@0fQH>{pg(Vop)OI|6o65kfpMB-P@2RcF|aTi+2(38_^y)&
zrA-y3oVGc_YYBzlc;K}&!Dj_8-|g`#!D%+xf7hJGJVyvf(Lk7VGM22*Wy_-!t){)}
zr8KNZbY1zYK_7CTWdqBu9CA06ES1AW0y8$Wf>n}dhF($5A5?CAHXP~~yg<}@C2inc
z;~U`wWIUk|Y~!ga+uV+6a5>-kc+qLpEC+FQZ>9UJTypu-eAv|pVTcTM1?h)c1S~iW
z=%~`}iJY+p7eE77933fzWtovj89hnLpW&i+z!j;hRd^pJ>?L3s<J{uk<j&z&%}l4v
z%0Q+JvA7qpBgjL?4fo2bMQKrb^#v=3Tn^912$bX~ePToBL`cm@49~OQZq56(|Dtv{
zCr`$+4HVkPZxnv$>aE<(;!N+$Ft$I=?6vw7-%Q6jWL@khkfzO&xaUN{b6<lYR#OIN
z7k5;n?1{oBik`u@j$7zF<4EXno&mJcx<aZs>V<}R@{dl#--6?T(~#>(%3GgbX7FY8
zZU1g@Qkfa7_TGsYLBv{t++!4UWUPZ#NuzABTMv1anGNBO9@Z;%(^!vQ7Z}v*cqT>Y
zg<xa4F>%myTuPOmIkJ^3AZx<*tSe;NW}KE{z;NRH)=7&dHmvWKh`?A6I{?{A`gkW5
z{MI1T%%r}6bkf%d*#)^E=xk0^y`|JPNX+$3%gKR!ZnZH~GL6_`)bXCUfk_9o<(JZ2
z)Z6K@NUNbhuuyEH>i20vMk}h5M!%#OebvCpH0Dod9No5L&uDNs*uC5AyM|HqJF2S-
z3Uc5OoqE;n$iC#cR#c0YA{uP*VkxTEo^xi2QLO<awy3A1raWz)GqApNws!$pgDw;(
zp6XrrW*WY4J*f3qd?8GVZ=Gw{4f}hWv`b}`7pudui}WrrHPWp+HVL}voK18^aZ@A7
zz2Y&Egfw}4vcWC@OC(*_UK?-0uu7|K(gZuj{TyU0(5P3KoaY05_fKY~!APt@92DYD
z-<e(2@h_!Oj^y7IUv&#rPRALQj0yYYWCY$cFx#njJHCslQQi9}*b2$vCm`HeiO}$+
z%eT&by|RG&uf4kl>2<qR&g*@&R={-nA&s=GFkOMe)-&CBh>=5?hvKxWhZe)&;YkeS
ztF!dhGGta9C;Qxv%v6$9jinb9o|5XVZXLKSB1HLc@aFD6Q}Og>f8(&}_TccF&$8D}
z8xI*5f@gNn-Jq(|qP(>*WX4M?{xGv17HVW9H#&-o&Fp$y)#@05Cs<A>&fp1uv&_OJ
z^x`1~XQZMKJ)E1INvuX0;AOt+yZe8zG~W5dA9Z;NKCh(hx_9|6OUCQegA^J8-CZ9a
zxt+0RNn!PF#JLmU|ErZ*$tXMc9oX`lwMm<`gxTE?B!~>U39^W~HV>yJuZbG;76Z#9
zhm=&7t2vs1I%g1bEqzlko{9}fYy`$WPP(VJwvOYI7Q?zQ{nOI7+DdH<TAUcOI49kK
zg3wZO!XR|k=Gt+;eIj(#j04v6J&*v{zJm#-@#4xQhx}_&z1ytoj5*zh3kZn*w`C%b
zOn=!r*<J_dBB-!mY>*ze^xnx9o-18SF=`N|74T}A<N7lobssasq$p|Z?dLfWleS<{
z6AVZ;GTp0g(xg`|c_M@3F8@o%<6B#2#3kqHog(9aL>m&MDE)eaw$>9k4H)1eZ|tQ)
z(HxE`jQh&Lp*c*1rt!U@%s)Lo%J$%UtHoUC?6yf%+s=>G)xr9cb3!i}y46G7KfTYq
zQWz|4!BK^j%CPF%8tXXKe5F;|rMgNkPiF}--SxbOI81xPqysOeq18BN=G%OBN%+`^
ztN1W0(o=Af?(XY%oc|qSb!|b%3M-=-!`;^^RYbBeR|dD2%+Ev715RA?;>+MTac%E1
zoYpd1z}%h4&WWr1|D!h3$DSN}N#35|fHK}V96BqEXw|j~(uQlm(CoFAJvewGZJ&25
zK}ubr{>ctIyeOb-z_@@|W?vK81^?7+jxLl$g#4o7@5wI6r$HCgD!i3-snd*f!xMt*
z;7V#IEP+NFr`90RCSerJjFSJcF>_<JzV%E7t<p@o+HI|}@>$S)D=@-G^@AZ!()af?
zCl*6ol=%VH5J7OAP?I=I=rta(df}ilL74Lcz$3?8k2Av-q^<p0*{;$2W7~W82GVc%
zD`u{a`J0!P{+AbLY<%WW!s568;Y{Pd{inGbI;PIA-t~)~pL{$v?brYLp?~_pXZbv!
zXUnpaU-TaO?AaY@v-Pn@Kl|^xfa&_p;tw_{R%>qE`g2qqe5KUnyvbC`i{0(LNo!<6
zl?{63zb06PA|j<!8Q_J3)Y#Z+lZp>B1Q7EJ0fVCJZIZ#>3o|<}FraZ&b-p(_W=k>h
zn2}!Y$@NF`CtMFX`W;Enr4q2-HALHyNjZy5+Y%1%ez0es_i~#1j9b?KIb(l+n#<7-
z8hOg<t=3Ne<XmUytd4%)J-2Ub1D{TL-gK*blo4)UXhM4Ph%WHixeXZ@=PFZ^p(ElK
zV-6M0u_iB`g>bGJ@Ufc#67GBauS~Y>z3lzjWP3QqBCwbE8a&8JHSJ=?lB9OkVB`a;
zV>9*&L~?G=@IIWb5?g~HGE<9p4|hk#jnm>iayXjGEkEZKo~Gr@NJV?nES59G$zf&+
z`Hs^(g-M3tzQbDA>R_$je7PmhG=9;1`?E%!|3mw6^M2GRZ6Ac^QaG1YQIanRe2CaL
z<i(q@y|`3h2D+{h$w^i~W{H=fJm9BijCFSBJWYP*(wqt#ZCv5LC#^<2NsO&%JSdv^
zs%`AFIyy_m)qs|SM-gO)S)*(E?2c!wi!>Q*c6y4WiMF<-ffHToU80Ywk#>y=0|VI>
z4xeW2E8K#656AVmdN<{MgpMAjd~q&x9x&K-(DVmSrf!S!<#3=;2@Q{L;wcyBm{eJ=
zd@>x^t6W>k!~spiW-6sM$gS8LMVXE*L<y^E7dmKaDROS=tkg^v6NeWK3z7Y{U#~?>
za6%er4_>KmQCjT*S+W|F3saID6X}{<Ep_2`#}3oyTQg}t^*@g*5|bjWWfF&<uo@qO
zXsV;aILf=qWW{5(($mj59&#iU`j>akb6>EG4)DB=0sHu^0j=)#@L-$wMP?gTI4-1Z
zmCoo?Zz<0Ps;5MtO&G4$x}HmK1o}}~Lay5+g&LX^JdVPZ_p1&`8DKNow<)5l<T>DA
zpLi-xf&XAv;lAWZ>KESML%opj+-gfGOM{prt%#*R2JrXJDjrxz(F7Rl-9RA8l;KW^
znN2ol*tD(Ib*x*C|CLln`-6FN+#3aNZ|$H~V{>o4Y5r6hZ3)`d#Tw63eBxvsG~L>p
zq#KW#Xq3OThb}0$n2qhXk4`B4=F#B^&4x?8ZEi#<M^aS7Im(lD`;5J4$NB16D01y8
zt|^R*l-+3cp9WJ=%|!^(OE=f#u!e!v>9X(J!Hzu!1PzQPh=yX^VcLw~KU#!iDcbf^
zuEO|ONXNa{<9O5sydc-Rj3x9jD;UMonZ+w}wl?%$on9-Pm}FlNp^Cud#zOa2e3%TX
zcq{O{5t#E?o4>!)?{3Kp=2$xYhxEb5@i;xB);HBNlfUnRA&I9SToLau{egpw8X2^?
zS6+fJ69Eg|^;)R{(DU23J&JvTN<kJENk+CAD=NLSRG(YGE)w|Es(OFF!M|97A5%=I
zk;~^7?tq&{`%=-sNx~~@AW_x^S7yduT;`C7G42QRR+=8{DQSw&s6I!CK(8%1eWEHH
zl%{W3i>Ww%Vy@ydPw0Yq!BLXkeskcUVo`3ICk_Tbol;=Bm2pR%gMC%0Q*JPGv{>oR
z^Ha^b)LWa^%BEA7?eWeU>^F;g_<R-qof1*3C7b*--9T~M)lfn}kfR`G;hixS_mMgp
z(|OdCFvlM=TAx#grpT}7(yCMJT<(9CCw{=`+?+Ey>Dn1<K}|&Brvdovf0?LC*jHzw
zEYi@s;|QB(Y;2O+<W(UDc%zB05(I5K54oDzt+m4s7Dii27h6KMyx@NQaG;fcy;<gU
z_CMas`?phlp{&@kqfAg2Uf>Q^Y0lw-p=YY4`&e&mY*$AJv)@UhRoWW%8xVy_4<uA6
zcFpm;MI~iPW(3<v)({b0Dn2ZUmClaN3%yx_M0pBG5*;ui^Oocb7iVb;r36?KH9V77
z0`Na+dXR(p6p=W*qlhCXB?vtE@MA6Hj#Vo}m!=ml6jaU*k7<+?%++KqW?)a4C+d{X
zm7AHmjm*)1dsw-wgmmLCDAz54TiflUgTr4m>X*!~X+81yN@qehgDkPa(h6(nuVqS!
zf2Y_j;A(Ql)Hx@uHm*`U_Iw-_d6LsG`LU=xq(){F_hwMrIVsr}39tlzF5ebc1~z<X
zzTj3>za^k=klf41=cXkT0%FArh8KY3(##o9^#S}KJT*d685c$K!d<38X+gEZjmSI#
z!muYFp%Ti$shQkXxcLPeB;RMg*kb?Uf_CBtU`@}T?8og(#JFhnUL9C#+GQ<CE|j54
z!`Ou#{oBe^%PN^ibp8+HiP|zMD@{bf!pE%SJtUa=)D`z#fljwOPQEDphBlcY4P1Fp
z=1j2>Yd6Pi>6FsIQK`Zid&4*kS`rL-85UG^4u!j9Q!mAR@6tmt_QM4gZe&WToQw7O
z#yC>q{WICFQ1m5(Nar*dQB6*p-fF&H7#L#k{Mhc$7dLd{TIH+~tTTu5#?EU#3Vd4r
z#i;V-Ii*?of)4!_o${N)&L{io*W&z&0yy>4smZRZ=qP$HN9ft9Z(8V7K~?sn00-#^
zBt!^}aP8&UK~f<XK74^16`kjjypB)StIkUS5{nYu)9TpYI1;-+P=H&>CV`gFh9MDO
zk$}GKip2GYf=Ca$x($wV=Xe1%kfH=i%|)i9sG_p~)*)Inq*ujFS<NOTR$@OqU5tTO
z*FLgm+-JJ2)$Q^1?D6&MgO8fdE<h^maEvxEDAZlwzK?z7pCL_W#bUx>j$h6~tvX*|
z#v#BGHfiSta(l^Snv_tQu>AL)-hNwl#p5g4t52q1<?_Ll>EuR`f;<A|{;3UG9%yDf
zB0~zI3Pm!irX}PFjxDMU!&~ahtMcM9Bk(v*2MzrMZ6j3<|LTY??*<vS2P(zHjN6Jq
z8_kB)!*zXC<@rgbLO^9?@Q^N;X&$Hj^X_skFpn~FP@}v@Xic=FA2%+X?mrwZ)-LHn
zDio)U@kw?i6%fbhLKHZy>jk;hB~jX`iNhO=jfM0k3HBg)l&l?6`*(O(p2QQbw#{8t
zkIC%nI==K8vWJ(L=6zYp0Fp#dQOGg^C^@1k-e95G<-%a(Nnn*~prgFDbkey&?VvN$
zw7<!_%>6=5`>ny-b)7t0GWfYYbZfv;^7-(G_JK10avtrm+4a~xiMY7gAX4qB@V+f}
z^zJkllcfAJ#BfhjkeFtS{iCvQeN`GUQj*Z>#p{zdH`RNMYf9Zli&;kcN+1gtxSD%V
zHL{ekmEyrbE}mPx1@ufAqgdQ=L>zh$DKU8;q}_|BXDnYBxww$cLCi+P5<rUMSKLol
zWUO4e2bRMHw!Ea9;GqtEIm)N4ZRU^f22V@innfw3;Yv=~jSCA(o8ny$THazo(dphG
zGNNy9?O5SuP;TwWx^zaG`c2YWmDjtv%h{ei<hdH_MBQQ#A45`9xbNtPMFT}-rp~dz
zKhljOU2*gH63&fg8zbUNWFb&K5c%BmWRMA8mO{&%RqQI+N+5C=o{&V2%DM_#p~at|
zio)KXc{fe36)an_+69_yHT!w|b^yTtubB_+|Hcb&+<vTZY)fY)9&*gia3(^90CkzM
z_9o|Y7SR$a6DVVRB)do07HsAPF=~)ar7MaPz@V^w4H5^R_b-2j)I?7)rPK;8b@GfX
z6E4sy%};WLQB3|~oq5!u%&+`*iNK2@{z}u$UiG0fprLB4R5A<&kvPJ888jd+py^Tm
zoe^Co9`g<ZxW)K{?=R9#U0}klT(J1(C*?;ts_NWZJ1g70)#V*xngCPrkz~8eyi@^>
zQuT|R?22M@t^<P*Y^cV_{%)b;#DCi0yIg6LniW$$PQc2+{hjeSLfknr#>-`tE}>R*
z6!b$-)NUlOh}5gX$tBW5S<p!nk=lzUhf?)jk;H{NNb<50Uk&~)Ci5Je9XrhW3$Dy$
ziyIwbviP?K_l)<a2j9*M`U=67<0<|)J#uz<@m;c&Z{3F(>K^}3^Y2$f7m0)sODxe`
ze1imXD>GReHj|mz>mD{?_NCs!Bdvjd%<7aFW%a*{-IP4vo5WTHI6a1~gO%gs^kslY
zru(1o$3Ejo0zbl#_lj?f`zZuTkWYsrA+MKDjy3qyj}F&CzFQ5({saoqQH92Srt<Z6
zZ#%_#3R7Hq<>N`Rc0$OS&VPpKG2blJ>$3)uYNECM{UuR_?s?ENs=_Y8@9t%HGYK_#
zJe|Jv*yqGFh`!p)&g4Gh5Rw?Dmu8Y(Tt$?z4;qbmDxWq-ls=GWnuIWx{p}eP|2L^+
z+ruC+l|cjcIYj(8w?GY<n_xG8+QTLtD<vfLyWMr1kJ66i{s=}ou11dN2L_bL6Rori
zI?#VM_HA|=cejLIB|Eu}kZP(JhOiTJ)h3)<@XVRmhs37<EtJNdH=NEI6swtyaRL%I
z3e&1@qBmGi3vv4WQTDUpZi0u+CC`i?;%!8*iqcAK;bf2>DFZ;i%N*}kjhrS1Sq-Sf
z;K}q}kumNu_RBOnu_?p#N=s_mZRje~`Zuj0!#XwrQW7+*p65t(W<+x1FuuXOzy;<)
zgZsY&80K-GZtPqrqz`>?u4IQhzsr$CaBx3MoJMDEU2gRw+N6ScI&orxzS<=Q9ww&T
z&$BqHi3KW<A5x*AuO@t8lK|e;qVI!FCIyd5?94`r9}>c<HIOibZ-b9qRa~RK(s5eh
znoFoyRE?}RHnso}B;E@+9GF7y(&b5Jw@WNKlV|iQf7rmj9d)ecns6AeG(T<rtWjhS
z9rdv~%lQ8BII;b|wn&R&A+F)8(z9LRsUJBUFXZwSgKw8i7l^T-Z!Sqa(#}qS_Hpy`
z?kCFfd?z@5UVjx`fP^%lKs)C>N}hd3b`<~G67S(U<4SA&IfPnbgO;BbJOQnUhVO4q
zzXy@4R7#Y;|J@46HBg9iNFj@jLK#$E$rCkPo!7s;_(4nU=1vAirM@mJq#NI!7t~4o
z`-VyfEunxW^wo$$Zo1l1Q$96r@C0iR&wdhKNI{VghL`2vFAzDUj{VCn%#X<<Ae{p9
z&dc<1JODY-2$^(Q(#S5Hm$-sZydAZUdP(5t?!6rpCrMl3h-Y_|tcCFMV%IXV-En|9
z=9Mh*0{9-53s^5oE+CQUpSE*(Z^lsas`_^LXlD-Cg376{@c~QdAME4AIM2a}TLSL(
zbOY}2OKC+&gYC6*UNC|&eCuKLB<z&}tI9~|*kS7zEiEK42L6$KKFOsx?qNk-TmiEl
z1ab~ZJZ1PMf4M*aF+NGwS)EaIwnX&m(RwW|c-H4hq`L;uv5w*GA5@=hn1O)OFUqp?
z3t~x>^TZ2b1SqMA({AnsbS}?FTrgoEG=EKQk^^)Pk1RbB8LSKnE+^Ezpz|-&4K8RX
z|6NH`PW~@R*r|t^Ap#gAuQbi!!(@<R$h=N&pkp<;FO5e}s4XLyf+vmfG~wLMAB8GE
zg(qiDc8Xyl8K@nOmtwoY4Z9H@lIvj{0MP-fs-1E<nM;I2YX_j8Hh5x2Kh1J?6ViIh
zH6o}W3jw-1k&JCM2O-Xq_T-vs50dSMJa3ioLa8&^<2mE2Euk}d`}lR_4tI;*yv}?v
z8JBAQ5H7k@-hyE^Gki~yd&FSGqZm-8ik2d!jl9i8ZMBv(8M;dtjOU)m^bO=pG%11o
z^}`QHHF9NoBOY>1s`p82-*!2MC{F&<9kE+bN~+MgnEqWEuU|z-r=EcF)ih_Di<Hwq
zFMBW7{9L1SmsLsJKg!IR`?giHrcGTh?c?Cms<QKujxteA5UT3S%Py0b{%8q)W)58(
zw13qc`6gKWGVq(I*KHp?WcMY`^Zn=uyJIn;c~5vh4-|0WCpE%qrJU-OLMm2g1Vy>k
zM;*e1WE`<7Js}?mNyfrJWP`9G1v#_a%?Jt-Cq$YpynnL6k7zH{L+jhX3eh&6Tr<pk
zN-}K=`3+bofKPz^)^FOu$R8j@Ia9_rNDpy&)MIjm7~o5qye0=#lDI<OhE856eCpM;
zeOFp^BaMouF)#RaOQGpWb5-HkuS()IzQg5)xqZ}c_#nxc2ARu+(E$rULSf%|#^l)h
z*<5v1g~YEs+5CNU8bS<^cDS9Xo_20!_3Y}+6|!U$z9cb8n>LBd^F}4oa3=Xr)Gek6
zOu|1Zz&LqdN^^;}LF*kBw_QBTAVD8gE_a`)efq=ycy91tLqGY~=EbvSKD_at|4{Mu
zJ-Z94KYeoH^Tr=#KY!oqw6d)i|L)Vj`}@!T!+pQ~KiZ$(|4?l0KI_3p4mrQ}by`wJ
zd@uR8ZMk#h<@mP#S&4MZs9QirF3J|TG`R)#?VB`pgx~9F%J=o>DL2oVz8<s(2K0MI
zmKAi4K7D6zo5|a3541*_Zf~~C*|x6U;G3UO^cJNRT0aXS9A~^swH=2Gnba*R9ZR_y
z3CO>O38l<whLOE)@I3@J#{`8OL=j)MIi)d{eDc8%>Lf-U^rpHdAtNa~Wno?C<5G7+
z!cg5K<Vn9ij_KNGW;3Uem@Ml!Ik^u>-O3>lE+aNoaXh*fDXSzP?=5mt&;tXAeq&f|
zZ#Cb%Vg3}M;6|I$6nf2c;~HZv5v!Q$07^Q&3ZbfKMiyqojbuz9VOa^4UA=g$wgbW_
z1so-C+K9r9B^eb5@SszY1N_o_4>b=w+?&<;CZkkTmq7=HUKxb^gVsrNZyf9fWvpa=
zW^@K!1*__3V<W?28?Es@-`j|b1<^;I9?)Mqb@pM5Lr64WO7_%N<I40L&m8ytMegGM
z1O{T~1wAdgiL;1wDZ1Og8F|w`wEMEAbG+|h$lGT6>f*uC<M}_S?n^54PD$NBm9kX!
zA~v%lb%y?SF*kFNVD0m0x1-*-p6yAjPsg-V$A~0m0u|zYl8$Jgq)L|=<REi<xr<~y
z!G{?YyP8U}q=<IawO3Fhp_7cySd1Fw9Wbe_F-jT^wN!#-@-NJVXfz^o<~T6&^pd&Q
zn=zJH*3*&Q{M`z9=MPqXe!=|34g18c+vZPiSj?YWZj6?6aqOJgPC$Sx5tM0`Sos<Z
z-Qr&;8o)!zbIu6%?6OJK6SpO$>36C6qrD6Pr*MDBTrplGh1&Bv0x{<^gV`|{+5{lZ
z;d(Grc(ABEh1hSwtl7T~gC^wzgpNcS8p@3+8m<DDwv_UH4ZZuO_XDREW52t*M!i)n
zOGdogcq*Q|ey!&cy+Ic&!9Nz7{Ee3Jq1_*qjGXym3J*WZ6vxiT7G>jug=G(1W`t7$
z+9iXlcz;bP4{q?jhwWPiLEr5Wq8W--i_(VOfGJR)^pkZ0%Vjr96y#J1`^r=&T*?a}
zdyBFaDGEP_tMg5f39Wo)b>J6c4Ah5hTr&tf!P{NU;*gXgcOn;H(C?CzDx)ecugzD5
z4yiSxAi1}y(Nt<@qLA$uE18T+9e+C5ss}})uYPvJJocge!@}Tum|~)xl_|55rCjM(
z2g`2`ks;Di&wxkw;;Rx5ka!vUwtLwPj&?cv@Q}ui>fO<Wa21b+&fD3JcSSqQ#6X#u
zD<xpkIr^)&ayL1$1mh-u<mu-4=)jQpMbmq;Qs<MB=c{DQ(wE|z=X%^DE+F?jhD6j-
zGgq6bYc_LQ@mS4wn*2Pa=`t>ZI=lb)$e!&3qn-U6Aie2Uo%wcmo5@z4k}(xo20xmV
z69`)M<;$rc`hS73fwLfSPI^KKJ+MHmQ`}8HrC`KiXs&3TQg`ND$>(jpTPm}5sN7fW
z<>J;tH#eZ$fi#GgGSKu<dAuoL@p&sR4%8}HCD@T4PbSItRq66cp=<+0*JJUbK68#m
zHAnx}x<%gFYfwjlJu3c8k**#|5B7A1MlGG834NXt$WjdU>-QErGXVJI;p!@-_<lh8
z2Iz6=%=@T#hWljXOrYIz{Q`{V>xru+jp}C>Z;xnvpf%|XODuLhC5l8dp1^3WSSsjo
zNB+l8Oldhi*7-Zh3G&ni@rx9S*?Q3ys#yhnwJ8Gr4;}PAG4vp9BvkPHdubmKTRs9z
zb9kJS%1K{B{++*`lE&1Ol=Mt5_6Wd2(bA5$-My==RdJh!N)5(S#9R7f`fW9ZB1D;g
z#m;(~n;D-$xZ`|G5mSfwWW|R7Z^laPzUDl`z~k?Ca~V>-5ZE}LjhR+kCxie`9Zl~L
zx4+mqbac|O;OgCex1%fE|18xyto3q6i9IAg3gONajv*Sw?+b6=NDtCj8oJq{SH3#V
z@bmwkm{G)YClU&Q8*&^Hyg$GvOu(WgoXJ_|@bq#`m>#3>Lgo9G=HMjv{}fPL$fGo*
zEdhQ21t9;{bA=={*V7dqRJVMqGYjQF1TG>=_NDoKXQKQDax|^Vd^tI)M%#bEwpxfc
zKA8=@+lzS019`c-GVMbie6pK#zY-a`$G{wa`uT=<_>iV!M@8^+W@ewt3*FY5eObd3
z9~>FkQy2V0t0nL$4<;|q3vN7c{}N&_hkRdOEyMJS4j3+bA96V&ow)cbqqV*#Iv@4_
z!WyWe;3T20Yw<9%j8=p;{>8qodF;knsyWw6{db9q79CZ(sKMUGyB&A)Xx|3BOA9lG
zA630~H~W8@sdnpe{L*G&=;~UWSy3V;W(G6%RUJ%@8eZSI7!_s35YJ+o+m|_OTc;rY
zuG2-2rU$d^%H$EUm_o<R=SO^f(pmyc6*`iv2VF+GpCfoSMuGV*k|M~mahXR&O=0^x
zabG0zpe2M)A{~}ob*?!+SvksF!8v|dlX`?mQQEOY&z%6|Tj_JmAs&yc5J<&nAL*js
z2{x?5U5{x4Z%$)=Y03N?D}}YxWcV_oBpShBWhF(^)D1_ASR6|qZJ0$De`mVgD4|aI
zUZ^{0zQt@sLfxD3#aoVn4IGV^PRRY5%6Y9SN0Td(OBZpUP9^nv#k4CCs#EG0^-(l*
zg&QBM2vkvpR|hqy&lnQ2rdC1n8C*#0R3ECg*Gn+`w$*4Cp>1t`<8IZy%;-sZMXAaf
z<#7QbdpXfFiroX$9i&_)LqJ3@K=GqJl(<RhpR>`D=GqS;Vb3Gl6MiLNN#~h7JPrz>
zKC<njkz13;OS#$7X3kZfuphu|!rK4~eAU!#;LHs_VlU(S^;HvTH+K``b_*CqI`GkR
z@ha6GdL4{BF@*L8!ud66jU-mg+9CcB5D(AMW45wLz`0C`&E|p^%Vxt#P}A}_1~13S
zseQ@osg;Y=EeE%Qw}_6!rAzK#S>ztD^~3dq9LW?9{LgOoM6B~n;9=|ZQ?95S&$IbB
zef7$)eGJOtY3Uq&Ica#}p^5i$bfNQkfr}dZ_(=}+W;dywJ~VkL?!<xr->hL{Z;SbJ
z6&Mbi$06F|*T?sy=VNz8Uthws96_U4c9&%4d!Ejfi5FDeaEs+jJZEdj7!@P!_DjWx
z$>4B3Sv#DeW~>}=iT`#imjTa{on5kp26$GJw(&=v*vLOw$7*#QUzDjAe|a4%ur(%%
zD8jM*zj%B1xTx=Q-*<Pl=NNCVNlc8Fupk7)3)(?ZIqXuVfmSrejs^`T5?R!H(6S3_
z9oMXk%xS}=Z6bH%w#qDH61*gX#w3<HWp*VgFhEIa3^6p!YBMudO+uaVHZU{4^M1Z#
zE~{(pz4ktToYyO96C*hMzQ6Bt`8=P`=Mh&j^0Qn)x!+Dgy@&sq`ynARQX1a)H>Kfi
z%eY)cgSPbkWcTsTRjF^5nj%EzC)1sovl#(F4akkl8E7aJ$rP}KnwiKuE@+dylLUOC
zj_`FvmxsWFivt@De^Z*Gb>d%m7TT<D2@ixWU0VURDA6l2d2G<b$0ak?RS2=4vI2&@
zd~X6G^%|B059%P4Q*54^j>?n!*{Ubxu`CS$mn(8Vw^3xEQvW!~XI){6j{SL-knTpu
zjV->rQ4db{e^V4$ophPS?!!U2Zabo-j?e9@85-CyI*4G8=YA-S2m|rIG5{kO$bPUv
zHzj$DKNzZ@oysGfc17=*dysod-dOHNaSCs5awjES-~|x|C=sF=*3J**BN2^pU}8BS
zv*vhx|2@lco}}$1DX<mDl&oW~Q&Ya@SZ<wR0Zsbc^2O5jT5dk)2%vhn@{H>V|F6eh
z*50?UHrEx72~}BcUowP;wEa-3KCy%y|GYifz3BN3?1%Bg>`%_=eFN?`KtRzF3WT|k
zy3AsE1VBrxE8=|qg8L!;kF>TT+>i&YS|Zy27GWC8^DPtw*yvz`p_a3zu*(T_95Vyg
zNYJ9L+-5UBAy_}!1JzD*NcMiAAoCl9EHbTc(qBpQJ~BT+B@SnpcLUyuIC}^VA!ZW~
zg|hQBh&o~2fl!s)t8ZK7u$HMuN*YZ@F1f(+A>8x|a{`^BRDAXODHDltaNjATR`B|T
zp{h=Kps&=to_d3-lUf_Ct8YVRdEi;JrlNvKuq2V|0p-w40D{3PRP01qw~3N=!5GG%
z&@Pn!S?OLv@1{0#S<#D!4|4qEz4|!c&bXXA$5l*hl;e8d64pWnt=kZCStP@u+WTyO
zo>n(@U=qcAYj{F?PP?JGEt(}r)8%&HI;WEz+hhN@x%{3op?lvw6_fnoqSy0ltC4%U
z=OH<OpxRKtts!sA%Y8>P$<LR}T|z{`tisbTdIn*iW%2XVed^i4Eayn_5bMD(eK33x
zrDm3E%HRBOvaAgdHWGp)KAkU*#0t({eOH-xnwM|v`W^{$RMr`8lxEK=B$2M~$&;2r
zo$H|p$4mZIDXSMb&UYLb545vmAyVK!=19f|N6#1~&C_DIE5BS?W6wvtJq6*_>7X<`
z?VUEeBz0vO&Zf}q=})q@vpOp9LTu@@Od`TbDq}PO=IRRRU&;b62TyhK%u{^{NR!z{
z!#Pqh%62eSab~xzv{bWLZ{9%6RMWje8&C*xbBYLCC?Z<>F!YGc^3OPAU@N+Br=Z6D
z!`5)%j5U0MgjT49I>Lq8H4Xp2UyuC@kj8oKUr%VCYFG4{9!FDa(B>-fttg)sH)ZNR
ze?sj2??<gDT9WlZ!sLtQgg`klistljoAiia?5Yf)KH#7vCBr?7aAouqXsl#&_r@ST
z;Bzs<gH9#C8)66&yz?r)A^cEfI`(4pR{RnXhIO(WU!+^c#~Oj4R49?}+AidQch?8*
z?*u(>tPtJi%X1c4o|MaezXUUsb;IW8(&lk{_&l=1RBEQzZeF&IkB_Av2uWetV+^e=
z8Jhn#BY5*pQ7|){N_Hpn`yquRjm`2k1E3<XQ;IZ}+%kwR#$q$3wdxQFO|7(Bt}B}+
zD&%vrc7Xu+aJ2U+iaBqUn-Rr;ea)!*Bgl<LUR+j~Qg(~ti25XLU--c_JDDfUYG{X?
z>BF~C+#NUC4vPOZoi+65^>h!5E!FX~N{3*EpV~XYFtadt1)EMwD5#A;X9=Zr-TK?*
zu8ALx|1n2v)h^}*t(MytsvMscr04f8%^!?;(R_Xy4hg2&@Q&f4pEze1Cx?FD!<c9?
zl&LRJmxfsB`M_Mwk3>=t>qxQJ*yeqT-cia(`W@#kA;!SQHwsPotTKIkwzCNO(0Lhg
ziIK$`PfKA<JIpgiXY=TlQIhet+Uz`wK{J>$ht15v2^xh#sD$9AKka$uA3x^;$bqr*
z2DF9gSLD)?YsCcV6fH#2;62>82?6`VntA-X4#9IlXys1t5z39|DWHTrtihTgT#1{_
z0$o*~6#ueT{HQ!5P=o0nMT^D9Wv;ffUgz6E+!wybV0NIEWJTmGrbS$u$w*NcK}*S6
zP)j3NqwMWKa@(KVx0dZY-KtTeZ|AH#?b~f`8AKvbs^m*q_R?IDYwFuHI+)2c?UktM
z#e+*}(L&>3hdr9F(ZPbcI|oJ)8K8)qWw?E%XZ*Xfeeb;EU6+xboSB+HpTA&i<qXsp
zm|4(5BSKd8jZhZVWl`%%{hfCD7$%Dn2~8lD{0~>YLBjr*`mw;hMf)<4kMSMh>d=8&
zYsFG#KV8;~l9IB~&;1A5KAHm2l*o2XW-r~PcP{HefLWAmd#RKK%4!iN0EZ3OKPeTm
z_IJnIyTWHX918N-7Fyf&{`}TNBP&KLG6t9Sq!FjxSQD8OV#IwZezCZobLnydE4k@p
zZn?8Fccv@s>aPxnkkY%<3d@z<rtorRfcDyauhSd|>`Z;dSgKi!92(;E0Tz|;772?$
z!Bgj<;6w~H0Xt%O&2#4zdFIxD4_7cw+YBv#Pw-+#Nl2<gNAN<2cJNLbyMsw-Z-erl
zN<Rj11SgwCW3X*n8Mp2UuWq9wcR$5jy;Lt6QF&4gn&Fv$tbS(dKQ-<Dr=f36{mD0y
z9{iih%dRy4;o1MvR(p3&&4yLKzU$)GM^2o4qwbRz?9cv}{JR&Ny5}G2j(_K)#s9}c
zSHt5Izk9X#cduF+R<E0S^b)E$Vk9VmdU6QQ+550Xxh$sbJ*sxfwlm5>JC^Brmio>)
zBCI7bFR71dGys!GP1>mRXT)bHshI%Ekm?B4isau7P#hK5C2QoUEqzoUcS|5f0Yjal
zC<hn<i8Y;&9jIV}Q(5Vz7CZ~GgwqI|=Pe%V^>wcBn}yoV5u>K`p1J;I?$f2s?zyYB
z&LG`kGcdqZEpd2NI!;U@qK@yjds^5sQS>VP<Pkedq2M|eg6}?_@7-gIw?dsN-d@QR
zk}77gTDnaYQt1DM!1~~OYS4H_82R}b-ot!<v2jLYz(|!wcVUdX=?qkp)_5)vkQO}B
z45X%y|JFym9yp&-MjD;Ak1t+&8XZlJBk-wZe4?<+ciQ?dfs4lT`jQ)0q40-#@`ApO
z>uaWi{rRg!qa-p1_l}YWWpeJ@NF*89VoKbb8CU>`mjM#Bf43;P@p-Yqd6ADGoAy4C
zRZ5v2BV*D@Fm;@hB&nTYz)ML8#U=2g!0Ct_gDEVSJ%zI^-_53~a`zsw{bPU${9ZSN
zJJR!!8Oy_?D@`hN?jW5;`2YE~^?V%R;ya~7371i<9TvI*FF+hJ31I@yxEwQn({W(@
zy7uXat7QDT<Gq-`TXw)m90n=TYuREXBm;)oD&E6wSzbs!rT2g(%0su&m7D7uOZR0|
zrVVUStWjzaiv0ihsKihI{9*xwF<9r@l9IVOa0dlH6&>?(V$@a;-Muh2<N+}<g@=-4
zB|mUh<pbHo2jkS4tSepmI>VUXHc=$!g$yNlXN6lkf}hOaU2^mM_2&qydD>a+_H%jR
z3y&-u$Xq3S5WD*bCD*oS-x2nA5>s@6Y{)NB5a??l6hL?(?FlO;qPP?aJvj!fKoN);
za#o+2BbF9I#A5Rsn<$G0s}iM6;5$92LqjZ}5x<WQ3JDwQMCduPkOk>J5~p(JSNs%%
zH5DR*5f4$S2QU#Q_6&g^=ko1^gvnLfCVUiV2epvE=RnPoHq-wYubJ=NsB0DDzisE7
zKIZS6mhtzn^x(#cynqAZydL%~H&izsG#^4w43H&-lCJo9^L!*UWVB9>CD0{O87cB#
zit>M5AzSDWx(Zo}0;55_bB(w<$!V9$4Kd5Wke3bsOkUAOm_-ZRAFe)!L0~No5l=!+
zlpYdN4FL&69o+?#16!28KGC&sulrl67_d&?);p}-rh#L3h&Gs0a`WEhhKW7nZxdTP
zw2v&oxAod1*8ACcx|#yEuF-6S^2em1szvTA<1u3&G;eucclN*Kl2T+Gq9~M#RS31)
z%2S2O%ZO-H3Jm^<mmk`64#1;<%945`&HZeeo3-}xPdJzmqxOv|ADxFqD$T>*)JEE<
zB~NavYyba#O6L0m*L4Rb@*JV-`>kD}eQ4dAjBI1-2A^Y0lNb|k^ncyNu!*eQ3C~yc
zkrmw1hDNb!04jrz;!1ly{-WqOndb$5OtV=g$^#dw!PTFi(<I(s&*_@P{B+(qNp879
zjHdQ@0#1bQ;T5bpUH%Hq?P6xZRq#kFbNlg`R`%aQ<c8}}X>vA<-3MDuYNYa|?lDK;
zN(yTwI<#wvhKb27*3k0MpK=^w^e-_b*ZLh#9*d2KZJA#Nu`ri$X?H?Ao0FAC<9)n4
z*pk4i@-DJukOyQy1CszOl)so-*+BuRij`Oe&p32iv<Rx!8-li@S_E2a32h(@PAl9u
z41^Wo%{PA|v{EV>Ncqsihbi2nJZgnaJA{!;4w@kV?6=?(of4D|2?8O^)S>>h(UP|m
z9g7b!>Rp7Ors77$_{&Derz4Kg%@No8V^^(kkO;7mSvRxRH%$wTGNcP04Lk;6fSsh^
z@HhxB>`8E8>W>ffrna4T#ST0w31_eSG}Bi4h!5OT<Fd9U3}Y(*I-_6CaGD|@_?_5p
zVGgz&2{uW|UvK$7P=t`-rX)kap9J*-v~zVY)wT=)b|EWd|F3fwLWj~QvARWP(qv!0
zxF>hVV!1jF)<T+zcx0q^f}gZsp5++7kry7({zAmN@W}MoW{fft2UtL5;?0R(8GDiJ
zAItmo5|mQkWBE*uD5~z|mQ$N-=zRmQyG=28c%IxDSXJot;L@NXxsyY9P5w$lCJdf%
zy#&cSQa=N15d{!Sbb|)Ckm?W)koZnFK#VGc;LYvRyN3`tqKpu=vS9I+ty?2ij1p6@
z$8}{6eC*uTzYk`*!ox;u*j8wnIOZCT3Ba62s?DER^!B4LeRsMSef4}K^nn1BHA~=@
zdZGtnWfYtirR!757cz(vK@_8gfNcYwrUuDV2O$cO*9_}>lk$2Yiat`ZTh7HJej>cE
zLVBE+gUenrMdJgLo5fKAQ{jbjMlo&?lr<a?A}>8lU1K@K2s&sL<|b0|w2!|CbCG0p
z!=VxHUi~drYZNz6uKwv@jkG<KrTsNkkYvAa+dO+lZGrz`{Ei}Lg2)A;OW&l1GeV1k
zE2t!6th|y-AfHLX#B%7qRsGC?Ps9KHQZP0M(9`L(ZsNV>MD8fY*ni+FTOW}vt;cms
zCZJLnMyRKmP=B1a8Vq{m3gD@7GnKr8%S&(!_>1C^K%L<u!;iFHuVS_5`7?}}sT!`_
zx?v1|&-V)=>vkA5sjIr?%VZg;(8`|Gh*Ee?6)>tOmeK@>r{H9ztjwJ6xK5hut3(d9
zv~cFmk_8CGQ4H##R|(xzSd#}Jaf)_u4j+d}-A-zfGD%kGRACFOHO;Ms!9|3qw1cuU
zLuiy6WRC4g^EmgzvW6-Ip~kFAI=MVw<{vp2CNM5GvN+o0V9YCvmc+l6lq#4&s_!+1
zgk9BR?w^;%#&)h`G*AIE4MPmn0lRSAjM+F!VGUp?-bpy2th-By@{N4TsMG7go{XLz
zM!$Y+*9d5zOTv)UcHgB+JK~tyy(d-$1Dm?;rn(k4W_VN8RWh*7QbN9_igmSB$dYox
z_FVrj%qSvQh)mHhIzY@5n@$8xskMZ~5z6s-S+#E%wPG$$p&|59d!c2Eqr>3qV-Hi+
zPh}zB;=HCrh;rN#SX%?Tik~k<Xw=DBP)%o;Mv`jTRsL_!rX)9=o}3q@>CBX5!^_l6
zL$Giv&EWJ(?;iY4<-yXgoQzOUzj`vl?5(C^EOx&NWJMRYJjeGE!e%m$jhY#Al-T^)
zc^1E_4EX;BYv>z;LnSR?4uqUD5y?3Rmt@(_<>I^jWM!#oH#}~>snXkdmgop~u=A7M
zPo^_p$&S1tHDW*v6IbTPsDmWybS3>qk}v*)Sf%Argpz#!$MPVn`1OiJf+<3bs@)W6
z1HPq1x!_T1H;M(dEk=qtA>E79Q2blyCeQ{|Q;P}JS9y6WdQTb&M2M-<q>CRgpWE_I
z%7O8HmY{_zb>_x1p@+{g5bfU;JA5+NPxEDJN+aE7`8QCoiP!+=4s?A}mTIq1A^%ju
zNDL=h-wu8Qrox=Ps`-RiwhH|h^N8sUW34Ro0m0-ILJvWW;hCgXqM*qXU0qfG7M6+U
zSEjk=iEK#kM$ZA4Bh9nK)z{A*5OlZ2yhpgEI|i`Y#fg#~tD%%H5|LiG8^JT0EH*xI
z;e#T+V%GysI_!bWOyN#e@PaNu<ARAn0iGQ#I9NOkBa!Z$r(Y@cCF(8)BIIhR-cS)s
zu9qhSmrxiSsHLd1BN*a@WOe3?N+QO?W*tefY(bN%nhFkSz+wTa;F0nY1a^yhPf7`4
z?JHO}3kjDL5ZFY}bExr=61l|1ih6;MK{}?sq;M4L`N)ezj-!O&yhVd6q@3@LS9FC6
zsqfCFhipc!nzzog<IupC&T^brXwSQ&Om}&js=P;hzhPF(G?BW3Mu(nZOJ&;g)lj_p
z8|9?L--swDw~S_@K^!8WBu6As2k0p|@1`Mj!k#6C_~J(x`a?`+2{5mAK4fZ&(*+(|
z)w|127^S~bvRZs`-A*Mv%G&yPrG0VEZ$H6zx1S2!8vaV36e%AhHR&zip3YPRuVs6^
z5RB-K6s-X0+|*yw2mx+j79~TzcLHABQFtf9vZPl_C)LY8k`loV7?aS90Edan<R$##
zc<G&MRRl>iNNE=V<b&S<0BDm#<a^%YGVrx@Ig;Ti65Ug!bQhf`IYe0%Ijz-}--_EZ
zy=E-MCC@4grpj@8ZQ&ue@h*tEvF42LHO2lcek<lmD}}cUU)Tc^BjX3c3mNyo%oduZ
zY*`816KAN_4~R61ZtbO9|Dhq@xA~}5iLMm?6TOp6eS#+gx#q`NnQohn;U_xPYv5c9
zQ(G9MT-r{eAg>5nYpe6tV72>Orhasl@0yR955=tbZtU<|QTsCF^+~pV4<dj*AP(qY
zp@#sUqhb#;jJ!8>CqXZ-#D7-1e&efG=>}KnszoC|{nLu~(ja`Spa}rLQ||)d@8T+t
z^-Pp$3(s3f&meXkyjEa;#3-&4yao&>gmETJ_Xf0D9F(BrJYiJnnRGauM57c=zOTyx
zMFODCeMid0_cSjQI7n!Tp9{In04l)iv{{sWq_^SqfqQ*J@OtHg>D@-AvqyL_2C|47
z?9GcPS7vd>zQ?!H_W*sQKL<E_V8S(i-2l)qVzic64ZoZ3mg3~ggV}8heurqyS<|uL
zrPgxB?q0&6>_?sn9`jcYhF27|i@bFGn|5-ybGG{KPkxcBL&<=~F0dHoHd#y0*QH`v
zy#--ZrFVAV`>DPo<<;(%Ax0vdD_zHu{7<Z=W2&uUz)t~w9@<T1ZW(JC+|Dm<4Ds%Z
zZb*98Mm8o|uJz{``wvy~QN2^}=BQ`ji^FL?lSZZ4;vLK2U8IbbJ56H!;`NpY1yKny
z>)zE{rQl`->IeAKd}mbQ0n#Ia+ack2Tv-EvF6b4UCf{JI934NaEzm{8P~ZaRhv6gq
zC{Y7d7o<lfDsm%|au!(RYTE2{FaibkVpwUU#)e8wJZJJHA|(*FJIlI01XzLw&b|%#
zC2*^&)+;d+*UyG;p8;)n>Oi+)4J)z-m8HqK0jQ*Ty{9Js3`j&?G7)Vy#q@4y&L<p!
z*d3|>_!cwbowNbfyhcF&MX#T<!-x|*Y_5Q3Z8v*ypuXqkmv5B9q6QtyFQnNLEfEKs
zI=5xpYFn(8_`m@b+#4~1kg@&Iv?V=hWpDAD+^3m+dzR>7zDG^3J7Rir_0~ft1isUT
zT%k)?COd~pe&XF_SfRCAZghk%;NuXIBedLh1oN00hcwfZZ<dxqCCq_g%nqeR6!*fR
z=7qz-Xm8sTk$l@wUGOT3{!Wb_2}`e2cR-s$b;3s@@$$0bpD+oml({k~C|9&{C~wp>
zgcsPf2J=Rx7KY)%v_TrMTksIRP;`;*q>m~R1}|(+8Cr6WMA3CGaRvfEA@VEoJ0@J>
zDxS#rgQq1eC5%qQBQO@JMRr!Eifatvt>y27L)<ySpyB4_{Wm7Yx1~4VImC!^o%aEe
zy(~o{DA1MxNNuSsy_3Dpqyfx{Ob92-7P<mG7|D{Xsr_3REeIK(jIdRLjGGKryrQQ%
z-Y=-eA(E$T;=xg213%~T<x)~q(2`LBqE@63To2uWKWGCbooNdE4chEi%+-m7=!e<B
z-4r#*J#6<pt@X!*F+kKbheBjIt*biAZH?yFN%QN~ZWo3J9f6o|h2u8p{IywHyMdYI
ztrMwpi#)F;`CqTTyBPYG6prN33b_2+OvmJz*-{V1<(zBL5z?ur!H)d(Fh&8trHH2K
z@E-e8T12*x1EFFGJZO2G@{9kZ)DVVmB1_Gdg>&K_5ob8=6NTBA>M!9XpTuodAU6(E
z3^n9UqU6|OlhX|Rn77IQd(9ag{mCDGlJMu|3;(n)zw{elzwiF!m!Ep{m;cYBi!MJ`
zbmL!~|NZ&c$42)5_g9m?^ZlyX4;@~%<l+bCHZ^Z}y>G!EU-`dut~18Zer*i?XxT^S
ztAFrn(ID&Bc(b;MJ3hr#nkP#|i^I+lp>2VdhqhYA&KIVKx&fyy7aDGy4a(lU529iL
z#6%h8`v(eju*<>j7w5iN!~IMYK`me)>~sHoVkBvVnHC)#bqd-kc{zBy`*PeB83^rU
zeAzkUeRwOde@spBA_FAt4Cv<N@l6ogY$7f)ibe^*7zQvAk*03mi}^$xbM-bJB-TMg
z-Y1I=&eNrq&-g)QR6(dtbG2)iwM!!o?S@=mtM=JgxUl4A-7>?(F;?*nYd7@aTUte7
z$+Z)>mZ7i>&7XVTH_61boo+4{hUEsB3zRbgu52ds;W)C5Kt1mjp<l$&>EbIO-Zuh1
z3G50MjMfOt4<WdrjlsJQISFCRT&=x8HRsvq?vdhU818`hd?>z8vHQfLLb4@<x=4jM
zY(_PrYgKJ}a6Qj&Z*)Q_zy}rrc9z}{Gh_OsEodkR|5Ao?qe@0WU$>A|HJqVt(<mA!
z+QFe04vZTQXk!iyZpG#Eg}$@nlYEoN+0nFpg8i94qS#%+%apX{xqY(l;1Pjx_?E<j
z;2O#}D(B-#-MO(M$3@D7sEn^R^$mNb;+rnMEa$yPPl{F*=P6N>f{OEQrY|a#V&Z@4
z%o<PegXN}{7u{Fv-pn|&PP=A++cDG?`fOOcrS)jn0COzYPFw4_yo1`GhO{#pMULx$
zpCady77VqI83tG6!w<zU1axPc)GhFGZAgFu+0a%5u8QJm(cF-<<o!bIC(L2ra{tD_
zZVd3nevue$ShkU{NBK%(RPZ7uTZ}`14?V<6#gT?LLZ=~nD@27rKIRZ3c$F%de6Yib
z0AyMb>z#ol3DA;8F~qCSk%`jmsmfAS9`a|1DTT7gVpOJ@InN&H%n6NZdgi)%wo6-Q
z4S#yeT9WO$F!mJR4Qh8TtI#*)r;8BUG!-DkI&X0vv@kr8_EknP0g#6-)%JqI?%tWw
z8yrch8xn1&KNCMZ=TaCxUDb^crEwsVSV~(5QF5LY2SRdH+JJOTSzFnOzIDd?9Ob=5
z%X4u2qcvRAvkIL7eG)z1j=RXSR?q3!c<%F=sQyiD-ovyH>zn#cS%Os|oCeOgLgO7;
zw|03%yKcF0=k!uE7>^e%{vk);?T(w5>x}=ig9|(h!ITTdwcEw?o+l-`6SZ@I=fp&#
zESG<gBP!W0wk{E?2c2B6ep*nl#{38V`7fu>e$EAOR>|xNf^USJ3G|_4Kv?~WPZ9xL
z#&NQCvVzT<*(%PStLDXkfkzJGk0GKNY8&l?TLQll-I>5Bz?5PQBUl(xnBj<xh`?UF
zA&LrC6V7&N6F`z5w;LU=x-Q%@SC#$;r^wXwB*7Fb0AZ#?rHa3r0R9xXI5aIpYMK>*
z)h7JD|H(O$OsdUqGNGYxtbim(B<-Izxqt1sNChbenGf19<F%T^$?Pp7<)S!<R@yfK
zy5eFj)!P_(rY+CPUzL%Z^}&4Y&m&_cAuTv!xwhXGqLN+HYz@5^JkJ)>p>V4uJg6^u
z!)RTN4__M#pIV3<HDK3t$TV=!BTR_A_<L3D-r`b130Zs_wkab06M&VNiBg7Qk!4hp
z`;-obF{K!*sJM29_^FrFW0)5s_0Zn<pAD`J<2g#I5!>~a*gV*BP^(!yU|K+xs?hv7
z$jQV&l^%<@Yj-=uRl*GX=zEMYkd*zYj6`Mb6|TqFSay%o@w4`zDOS{o3@yoi;|HWM
zzdT{N6>4|AxBtd7(y5)wpAC{y9V6DVgbCtjbtCR_ZgIEvE%P6Wvuy056jI}oVyg4U
zg{#uX1{DS~G7zu9MiX4&fxwUXM1Tdzu|**c)P-Cy6>7MiTt;|m+V)eh_o@5~q=QXj
zIyOr4cYhJ&W5@;ZxLh&V6e+cIkxh?Qf|VXh+u_;sw&68P@YDUUK`ee($;3wv2GA;K
zVp-b}{zM-R7(#Db6Kh#s(9Xt`oqXYam_nw~#X%X8&TBfI!$D?82tn-vi`KaJNW0F%
zts_kQC#NM6V`04o7m9sn!?w6JG#&G$?ClV--}z~Q3{`IO5@V|Lz#Vj7Y5msS^_A0N
zp(h3^s>`HSL@UdbgzIgsZA5j*aPE%kP%veg_yDZ@zI}XbB5Z)cL(?JMVUAX9gnCf@
z!O2eHyXK*hL6S}e<7bsc@iIgp#*G#+9~8#}yd8=Lc@SIhLdN+Soy-wl;03(7v{!R#
zQL@D42+{{F<zbnovUp%s8{AD*K`$7nek;m<ci#%$Mu?0uAYKAt5B#SQ^h}Pd5{9Z+
zsxZy@@TmFza*+Gvtnh<%<~M&ZVhG<nYY7ei)rqA*8quyGxM<A_cGh;@zpRI_o9fgp
z(H-G+rPy5Hj_nDqhx1E6*p0Sk4Q8sQCW2pnhG{mR6WS3!j}jf-!5+O%s~{}6pqS+y
zqm&K&id<pq^lgIi!V~FGU>W<%K4i0%?DZIyT*tMvOR^MOAkpGVVFSv{ivkD$Y(aLd
z3>-Cc6f&J5igq1WU{`Myz<~_jutOjs3(5?lD!>n44YYRr_OHXE?Jo0pm<}<_j@fTI
za>|=4km?Hxz`O?<?_O}T4a{{C1E<)SBOL>WK!p)peH0667>!(Gek(P<x(ohgb#C?m
z%o`9i8s}PiAo+CsmD@Vxd#Pxks$RB@%_U{B!gUf)lnPXjA<_U;B;pi?J3|RPPT3}a
zI@3N4=GgRwJ=KN<#{N?n5g%?2zw0`1>#8G6Jmj9;{5-31Sk}<ndExVU!AJUX``<VT
zu2SjC?C(yJum-NiAi+?JR3+YsInca((7k*bf%3_oe38V$QxfX8C@g`FAdRALB8IB9
zbH9-;oM>>B^hU{$1Hwq$?24?9@AU5BlF2kJ50ii4fBOW&R8*Tg(lP{*0}&IeC3%ed
zvEQaB5n8Maz5>T#L@@fMY80y~DH@4YOx;?<e3=KwR})sNe)hiQ%7jG=wQIL2BI)bL
zC-pD22uup9SfIp5avWkRS809<UAE1w`?n`WgnKo&`>xJIM}#dnZOs59i$;nf)jLK7
zo@I{OC7xxxDad?krR*<@SXC*YsxXxrptxRN#>`uUwUjyP7X;C4v@b?czbt<HXt@|t
zQTuJqJm<T6%fwkWe;|N%+`f4>Z{kkh*P0g&SVN~B<K9NgXLb1y2et^8<Xq7!_rSN2
z*^PXJhKiHk3gIa?=C&S5D#S(_T2m(Pnkv&HFn-on@#)B>WV?eE|BADY&tkAc>B#lO
z<gScit^cu6dMf%rz;iH?NC6t{%<``w8;TzwK#F^xfWlFMbCeI%pP0Si*JWTCk{BHj
zsG?=Wl@;Jp%juUzVGw~Eq(m`}2kmFc?s8rtOy>AFEP<NdK;@VodUH%;e~|WaSN~Y(
z;r#O8J<@>QEuypJ!qiprg@D_15!SLKiGC^$+xN|`6~auLcZy7{nT{-U_d?qjL<V>Q
z&hdkW>MGxcbP@FDOXwwpQl}`75I~&1Sw-t+9MV6UP_L4nIo5Eu?}_k(G?s)`3XOp`
zSgj=bz{B~yj6nm7Pdg?)>d?khG#`b=r@IUb{im%hC~YrUZVpNC3wIjJpIGhPhlg$M
z<o*i@vX_qzN0LgSE}uq+N3HK9t&TDf_BPe1IqN9+u`Hi~;-puT9149r2Fmu8nLpxM
z)yDCL<G6^G&bz&{d{3AzG7#Z_#B5E3fcv&8Q!rT|woX;#fR~6?M-n3AARKHkBwxMN
z_vYjPzU$IB4EOE=reEcYi=5HpCG5;V089raTq&vNYGYc;a^t|Gq+vk%>{<Py)-|Se
z<1pdpN?qN5<J(pV0mFPZiVMb}c+ZR$36G<<X1k1pwqofO2h|SBj`xx9-$6c<Y&Hx_
zQ;un>r#A;$N<=G^{v7T%jNbzPVrRl8_85g!W+XtdW+V!_r}CiyoEOmV2@D(q1Sh@P
zC<Q#5PL~OgwO}BVP~|-3yI%F-8OH<{$(u(>(GmJ&{*IEH+5SPj<;E_{wMJL)PdSOK
z&p9%0d&cy-xV$&ga5m-j6%2M0UdQCBRV5D=mI<jBVGpO@?LA7MpxPpmXA+-Fj*3%=
zq!*$fo#?}qXi-AXS8njl%E*+nYm0E&L`551FDO6Vw%Q|5C{`?3yRwyx;4W3Sl$1s8
zs-_5s9}Gee-s%oQD)&v=IVP8U^9Z|50rIJ^PcjupggG~5shS=6%9Mly*d<1wr`%+%
zGu#FH<IAGZ$|RvR(Lkh>DK(K@sPG(5)0*f*#48l;Maz56O~X|tgI|>4BTLbwJ>$3g
zA7hHK?k;pDm~P0n7VPOgr>SwbL%WmJYBF#j&_(;&BgW3)2vKo)^nh>@cdb!W)UN7n
zWeCL}^hzP0AYT8zWg7=pEE?Dj`k7NZfGPBm&U4Fly2793X*XQq=L#JYZ&*Vw4h)q%
zjlYR13VoCqk1VksOnovV;k~3(-=_XdwtVixR!%`S!uL1`!tCtsh10NO6;L%yXz|6D
zP%<K6BW6*La7Rf`vM%TY^Lom*o@r_)m4JL?ZY2*CpVvx?FVdAkFcQeQjO*y?M$GGl
z^V8f;?xjrNTL26Z598z?B_I->8mP2S2&4eRhnz&sCR00H5{m@2M=UsHL|zNNKf5_4
zE=regK1?c7)~b_;5wXLMZq$xNF5P)y!d?GX*QGk1U2Z@e#rP_8N8%UFhvCqk+{`WD
z+w6wWAxl9DQfF{m(%$9+2@?yD;H9(%-Zci-0ul0q$)^N4UEK@4HB)q2a9b!Vq}ap+
znywmak=5vklw5=+5m<{ki`zM+nUNebE5$_R7iq&=3V1u0NnN_AxqypWT*mB_eIXoT
z@ZTNc3CqnK?%_~QQvsg3ETNd<e}^+N_HU8ig4=*WJk(~QLkYU{EY^@D3}nO>5*4^}
z5vW)7s*?B{dFMqw@hd5EDHO`gPw~c~8@2q4z(NesXtvaxMp`iBD6u>U8Q@^!qMpGQ
z@CCxMtxlb6+~NAH&kxr}3J-t^ge<a7nl4N!%+4crTcVRNO4BU1F!DKKB3<=bJbOwy
z9N>=QVeJNy^2(Usdf;)>hXNh4AS}0U61J4QhE&(!m23`_={=?gRA3Uurv*SMp5*61
zWVa+s)Y8)?3r4Q$XS-+alLR2SQRa1BFTnF5ev)$62A-9tCZ1_)**-EX)uA&hHs5wq
zC{+OTv0UQ(@|hqo^b?iMFQ&ekyP>jtase13OQ6sERWiGF;`2&AFs1laUKhZr{AjW4
zd)+RGj1(LN?w_ZYyARXblV_n!FNL)3Tj@U}JRhlG)zUt3V3%@%CpE31gGHwj?(!sO
z2;nF^2`M*sP~A3&JBfKYazn^)!f~d8z)F%7BY!E`3QXK#jHTAgi~2Cv*=t<*6%8O8
z>G@-FJx4GBp^XM&gm5Sii=7$ja|8~HK#%@<YUaG(YiGu^Ol#Q7wR(SlU{d-qjBiv&
zz{;R!;j~unBUbq17^}JWz*;DF?;yyjT1XdIAZkc12u4<-ZK1&s10M0Y1Iro|OAFqM
zJ>j1!HYr$zEc3AA9+B!~x+6}{^09=H7=y9!IbRa9Iq{nB<uDm^s1usQx^DNP{AKx{
z3Y<|GzWMJ{nf8fZFsb*FUW=+b<91R*H!^0;;if1<8>}lK=M;byiB)z?>8e=4{&loT
zWH$DL1Zrf>6p)^j>lBT`u_rchkv^7B%3pkL=URX_{J1N5$B29!v6PTBH6%j`gR)tN
zxZj8W9)-6h!#21<v)ug4=3Do#Sn7YhZU2AWUHabdkLtdWx%_9l-uv3`fBMSv{kh+`
z=iS`WioZ+x=}(`3t9#~uoBoHdeeH#)n%L5pzx}mu|6}e)|B&|d@Q;r8E*iQDKYg;`
zQ-8-(p6^%veOpiV=+2|Tckt5e;7t!waVj~Q<vUwz#k6>k52nbiwRDPC<wKDrB?(4)
zl4WXe1s#E%Lf3TT&sa+?+|sOL2F$^2C3O|R)i$qFH@Kx}dL}qe!en{`8DHDoyLTrT
zs$KVhVP#^<lrC3E%@5{6qxTARKtUk_d`Q39lxUR!)l1Y%-qUi(TYVeoEqEQV-t0y2
zBij=M6a<XH{epY*_1NL<%s5}MyLa?9<Yvw|w5_eS96>)b$<&?8O7Fi3DCx0;`c+GC
zyRXU_boY2$4dH-ge3#})F-!!U=Y8+k<8u_HJW2?D^8EqnJxd7Ho5?AxonV8jZpoUx
zphR>9coDM<C98z1Co)2;?|~uj<Sn_T>H+hNr9tMd03p&Y(NoKo3n2sWZttL_2G>(F
zate<|nYSJ}?3*OddtqVnM#_8V7(&bqVQw2Val+?=GS_7xD_4+mCk~U`ja@&aY7yzm
z!@M@iI7TlV3Q2oJg+)CqydAlvxd2B3m(O=x8NP8K{i2ZQiaizHCCZXOrr%}c2`1kM
zfwvdx1Ahxzl^joP@hq$_$`&cF`!_TP{x+4#F%RTvTkf5+nr)Ji9jCQG&R)#4vS4y6
zpu`&+1FlJv1yxDldE9F$YP0YihkKK#N4F&z^_=fw@bML|o8*D?F2%s$@PUbs^kI))
zJ3-A-W(^)4FqZ7?3S1o0&TB|&7|iCp);XnrsdQzc&yE75l_3Bg0K6aT4!;Jc1JIsE
zp@Qdfs3JDsbSx>SytB433JbGsm0~+zMKmPFi!Yog5b_$;ICk$=@MESP{t3kp0eV}f
z9=WnU>v~yeVTO&|W^P6FN<C0*u6QcLU;%YK!wg5=eYyT5PnolCVUiDisSrOQw-nIq
z={ZO}v>rx4cF$qr0l`*=uuF9k{zfwbKX(0jY~nguFAsqQS?_AftEPS;Jx{qYqibhx
z&V9*GVu}PqTbPZ=x%$p4MWQO{8IR^SxhOC30a`uE5f0~0_A?VE^47-<2Om#IDbJ-*
z%t8)QN{=G<)b1$<dsjGS(&~9n_wABmvS=W?Z3dMozI5wNbFr}Fpx8LGWd(8=521H+
zaoLKj=>hud;03*AB>hb_1U58VgZ~&jul>ak+@)Q!1YzZcV?uf@NBbl%XylE{$J&R_
z$|(!ac#-PfDK(s@7H;GYLqjSWWJMuA8OGL3s%_JX=jl^-wh@1mjh@&Tp6t5XMl<+^
zsp-u6u5S(}AdqWqq!S~?1WP=Sx<D`TvOB#xu_?~>v~<WCvjLqzU4+_-!Ciq@sAVd6
zQPd(46TX|O(4^L!9XR}zGn-3TRO|~mb>8P1J^dyuPACcz{Df&Vhmuep7^9d;4~0g$
zH0&vS=iBoRTsa%;Hvhop`v-|&G&L=Rthc9~unRA>m<9kgHLSS^{KJbQ)P$3!{?Cu>
zv{cjYQeT3oaUoU|GDi|qx~YJtZ4x;YZ~xrM3`_(7Eu{evQN23U*etd<V%L5&ZF*<X
z^OP!`2{h|`?HtO}0`}J~SZuj97KT6M(())OXF0-y7JuuQ_P((s-aFqAEynAHiCJ0z
zfImhHP85dchjiLS@sy6$@jsCMHkew@Am?C}8UvLqE)31jFogy>JB}8m?Fi$H_{CW^
z=!v~)oA7N#|NJr$ejn`^0ExZ617qQ?<5C+(`hY_4a71Zj6|qf?=J~1t4SJ0@M~}#C
zrCv#60g_xV##53+h3^2xigHfi%uc7H9e@LxtapJ_=QM%f*|K>?_!Ywc6N<XA!Btr9
z#NwIk!OpS=LI+g(x-wzZUJkdp9wh!Woq@(Aj=%+5)$lhT`Phc0#BTT6z^fR-pmRg6
zNB-(PCCfLFx(LUFclA&~>091olE$_<Yg~z<SxsXtQ-zvLi8|3b591K(6|sCqK|*~g
zRR%vuZ<eb5at0|t$&IlNHtg0GbQZnXc__XZ{Z>?RR$ukd`$(-OuCon>i1@bDnU@@)
z{hH1YsA-ZbhvT~AwlxHc!4<q_4N~g(TP?R={B4?hbF^rfu>IWF=G{jqhCGA82J;Xa
zo=#j6{SQkNWm(uL1jM*}F5vn=CL^|o!MJ=Yz={inR7~ag65RlJv55VI@4#fB75Q<T
zk>(gHRUEHG@`ZHK_e7EVmh|6JHnddqWlPmbKH`wbPmr?Lrz-^@CWEYEgJFzPh(JwK
zY>UOj@&r5-afp-3iVQ|g?;K0X^c3*oZ)JAyv+Mg0Xv1iAUo$$6kNGeUp{r)s4Fd`=
zPDt?;%DK9U`mMZg5=opSr#xS?=oq8`COpdQdwF)g*jxW>FkA8&p^+tB@*FVbq#;D`
zFNQGbYiohOIU;%>6nNr>4|vb*tPz9*cCnbbOUx8WOkxktti?;#c%i?sw5~SR`f8{y
zbYA;S_#&mppR_=VR$&QU)o@c<O+HHt_h^1YXqI+W`-tkJZp?5%yL=u^_R?YY*F^z(
ziS63n%5xaO@TukJkO#>QO=J<Q2NQHjvftnadAH%pvw_Pk>P2?DNVXEn{Y;@$_8N}E
zs}k|V31_AlX$t^xNGAr<bc1kTfe)o~mes!11rIEex`BHQOE$7(ixPkst7nKMRC86E
z7uQ>GN@6It6>1XhSh7X(@lNk8^q7>x=gh3yQggmhilH51aDkf+kL?)rC*Y>IKgC?X
zg{A&^11m~GSF&B#9+;^-IstllwczSgve-Q!Tq8-a@s5~LhU|xCW3MM?7fM;8mtHY8
zud%9@KqNi0OGK@d9%Rw}sN^+Ug>?#p;(gl0LJ9Sa{5E537O1J}-$Bz=teRUMPm7o9
zneDqMBbF(14sEM+PkJqCjcp)mMbG`?x|_yuU-+^*KK4PzSkQ8|-Z?(X5o`EeYbYRx
z7x=D(M$e+Xu$Z9o8wHcehm8vv<li5gSN)eGvU8B)3@*8710yQIHr4;i5(;0nWn(G?
zEzU0oNP2pNny@5B(Eu5PVwz7T`~UOUiof@|(K39?5(>5Fz29-=?05yc$>b)(H{j7n
zgBb+&mQc5|@d6^XihT~=QUb^FEYnm>_h|Y=yHH_%3Jh5&%GE>O{i0wZWzkqJ!)TB#
z24zuHXkc}{Tc8EWRD58TvnI~S!k}Wc(@fj|FpL#If@!Pmc@&1R-M)+KyXrlwi*|UA
zrmn32TkSWlFc$3qCU2i{gdG!oKRi0K|8p)t)mxPQA4};U6S`=%_)i<2csC6_)}aav
zqzE8V=LTBD4!RJVs1;Z$s%}#(zUW_Vl)06HZAeRpdkPTjm!c=2Fgg?@7=mAdW!lBX
z52N8bv$)oFUj>0pIx2;?Fhk%+XKb*7;4J+k<k~jwY~XJLUuSY4V5pos-;3VC5vP~P
zt;m3otz=2+mOuth*0ha`EWwGX!I8w#fqssBVMWe7=rY&s!1yMYc7g`r#<4M<PPA5!
zkc*bfZ7SYzv>Y2h_d;rDQty_oZ^i)Mk;yB-^z870NJ~~iNkakpXK~PHEGHDLS-k3<
zwF8@?c@SX9!*E5G!F&c3<x6VqSWlbTF}XZ;AU?3zU+>~4u4ftA;;cbolT;5u-c0Ov
zu9)#8T+hm)b>72iuTu5rg)hjNxkBge)FY&`A??pY;b5yZ)c135tab2>;N$tlJ4&ky
zHdi`_7sHz6-C`L{y!hoSL7DKci)Gxk2z0hNQO_y@Ucxj<T*BOfxqVWa6G{r{K!sT<
z(m%nY?u_BuoH38}u8?`0^kqOv7`2dysM66W)07d+q~Q1LccY?J91$4iYW?C4Td<Nn
z-96}a;B*N_nTkHiGnGPw#!e7Qz8kJbwE$#rkEnY=n0XZ#JIB7{xH`d>k;{3vF6UUs
zf7dlMABdUj^5rq1M<9#?k_}{&^8Qzj3DzLT<htjOc=s@K+hp_XrU;z9W|>taWc(<;
z>ZBN;SiQP|uY+{RG0j<Y=a|ymvun58rX>!qUi|>~jNrUJz5kOD+@(880(sJ<1>IRc
zDDi!KZEz@j-4M{Lb6RttB?(j%JSGRD25hkLIPu?Ke313XlCMC8GnrnB6Lvn}J0;VQ
z?+K)#)K$7oO;zF-K&GwYwW{Q4Qu31pGq@xIZlMIjpiYWKp$W)pYFTzbx`Cp=-u77N
zXAj7BIzUO8r6LGnOC>PuUPi0D5l)$ZQk5^s6P^@ryL>5gbDOWKzlJ|br73Eg0lAlQ
zwoqq{lrE!^zNW~lf~uAZCb&w2B64LyyGgl!!F4I6Z)qWl$wK-K=cId=2{bWf#C)h!
z4wwdnNzxJk{!S6*O-&BG!h1?bf!7r(8u~u=sA0yvq>zml;#V(%9P$+e!_|B`VVyA%
zyw7*B8YUOdp;s#ET(NW9RJ?a%+xyCwXO?eE=d!|?-ouB_RWq`*MQz)C<oRo#c4)r5
zuum_Aeu*z`sUdLfws)6h+!~a)*JF9ix_hxcGj$nV>}<FaLlhLSOwrm@z?qYn1K!a1
zJdc(&%@L{C|K>5d7Z%3l$!Wi&8Z#q#0Zm|rQimGlBE=$%qMnaz3e>g6ONUo8L|;m8
zXZM^C@Qi0BE;b@}3;#%|)@09&%nyWoDY7X6O`!Fu^;9|FH8jdR#Iho}QF2bf>z7B2
zB~Q8heGK_3poeuWirITue%R{m^owL7xaD9y;y>5N-yA%FpEl>P$RudbF$tjqh2G7g
z5OfCh5@9GJ1&pG2ih5v;1U{L92Wei!1Odh}rtW501Ig%%HExh@HZmMAQ{5-_1RqDN
zc{FLbh~-KRo$p7@IbCQ8J!c7gfla9dWz{~pWVybJK5Qtw{e>O%+sI2xvr=0M60F$t
z2jen8IUpya3#WTrw^~NRyKUw{L=g}jLBY$V9-;wa!2wZ@*_fasVC9tc>R0$hU`G`Z
z8OEPclS*nxMk+D|Q+Zu-%a=MRt_g`s=G*&eWCa2dK<tEcXDt%g1n4xa5edDlmH_Hh
z1cS(+oQLzdvQ7|BxC(+v3uGmGWMmFeCNMQ*NV&Xx>P<)lU4M4nxSVr9bCtNH-Z-eI
z_bo@dn(Ey*%4-4Ynn~a5y^9+O#AL(tnaV<q3IWJiPbSU=hqjH@npbi>%z~I!O^M~&
zNDiaY7>So|2lY9n1fLHnrK<9=`T#QIKakjJ8ZmB;Wj|yBZ6HHbAqh=MUd(IK9VL^t
zjNV7@^M2rydqzN<6#+pa9hu)63RZ<bvG~D2e%F5L`JzXOcHT#p4`Yc-M;3EW?LJPd
z7dm*^KHS{hv025Q0_GNy!=+>9)#eGK%ZOn19FZ7a@qn1PDIg!c^&O_F%?>ny^@<1x
z#Clj&2NAX^j?yqgBZ_OalPoLNA&30MQZf#5n8B;ab<&+{h3+MjaHIy5Yj=c(1VA2|
z(1GN@gko|=W&xjz?9_)*vY>sevO*xNn81~C`_AWC0I@7ra&A0zfOPid@lh007|P6s
zK`@U&@xu#ngP^8YlfprmSu*KI^=Y36#K$e2iC<j|%eK_&Hhf?NOcB?59~p`k6u&@T
z0tQ0j1p@Wj1R|^Ywupj&`c66pT*SEA>$fs0K)jK4Ft^}^q~Q+*8qNz1=9#~e(1t8G
z{f6<=2F+WgZU0`Tci&G@3zpUt0E1KCKzd)G7u3RZ7o11AG2(b(>f1!q@lKKZNK_rt
zj(5(@MdSY!VVeN_%%dr?NJV2*iCh{}e~pWkpphMA$af&>&Jpg?7DiLDpA6xS8V<0t
zDd}TvD0);_|NnQ}h4EQGne5s#`PhYwjN`A(UjOx%Hyqz>`<pl3Dt+e9+cW0;XwvFm
ze)f}l8ozdV>DO0zzE*o8^y9dDDqr98t>0B$GPyo@#_IoV-_c{)KYt^l$pXZ*_KksF
z^NiB#tU@-H9ZrvS9T(Gn`IM@O=b8o{-AeRDnIjY-VGbEnYa7)kJu5V#HrCfDe}hV2
zndLnLT7ii_>P!9%$ssV`;LILHB*B~?4<}O{qY!^BvPB>0cp3(+ZdN%L0=K)52=qm*
zQW^M}T__LbCIulFT4I6yq5p47Z#3^=DLhG1s)6|YHf(U1>Q{F9mgPAcN&}B=^)bN{
z1|@T#(Xx{Q&ZGA3^lcohdOr;^X4>@FIS=)$+UYu=5nqQb;~(8}Tzk%NxBodq6!f4G
zL->X@_^u;xCb6Z!G&kd1(%4)~>EJ0~vmuk(902STVAK%_HZopZ$cwi*6GeW}qddaQ
zwE{#jh&IZSI?S~Ps?!L-MoELkp852nXzR8Xx##=HBm!yS>zv-`R{nkxH<g-kE><xj
zoEsB@y;#Z|O&e$!5@bb3PhjmEqB$sT=;+Vc#9@gRPyI3wbMG#!=LsU1&x2zy7BD$$
z>^K?4=8viwK|NVap6(z6l_QR;725duc1Hl7>)GJFHSPAkQ&Rb`2A~iGy<jVY<K3yU
zxYY=^u9ZNo%mk}YpYgQ$spsSN&uGZIj_O*iB9rhemV+mv0*HSXg8HXS?)I*(^d3X0
zL4}?Zz082DmeFKwOj+)|Gb9bCHLyN}IuIv>25$PRrbYGJc<##zA8Ot9>De*%B5UJD
zHq57#T$}WRl4}(o=eW!hJ&y2x%k{xC=DGb#8D<Yebxvu8!Y+VW{Nh4{n}q9Ao$!`0
ze*eJk2D-f$R9xmS*!FRYb@z&0C8Yy?SG-NQD|HHWp`iojAN0bfa8u|4>X!rL^n|Kc
z(6y0gh>3Qt;J+Z)nVvgKSW!|)3^Fo%F7qYt6ycR%{_I;*)CjimRu9QWpJQU%f>}_z
zTq{5bQ7DVj1Gm0qLP?PcH1BQZIeTlOEURHDMW$^I3950<3|0&vIpM_5hSyl~TsOM&
zg74r%a=mJ?E_qEf3+AQPjRmkbe`KF3gmH0x?M*r+?D)c>_~E3w(&h?hw;lZkKDNvt
zO^qFjg`g~T(;ArF>H2GMGsOcn8UXoV5|L_MZ+~gpfcq&=G2^x9jB~ktb89P_H#Y6R
zx0XoYt37ejj?`!NqW5wOM!z?B%2lGZ8Z<Ph|NcTbCN89CH}mfLPYz3>fL=x;f^iTT
zNNSUbpxvduW=V*+YEXB1;1s~WWxY@VKO4RcjqRn)dhyln@h;?=2?e%Azo}_IwS}6D
z@QT72ztaw+(0M|p%BI*q9DJV2j`mQL?bv<`Gl691PL}dlw(mB?@dls);F2M<6Ao+T
z9A_&AgmiXrE(J%bP8y4}28^6xu=5@Rc@+(CBaNP`MF;D{Hlu~Ho9jIe>q>?DqiGFn
zp@1vNTwUx`|A7^7s<m1;T(qv18_O+Yz=st<T$U$))P0p&md`{4kr!&4O)|w`wh6oh
z7`-)$f4q3q-WsFQ3a!-wU9ySo)r^+NMT`;Hq0t=Q%yB^`LP1#O!OnGV`IHsIE&F4e
zLpLW}!9boiG1jFW%nSawDKGfP+ZR$?;cNIS#T@V#w#SO2*P(W%M>c!`ctLQ#)x#_?
zpUj}Jki~VF63YYyI2%qQ5r<6>#T%h%V#YxNHuBR#YbA@BHwgzZV)Bku$Fq(q8L<Ve
z*9IPT*P>M4&JR}4P@6oIG8|_=fd4dk6(tao5H|F-;<R`)$-q)aP)t`GFeM&TB@Q8{
zZ1p7&uvFzE@mVTf{if)h(X#x6<6_u8oj(&0kxqPcnm8X<@?m;gso_c6XTvp?KcC+p
z`pYcq_{jLgcux}#n;0qS8rrO|9nxUx!rN~$i7i08+@lpBJTx&_7w&aWRs8S`m3Bsp
zlHZ8K3kqo?<pqfn<N=pJL<qM0(x2m=IM^VjlCkkRo)AA;6y=wMhO_a*+c_ZB(6#24
z?Rx=r;W}|2&teL`F13iL)?4V%s7bCGLZ5z-vc94HLi?lkX9o+et)X{0y<K3+gBHzl
zt>b_5{3>MN#7930=_LhkX=jaf_9ER#JWxsRnCu(iBy>{epU*YbVLJ*^PSZ^#P}A{9
zrG3uI$8Zbg!B>#6BoBcuo5)(SHU=EC`EF8$onU5E^H9$%mW&B$l_@#5wb9<k=~A4{
z?(5HOKzTISxJZ|*lXW#)r40e>_F*JV3R}qtM<`u<SNA%gVp_qh)bX<=xBJsWgY8`>
zTo-ia7Y1ZWg}#$aWLBM7yYD5*GlFPmbF1F6Ct912@X_+AZLq7`R$2s)+|%1?TEtF*
zPCSG+jH-(!1R)`uonm=;GJ7RNwTw%@H#v~8TKWTiuvD^oKWtb8SOuH=?5)xgJ<yr)
z+Pc+EafL7$Qk|%kOxcxD{jI6K{n}r&U<aDE(5H-OZfS$sRcpy@?e^P-XDB|tp?z!#
zIf9LvN4r_o6}Xufyrc!fleBNIV5kt~f0;fK8{Z1gAR?ixHi79OH$FzEGtg>=Ec2}3
zsEoRh<yv`hZAF6A5$*Q#T*+28UfGBS)d)sM&0(W4sJFrlrz}f@cu4(50O|3n8h=}B
z1q~!j$!PelgjhTpbdX$j`I<X@FWnKhF-S|}9x)9A)Y>IvJWh<Mdt@C4xCuTfa;Oi;
zQF33n=r~5s^nOG%+rh8XlkwH^XYxL6<8G5^WtjJK%kK^EMQ)=FJx9|vZhi56E!*hI
z8X9BsdRxtd_CDU8hPW}To_Sw+TT7@1<%IcZSzzEl;(kinyzId50as%C&Qi<D$kvp?
zhltGz*VPWp^!<=O-^Q2+%UoIU#S#KS+TtwsL5PRLXj$is)I*JC2w$1K@1}V+kH#CE
zOTU|6{`79tCSgaQ#}WK&za#M6SXhsrzh?R35bG>{$M`Tlf**Hqh}x~o){?(G`{-<<
z_b6DWFKNt2)+D<3!Y=7q1uuvntHM;L12KgMPp|=Rn1KPk7oH<i#vf7yawd_dB+@oY
zw>DwhC-CyIi&EktWt&N+hvH_;gAt(yZJrMDjc~Z2YT*sNv2tHDrNj`%-=4L1lx>tl
zV!4ls7MycPSR{l@VwawWsE`@DdyL+BGl->9nyT%6EHsEUvF<m*;SrlsVLT-CcxnH@
zZe$vjc)%sw%3bCwXN{rn8{&zRowcyH$jm!N-SY~4EBpyrz1*Pv>~|_yHMrE^tRiri
zHCIgW|FF$@wyE6*{fcHU?clV&H`dkKTZi0N#h7+KZ!!&@wq|mFO^TG7GSMT^+26{|
z7W-Tx*yc*#J7j2(^r(3`<-rAcy$yj!Q(M}(fl-~92Q?p_`=R3l#|PmN8h}>qFO6C|
z2@ncc<Puu07lxnH0v#I57`!9nhd+!;#I2z?_F_3ei{~k5kcFr>_CVnyC~z-LLVN0=
zjwR9`1C%J<6iu6K2YL4#Mqfx1FBONHfYwwG%Dj;Xn5Ym)ItPPzCYY&uc*4EAK!*WY
zrCtOz+<{LkI%4iA;ZAmUkxIEZcisT6K+vi-fpRO)T$-VJD-MKHlillJB(<_Ht3d9q
zI?u>0R8%42x9CJv^}5h7pHK3x;d0Cs?4SM;bM-9Cpt8w#5l><i?^q@u$?xxG8p*9v
z=(Bxc78}o$!$<xosyHZm@x#1gt>*e6sXwc?iIAT>(77m2?QLpmRAriumw@B5a?&r=
zFwJPG;B%57PgbYI#tX9gXvuxtxU?nuz=qY)33Wl@!iC)ahO&#6t7lxA*ATp*1^5T1
zsn%Zy-j4Z6-U0uoIN4c)S6T~m>gGJXV8HxMtgYwFeAC{KxyXknr%<Y-!58mb)mxK7
z%_RAXAlU}-fVxo7G2V#Bqfs3a+luJ`a8D|7&z*cxTF~HkZ8ffTPKSD{WEHUs&6K)K
zL)iS#9FYKpy2pr>G60$lbJz^V2J&r<Bqu!Q;}lPgZjWl8VGh4zP>ByqzTf8i=0J6d
zQ0lN0#ta({6M{l&i|JB_=cR0<fDuUZjl(|akgatHv{wDs@bB6kSMJ=Y10_wy8C=$}
zWjX99LDDmvWNIp-kD(G8$mm>|vzjw%rb{Vk+Anf#-x2x%{JK+M#Y*Y&U=IRcTRdhd
z4SFd#ax(`u@G&GOj?x+)@2rg>8oK8h(ZZ>D=}G%!Q}*5kg|Z1_?e^X(7_9EBSZu(3
z)JmIXXKvp;7%5Ip+$l~hH@&kgca_{2gcLh$@nyNpXJ6=i^Zq%oi}({5x5^e&2(hhc
z4V|k<)v!NT=w`lIV;1RI=5+tP)EI(2RiBLN^sYNhmGBYsTq-VNjqIE&h2|<IsniXd
z6d&2SLMMks5!j)OtZk}M;3<;!upt%TG4T6o4SGpY+c-gF+aS>!<@%@`M3rdX6v4}2
zX$aQI>GGa%7P5hZk~|(%RUoya!GBi$K(6Omscx-8)8!iyZl}oI^A>AIEl0=3|7Zxm
zv;W4e8^{rK$)y8l*Hct<F!A-y_RXZGipp;DC70V-Z_mQzZ;Em&QsJ-YK^G8}%qvNB
zyDRXAYHA~X!<qp+cslRi+1o0V18F%i93VtKMFyv`&N4YCWwL{aJb;O$?+88Q5&P7>
zM@#DU+eh8BHtQa@^;Kn5KDw;C$a8kg;=d$*G!vo1;8$-`P>_!l*YQ2JB~Ogv;4p?7
zge{uzr_L2oZaJ5J<)l)blXw(q@3BHNvy#z2Oe>OY3K4-_Zk}fW1BrE($5L0$fcVaX
zT|kAN!-^9ciQzO^r=i1w=w)(?tyA^i<Sa&XgcBi`yVt^N6p%@Za5RgI?Ka3AlB=S$
zP$(Bg5CX;?+4!pGO_tiDQ{I4mOl_6v+F@TYhfG)2paX1vSlwf(>TRi@|0$jp-S5Ge
zJJJt~x27Msb<A;TQhKZoUpO{UB@FP!$s!hR3`7<TT1tLP)ghOj!j}jxj4M1*@W$gg
z-Kl2meUy<-PHVl1muFJh9N)`iWhuhI$tg}uBnh}vRM|0v9!a*8ES2^(Uc^bqPI`MA
zbcYgIJF;)t<YKq8b^DB!+oj?kwA`kr9vQQIJWKmzP#?Zq0-koka$`OjYD@ayyG>-m
zb(LIe)8_Oh?mW*?x_;j#fUg$-FxDS7rNp4gOjGg-LW_+&>^yakOu38-lWR2fVL*eW
zFf)V%h=eZ%|Cw8Nm0p)726Vha$<`{QWdrSl^yA;DiVQ0W8@wE3lX`$E(de%w2ME5$
zSycJpse%(r=`2ENnz434q&T;{mRM_FO_~&J1W|XdkshuxqvUDnp=qa|g|*M7ORkhY
zcqDw-$RUaQ(^H`X;h#~1u)GBK!AB3Lmb16sw*k}<+~xI}j$Sr+`o1}QGOD4Xxp6%S
zWt4m*Y=QRN)@q%vHie@*S90pf=RW5G<OaSClV>XpMx_kuM4$-K1$JUVn^RIA?I9ta
zDP5^4C3dW+%#EPpHJ+N3X|cw^4VwYT4kezxHljePD|bMuq0!)<bia|}FE5m%eFj>!
zws<)Q#6#h0qYqhkGZ$9CR8vaez$#m8{7U2D4F3;d(D57r;OIKe11`w2A@x`u>S3eX
zmq+AIHZ?YyNZq32^(9cOf$n)RJ%g>Px-xiJ+4Q;><K@xriWHA4*R=Uwn;gIqx{=DU
z_mb3W49m!e5=3|L6}FoF^jjRang!_xt1IklrG=)S=&y-0;O4^naZDN_wtwnYb-j=#
ztY7D>ha$C}Hh~1dyZ=UJI<H6#c-KigHTx)5qhFj5r<yJe8~k5i5^IP^C9Nt&o$z%M
zWuLXX*VzWEYT?-h1d$OB{HDe!$*Q!62be4*B8tuQMm^98y%m@y*}SO1{5@M#qPUik
zR&d6<?~ouJ&b>FeGK9Gs?bLY>IBxI4>!spf?_h^6jXUINN*eb4aKUy&+B4rzi+?3;
zpfwF-f?C|UloZZLX00r3pnHsLba0*}Yr!b^JDW4xCgy=+4-p{3u|6A2R*86{Nk}xb
zBBE}T5>0?8sT0n7d)EqgjygIHcrvo|MTR}#?oy($?uS=`FO?woAupZ!7b*f$qwI1O
z99JrQiK6n2S#Or>S*OIiRF05U*D7$f+S&sDgYUnw#37)euxrnI85~4k{p2U#l)=F>
zyJt_I``$Ne9;`jI_a~F96JFg}**R(13!VSxukLxQ|G|ec{^r)-{<o>Qe>3^`^V5F(
z&M#k@{D*xX-+a@3<>TL6`Xpw^{~2pg$HqQ6{_)8tkG}!=kx&C2P#2ht!sv)&>|@vU
zYx?w?DL1Z;uS>4qM#~QdPjTTHG%sUrAIFMvOb~LhGBUQVUrGq^K7v{CObnR0hf)D<
z1aYL^Wt-u-X?xld+za)ct5)w#$0-QgWS}t5ucHpW@{aeBuIl#YM0VwsKV7+mHXFpL
ze%q*fl4qgeuE}*X-(2IFE1UE7P{3}lU9orKFR&#C9TTT@F3Y7^Qs-;4&Hhh2z)E~l
zMRvI5P3T+Si_KKxE2g@WzMav&dbotu`*{=Q%)sGd&g0@0%3*^gi9P`vKtSx@sn}8h
z2~XfijzEGd5mZF8rtD}{6NpCpxPVyT$Aab?Wx=MrBc?i~b5@IFm{7zSeh^t=qY@!E
zOV|ppPX~CCSg;}g0j?@r625#BbZjndVEQ5`T12^+f|>jY-qyX*GNz3n`EwT;ekh0}
zNR#h%5iQ9djihe=#t#z6e%JI9w})d4B}enlxO^LWG1e29OUyr0{C(gq2i-ehX;4c&
zxfR6hZ5pelMD{D>Y$iyFh%Gs=(6fWfNE^9!GW(E?IPs8SGi)a3GonEgq|4CzZg2cP
zOqB7HpUFu4@VUKG;&hgqrRuk_0}{*e+%2_9ucUc!|D>Ls`Tq8sDK5*cOP0W{m@e&i
z`Vy*1ZRu_A2%Ekgx8)L_=sdKof5T=lv^P?pt~>}uc?kV!H#;V{kDlF4R-Dd&*GCs;
z6-}?hJ(On%UIPo7#O7R^Uf0h5XSr{QFv0SoBo*n6@s5;L2hxx+MBzytKDn+|S(N0#
zHc-h=wl&d5wK*GQUcH|kYDtEdg7x-nij_|9d?`1%bI%CkoD6xP5Bh^bQnG+jzY-AK
z3$4iA0CBudX$%@LZF!mPgKyd05|a5|qk_Z*pABz~8UHlT{7Gi$zdH7;mgR^*H?w_f
zqKgh<h@0P<*tdfCFGL9EiaaMv*4%aW<~6*cG4Q3^c8&_MksB)S(7KZKM4G4Ja}h(E
zD;7e6mQu3raMAO3@4Snd?xKO`qZ(4Eu+s)JZ8gT@souKkg4r(!aB^?ws>w@PPFd-3
z>Xzxsnt6GEd}o&LjtPHu)}WoRg#I`&7W|XM*qCPvyFwRYwC60;{*IP{)HkEbq(c?G
zBpXw#ecja+of(}xa_74|I)?EDM|?Xm=Woi*6fRvQA61T(vYgdu{J-HLhin=O#;M9Z
za=qAJ6Xm(84upDxHTAV`rO`?S9wQ0x@a~A_K-UHLIBKtBzQc>!?*4`Ru`50oA&!bW
zN$VV_bJ0X(b->h=3b8atOn{96{`IuH>byQoDhC=;JbQ3_otMXb;(PhN&zmyu701<&
z^WM+11aF^-nb3WD=EXGlXq*$`B&k<D^K^YH<-N&bY%w!K4jo2obF9@p2&cFw_|vpf
zbm}idtz#QR9S<tXpI}%}y4O}7??0LP2*yOzHS4)H1kuXQDaobPNk)Y@etA|Dk_`p3
zGg$pR>i+pXk73Oev%R;$c!&w>ypivw`d&YG_ov-BSdb1o3})>Yc|oZpxBZ0^8vhum
zm|`WTcyN9`-<@xLG--Q+7b8M8%*)7izxOy885QljNv1iIW%#fY5k6ffR^kGLtn1CL
zoa{-)+ATpqKK=@box5IC-+wK6Ym+`q_018<n^^vgLWtz`Rrpok3M)MtEOfC_<iJk&
zs$F^*cyI;K`pb+4{GFR5O|l}WDD#B@1c9sDBCI;OJ_;$3D^1W7GS^l@AC(8FHJ5r*
zNw;g__TehawUMfrE5m_F-bsOnDT3(IL(E^?AKsP9X4KV9M$(TX4qcfPM2iOI`x8=q
zzeB&nd~aHS)`Kysst#L?zOAGE^Kqtw9U;xwy<B)26+1SS=4ET}?P5Hjwr30o+F5ug
zajeXpA1}M57w15_9C(rIS`pRX@W&(Mr+<4yC8Oa!1MVB$mYYLzMtG!M(8dj#K747m
zf6ZoD8p}hO*8H>1DV+&fLY>a#TD)t=X$))2#+qvI1fmSWgO-B(osC|2-=%4u!yD4+
ztam9h1qyX5(4MartR3))`}Y-r+8$E02-J9x0!{#tW_o_j@iXp}_L$zgZ@o0t7ziTg
zf23=XE)eq7OUVsYTr%Z{tVrX1!5>hqUg!{kJ_01bjfH6bxw<kNDR*Y84xCcQAX^$J
z4rt`R1-i+{DSM-9?2ILF?c==RimFhHE5X3R<8oiEOD{PeGkxBs(!h2mPp%rpRQvL*
zP^M&&$x-6-Cn9d!nBKS2Xwr|NWx|?=V#^&H*ucb<nsOB2BmId5x+B2{i|1VwmVz)m
z>Qnl4zOr1;+{vg>6EcZjJ&)G*BF#RO*yD={TeJ>bqO=>`x2P~fjzERw_P@?DZ1X>7
zJ!2Wq3pZ+?IKsGzUUG~Zy8_nmZ!9;h9p09>N6&4a!E;~%^bz|L$RUr)xSY2qYELEm
zFdGD$$fVd|)%gR*^B>@$5428U6EQi0P|{M7kzR+m!pQWC4Z^Z8?MI&ZC=**Gtcl?R
zI5G@;mT!uooD7yL2D2-9xfxinK8Ir3xq|7(mr}K6dv<=SUSt8^r{I8T$1bM9VQy)J
zAj^VazGaf$L}4mPhW}uiBe!9O)5xRi1N-M=G;Aw*g#Y?2(dEnc=%gC#(1mWgN<Jy%
zimglUJKwti6aJhzJL?H2f)#AH*`JX0J>70#OmHeyRuF?*JNdAk*u~JN`}zjp(SvOU
zeh=xSkH%Tg;5h-8AJFvpVb&(|%8yAA)LJyq$V4wU)}KJJv&&b;F$FSuYY2dEoZK*Y
zYF=MMjC*#`j@lJ0ki2S`7?G=n6R76tuFxv-g3n`UHX81+_#8nohU_r}IW{`)3Rk$h
z=R$6AY2UjiKR>m!H#X!$Y>;^@74YCGwrP{-SLiw=A8n|de&OUnXLhuAU0LAKHrTGB
z2F@h!q3{AvZ1*kKcOqB&rSIN|zR+6D_&SC1kOUw_A_pK3xZArhOX%Mmd14{`AUs(y
zz)^UvkaI?log8eW88gkNo0EZCrY+Dy`X^$y`tY5$A7cs(MOMCE1cvS%lK9syFhHJR
z5?9}DVnj?%WHq6151is$*ZKQf2ZH_S;fbmPw?F9cZ%MCh-?u?X;tlz$OuM*25V*b0
zR$*4H*vrU-vVFRkW~r8yb6X3+xJ{Jn!VpA7NJ%0k;OFNi7oLArMW4O#mH;8^Bo)px
z_UnWdngCR9>V3x3D<vXI;CpLhSSh4SeSF!(SV+ej(94EU%z@kCv)UzXLc6XZr@_HO
zom^KSFW48OjfHiThKY{At5q>e4^^ZMtQdYwR!xhzYM2-}B+d&ay<W5kaQ*QV0>_OO
zn2$<GRFhH%W{#rYaGn>pxuUnx%W(5tH63g`;O!T-KOLCDacvnk;CW%>py=cc=A4lO
zzZFre{RmBz8r4V?<QcnLi~t+ICX!OmdtwrmmjW3sc9_Jyx+(COXNSldY2N4w-$+G=
zdMx-$(j0R-+<j_}_+H}uM_#i~8dF)g>!-|@S{6vDxWPC+DRkh#?c3K}2dETVEZ?!2
zvQ4MGvt#q-*1pATLQgaf#A9Dm-p~$pu6KUmF<?W}{GQ&KQtkU*B1^!bDt_LV)U#;7
z4&#b0z1FB!X<!#fz8%P<<-~=Lb>f-PQddjAww=OXR(1@`kHrKrBLQdU)I(G+c*&$@
z1YXRoi(OL=A+|Z56?vQgY)B2CxA^`z)-|Ew`G3ixozeC?ZqMNe?V&%OvG}sI5krUy
z|I?3qCTDtkTbJr+@I{wkqX@M2i^U1sU)ZjLG>_m(i=8H@$O=4jN8<aq7-{Zb@Fu3M
zlUpL|YySr)w=RHf44ImvrSKN|T2Jr%zM2?m#8+-C`NLv2L~=O05%$ivI=ywB8N4&9
zBn|Ya<-Pd#@*)^LvO#?#w^cP<cXWo!3Z>&xI>WSmb;d<`#9NwsOzdimK8Qdqf<Q}o
z+sXMDR(qa_(XM>lW66_JWMBFZMmLVWlImr2a$<&v4e2wz>*lq_b(13@@S0_0l9%kf
z=Ych^fzwk%4CZ>CmZA`Ju^kwTpecT+%r+vHkvi&ON*~q=ppbyUB2g=yqLkz+V@_Y+
z%KMt%N{DAR$BWMvg5FG?$KtHA?|EZ&56rwj@bLBzZ%(qDlxT@G!x|cN9iW3&Ij#^x
zEqHRl^VItdRWTR5O$D(zEPDpXj~D-$6p`a?X;XwbRyM%ogI#)2N<!bxRpelX@;E*4
zVa&&blXb$kl~{(CB#Y6Gvuau(e^4y<FPyFW7T^v(!prW2Ekf0WB!@Lb4ebDr`M9A9
zML<NbR21CJOWZTr46!~|q%#~CC-@Th7K|<i)i6eKJ%gOb5GHrTJ;tWo|6E3O3=UGz
zs56nWAG3q>XTt`PjX&gBiDn?nVi`%4Bt-E^6yY|llocb$0iKF6DB$Cd+blzc8@ra%
ze6LSk1$$?2X?6Z+sRFGSAuP2Db%=XQWYGB%vizF@zk*m(djyFDbKerFjA#a-8l#1S
z)n<(!n;v*kW{ORwY`C&)@j0L7{&msdd=~QB&iQI@acDMf>B~IGyMBiDZL=*Z)0Uh1
z+n5h3#vBuV%fv2=KeWH(TJlQAZPSF;s`bT$`qeE9+}^OSGPc<L^dP|zAK$?COcUt9
z5~HbuhHnljpSU)(SH_w<jvzzSOZ_3C9F!x0viv_itX1i3FID*OBKL<pPKxrAIBx1G
z1uLTfyLdqum9T70$#x=p?6!G+fye^5m_ml7dXzLa-%bT$BAplY81(WTw)JgHSO;23
z4Mbh;3oJhHWl$ri%+l_O0X0@Qpt{5Wdm}Bsc(gh*dsq0^REBM?@P(f6uflJZJmnb7
zvbI(PD-SxC7J06|);VR4HyJcT@IG|b1U<nLWW%T~6=}yX3O`JN#7TzhoRot@xYQvW
z9ZLczKv)LGN+HxKQz*h9KgfdR3OgDxUVaX%OtuR#iIEQ|`|WDb*&rP4ZAjd9r#1T=
z>-epAmZ8vc`%`Q1+MU~#{{84fiQRKOO$9~nqxm*A+BB`Nl!TqY6i>>X?_)73cO;ka
zu)Qg%F20V)Sknk$ceEcGBnmVt$0!(b!8rH)`djh~bTCQ&mxJ|WG5NXFmU)}0j#}*h
z<?T(to4(I{-<h`Ogmpy7!V(W2fmw|!cCJrcYibjWS!xK230CYnW=SxNPR-0PIuX|7
z)=N4`j4ki>1cwj`2_Qpg@ObK2GteB%vR#s<U=p8N(^OJgXyQbQfxoTge%_y)rJc+@
zbMJkg`#kN_#t>Wn|Ng&s`M%%p_mv?EEYNJUUrJ8PT4=#3V8R5RuDv52wp?7%sRSMd
zylEFq7oZ(2V1;F2OLvb9n@NZqhH&ykl*va`kXM$p+xAlmEGc%<mP>oa2N9W-fe*ao
z)@lZ3$B#xY9gR;NdAj%n4HQ2bo<wnF{Jnee<x;ju^BHjF269v339;jQz8HY~!c~bC
zSx0qj+XY|G3jCChFOdGE+QVYG!@h^DgD}9we9YIkad5-sz~98ya+wjV2_u--Uk~K7
zWb5(0jy1de>!tW;pKW_PHT0{)wn(<^b`0Y<Q2c4|HgzFa={dPd(fv&+oa8wk%e;(=
znP}_EVF>Kx<tyal!a<ssM}6o@C%XuSK4V^QLbrDh4B)xYaPANdC^R@XeFpQj=s0b>
z?@hh{YRD9Rq4A`U3Yo7^&{HT-^Qvj22&7?!;y^&_QbXq^CsODL<@(;_{Kn*HPa#JW
z!hm8G2w#}7Ck!5}u46~S_d6#019-ynmh%y+Y3=CDh9v!Y4Ad-VLN$C4S#R05A(w-C
z`_ng%+J-MU9nN!k{pR31)@5`QWHa~CamzDuC%iRjX|@0uVC1Z?`H_qQ3%L@AyiQi>
z`&7Y!qE*NNjEMFTRE4y&*>Z}fWET3*o06cUwuqBZpYO`{p0rLSsMCSYD0<691>aEq
zJlh2p{j#_A+4ITvL<xVc_dSuXJ)fILvBMo<{(P;ZPq(b!RDFWwc$H!jMb8JBs<m-5
z7Vpx+7lNe&X!0Rw3TqC#<%PF&zcC(Qb%ms9dm%3{$UKcz-?a`E*649%KaWM^O>@Z@
zF}5h4r10MS)q1c$UmwVZ^t({K)uJaKGfiqlDHK&bUvCvmLkdbtE=(7<7CYm*xhqc*
z)xe5n0h{oVQEKzua1P+hj?IHTlpVBm(hB6CCUat$DG<M~U{e$`p_$r$q492CwB<V~
z*WdfEn|_~i?$*#V_pa;N|9--{=HCA?`x~!p+VHkNZd~T==^fvCa^KE-tM2``?|k#m
zp4s{rfBEi=g;j6P{+HW}Z|(o(n}5~w(NNbBqxm~mgV%C=f4l3ukJlGxdv5z$J3l)Z
z{<iZDdwjePm#&4JCgO4Cz4?3X*6H>(#zXb_4(qyQ%ER?uf?!D_8=cr82E6HeI*lHW
z%Z^{mj77$pt7*9?0Zot?I_(9W*x6)g33-i`q8vt~{Yt>z)<LNdcPI(l7=k9GFZ^&F
z-R`RPI@}E$uE0lT+i<-tm}eUukiAkjLcc%t(KJse;0e9w4FASloSYwgJuY~heWF3-
zea_jVmnwcTjadYBEp~RqaJFov^hxYA&(X(=6e)+EMhK7FJD2mvvC|JVz*(^1zY4?n
zifij2Ffy>AnV}TC?f~uu?e1(2|BAR+Sl-{^LdjRSt|IhZTKp@lbKtfzmK6hORLy-k
z#j4Sxh=i&ndyR~S3^r5AmZGL1I03j;`nqcBWgjAPE_r1WYp!w&iGTM|ljzcYql|bl
zVdyyEHZGLzyhFTO2U2Q>yuWQ5UVTIx>iq21$WN`mp2)xDNIS$fNEi+Ry|7%21hmMo
zJTN7YALItGoQM5;4j~Ru6cYdAnv32-f={Fu06W!7%Xk8}Og7jR^<9Y5)>mj}0V|ox
zKu!fLKz_f@-e#R|mS`t?Jt-ZYguAV5eyR$dr}%fo^|U#)YkByMggG(aK4U(9pO^Bi
z!5&)>2c~#iaKrN?(4rMhbLde}MDKN^BxMZNCFYF>44T|?#|vD>Qo#Fhfe^!peDVov
zl;1C#Fekw;zP(+hx2!O`12xu1IT=CYu(cK%wUkdv)kfiUjBrD8lC7_j0Asswcp$QF
zk3Yai5)c#FtZYFz%9bWM_=}>3`WjiAtYa2A5(EZOLkY>)i}x8z$;Cn>RiHqLt7hIG
zk&&{nhi@#%ezA$llGrN5a+aTB<0>`70CH)}iPg05gFDc$MtbU<4w25hIpNSXQ+_d)
zcfOh0Vu-0k4{cXNCX8-Mlu9CzINnvY_V%ZdK{nD0Fep7Kc^siOA48WswXR%-7dT7s
z8S`0@?$F1B7*kcghSZZqu4JDU_^XhzT~%?+R5{1W6<s(!F6Duz*~>3)ynpY@R@W5K
zDhAIu@7^*-Z?}ob_<63}fB{2Er*<(KIO^1VME@@DlPfYNC1$}m=tF}(zo)jf1aPs;
zcah?MkoSehi3`}QGL+^_E}tqZxp!)iEh2W#mq%|&^oY9c|A!-dcKLVj`J4;zA021)
zDav7Oi6mZ%n2s2@EZmlSVZYO0{6Lxkh5=APEMPKKbHthBK>DlzG;cNWb8in{GDHuC
z4>J}0=}4D#DP}MJdy0A=fETlaBj_#HbDXRYuAgE5>T%8&`OpayFd;QDwU<8jBPkR_
zTa9Q?%C>YoLR9N3-5jqY%>a(BIImcD(1cIjTQrwGLK6ca(V@=iVn{Ms)-6`p8!N3l
zy*n`o^=I2Y?YBj~axIM!e|^;NyxV3A^%<@$@pRXY59-fiu~*Cc!<A?#?^tZC1WCkZ
z5=%h!x?%2`g3K9vM^%l$53FTGAOoG$o~7}DBVbu{fH<8tvDp9}L58-Tg;a16;fSEY
z7br<8jWgnO2yQ-|{*(q#m}xRnmDM+HjJ80&)<^zc^rq~pl8t1tfQRoSFaSz#zbjil
zjw~sEO`aXSMrSKu5%w4US}?Z`5jETWi)?Js>f7hj0dzp(<l9^G$S>|}_l0iO9X%fY
zXM^o=Px0>#hjL4`^{Nhj(V9h}#u~(*claLmHD$3845>dcRn}q<HkhQb1!Y#?vE0D@
z{M0&FL3wmJ+B2r=A@(616Yvv4ASqoGm<^{uR9#hb`iov*8yF4_DSZj~>yC-0>a``+
z{K<J&?F&}sFpx|Mp0;0hhWlD0AGETN!4qLU)n#1Jwnu-+G$)b~9m$J+ifw#WWOZ~P
zde;_lZoXojU$n6So8$-TTIC0-H)N0Y41|_JF|uis6wEKY{un##Fc;>}W&7UR&T3iA
zOylWm&}p5NZ+P3zWd~mcV<Cx!SlEpc%Q&X5ktnkFge@9OG8rDz8q@+kCW-`(|IxmQ
zh>$hNoD_U}F$>~53Dl2e*POg=(i;jr$eAE+1X696oezlMhEU8a-m(?9`IXt4cn_u~
z+Wt}*qr^T6JkCzuw5l^B2gBu^p@ZRnjyNJq^Ln!bwaIR47Ta-POh2nFriiN!ohIAU
zSxdm3&Uz2I)nZ2i$@}E)=GM6|=kev*RSYu^_*9wjN~#7~K{AFX)Gv6~;?WVVUDuFk
zQO0PLhp%%PAPrS<Cue8F%ASy30U4#Iw%{X{mPbd<n4@M-<aRb**Lji19nZ*nS-+q_
zxoI4^8o3yK&lYKket^|=2F`X~+g)R%<?Q+PqQ!>9t~mCp>_ra-%|mL%J<x8hJkM_f
zN(vBY$28Gh8^1c+VSPPt716MNXKD|$;YaZ!p~c{A26^7H-$0%p?={`5vIUZ%hX~h1
z>=bzkvW$b_C|T=%H>GT|m_gEQEy6lmTPhkSE}2xQ+$%O*iNA=9h?Hy%&nN7Lgp7dN
zO3(s~B*0xRnu1F*F!4lnhL1y47N11oBhVac5JI4m@|niqQ+cPx=6d{)c7OcMq0zv>
z@TR=ZzM;VAbbBv5kCxC<hVDyq=#8_ZCFAq`CpS|&U#JTGlb%$@RXXLHskAmM)S8->
zi*1u>@_5QTQ?PPqyrnybzuY5dSUJ=K=XF=-s{8EJ$c`l??RGhA{)7Xh<|zY0EfAlz
zgzhQHWu~fv+fO@3N5fGroilpIIpU1?S4Xqhfx>R{ZDAO$d1!2IM(=h}XXb@&$KTfF
z?YLorybKyJiF~F8{%5UwYU)us=cA^?cPX12mn=dm?FaNr^An?InL$;M632_7Q9b>+
zxR7%n>S?B_s0B#C>H7*zIy%Id7z5m+$gAVs^SxijQb`nS;K}StL9k8J7kC|c_*Sg>
zV}l#{R&g?(0jwnG;){-q3|5zNc4B`Gzc;fuY9^^Gsi4^L#LNpADpd-fF8No2tUnBV
zlbdRgPk`=~NJ`x>SLoA31OC&rE%J0;Bt865eevVAdRxNyo;Nb>eec$^zGZ5EkoH+y
z%m9NWVZ9p!&ZD0^t<U`Y<i=C$vKj!aR@;@@Tb_w7UVdgJFZ3YUF|aHxm1sh2j`LM_
z^wyL!WD@^)^f`nPO7mR2Lnl2@2LQ4d&uRZ_yJyV~PHA5j#5*$LV;zb)63&Y_F(-_S
zL_gkV%%tFa%{C0868+2;g-1fjJYl=r75+^<i?E@XrVZ>qmKJ!Co|d$g12R+9mK?y<
zT$+SUojdYVdx6K0;J`_mpcdFeVxji6F4IW`pymsetXN!bAnoE=NIRfo?T~R1gigRt
zdBxcmAoo&`fGlNoAY(utPZ~uu$s<xp__U=_r{b&?dY6g~ikuAQR5QqLtnP^f;4B4H
zvT6*44UOmm3R-CiggemD_duYs9#wJAeDa(O2iW)|#wiXtNt(@M{PHP@9a;kR%B^{K
ze63Eaw#s?N)3%AdoHKv4l9|$awN8<jj<eVCq33J!eNV&06OZ%#4YCcKOp>eScvj4S
z%ExPr=Ya4SU;vR~!0$b1Uy`LABWUE8O|cJkvxq`t*?>YiFW&pIuwoOtDoa+zr&T~P
zHg!X$UDJ7A6@o);+UN<J_K~@Gcv`e28l-Em-5raXv~|D3>2A~*lAyUI`cF-TsX3_|
zOD}b&JYwmNXBSEj)1tttc$H$H!NR#C$j1k2*uL=fho+m+{^Pjd5ejJ9ffS6zAx<zE
zDc;8DdP#z81yY^Z4XcZp$bat8(Cj=W!vO*)rZWEz&g=y?L1w}oVkM$jIv`n&eHe#D
zY^juXLFfBz+YwE7L>n+jsyp)Q=Std_`Sw5LMxn+4K&D#na4GBMrET%*x-qPnBCQ6i
zVH(BtNEIOCgOslE@U*DoWQUBYoA#aQK5IO!bIttQmnZ2HJk=Py;oMv~Cz)D`vi8jG
z=7h#J0Y+&;rH6J^ZK=p~nC_jiY|Y+VM?1sN>8MW(q&p8gH&mLF18WIQi+;@iDLT@)
z#Ghyx*R!$<9!d7f0~=^XPM6K_?a|JPM~;Ja&cHdm;v@#|E(P9C2H;Dtc%kxC;C0#I
z#5CMTN}_yp;Bs@kuPT0A&l?DJ<r-k$BUV5&MEHa(lshDyGhgj&g?hccA(3C$0JAGE
z8LKVoiYzXN{IhJf#zZEh7uq?`oRK*j85IRx>^5juC}w1N)tCCUE12=JI4q`Lqs~B?
z%@9!a<dmsoU*y96MHIRi(bU2!#<ok$Go%JQCaPh`pp2PXJ-_QeqBR?iG<$q^mZpzu
z6c&?ABV=oR<B&8_g9<)Wt1^?!!57g`@m6{t_RTfVktD}dfN;aUhn|&ze8&{pV}4zd
z;7Od`q3DTc6;d+0b$$z>Uf`M3jjpO8ubWSh&>qO2=XJ5ENoF*O05Fg3UNw4qwR7Y-
zXC!F0F>n3-{HNIVF#Mc3@>y2o_lRzq^L{>UW<jUD{Jg2TiIt1nudHHSMA`m^ExC19
zsgjOSP3|SrO9#yF;UEkZ9Qx%F2As4czE?XAcnf_eXI|QZ%waA&CY#EO?tec{dxr0(
zT#m4mA)`qD3qUI=PYilx>suU12V2=t=g=Yo&D<f$cJ!5_fCfG0pZ7`hEgZzPOEhZ)
zUQirn%wOu)lEk@7sq6V@%7BgLoa~vU6c!8en1=E91=7aRhPFUWXazBTjB={Wgi5kp
zdI@Rfuz--wZ#1?LVV(?MiVSpyNPXeAJiZ=0$>3qimL$H@AhZ3vWF|Z$L24)l6>~h*
zL%oOXyXC{uy5;p8_V1*95z2YL{XA$3nc(TP%fxGH-wloqvD#r9$9fn95YWA@!-4Nm
zXNlrizL~j7%BbYo@r;w$)lrq`>Aqms(<?l9TS6zxhCh_LlZBTnSa><|7P+q4ILf%y
zUw_|^Awltic9q~|z-w*k77;S<3P=F1r~Rf<9FZv+Ut1#ZPAXp8;J2=v!J;Jg3&{O=
ztk~3v5;jfD+xeFa$T$p?9HLi~(y~k{)Ve}t_D)A2N3@l0ndcIl(5dRvz{}YpCZW(T
z+0Q0Ujz&;EC1Nusv@jAN;z^WpZD2kml?J=nsAZ|Gj-ug)*8(Dn4*+E?{LNg7O3rqh
zL))Su#ZotJzM=@SM+fyj_>2g)t{Z<lbgMtFGt@`5;_MkZBC?DHUcg_TDK7?!?+p!^
z(p)>b%S$YDm9#61dcjV3N;oZX%ZPt)Vy!a4QO*HZD!o@acp;IGLR&HW8d|*>-i7gj
z?`Vs=Qsy#26PqZ`%Rqs7Ebt<~VZ-=j7BiJBNng3Url9l@_l3Chp9rWD^?%Xa*`>LY
zqp}6|sn`i?=sY8IKJI-FzAkG|JG_)*slA6e`Jryt)?=4DB*A0m$J<a%(c4=&$870&
zBW{Lz6#KLvXzPUsdOfbEO1uImEcY-K3oOIOgXlIRP%GUW4{1}XupjYfAv37^gKnbj
zxR8+9eOQ~N44T?8q+v~w$=4!dN?FeqZ%2y8+*o?-L$V1)EL5@Er9_vr>!LP+4#%}7
zyv3K{ie@zQyEtAUVv?np_a?9Mm8)-+!}bE!hi?<y?()X}CbgwA4^94XeWb#mJwXG`
zvGW)}i^M-~sTpshowKAmt|+0Pkdv6epYgrzLwp9i49A44QF|h=f)_&{%wI=N+d*Hz
z+Dd#%ScqwluP<j?I2~`B5L!r*WD;tv3W`kYbQ#y;B$rHlOJAG3V`s(eo~0#6Mw|1(
zt8K$ycg5P|+}^eBbAXa|zq2GGm0i~7L;n(2x~#VchuDT>T1+k)=Ab9|O^yfNqpMs@
znZ>Codh0B~@>^=!WN#dsd9K7o*#q$X4<G#OU+!EyW^^a~aL)U^-$;A;>iw0|XJst?
zU*DQK^*+b(@pESWK-W3tpDMPl{*!lq@~uY?H>F-UvgpqrzHqB+)X{o<*VDGakB(h`
z>Dc@(lf0)wJ&;Ex-5l6;n+$|_ouok^7rLjZ7LOkAeZ~*!Lk%jmoO`wWgKfL887KRi
zRU-{cj8{*Z^#Fr;k~fuXY@;c`s^x%vH%=zDe<iTP;(%)a120clF_G=EA)CKW?!o~J
znK-Tu0$;~zh*qTVRU1+zg9Yb@JB#~u(P3S9mGf#mh9qP%fnCMtmGxA;RrS_!FktrD
zcN27_r<yzIP-&&u8>%Cod$v8z(MZ4g>Us3HTbSrD=qKYOEH*rFse3ge6Y20d@IlIs
znE`5`hJ?m;))JpylGXhN=$r0Uw#jZOGqpdl+n4^JcHZ=nFw>Fp-||4DZ3!n}@%36U
zo87|$PR$wlpx)W!&p6aKt!QIKnO1qi^w0x!#^rESn(KXZ35Q_!2#kao7$U!Ism(ge
zgdT{?yB%UW%)JXNI{|_nn<+Ek=51FqC}lUI3_wYdS{(odC$%pYR887t^@JZ<l-GPb
zz_*9Xk;7CMyyeGo6;Y20_HR`-1p>0Qar#`>*w#0QW8@P4jHK<Wh62S%-hd2%*I|1q
zJc#+)L^%-WYhn*3#tsZis1^&432rAhm1qr+PK1=FYNpGJ3%7+o$-;j1?V;jNo(}&g
zUKFy8$}+5;F7P-HVQxHJhibe@+8CC%&#Qw0fup!Ovk8rDB`dQ<O8XRB3Z{}i(gWhV
zmX#@+Y2~eBFhN@(lnk}DF<8l%c>1EV$q)X6$XDGL<j^vg$)SZ7Jh9uZd(KsRJnSDG
zihLYj9Bv+R(g?l+Zhg&AtevrquC|Rnmlv6KpTCJEhRe}<Kb2PM+cOJ=GZEDKdYt7U
ziH}=wguc{WE-vHfQ{rWag$+hC+b8zcoL~b2&%AFwwV>_WG{-bIW-yk(=GZ<m8jI9s
z^Q$R+G#FB*ij%nv56J4%{7H&lBF((E%hD+R<FW8cNom2=Rk2;B$p}Z-WuiJ#wp+EW
zeK3*n`;qcj_z1@3$PUspT+&XN(CoJKR$9-o1B{o1v)e%ajrnC}GUen6veFLY^cJQp
zn2k%gyn7<D*%|(Ix$UgSuhZhmzou$kIxBCfV5?0S1J#_GAvs@mHnjVp8d7djT^<Y&
z)_HF&?riM)E#2EwaZrp6OlMB@Un#M=aysp8Qc_X0oT7Xo4YG|$Apm&RxARxL>B)5@
zO=g%Ip?LVx%UnGP*=y>K<4!XaIb*v!Y)1cawRkKivbgFaPv|2~F0c0;nE<iRr#EHO
z>GK`E>0rWUeegY|FtcQr0EL&N5wRFYe82>ou?As<+Vn(RpHPRHUt%Mn9b#C*uHSQ%
zbnM3F4<tUg_XU&Gnm(T11u-#CIlx8~RwV@Mu(@H`q`A4WwYFYu!Ib1kE>t`IVi!$l
zOEWcsDKSxK?kU?lSEYAWmZQuPS;57)<iI`wR2JAUgs2h4WA$SdU!2!M+%{64w=Sf5
z06;^4W|iZbt{svOBoIln>3F0a`tfk{$m7L%Hn(o?IzYeY@wH>0;Ut&`0DgWqfL{jm
zPnZcNq}D-wf=~}vd6`Hq0CT0F6JCIOezJeUQYK3~k$GzL4Ba_{JUki=itQB>ebv_K
z$P>j`8~U9|QY%_38Q99@<ir2ORHdt)GyC<pLT4CW=FNY9!JLt1XSgL4$X+pTu_0KR
zQsT>B5q}xEQ&-UoCDx~aURaIZ5GN~iR>lWbA=laD+CWOLGEBM;Aj5J-oZT^@OP2zN
zqm;nRZe3c)Sv+xVmm;WhcH%4gHM`)jS{T~~C)xo`QmtFI?7)*_+SLlW0(2IXg>r;c
zl2`vCGhd)2lp!AFsTxVVA-{QSJ(j+o3DxOPkL-z|co*|9shKgA)oByuYDv)t`T86P
zo=gpISRWNSVps1{UU7MuuuE#twuX-v59p#ho#ErgPcjbGp6PfVzW@%%%+5T@*d^)q
zQ@;GP&Kk2ThYz*o*b}+)yy=~GDmN^bnH06}caWY|X5ml@%K}5eb09XVlkHgCzmC$V
z$GXnJ6ezSHD4dydap|n{$U;`!P%iaMVPCBf$JRWUFUJPf$&x?xJF4e<H`o$78oh~~
z5GsS4p72fMs3#haS53h7o&Q<l+rxW&K4nsMV#Y(Ts@m74b^Zjv!x30Tx)T?-szl?=
zx;eCOvR*Z#bIC>vT-NPu9DyXuLnSzpn8<t$tO|Wnf}CrBhsDb(-sWQAi?$%E8*Z)`
z*ozpJ2!jq7;g~$V$-KkXIzi3w=9St}meulZ@JB))0tr|E8doSi)R(fjQ!}XTwOu?`
zMRVDEI;N0}>;de|<&SbnUU*6Je%F<U{)@1Q?7|E!1oM-ch*C9mp~Q;<C>bWn(hLHN
z4%~Os`BAy;HP0`HZr=GQ3(1eM>4b~LIGFdNC1M;X#g<PN6x1?LNx~sJBbgV$+CcT5
zMAIbO>eyL(EpB2D@mW?MkrmV`T2OLV2Ua)yZQ80-FIGtalKxlKpQqpzig>NBs$iD;
zoyy&_-j7RO2CHm$)o(_lA4l2N@$0BI0QWD<{6YIoOYw*Mj7v#)$HbM!!(Nv1=(~C0
zpXQ+{d6xl101EtdQ-Qc72T(TGp5SizcG5F}ZBk!@wQ%pD*~(^yQZ21>p$g-R4ecZp
z((EZOE0EPw7Xe6xZzA^~gTRv2cY)fP%Rn*{lRdm7`uby;Mjh+qS~GiFs8nK@WvI}d
zXQ#bL2KM|f(!u$z;-AP|6hwTC!LA6aG$C>*s3_GMm^eOX1*2J%Nc3#S1aGLh8h{}y
z7y#WR4Pm|`)Ck5ShfvOq9P~Hig~x4O5L?(DKU~hRuONJ>xW%}47W-~=`i#ybGZQAM
zI4B@NWf_WOWh#<0H;tKuRn{zst*^o`zaUjC&mPAk**dH4#HQ5V2hu3+5?C&r>tDo*
zRZJ`h{MLpd?-j&&ME6H>7oD{%T%7l#vi-$BF_r4GLsPCt?y_XJJQ|6P%JGe26?GTy
zdK-mGFC(K5S;`Xq7<?^yCy!tn`7Fa2I$yffpAkGE10!%JcFSz9fL<ay32CM0LI8%d
zXL)LtKclvdAi5f2BegHv?NeU(>e1N+-H4NQ&N&qwlyyx7ybSc+JXBV-3o6spDj2+=
z{c5j=2Y?Jh6qR8+MN5B;Ga>e@wTTM8#9u@PqM(*!Es`9mDb`g=P~@H`V9Cz-Ty=5l
zU{EsS#hqPF%m;^3Dx_4AK7wm&%he_Uf`kJ2R{IES1jCs5?40nS@Ye9^;h&BiDPG6D
z3_hCbO`L-f6ck=6S;0!cInNWMDKXK<yn-RXl7vP>O&d;vXoL<_CrBd*s6gofE%)S|
zzjl4j1#nd2evf~0GM9^Is64gQjEwO%=1kWq!3BiLR&pJ5#xR}@rM|>`yl8RePG8eK
zvpH684%w(Tqi?o^f5zU;{}H)qjCvx&HqAErmcjOhG5nJM{h5chADR@bMC-#eqXgb6
z29`K4l3L<|N$r6XRk~{-wt(&9eFTf`Ck*bh^jy%^uWN|+rRQ=Bp;T1dtCT6d7vjbv
zB`EJm=pBowgZviK+|3u;XUr*i*f{baU!~U1M!Yb`BN$peVU+Xbs5+AjUnuQxpMt;$
z7MqX}iQVzjRzc`+su0KRxy+2wQ%FRn{DFVS(M9Ye)2mdIFy7GxP{~s15mW${<=jED
z+%Us8Yjx6^@b1tPv1~oT=GBcnZVdGsL!bOKbUbncN~7OhZlxcGykhq0vwE(iY*)=5
zfB&*9sD@2;$anP-W(rhC39N|X*HP}UKD}An5U6YGP!n#bWP-a4631Y&2uiGAn+`U*
z1s<!wDZPem5w!gJqDfVWU2o?%7Q9cf`SDOB(ob3x6kFZAR@ww7E?>7>BfnzYKDS^e
zdtNRf+yg%Yb(ak>wdH+~ZN+xgjLwKKdKrOSs%*WWT;9vho1UDCzU;Ki9Y|+V=<LZ3
zxISYu;a|_nYVtt7zYu^JJV3D@q(0%nUX3Z_#J?~AJ)K940OZGTj-9a>M#UdEXvrkV
zNm&8EpGU9AN|cwPH0PMu$IH#il582<2xny@ql|ndBF4@h3|v8H2}n}lVn}P}V6w}n
z(TmU<Z;3K@8y()*j#qkAef~My$K|%0<<jAi{O&1!iO9_I4gpq}MAVe0B<T)4pP(rz
zZtTz!#tyCYXIR+tN^Yn?4Gl=f^v}_^C17w%-8o2HR>QsrmKi5#OQ^mGyfMJ%lF3K?
z17G3+_xYp*$Y)-6Wn$5!SL>30{KZwnhU3etR_`3S6%BeKA9%w1SDw}0G0AdUDt*{^
z+XVwEla^mxOUzf83DX0ifIti~PjR;d8D_d{kzKEpsP3(4y*YdmIy@FTiBceb`;($1
z?ZjKW-Je6<2W7L4x1U$tpP$S28PN{#>f`=P4>mD|F3L5525w&xe1lQGl%Tm}D$vp-
z&=4gXCkSr^dX4Qwl}p?$F5oPvvoY3L*iTVF3?s4mn)yDD<~L{YlVmLft-WkA&^SXT
z=Ddr-$7XUB&)`0A(s)x{u;1vy-?q7;e{U7uSjOO5BAaj)g~LNLKC{u3&TntZWcXQC
zNKku-2xw`dgngCmYYfQbBO|=K-@P;svY{+)p>VG#dfu8dXkSu&HEsIKbPXrtg7+6~
z8rLJ3&`NE6_r|GcCW{_PU7Y=+#qZ-~!{z#Qzf+4mY#jYjV@vomw#V6q``Kvayz6V-
zaiuA~Pu563DVWTSwP{k>vSx&j$f)=rrK{?m!Ely!d7BtJs$F(~j0B1Pl%#&ww5d-F
z!V2Kga^vAR^oCBS#R_<y&PoptjHmJj&xZGVl`d#MA(wy|)Bom}0w<|17xr5>lu}66
z7S^-^RpDNq1N$@EAsvWaMrW*~i@Yq93QM8^oRNFyUhSKxy>fX~3zA_5#xCb8jIQVi
z=r-h9x;Cg}w!VQ+GDmxKXVet^rOv4xk2ZKd%+mffJFsk`FTHpL`yq~#3~5jSzNjlN
z6k&@3>4ITd1!tV-!8K6&R$evo64PaxbG|W^=5gVT@rA&NzAbRga&l>~wX-RU83O?<
zFl9P%-bC4n4))ycdGIC*6xpcgXS(afqv6B0J37|G-SK=;fH*_f@}hm*sCwt44P`TA
z%hJZv`EWDSanMQV?nt5zbLT*+^`8^u6=e*OV_$Mn-?b$&{GVv)na_0c5Vvmtx8j4P
zB5mTnTN*13E3zw;cYQEZ0ZD>Ui$uiHAp#tMk+TQE3ngDj?y0sy1X7t0e2^GeX?t9#
zyfSg8>{uwa%6YO#jIj%-zO>=Ot4&@MeVvp?(;yQK`d~@}9W6Z>4cZEtGAc9L69F)3
z=ztz|6bZ0r7~B(Hg-OJ_BjwPZ*mtG95fV&ji|vyuw$O1~?@;k?b)hPO(laktnqB*N
zFF@0qPQl|B$cn5+Gk4kbKss$L-*OXZ8j}ZAoq)~Pu?V<5@R&3&EXv7RPA4RZE7^aQ
z8<*W!|D(VP9s9W9*|;U^=qO(*3z)YH&35MdDQlim9r#!)XB!%;`lVL1fmI8{wLbsy
zUPx)KrN_(Q`SALYI~xsnozcPO`}2#p$@&WiY{Q3Y+p6GaG3b|g)#0n2+bb3)G6rgU
zU|ELTBKG*Eb-qo}shZ~bQWg*k4lqTR1Hnm3>O%p@d~_{LB{{=P0-oXskt^f@yPV$^
z5oD~_j=?Uitf8AAI~m($%b#!=w)*l`aYm?8YU>4T5(*e-mlXO%eMbYbg8Hk6_K-!4
zQq(9x$`>J;HStZ==vOH#AxkvzkwS;U<s+NIT_Z;#HStT#U^LK90*}#Iz&sWj*K&h)
zR>C<Oy~^Y0{6J@6Z6APoEn|c24Zjvk9k1z!^Zu^*nwE2E%CfT4<G=B(m!G>gZRf`2
zRWIN9d+(px=ic6&bo8G;d2q42=lhqJ{dVV{Jn{bAgx7X`{Ox_)UODq{PNNnK{;$;^
zznRxDqtTZ?>4Sd+^R?s7Ug&V$fnyfW20@uy0vE{Cy+ZpYH(uJEjPCLmOjU&?B{UEj
z&>KKHlZD?~Rgk>sY+w4ioYC7qHAhCw(VN-M$h6|y_boS${!Y7DpNBg3J!ACdv^+zX
zDF;&Fp@aOM8M&|NyQ~X)J9x{?7<bZ`I)qXpDvhTArW)Exd>N4MZ^R{&+5<aWZ}3=C
z&S|-&dylO;zPaKcJL#z6$dQOvL1jDU5|;R)zF)Y{Bz++q0XAn;sPz@6(5<4TY!uNx
zXq6T(O+0Y4)Qj3uO4;^6CPO~2YrrqM9_g|G<xI!9_NB<i6zzq`xkW6J1QI1aNbkA|
z1YbbL(u3qtIGFs2*v~G|0oNQw2u&dh<(Qg`jk>asPIg&a*f><ns))u!=QT6lVym4)
z>{`{%#k;oWuI;QZ^lnIJ?Rz#)T<qJBIWtQ?p%w6<q0-lq{)F-=ohG*!u9da!lz6OT
zO2qH5Stx@QmNNh)zzM9uLj`k;QBvOqOEYXryN(6z%^TsqxU0^{Y-##zPY=@P3wD+T
z9tj$l1$KsqSAX^FG~?)pw%gCO))zREfb?X87i%2ZDY8Ma;w%r@%R7W1(04^TcPb~u
ztQ~V3jkATp$IWzfJWzBHYI;^-XZG>N`We6}Le!y)p&ceZ;s(=(e~x_7%ZhdHLG_%t
zIXrD8bp)sET#+pVvEE8)YopS-%ih0g7;MHL3;mhp-!ZX*Bt1$77n|+Sf}s1`lnVWw
zSS7CJ<YnLkUKd(n_P1|_u#;Vpa{na(zGb-ZrLvOms<`{I8g#RE8~|E?(1lEY<X@4(
zI_{k9wDZ;-=g1L{&%;Kpr!DI`VMTK7A#U<EE4v$FsLWatGJ;)68J`~>@mN;(@?Jl3
zY$`JhABsv0gM?o)i>rhih*?$#cPoxvdIjr&=sOpJ4N@w4cMh`Uw5a9;1<!sG*Q-ms
z?mI=C$^&(INAC2IBoR5ZIzPSG(m?XOn(YjYT#bIv5`E9$tXtauAf_zHu^ET(i0e*~
zi`nBQ+%}3Q!j+0TZe0>!(YXe#l7|MIMCP{IX|}jxMALjW1M+rZ6wAkvU-o3j3IiCu
zxXyFWC~v4z1g{LNO6T&$R$2mp@HZ#2^jR?`HmFg)&?jV=D?eQ=J5qb)h^dNNMgS7I
zEtn1r3NE=?Ee8~;m^CUU+f0#?z$w-1b@+7NYSMvBH3$VOZ6+;EVz^W)q)|>?Ob0q9
zbJ;?XA)Ckfb5FF>^T~kK9#5>9C1}gub<476HMNyMbvIU0qZ%yAs8w~R2%XRrx%OEm
z3Mq@%*0?{pC<*2b<1uOWyW-gSsy!2zxwrFvIK<bfFzVu6Ipa_kmRonW&r9Hf)%%{V
zDsa`-&S6u}AD>dU?XGe3rY+>uJjKI3t<h<Uog0deyz6TyoY=FS5il@5&Dx}%tqw0B
zXq&t{!VhVe6&Uznr}r>SV4pNO{XcVNGskOhcc6M&0^11jGHs|*2lD<8DAG&0oHh@J
za>UTf+8_8ttjrcJ5j{%_93ToQmEm|J`HN=Am6SYED~+u^XoEQ~IXMPg!<ShBftrB=
zd$hFs+${omQ}!4uv#4??RfGiIANqlBB6E5ImBa^GU_l9aEU@a8&ve6{>#fC}!`fli
z6F$=+jHlAaWUdD(1g=cc3UD6PFrZ@q>xJJ#oP%G`$I{&HeU`+rn?owW{Eb!Z0=cYv
zLNG>HdsXNC)#}i+$|2vyjt8#G822*Yj^UEKC-A1JvD}pq@Pw}beuaM(y)`h@sT~F>
zyEEVk)A8*FTv;Efj{YY4JFu0oGyI;Rlf~1mk^bmA@Tza-JMiTM;PU40l<<OeXVPLr
z&xX=QY|x%0Ven*o>&uGHN<Lt%PlItbWrYjraEv=34j<*>-hW6T`Y>Z8%*nbX2rq|N
zyF~*JcVF}hfEz+#r*s_*lu9djI`lRrj@m(jP&RN+fYu)uH^!~(NM&2V(JvRzl~bL|
zPe=|>1ViSGWkFn5jssva+tSJPOT}wP)}tPz5x}qL2UPdmdG!qsf*Q$}0U!9eV|Ulm
zEU!U^Zz4?J>5Jlv&v;n%(rSxkx~pkTXo(wi-7E84I<FdwGw0wv47^EgOx<dA-C-!h
ztdmf!XRdcr`VcUDnbCs{=~?;Ws<n=zOV^1gE!)xI>mmyavp#z^1eEj&fBL-qTs+-s
zbye{$T2n~U1y$F~wmY{xA-~y4lCzDPBQ4HITQuG{+TG9+t_i&t#Vkj&eVP{`VBY?z
zM>{gh-;~n#hE)&UOt?z$BCfG>nHuCjdxGRhUa1@F)P}2+=Y?0MJQC#LRrrDIW<l+e
zU3_O*O3G3X>Y1cG#0Rs=+&blBOH;i84X*7kj11zX+oaQ?gjf{)r#NA#i8izNG%`Rn
zB&^1OPjT1Osw}3miwS)!JZD_Xxv~^mbbBVuHZpJqlPDr=(M!xPU_mjKq5>Yf#c6+d
ze1_k_k*BH~`e~@l^WmN1mzbfHG|=}E>@ZfOOBW>^MtfRrs)9Y4@4^2TM@VL61e%%t
z31btO_$Mm#dOzrt%6qBGS;k%47WyY4Wda*Ou=YJ|!TA0hWoA~-cHox!>u~^G1?{iD
zJ)*mgI{mOG)Z+}d7PA$%ZR`$qj`)n>pzXFTO1atUIm4v5p^$W1*jxz1bdz05!vx)N
zYr_yjm%27Ztz>kr*r)`EQ;e;=5K`t_5@A(=FqBs%EH_qKUr$R_OC0?-;N1a0TxbUz
z%FgB1{;_~hsLe`;qqr<g5|~1$IT1{83*l`%+y&7|K_N?r$$mv>1x%j6sufPU0VqlZ
zRiOPF5G({clRJg+f?~jYMe;?imTJ|#LCR7AO);XNbpM=4LD4G2ed$GnE3Zn6*e>KR
zJ|I)%HhPdF;ak@1XsVN?&RC%NTo>{msH+dGD%#UoQ^?z7XYE}DRi$hq0S|(Yq%Iop
zxpkpgTcW>G$(9UcfF>LPN%1KqjdNP)4goc`VB=p2)1cAddzwUWc~YiBih8yl_paer
z<4{4g)|5>r9u}SbNjPG>tNMn#u#qFR{d%8q24f?8Yw@R9(eAvEuO%EPZ1Okh@`Zcr
zm|P$S%EiIvporCLT#7wpYZ?~Sf2%N(p<P~Js6Lgpyv2L2QU?@~FRaK%i=Qk7gyi}5
zU7(B?tF1#1z^v<7!8UEhawdx<Pp~utRC|vL3k8@*W~s!*SgB7RDMr7W9h&@>iVFnD
zO;0SGHMF$ktU6^^R!{js$;qnh2`qi(?@~LtV6Mp$Lr7}940kx&iokEE%jxUJxbl*W
zWgoUE%M>Sxk|w354U1i+^%Srb_X*jD4VC804|t<&{VTQ{Tej!=VQmxg=r%qKcr#g|
zQfK*x_StADgbF~pyDU2yNLW&znZH6Um`3E31GTrsu{~6Z%~;nU6+mq{Y?*3BYU|rF
z4i3F#9P{HxJ5fk7dW&nHeQ0jJD3G>50eVVoU@GFCo8CTK9ci)DInGeFZI~5>rC;39
zw-EbHvuW21&fNS%y({VZ`9*I$@2+Z#TW%<wQCrYcRZmID;^mcD>|8cUMK@n&lm{Jv
zXfSQ1M5be7Q=<R}lBOB9@{(KRS^Hh-#q3b37Ho_IhYy6!y@OKMUJvpul2E+)o7(vQ
z5Gfd*3;UDr$*-D2!74;hKu{KP@KXqlQi{@ki-<`f({zc)Wg{N2Eu~nkO%3w3tubX7
z&AD2c4FYx?e2!G1a69<U$`?zRsItLQjxkq?oXKL>-SyVcXU#1A5#x#G(9g?#yQU&s
zpOo1NVG#bf@?s&{>sc6ov<EDggxsAY#ydD4)?tQ3?I}#U3b>-aBy`v^>svrMzQI+_
zKez8d-#`?}7!>8S-V6w|pnpw1Ypupp+dSRAOm}Z-<D~b7Sf;z$7H+c*pUJcR=F6)$
z+!B2&PwLU^8)(=qK(DR~GhhiM?^OtCkvA#$CTs7QX4Z<EzQ9|Hsfmp=xJ@b49Vx<b
zVZOZ>T$peLNfyR0L=b_qSiIQPo1XeAUBOHcJ2iOl@0ZPB-B%PzodTY~2o|6y)wFU3
zY1rr?D55qDVo~IRKqBq0bPqETI**+beFDOwULClLRD=(4BSR-*qd}BP!N;gRMT6;v
z>dwkvrWXL|q*uNo=5j2YK)s?!PE?3rp3~8Ph}EDs`#GkH#r<ilS`-?IOn3l<36HWT
z7FfZTWc)VOu(T_T;@MhHS^;;PB)`8BueKE}HqMS6)C5ZoxHc>XT8lTRVVTM3v|Qyd
zWf75|={~!DphWUsTU<}=ncZcN)R~>5Fm?Ts_skF`3Lwb`u2F9Wop*bf6b+YHn!(9g
zjo#0sAGln`z20j4k#o6Dn0QFkrA8DKMXE^Hp;eVUXrH_#z1D^LJbn%6aEF*(t4I&b
zDq1f%2tE6J7wI^U(q8bW>7ua(pts5ushvz~9~G3XK&nW=tGHV`c&fC6WLVjI!FFZ6
z!t?3J!PR2fQz*Nof%(i_zhG5MS0CBpYk+}KgY+{u7B|PZzG8<>&J>-X=xk9k)RbH8
z<wQ?KW8=%n6$FwI4Izm=w8Q!8&`8+XeV9|bzE!p!$C?J3!?@*n|G=*mn<+q@#?@Om
z5g6<U1Z!>&yk1o~q*TOu3X%$%rQ(f_%C_;+ifW6^mc~;F7P^e-MWvwH5S-i>egy73
z4_+XO*A(y5GuW%;ex0qJtE12yZVr6?&6?jkz`qNU1{sYB_X&|o>svNbCV-vqqV9#j
zD{r)5CtwJStqvsY2BibnhKTv7dp=7hi&n_Yb0bH{)RgJANDuq=X=km#6rp=L4%#qA
zrsig`U8If_p(T_icu2(?W4qkB09%x16ZH_<1z3LCaps(Vq_PCrrVHL2d4;x}Io6h3
zS)MG>)A(9Ut9MT<bgIrVrq(;aosfm^@_Lb;Wm#z!CT|EbfL`RwILIIhQ*;_ir$v#2
z;m42UI32cnkL(8^4!=8Td=JYak&f4=J;6g6LnO%#&!hNK_qN<>Fjj7Q)@pWGi3q5o
zDP;*O_XOSqe)!_J+T2yZlq(?~(chE3r%Y|Gv>H&P7x}KZ%P|O<Ef|VVn!8)kDEq~~
z?t*<fT+-K5bgsRuV~_aL_FSj}l`Zm?%e)+?PBO+KKcq-5UQu7XV&AJCQ-VuEi;qZ_
zp1N(PgF|>P&XSl#v)7KwaLX~qTMSL$5$$l5TV8AI+BW-hE<o8qG4exw!0Ag9i}qv-
zlvxXA4lSY@$-wd13>-*r01nV_RYnl8-GM|hMKW9{EKOykBxNU!pA}^xJX?B%qcYG?
z@;~guWvrpvI<XOuF(BWH0^W$qM#xM8sSx1{Q`Ps2iJRoD0kqPAL}H4eb6#_{4&BV_
zrvfRS53@=(7l_p-?<_P+0;EihD5oHrWt4TJK6hWpBxiQ_ZoRy5U0{_reO$hOgC-M4
z9F*;*9wtUiO72MutWegwsXc^r*HW_#D0)l0?zQ$M)V*bBA8_~!<UbqGCv4nNQKs(^
z5R;)^Huc8~v9e=k<V0)d$kAxGQ3HUEUWLb^0k%Z5o;r|QSC~J^(&JhQVpA|jiZetf
zkTudIhQ&Sb)zh@*OJw5(s)~nv!o0Imhj)DN(cD$qGt7u^J@sC(U7Z#Az3Iff-yXUC
zHv>Q0vgRKh>((t;w<Y6=Z%lo5(wVB)j!a#*5s~k#B}Lzuvv<;O)Bo*HrvLW|lQ#E1
z|JeQG7hc*DH(tvc8r;)%=2zQ|$M(JX^ts_9LxbB_em3j(x8!Yttq-e?X+)dSl>A3;
z#t`dA0}p@xW_YR#@haV+y7a~ZFQ+OQwcq+EhGg3$V2UJHS`iJ|M)S5CDKz;qvMQ<M
zCjluW$jnNFEF1^(W_IDJPD96iiiP$eI8hi(GB)}AH7&T0gucKd1cXhl^0@G=k^13H
z;YCh|XNtTQRMnC?suM)^Bl5m2;~Ht(8x!O0`9+GMK6ROSd?CejjPgsiv@XOlMD8!Z
za-a@PX&fXJ_zNOQzL=7b1+}nEp*j^sYkId|hUa&1Dk~y3=d*KbOKM3;cf!oF#Zf3|
ztE0bfjl^Tq*(2@+?>MP?emCFvjbL?rc}tj$cedMkp=rj^YU6NxG%IvZ^qu_;6h8eW
zC~@Lwwn&vp2K;IQmnf(M%XCs+0te#2(96zm-e|Udx5<<dhFlt?NUE1f_;T^bKwA7+
zdEC<8vngXjnG52$DJZI|oTm5H%O;_~GHWgb-!co(Nt#qHf{T@~Pw3S9R5vF=!4&Vz
zRk&*kLk3QTYC2f>rIc%`!MIE{l^eK3|4di{c@_ID0rV8AKvPU4*&ca`uI=kIf@)d<
z>;thwPL=cl*U`asXFztML!_!wAq$B6fmv%FxfFSXsQq)#s-T|KB4&<oLKm5}V@m*L
zl>5YaAP9%h*Ol0kFp%t?Z(+TKa=)R-2;c-UM!;woT2p-Lc@rG?$`hE}d0jcVIBEo5
zgdHYw-K-ue*$GgFk{{GBe)*#z2)@BPk(<qV;kS$<8PPUps0TzKG_E(RI4+W58_}_5
z!597AK4bJ<TTuAhkpYjRtnpObp_2BsGK40ezR}^^BUSQ)m}(gC52fW34I_KK$)ZeG
z5<!zxLy$a=ajML^D+`NOB=lmoDt`uJ;ffb-;+z&nWQf^I345mWR2a=*B5r<F*-iuQ
zXK4n2K_9PhR5fcLc_Mwm2cGK}L&48iC8%~&QZ|l56#C6f`=#EI?S-qdK~dYtA998B
zD8;^%4i;~&An6@+1n!|CEt!Gwui~n)!E1%Q6(pe8dm^jT<mL-FfLs*lnClOQgtQQZ
zjLS{dNDTi?xM$?Y#ltz#|5>~tb(<_;65fXln`X8LUKEobHlk1<Y}+Ysm9X3!^xfs{
z?xhG_Tf9jb_8BNSgsjoCPg~6OoZDM#@VSSW_XJ+#SQ%6C$W<`qSP?m8SavzA^AqG)
z>&~2i_V!TtO&ePc??fY_3XP69HDmOmEfW2#I{q20`zRxCXXG9@q*tP!?z07+^MnSR
zAGJCEydYRxc_7~*24C+Y&DrnRlIv$@Bd*BQ>9maqwR|@oWQ>vG-2>-|IS?g_D9PtA
z#(<VXwV<*Twg3kCrH+Zi1oJzL5X9E8-%SVY-#HjNA@%5>4NG&ue;Wb19K^j`{;T~^
zHIPfPwIfnZN(&FMpX;4#2QK04%<2G0>d+Bo-JV!tI0QVR-?8mvfJn@hT@Z}FIaS3^
zq$EF7WAHvnMInG)jMqrCz@S+WbU4TI;UIDG;qm$s>Ot~a=R2*p21Y*AE)~D%xeFGA
z8xO)V%z30an-0R`W2gbZgSo(jfhn&SJx_@dC#u}$@&wd{O_e%^rH)F<bn8>DTJOO$
z0h*Jk4fqs2DJMnO*3{;GSisgYSi%mr&J)*3lD{;iv#^ry<q1D*yE7Di+BVc|j(!jg
z9wqsH&>XEFyQ0On=STm|IC?R9lN(xEe8t>#!#X$Np{7E%A_P`2l>*)-6qLs4ACg`M
zWIhg;2bo0u@ev-9xY-6pf<#hNkjma_(V=HCH=)=Af0ky=x6ASn!<wD}6*pBt6FIby
zq4=*<ZPGSMduiiYirLyCXO@g}6%YGs6`|g_dEyp?*)Ew&h&bpMb!eicm6SCRNx~Bs
zjp3*w224|a@JeikQNwBKFJ2I#C<>4ZMNl#>=PY@ee!6SzdfVgC*E?U$^Yv?!8$WMN
zzB3+rYNaZF-1S7p-rDm}tsv0tcmhnLAUROemyS`V^uN0>yq9tMtiUon%y?zN4&)!1
z=I8nsW~LIc#FXJh>wI1EOuydTKA~_OP9DrW;u3WmT(I1pcNW<CJ(_>OdDqNd(Ob^Y
zZ9~YL@%`e!Z$7zb)ZR7XB_oIT*2v@+E)XXT;vatiMMt-&J;hR^o_;jQPw4NUn-Lw$
z9%)|&13#pphgK>g8tFxA=9ImFPz%;DNRIeqV#hYLFt3@VWOaqJh<DAHi=RPwghLQd
z2}Zr>peS7%5{=R%g}y6pIIC*1)}Ch7E_8f{g@1ua43%JdrZf)>s6SVlfsY9a7B~rd
zBt)(zCVnm6JJP7wHVeK-2KSr*s{za4vGcD<E6U`7?uJ(ieu>~fC&lCGwDln1y==i`
zr5pcj|CvcU#`tA{^IgwP^UZ;gX>$FXk5u@RdZ#>^U%c)KN7X6osSeo(U*x+7q|bX#
zO<~~vy>{<K)XRswH`bxU1Z;IAh&uS3ETSrp6P#5-DwZ-d15k=9$uF4=KN5nX^5})4
zQ=O3ASzTaxY~7Q6TQ2WD8~*cAIkk-FXIG<v0p}-Uu`@p0-4gk<C0uVD)eP6or<S6J
zZ)oeT;$dxcP=LuZpo855MBM=|7f?>5nHUyI;Td`f!b1u(*JSq_Oy=d#D2-U^&;h0Z
z;HXcYIj^OrwU7M)Ju7wCT!LY}FeOkLn#D@JIw6=)9dNhYIthd7mht;l-@8Z!G?px@
z;#8<7Ywb5c;h{ZAs^YwNONB@kVSS2z7RqhAe~-umzau9S5A>qOVR!yO9EeK+tAz!O
zSf%v9piD67L|KDWk<v#=XTV{b6Y%reC%!>L)p+ynIkMI1ZVe8E9;}8jvaQ?ip1;RY
zW$+3Zs0Kj50y?0^eYoa`KS>p}3q11!2|C7R@Mxw2C|3@#fr7Ui>924{Ae}&1ls}a*
z$>#id3E6&?S+GLW0h_uG7fsU6SH2B!w5hC@T^se$yNsb&s9pMbE%VzCZ6jT_+dA9r
zVBW@eP0cJ-e?2Y-WZk#Z^3XhT04)iQZrb<SML`u43GjIb)8KU#zYRhzn?%SRl-%;`
zi)-eTLkHO2^Wa{nEv=hKV!vQo%IKqDut-GHg7k3k8oXh#$cEY9L@m=#HG@2gONfdd
zW(uTdDMF|b1+XKKR(x(#aEu>A@=YJ4v{el3&V3GG!oYVFB++Zq>%bzE?2r?ZLsDog
zG)o!^s2ZeeB&4s1|NI<<Ah@tlj^_zt{RHf!MkY)px#8m8T+g@lnTJ;o9}EXau9?G=
zZO<jhx@S~?_BLu|)pGPwf+=mfNOtgKN}!Y%oIMGLWvs`zJ!Zb%RVhTc^*G(}NGf^E
zwccHhxxH0Pag@$Jt9=>lB>dvq(*pVHEHh?j=WE|{oQ*p;!vmo-`l&7YA^pV$qjqmY
zOX&Xyy>r#}=@;i#;OJxI{8&X2>_kSsldB2+NM2GC@ikfuq=^8O5fo&Pi9Yu}?Yr>K
zc(NEaNOOONI0J<5NE0=ee`iB?xw{|AtmP@hm5ez-H7U*kR1m2F%;XO;|7IyHQLsV8
zf|Maj%0_h}`CGK1%9*khL$5;b7gb``n6^_26oJj>cw*AdV?^2vCI{^08xwa@l9JUD
zEe~mra#YToQW$bNX0qN<lCI(uaHd$4i{DE+F6U#xGQ=&rFkTE+m<;*Xzk-|ZS?6n#
z-!?r8R4~>Vq*H^lkcfTXQi*@F2=M?{&x#MlvRIa;Inbt%JOW41FDI9kNZ@4co)~*e
ze5x**bODvpobE{~#qnx#`^0I~Vb`UnrA}&pd5QPos>~HLY{L*_x}xD0Tks=8cr2}D
zkj>(w?-@hyU9~NJp?y_XO6hL@gkUL6Ub%vK-Tie}aYRTo^Z`U6Il7R~?VyG50^%hx
zbK`_Nl4SA&k5fzve*v4C%3x{BVr;V6swAZvtSA#n>!r)9n4&W}P_rmOR+UhYr)v@U
zGnG5mYQos&iik}R(F<fodxe_pYn6hM)fg87nD=kieiai6j&+fsV|*y0*_g;oY7g=j
z@057j-zc-fsAHCZ-%RYs(d|<6f&la@m@ZqmuoRb~4v`@-WiR`RBRgzcov+!d%(j76
zZE`?Qt&d}4{diQ#^o#T0`Umu$)!y~IBiYADaaT<GyzZV#9Y_XS^k_=ZVM6Cf_Ag{8
z9-CX!DGH)a%bz8*n(rZMi_H!(=EV_oSu)KCy6k0Frc6!VG5Y|eNnWJdHu8Ue>$0{q
zpwr4?bAwLq80G>=dEJLe0&U%Z|1K<|&dR$TS88p5U4yT&&o#hfIDMWNK<YK1&6Ia@
zY0I7p*iRI86l+2{z)_V1!YGchpG=2&yC;)akO>l*6knLSfWgr&=ImS;vF9R3F~kRe
z;ro4^P7sX4)J|;(kLq;juW0ZoJ+QwQzaS<Ub-*${xK42~_Jv5P#l(Q`Neu3kC2LCW
z8T)aBHyL|@?NLC<!2yits~be4Dkn>$FgbbT)`+xwwRs=5bst7`QDM@)hg}cpSJjIg
zp*^e2x<J|SF}h>}A+(tu+?&!Bpw85fGE1Tgq(d1hAW~4PPS`9ItYJb-U#M#jOr5Qm
zi{%XYv1HqluJ+xJV{cf|p1IjMD(87)Iqw%)#ozMJH~yLJvtV9`O65vRxZALU*m(i!
z&zZ}0VkFvN5Dp#hO=0G;LtaQg?srUVRFMU$zPWNN6|o~=sJT($$T9WQUhf8}g11#@
z$AoZ$jN$?R29nY;Bs}2B7od4aQHgI{Qokrl_1sI_=8Djt$+N00jJyOlg%uHL5}$_Z
zlu$EOq@Rj}F(%pMY0H2PltZaU2}#Px0?u%b)KD?{FpjkCV$>p$eN18oq$V{aDgp6d
zW_5TFUO3ah?y8~26Cu6Db%&-#hLXisT5n|!8v<uMw^xVCOvp~N8Om0dn}R=ygJ_^s
zuI$lWWcJ<gUO{s%EiC0OK07B&w6!F7C2kRFD5gPkfT+rF`;^?uzHLH)gHaU_Vaj%K
zs$4_ChR{9z7jLgccJ0Q)>EYzE%Eeuk6AG89`f`5sS6_*qL+?lb?P&4s_|Pd^Na84#
zph4%TF8X0~!%R58z9bQ;^O96twmy~i|K<Q49T20%)aPkZS%nFHV!on=k}IK%h?G!N
z>I}RG#qxzthvUA-=NhUM_F21MC(PXdT?8dT=&yKMl<`6$-G&Yq`nz%m=tf(p{5zy;
zR18%O+Z5QFC_*XPWB)f!V0TY7rz)_C5-WyYf{dZ4Xu|7|0VJ&zFcL)+`7_>LZoNu{
z1@=~S()n{%_Fa=Q^C6Zb&(Xm>^FA!f{d0_9XyQ*==Sw+|Itf)O_Ylu%6TSY!`d$VG
zhgAchPLI;3Q9Q0Vy$Ao@>9Gl*M4z3eMw62d!f<?<kp(!&`nsW#-IEKWH}j(XQKt}Q
zB!V{QMQzdBt8Kpd(eL$TMIJ{4`dsuHE2+YssMIh+(fQHed64+dDRf*&Cja%`>wj33
z`}BDSZv2=sMvhhnjlJ6~B701jH2ysk+?FC7SYL^2Zj*fAFYJMKfo%m11dc$lwcYQV
zw8c`C5Ib{ld<wnPUUaUqO}O2pILU`Hq;~{Hxp~|c6=J!foNNNGI^R|KYpc7*P!h6>
zcz4Ar!xVV?r>%mW{gLvF??rX%I~a-n{@4Hd^Oe7SW+?kyG&7-c>h7!mXUV$qg=x<o
z&q`>1?)xRv|NF$B%wC#2-t>*%4gARu{`@C~KbijWiodSd{PJH9>}iQ>{@d{19SdI{
z939QgZeM;T^W5Kvv!Rl*v3QQCU3iS%Nm^bul(Mepw6z8J=L_|r@KxP+WcDd=U%9Y_
z#G1-XR5dQ>Z~3fi<nc)J$UNIwTUrAS?|MB*UfmpX;%+G6)rDye4a-Js9o7~7MBh>a
zhWudqy42n&`;%v8?hIzqoyvyS-de!61zqE+TX98S%e^D^5sxvOMOFNHK^u^u-afTV
zpNT|24v9#?v*VJwraosKJ>?1iS{EHz?Fo%UNAsdBHrAF66$ef<wS;aIN8+P5ZGo1M
zbpM9YciWCMuk62JYHY>Ps_(EHsZjNtjQ7rOqBFq^RoUYRVHV!Wy6IE#)I)TFa|e7w
z8JKtU^|P3nMTtBw64W{2=*#-ezsOErn>uOhU;vlKr+J{I9S5sZ?x!jgkp`ij@*UZz
zTsjG?@*fB7DF7EI9Luv}h5!;S1to@4L#qW$wJ4ieVW(<m$%(L#B@D;D7Y!24z38|E
zLulvJBq}T6r&9CaRWW0chXoSoRn#T~S=_{x4GTD~60FEi0FmI7BqJ?9akp*wsCF>y
zh*X+`=VWOYz43-p>Wc|8z_j?BjDYpQ^Av82GOh&hl5I#t6*vdd(i0UEn79!D)OI0;
z!1|@x)vcxP$GuR5Tgj^(D>o){aQg1pIkLd)L_jb5U{0=KnP2i+h95X5K0mwXK;%V_
zHFzuf!B8~IsrjQM*VlkiKl(6=Kxeo=dh<i)-P+FDq}~DRI7|n3AE=nnGp$U|+T%c7
z7D)&lkkkT?@msEl6*<D%5yDCT5>5!Kn9KMB7xb*md|ik~OIpC7JE)DJX|@+7W$;)8
zJtfj2nSsxPuV`VX*dcH$2?}7acn)rD_ZwWS_YstGXsfKs6mM)rcGPxB^*APW6S*1F
zBeGI0SsLtJbd>q{;t*~DU;j{<*s6eaP*tI;)2{QX5arozVQpx|7DwU<Hs=(xtIWrV
z2Lu?VUpp!=j@2HueV`luX(&DXagNQ~Vx2DyX$CHa_EU|ZL!clrAO=8mNmdeTCMY)q
zTzhKX_?~xRNg2EkYiyn3wMV_=f3*sde<Dbapuj9;NeV2}?<R<G7gG5eFM734fB%fM
zhD`g)_}7bS&gSFu6dD<d^f<M?)wWw>=d}-^qU|&1Xcl6Oyzf=4?6>YqEg{Ji9avlH
zo5)LJc>WI#Y*o=)B<-=z0@k>!<hv63dz0^Au7YnanX1_pE9&mY<fXSB=P7axidJk)
z4!p_pWQxn&AXV&xnKnQGPV8D6pkrja@#r>(re|^7|EKdJ&FNbpMODYi^rI=!1D6bq
zqGWBRE0pCM2=Dk=3i4WrBH6uC&p3+-Wg88$+}HNK_Rx7&5LsN=QzHW-UDn`1Y`X5=
zS2c%LG?A&Iuc1KhXa_Ik6z+zi`Nz{{q<C2eKwU2*496yQUTVq5Yf=U&r<sdLT+#Yq
z@+Yu-H=8h%7)y#*>$aU0rsXqme3@O+nDAIoU%jBQ`);4v#+q**#Ny_>(e9)BivtC~
zYsJF|;983#&*jbA2wxT!Ry#Am$@j@LTo|XQo?tZ5O18n|K5xw@U6hN#1AUHr8tvxH
z{F|3Q=K{=0Zpxxxu&~n=2a{ufwUj&^VwuNi_-$2{C(i>U#q5Gu7<?=>F{`=KnPB|B
zD8S_<p^31zQv1l(nONV!gpaT&t}_(HvQUu9ss|1o4G4zZ$$@F;XVkhZZkrUdfHa;h
zw3ou00uEC8$5mp5QwWm~*g`sf$<kIBT{6l<^dzJeh8}^<6i&Qc#{(=76R83&Q4F}`
z*0vx4Y5zsb-2!e}w9@mtquP9iy(k;Zs1Q@p@Gfkbn~&d5>IT4G`8E5Cd>8pKbD%V$
zm?PHW#RUclw~4n$(8@Te=Rw`5GZf)g6}HmEQ<F<6>wP%$^>)%joSw}tbC<U-tFh16
z>(t;WemoTQbMel#giaG9FJX`#i;t{oub&_$U95_6&`DfT17TI<V@T*e42i@#pIrxw
z^BPV~2nrjmJeg}}fDoHev;v2U>VnQ{$US0(3id*p#}SuDG{6v>wyC>ihIormprZq{
zyuT*de~<SbdYOH6><Wb%S{bPM-yGY#EQw$T39osZb1!w5Gyf-(F}RX)btr+G%=6;(
z1;LaA$4)3|Vq(rf^5s7in5wz0g*D7F``@hP3Is=^F3LygV(>^gNF%T)x+B_PzIjJW
zkF2q)8=FHX4C0fAfR0Aje{u6}3HaF#@GNOb9V}W<69%R+vH=7NV<V|`8|5H8D{;yM
zW)maTm>{abaDQE;EIEtE=VEU28H9#ag~^yzG|eeGyXN4P_szwlw~(Fwag@B47an$w
zyk{s*?!Imcz6jq-OM@)Rt{A4_rBn>Hg$Xiuu&$eW2#Y>~-^zIO<CB^zRbnZGM~t`7
zw#g4MBIH#&ky2Zb`n-QV+k&tM!8UQRIeFL4)P*gmki~FUYEq=v&3ds!2SungR?}_d
zXvN7>@$<<96?0f0RrI$#66WaF`313+kV1swLB<tgW5F*U_z9L4Jl^-YJ2)BuGEfj^
zq$UgijmOMpVKvMOtn!@VT^Cxpw`ok#$ZU*Ex>YGRstWmqvP40%Bpv9Pb{)f0WeJS(
z50&Mm?sw$ArAFZj+s+#C;Ec<32sdXfMUB`rt9z8SJl=kumjxLjRS@Qcy|vyZ<_?si
zR`No>bzM>4n&ZjnWtfO>h0yv&oHpfp%G^nCN@%dt1_Lj?!lI=6(T7fG#GkQfQv2)o
zCYL2t6--WSEc|FS0t`<$ED_T=TpfQc<n6kBF)wtpH8PSH{{1wkX>uX;2>MVg`&fT<
zzUV+&MX#=hc`@VX_Eox9<Gy-yaRVj0rT8<!6<#$paB{)`?h&-_lHj4Otqs_(Qfk%>
zEsckuk+is_Cd)sueXYV|GAuM|%M(N*gEl&_4`j=d=rLr#0uX{~Z*3N7LQv-l)0}uA
z!2lq|jAbb*Me_!m7qXB%?hc85g0$dY4F)1ob2>v#k_arwI0U#hEmaZ4UFl#mpA)%-
zzoFPO8Kf4b6w2pe##E*~I8@OdiG%~$pGoa_i4~Oo0kf^ec%yC2pr}ptt1=-hwg+n3
z?GgzY#&fO2Rx0o!#T;8yr9156SjjmI*O<y^Q`}4Mv-@`<v4WZcq~^f!r2ISz$637Z
zrNYUmN0^L?6pr5cIAJxe>75<F*7xk$<o&L<i`jS;{go%$XYSN;jM|pItl|SkoMoan
zTchu`gg>Z1bAj-vJPL8mzwCW#7C(YQP(m9X(V4<X5$24jW2*!vt{kc-)Ud2%a9$>S
zGVeqp4pai<M#yiN7KV%73bpg;_5JjHd?mGqX=8dP#o8em{i@1N64d9aLOOZ5xr_#^
zc7s@B(9&a7Cu31F+)+j2&g)^Kz^tu%gN%T+55>jWtG9g@L`BJ*0+}FdW9?fLbz!?C
zNJueCUoNx3g?tBvi}K1~t%de#sGdTZ#dM62ziXKhz7J1B+ko+xL&19@znsKFT-CiY
z0j^p740}iE`Ld45;x-gFIg!2Z>2|8raw=2ko&s@fU|Qls;?Y>I3}rT?_GD9$R4dbT
zxMw)+O~CDcQSa7N_@fpV6qNCgIW8CVPI-Cvy!~gx{x&B&!o#*mv$^=ASpE2+C;Z;k
z=qOUP->ORT6DN$Qr=d}3`wv)}3I`c^lv<v2ETNi>Q&hML?N52*kk0Ur@Q|+vuPV(U
zf}XJ0)ymX5p*!V31!a01YZ549Ox0O|x}kbX03h6XUidprlsWt8@hq->3@ZQ1VLca`
z%h3wXMEuhh%NR-7FYrPH?OB0R#YdAv0Z1VKiTYFO^g<}I%mrbu0*Wt+c5r1-&=~~y
zY`}Ybd+MvK!IGvBkSlnFI3}Y5vH79Bd%sMiNPGv0wj4Fsc^l<Tm{KHHa-q+_X3+l@
zIage5?)->kHu|?jYtrd<Xev#h(p7DjX9iX!>^`)uqHMgr`HY^4Pe9+ux8bk};@yXz
zaBw%51nq6=ANOhNd1jnmdPRz0c3+&;w4&Hqu@U!?vIUbG^=tAst*f&IQK5mg45Qp#
z?Tnal-_lMQ!`HKl6CkEtbw-V&c_AHO%>WlGJNjny-)x@^g+`-4TzXdfx#^+I=MK_;
z#wGSXEB8m-T_<YAJXqv9VkgfX;5iJ=t4YioS6F<51$N3{_B`-s%UQieWE8?G>#XE)
z?xFBL5JW?XO^&}%EuxS`fKb>fthDKS6l?4;!YVOF%JN`MIbktSr`Ap>HyLmT`8Qa<
zg8TuRs=`2p+l&=&?x2EVWW^c0ZPA<2M2#vL8W<NmGptCYAWp`;lcQFG96B>@>mBzr
zafk#87f&a}85WOpT!~_4LgnH|BiwMyvqlQYBjGD;;Z)kT6qKMRxGdGXse0BjDrZ7U
z;iYHc2fhw^DmQA6T(LGdXt%m5EBk6&6S@hdlOL!vQ1|0&2(1rc0K|@`?7Qb=PX)57
zZUw2+mb}7EO=v7%++H^2`WnZ^N0K~|ciH%O1=|wNus=E+^)=gqQKs(QWVT=sAXLki
z%LkU$6S<ppWb`9bPVmKc|7&Gq$HzKsIX-A(|MBsy>)iTv$CvP*J-!zzZ9AyLvUoQL
z9ax_k<ax)|9|7I9@Iv!-080=%>H!6Sk^scr^+Vr5mf?sY0RT8`WCDPm_#hfkUNY>H
zJ1|ZV-8BbAs68F`&{}v-h@q^lR<<*V+sPh?qO_5N>flj>yA<rA8&m8b3sfeCk4G+a
zYS)K?8Id)ToY^L^Vn17ysuB;?auV4>21bS|R6C~B3`sasx#7DiW>ezGK}~v~ECnXj
zA06ExpR5^f#F!l8Sbe7d3UNy|td_29pS^WUkMq2(&GzYlxi}mh8I6YGRZgpdLzrS5
z#mnhT)EoUIm>2p*OXQUl2xg%DN-1?9^N9+<K&W|Xt}(R{T@Rnibfu)Iw|}+-8#aei
zy_jkHx5<hOwV%C06M3>-%*u++BA8$kkCgdx9_bJ(mM$}6OtN6xC5F-20{kT}`WeMc
zyAnlnX(%5TP|G8m5dUiOd~AT18!IFk9f%}asj-2KmB2HmP7{)hJbZjD#o)Ms^A*lI
zH&B*EhzbT&gHn`n0phYh6rj=c33Y})suhb`Wvn1$-Ou-A@n=-3z+ckyiJ2x9AOXh4
zfSOl{y-Y&%-E>(nP5hbDi@Tx*+sk<4U}+~5`o3Crr#%L=QYp3u?kh`<%cMv0w}doK
znL)*iGAAUC<U4kkB?LG=*GhQh^M1L+@>;(1l-4?s)3rrO*-#S=z6Ch%$03ThJUdCB
zzidtSvdAN&5?l?Dh&l49syNlXylB(TL_cYbUX+^U)<A7}k}lo<aC3pH98chNy-VdM
zW{3l3F0YiRP6!wxj`0_^qu060q{LMo*dhDfWa0_ScT-Ux6S|suwcLYUTAYSO3NeC{
zHU$@s-V4@oKpYclc~|;KSdRRP2}?yTCKV%7qdq)j+-enpj22?LKLm!g^_(b+0cI$r
zpi&qj7UsC5i%WqJBYO&BJ%e=eaOH?!LZ<N;q<G}J=eikkQmx94;Fxud@R$^zLIKtM
zcyw-}d5k%p(6fIj7nNd`muVg=$G`S;1xUZ}@yff;eE8INJnxLVchckcJu~IzGxv|z
zzrJeS?ZIzN{nLLP{Kn+`tNUBOUuvDP#`IseCQsG=?$6IY_|*HpOKn}>@#&l%%dLM5
zc1B-Yc;Da59ZnH)LeaL!7B(f^I;tJjR>|WQ!xY|#x!Gb=L&44R3|_MASq#bBI#jWb
zga)k(vaB{4cFEZtVj+A+XLXkFIV?~?$2V>~wjT?t8cGcl+N<f}vtG&JA(lb_!nWr%
z6~{~R@7=Y#%3K^Bc1AyHwcQzQWhKcntZb~q&(T^&hw*86F7M;3n}SQaN_-84htTdX
zuPM0v`~(Eh?`;l3LQo|Rrd^B7+?i8~vN-|fFE!nAfnv_k>b3z;2%U3VMd%S)V94S9
zATQnZ{zdV=9}3KZa{LM6EGjWTWBw)NxT!L9g1xO;3^K3$--C`oqtG~bxkL%v$Y9KI
z98%a^*?V8oXb@5(TtLdG&DwK;?-EJ!uno3uD_$d^8!K+?#h{&#S$JVi%7YWmQU`8I
zpl@T+(=WAfT{6s&<&gZxO6-E|E#Gk)mDD~mi1m!oryGvH86Ww5OW<6icc#5#6MRND
z{lrlr!1vKa%3PS$#C=;ktFI<ZdMf`!(eo}@v3rga<9tgS5gknA!DOW)v~9=I4KgVc
zBE2~odTHAN^VV7Os>;VRz6XkHbZt-1`B78&<K|Xdke=k@J9&5~?k>LlE@BUC_@Mab
zNjriSX~TJJhZt_|DPuq3mgBrS5ADZmu8FNMvX(+w(q~B>avwXfAl+*l1?dn1P7-le
zf+`X19qyGrqj#lLm?zua8QS;R5-B^k4)EuPWMP)9%Urc_L40rR5M2esd)V;qeH@=y
zb*EOxL~X*DI+i8}BR;Wmt&wb>jI|}yKr<KFwxj~tXg!QGkGhX?(;@8mmiw9vz$WAc
zUyL31#|I^1JUKL2i=hS+$e@~vY%TsvKnl@4@@>Hb<iPDGY-gRVp6Gk=!Be+C%(Nt$
z@0&V@i`8bWfiU%?(g=_Bg}wWVcxh)c0iJru_b?1Ie?Dah(FtPsMr_*A>C%w#1?dSY
zE8EjzCv}KA2^r&(?gt>(JK|WDHZS{^Z||tsg(wcoXZn+04LNULY;lEte=3L{{;)Z6
zd#H2dEkkxgVQ`B7xc`QAA5Qp!)1V3E4c2}E=nuS-A&Fn6i3^nYm8Q^BUA`ncwHj4;
zV6!0d5)3W4$TIIDR4+$~ASCrH-wEqH%=F-a^{}g0t{^?=Q93P+84rcJFdP-lhQVSC
z+2(0~MYpKzzIp9gbLKwUbi#@TV7pYL-XGH9^P<8gLD(&emO-%lbbMseo-CPzb|V)|
z(iBfk0`>x(_7^%-HOC<92)3puHfGgrWB?6!K;PD0;<e;^iY|+};-ex6fDn2Q%D)}9
z*Nma|Xr<AgrCp!zYdGFiZ~|pOhd4u#7<0Ik9W@-x52f|T^qi55fH$Z3ll?bLnXCE%
zrRHXHW77u%!bEUyPj&TFR%Vr%#CaS5Kr)yK5{GwvvVRv_C0*N}vj+PxPyBb$5CKN`
zr?&8Oc?)~ru#5|CK6GHP3^OQ<r4Sj)nFZ*vPRQ+o_D%>Feu|0th;N~8PD}Te$pZ_r
z#RC$f66RY5x(AW?UG+XqZKCf@sXeNMq+oYB@CczgfRDIwt{{3b7m!)LWHrNB1uH?a
zp7Ira+SsN+<?YQg7ol1P6%&gAWb~Iz6vQz=NGn`Ud~C@SoY6hfEFphn@XS5_s&+Ad
zmS{~YL3I~p!Oh|E;Pk6-o<11JCxHJ3g>@$iOTMQ--HV8+TzxKg;D<ylxAEp-t=xF0
zJhad@PaW7CxHi~O^j%rN+BoP!%gyoy9q_fROs4i;24a)q16)pc&p%=1^ZE??|Ha+A
zheds+d7tl0&y2z7U^FHkXbFm_U;={3ONXWsP!SJ}$DpyKhTtIynwl+khO0IeYWod`
z7+d5hAk)%@PUEpOCWfn8x@sDemMWl$V;hW4s7`lP*@;P4rz#3RDXR8!|1jz4p4pw9
zYp=boefN)^(ZrO+@A*B?bKl?ldmujx^R7&>b+9@o2&a@oQI!3G5BLe>*jB}-3(wca
zB)eZ<c>0w<zkRgdJbK$T7AnT{604f;$YE$k_B4Ff{N<@@U6YH^)Rk-r{1BwN0%B|8
zc9sXuHXME{%oqXH#k_)-LXsSNQVBq3vOO7M8yCYuxnTIWs21`?zVI^abDj)m96wPN
z%<Ea+ml-wSKh`|ZM=diSZkz3!R!_oPc+!VtS189R_GsXse8PxhwuCc-u(o-~qO}Jx
z^XFD;iO6cs1F+F5J9)!V+K}!gdNtlvRr|s?QI#%|4v!uN9*(>a=7RCURj^VOSo5fO
z9h_k9)2#P_FZ^!`aCC{2mgT_Mh95|I8iPdHvumpV@VMC-{h2TPuhISmwiVT;-Inr*
zn4NqdON`6$sG-8NbVFtnJQV+7!(K{V)Nq2eC2}YDbGs*9iwfilm@PI2fhn)I%*)4-
zNFJQ>SU%jhn!NMuhd2_prY7eh1WvE)$(~!Xwn4kirg*WbxjV?<_Cnj(a#EQpuuZTt
zbH4+|CK<_B0df@G*x7LB)vuSaxmZS401v%Hf58iRGYUDA@-Of*I1?FQQe=3WaZ_Xv
zrM4*?04kN6GvGg6M&peCi)Ear9F;wh8P6`_sbR;aGO;t+%jqjAMzK|vu{j-1J&1B)
zT~eFOto;tg?o}!qlu_LTndVEVJku*xX!u>vk!Vy^RIG%(v|;rpgpWDnLc^PqkEkBS
zn>=gYKAr^O!z4vXP!MZ1cq@}_Shk)PMTsTl6u&O4)-9nJb6=bPaZk@xEjfZU#ZsEj
zF?c8y+anyeNN?eSBN}uCx>Q~|3tn?f>Y%#=elN&9k8YMixR=LSt@V_MM89OduBQ~p
zam&#Uk&+W4<C)?Ap@DF%p6PW7DKWK0%5S4X8eq6^7@YfT_)F5QHLr}~K)WYRGrA~v
zD%>CbJrtaKiS~PF*baqmg%6mOv!Mm8MHb`Mtp$gxq)ue8oO*d>g@W5O#}}Y%)a;Y#
z%S20*NL4|qmC82Mav&Hc2Xbv3Tuv-|_c4r<@YEDV?jOvDySwI}7W~q`aSp9ajCU8Y
zqu77LsuWE@e$VrB{8N&3f?ZepH%OuuE<=E*GU7$oLw5p<rsuN4qmQFat}@jz^B9Ne
zeN9?oN^CSC6zO6i1?dW*4tj+|C@-p8i6C>r=2s&`X?s_RQK9fu<VGD=W~fY=eXE*J
zLB%ZDe3T*!BvlhfB?=-$HI*ra5QG8M8cyiYAcD$aFrPco{;J?K*CR-UvN>*jqU9-!
z!vSO6Dn@d>InYfG@k02Qwk>Do!X0^`RviY;#}g2u-?&jXw{=p$DkICOlCO<ZzBk_G
z03(VSEH_Q%(fQwrTCoc5+L4;Yu26@01d<zqaqn%y4@U9N*M#h2pSOKXJ@bca;ZN)%
zx{xb$TlVTJqisQD;Lq0-`C(F<dw$ywP?B2JQLK!l$CuQI+fhkF0#h<ep@7p2SUt}R
z*eC%U+CYm*Zm5kogA7f@8i{IWQDk8&D)I$^bIyk#{gJGRwS3?7<h78n`<lGS7~4%G
zuBo#JZk+q!Fe=Q(Ed?7(O;Z+79;_Tfg{cN@^HG^>;qWj)BzwXbeOi?aVnca=rIDL~
z>l`qUA?#I|+)I?UB#es@E+@CT+Mr2RFw)s_^QK&;Ym+j`>O2Gy?tbK9Bv9cAej3$!
z;66}a{h_%JdvB#E^kdBeu9a1FT>ZO*%tp`8m%0K1*<o71>QKPY|5YimvEGHaks9{J
z?xfiU85fxhord^=FOhMuX~DSf7Qg@vT4K8?CYC``9A1KpE<3Lp9*${ApNKu?uCW8=
zEB2t&sG~0QheR>=4un1;XltTw%*XZ&bcV0x+efyTLwN})-hI7SDYEA9W(cX9lYU4n
z+Nc1KyH-;OVWu%&`E;<#3Z9uK<#AY7mTW2OK}129hCPcqG>-#|dp2`oOM<>4V`&~^
zKgbJQ?Qr~Ndrz8b@;^v;;A<{G6)gjkiFuw`{wo}pB@NMsn>Z%fDbzIMd3f}DlE6%g
zIxDKiUv)@u1qQ2Z<oj~NdaGaBwqcn`()>xD9%YtH|HbNWf`EiPI<}kO@RTK#-9T1f
z3)UEJC8?1(!s!<Ql#H05t}V`?t=Z#$m3K!fmCX8Rg57C{)0eqE%Tuz)UJ0!aH|QNQ
z%y(|bM{{v(PJ3;FM8pCFEkHudE!x80(w+`ifwQSztNexXy{Wr19gW)h_LxEL6oe=2
zi+RR{iL}WSVDqJ<tU_$exs2OXO@gOcYF)4aTYW;&iJezIBpHw6cT6Alxq=~=@{#5u
z?wFsPl6hnJuKnI;`mz(6tM=xip?5RRaN^_pFFbCEcE)(<Dvdz`ym1nJX#)k~ZzUF%
z<@^gHhB*9eK!E=257A=GS&1zU&(hGphlIlhW$jV_(=n0#SD8Jhdx>`>K;?2}4Ew3M
zGbx4VJGaub6q@d;P>R1!ihJl_q{@u%D?M;HwYlofsOp!*LvcX(+~24Tlk<;q!gBYj
z&eG$GQK}odq^Q;!DJ8j+VZwn-vBlLSjDMpT1!;pf^|j&<fLtVT@Y%OkLyaDABP13d
z7>g-R&fkMj+*@PAJ-*MPKRcvsvdvG4ZCehOsKjQIr0Q{AQHe%_rm0z|F_1Q>&<klX
zbEHT&HPBus@{**_GMt%0#5Ie|7r~14KVH3<Knsef(@`ASMGw`jc!wF|y|gf{rpg%k
zY36^#8?2i;Hb3muhriH`^(W?E-Cg$2j~wpoo)q{Y#{n+1yh2gf>yV(<Lk_V(q?s)V
zSolp8_2?#<adW>{*X9BG7$1a8&!D%VgO(A9Of|?-jzS8~8Jc+W_>@ch2r6ybQr>bR
zX`TX(+Qh`N_X35jyXTgF9$Q)>BMC4Nt|9H6DVcK){XC>K*?E*&rgKyfkkhde7Z^6f
z(>^mknzfFW1J{mupB$dbLkukq{a3Zp3!xc^;XnN5Ze&g56NT^6#6Jc+Pj#iq@781z
zMG74!XPo`$|G5`C0Mt9Qr2H`B0FZhf_mtW^ZJ$Nn5w^x6m$L~zzQ_3}kpA~jAkhcG
zh^eg^nCzhNUyU0o&!Z-RQ%DAoo3F=Bf0vO-a~@Xh4dpFe^9Ske5a%yZ>p1vE78-^*
zhNY8oyOZ{tVsOEYX{pMZS$zDFZHMvWxFb9bR`JIDgVe0TXs*!5`Q||Hwd;M6*Si5x
z3vDT&n@mCJAc}g|4ACC3N(|X)h5Sp!Ic0;q?o!Ost#`se+*>~&^oy%f=~51I){n1S
zwKicdwGCVcu^6ltum$?36c6)pWhN56imb;(<ssFnLG>b)v?WPjP}7!~XNXynpre~J
z$NlrLWJm$h-f+}^Wm01#rxTYJ>`vhVP%Z^u;|(LN!NNaen+0q55Bb9)eUscDnLer&
z)EfRV+wO=@%ve10qX{a9J~dB1ioWCT*l$9K`C7Qz-Pa`SZ(fGS$1a;T*#~I&uXVZ~
z&tEomZnSWV7jyI=Kf0pbZ=LJq697r_afwCc9UDn!NQ$Yg9<vNaT4*!^4(=w71g>UM
z<r>}feG62;Sx(d0h&TH$;V023zFS)B#NI6nvu9s3EbjG{4gbfB7xs)}HAN+TIn%J+
zF+BnAYNlMeTgyJcUK`-NNn4@p6vh*PnD`EG5mOF|Vce#4d~0L$n=B;Hh)8!mJ}dyb
zFzVYJy->SIQ*iRTyewlgXBW?Lt1LdFf`W-N;|5LZGkex_Yj3WH@UjMfW#DXPTAGD9
zUwa)$XD4d>8uj9nsW8lYD-giTTy9iLl&~?N0Ln&VOXs*Eg9&nDBmq#jn0qpX7iSX_
zbL0DUZY?b9%$Y11?<Q4|V50I_Y=N(ZY|h$wpmu<6ugOq{^EVhiA>T%hlVw)hHI3(x
z?7Oi=E{<~w<5h;q+0F|{)YRB|(-V4V0URf)SGS;QQ+L;y>JAZA--G)=J%CaU)FN@!
zkYHb)s-+x@oInj|1;)a_5fz>nC2>}sv**dN@byb=hj)DS*8>ATwXM7Q?P>p8{+dav
zzVp}Lox0&Kzgv><t4}*0d+qI~>=mDVe_q1w-}F6wy6M+j|4R4jV<%E>=T3S5`+xt_
z>~s6S+P3A^ui7^4F$c$fzeh9tvk%`MJaO{MoG)%C=T^XjwceQ9mN`bc`n^M%SME*I
zC|4kWXv0IA(B&}7)vJ2j*sCyAg9D*Y!Xug;2m6;quVd2%tm1dl?tclE>;&si2t_=G
z+T~P|{t@RzVW_F~CKMzPVxfoSSHsr@Ndut0|DGKE<9l*d-V?2HkvRR8`;+njK)U+g
zBGH?t(%O+l%s|_N%42yEf!(d>2c(tYUS(XOoQ!f@Y$*j+(*`X5H)ql)ppN&kx)tah
zH8N?-QohfI%)uy+H|s{6qse&{z?~PBK!P6x*L#k*IOvV_ye2bZqBjri3kVnreT&y*
zhyyEki1xxLh%U;aZ}D2F64eg*C5GGFd0Rq%Bek*&QlC88)oK(;N%}J5rr@quoNd?6
z`ty~?myo>iY<27;5as)CNrJLa&kq6BgRed*f+I>`1?MK)l)9OatSgKu`Q0!dmxAb1
z^=`CucX8p3Cw0*296gNr@ASi`taU!mR$Eh=XY0{t>UPFW?7g;=fT4Tl(5G!<I+jwg
zkCukp>>~%vw-#vLfFyp~Ji?)^8GRJn2EF;NYiu~|XbZpWT3d2Zd7kkXhXtrj1)vz$
zwfyB2gN+@HMKFt_JgfQ`G_DCy`@GrZZud8kB|hu$p)6ZdQ@Zn_=SFN{Iu9c=zV8Du
zo$%AEtn@Ue$*m<tX#b|n`bcDw)CeF?7bvc=DedTv6NUQiKT08gRFftqf`UBAh8&a}
zq2M|;C`?g!2xw<$G)>p!m-j0pk$wncr5OR9yaqBHmbfYLLC2^!ir>ok6tuBt59Y&=
zMto>4rE>;*kop<8S55O19?+mnQ@Q84Dfs_jR2z7abz=G<UjMul1H7EPqd0CbIpqzz
z{BBvV{<C~#SJ@VGqwC(n#FhlEUL;Yg{Cv{j@^t?O*2i&aZcYJStiRG?6AEsPV97f%
z#+sZj#b#~~<uuM}IR^Lq%Uv_$eU-c`BD}FZ>W|BIKe)}-6q{mvUMW}}_t3V{;qaJ!
zBr-g~9{lRlywElKJ<V8W_zSzgKKcsGggb4)&)dQSq2Ey<!!JoQ-yJmH%Xh7tQIvT^
zRX7ToQ%Hz$jwOTBHtzBd44xM}*SS;R)&zF9``@G>3T!O!Z!ii!cXDqIFKAPQ(MiR_
zO?L{Q)|hgCb?Igr*v#y$?hH?g^4R5NWaa=0g2SUu&uA=iuS;L1YvS2)owDyIRPYYd
zLdek|Evf_+dGvyd!u%em`p`t2l3z-p3sj+Qs{44lEG*SDoge>JZW%%1|EQJB+{CA)
zziSkzUa*XwR2jQalt~1LEc#~KON>(4uH+gsG~<UkB+}cH6o%u5Al0V028;?PlPWA@
zQ>eD@=8hb=cQyD?>DYJ7@9Q0plI4htV}bzAl%?w|+^z+NSbcJABG;Yg1bn@OXpb0M
zV{6N`K>Hjgvs#*TNPX~9_?us{z!{SFJJGYQ68Tt7+Vl_I{m#~&Etg7n-aQn$q8V8b
z{xXfz*fC%Z*<lH(`VMvLYJ$4p=N!WdCD<3yK}Bx^3NGtR|3!18A@!2>+6Uf6K<x7d
z<J{ORqoi>6OkhS>=-pAV?+s{QCjBLM&n6x|Qx1l4u-%^J7){It-AUDP6wxx~Sk`4N
z%cEVJ+*nGvybBss&nAI_`_=6Quu`R}aI}S#wG~>dj<~}~bD*nWzY@Sx;(0YgwRuRD
zVwY_N>MSK6bu>?$%C-P@;;gzzk@2Z8ZKn*=RfNi-wwFLO(JXVG8uhr#7Ec|H_|9T%
zVhHpxB(g-hRLK!zMoOWmRZhw=yoeS}8mFJC?@jP2yvB}F)1ex;4tz-G3xQ#KS-Z}B
zt4V1rt9R9%T?JTEQ~a8?b&6Bl4um~auB|W4t@I!z=7-&EiQ$!rbWAFlFS##(mdvRG
zaqbeMV|ur?Wy*(970alH`AeB&R(9oXh2Kxx+JhOu{z*#=E3B1`o16DP>y8eN*~6~U
zs_?I`cC>}<As306zYh<ahwtyz{!R_qno3)EsMGu--><xE(w=LRO15OTb|6d_?pucE
z1<usgDCO^Xnftm(duS-yE-M+^#)N}O554@NM9-RD6qNnPKLF@SYLn!h`%Br@TFQUt
z+s&(;_y3obG`y$#u+2O0)$QOG+r0dOx}7q(sd`OxA@mLON|tp0loz)iB`~Rw>zl35
z=;<`IR*pcrX~s0($rGj;s#z=fL;a2K6dI3b9HiQUEO%O_Rmc2sOiZE<G$-UoT$<R}
zx7@MStapw5EYBWP5<~rFrB^d{7SEie8s&dsBMy{>Bg4NNwvByjMQc&tFrp|B`{p!0
zIkA7I#oa@2D<>0317#m97HNTU>(T<>GCHnuEp|1?+zOIL`jHNH3Bwpw{d`ebn@KCU
zFfZ^e%pvH^P!mq4e&r18xx_T=r4sT6MQONph)_dt)3kPN!eC8t2>=iwsd8tD$8m=W
z@Q6_E>^Quhz@JtAWm2O{(o&R<W!KM)H^0iqE)r}~IWjRtcVFu`#H@5YfCD;0wx^i=
ztbT^-Dj+;sf5QMCW|URO{1-*+n8zqOky9W~Drf#I7Xa~@+Q3Ux?!}<kbY!YU89TH1
zIIMa|Q{TqXD&X_qzPHX+vV!&-s=4)7N&31pPjjiD-}LlItJ>%aCURF=c#taF*_J=2
zvg^lDvlm@vRBLqPC|NzJ{3%ok8@#iad_+M@XZ7Q9ZB0c=fxB5dO(}RBVS3!?OKoKe
z%&*q)Z#(kLzqwx$Ng*_99?7u}e-XX#MpTh44QK3D;&UT7*k17Va32|;^(WJ8;Z9Mw
zO7can$K?+7MY<ua3qG@kTZO0u!WfNe-N`x1H}DBmpQ!vOqdyX8fG_U3zOrJs_hq@o
z=3sDBI@^&tTOMGX52pIQ%|rb)=M0nS*8gW(LPNjq!;aud49VP2&Xtgsny%P-BT7-`
z^z^Sxj-9<xo)U&Ip4VA&4l|PG=yK@ahhX47++wOpNK_Wm|H_51*tKTyW$__EV^Xni
zV_}>FNBNqB{J254voW`!Yg@hNYGT-ZzfZBY1z$tgW1&0w$QS-3n#~w}`guHA2DyWp
zYrL~|RUZ`L05vvS>9V13f^pM$-PeoDDX>GN(n{w@kw_~F<DV;!*Vl)t2UQxBKVp9S
zCNRL9MDGe(_L0<T>4t{`yEE(Or89b<>z_mAc9g0P6K#J(Q~EQ!d&Skb%mujrq{@LT
zZRo$qrgHK_A-e+(@F(Y>nZ|lut!}7kFUkw#ZBF9?k^e%jQ$L*b%pQg|Tx|X=xB|}~
zBywR@Gw&INEp>7FGGBaOD<lF4b~MzC*pk@!^BLKR@qcN;Wz!Dj=J=jYCvs94@N~?=
z=%F(Tys4JPzLA4vJM4XD1Gp(SCSd#>Wp&;%c-C2@1ATb+P;BAQTA6hYu~hE)u3k!q
z+K!pREx5Xta<AZlZ2x)SX&G)abv3wCra>N==_kHv1E!G(6;}%-6ZZz~%(=;q8*DiL
z0Sq8I@-%;P|7n8l(q2yQzmV=df(661RV4?L8ISFobC?jR@r8@@JV5`pET>TUzEDEF
zG4@SE9(*^|HjVyMN)OM+iyJzixq=V**EcJVMXQtTdboAesD&hzSMy4y*UifdSY;x@
zH(Zq0-&o|G&$(4p!3@>gj5nk18?9>GAlGf+&GB;PO$6|~C>=C~>iNf0H@J7u2SGH!
ze2V=&*mFyoAO5xp>F};tO54fqzKhiPZ&G1W?-~xq@+GhHp?dF{#zVXfF+P=4ofw$v
zTtd;K9-}r(@C)7~(0ghr;3n!v4(Allk<#h!Dq874HdyX7DVy#0eu>$DZJz&Urn8ug
zQRpJiT{RV`k8?Y%o`}?zG`??_8+7!WWn)O&1urhIcesu;u?q3|mC_~(+x|#8P$YB&
zb)-PZb6<b0&bMM{@!U5m?{>6>+>hEvew)`;_WOTxV73K63E$<`Yq8B{5XRWAd)c!`
z4|hO&O-&(nLima)g8fxS_4C%{fh9$|Ur6`2ux8i2ud18E$2>vYUxsjnUXwWu9R6A2
zoAJq}U}ZZNTUa?Mf`kJY?M{K6&NF!-a}%@xL4Ra7G?r45nwGL^JX6#@(o9TpaN@iG
z=+Cpm-bm&%KtneGf+z;Lam9BGl(rqJ*eU5D*mJUgiGhuC)Efsg8kv)d@bLtjRZMlo
zI!C(Qk5!4*xP}g2Xio}=Kq(lY|66QnSac=qQvwZ`$v!Qk1<9S{bW!g<-kzes)5;m#
zKJ2$QvGG3XTrvsw=W3xqHl!&F^9$xu?8>o}J(U;Bl$bx%cQLjv)(8}oEC*8#RH&|2
zMAYshb);P4yW{|aCU&5?Dr56*Pyis%f`Yk3Ilg8}G&C;Wc<`eRUC3b%y&rA9_f-iF
zR)9K-&EZyaFwz`uE0Z1~CwwaWxn`s-*zw}tYOD7M@ZvfDOUm(c%`>O6h(Ev~hf!fv
z3Mot=lpH)jFuk=RA18|RL0Kt*UYZH?TcStdKflc}xH$i)0ox0UXsG}&#e*5JkE*IJ
z(l361w%vcaf|353L*8Q2Jb2h}m{`{sV@rF+KK~Pcx_)tb+&gGP(f~o2XqRqgW2l~t
z{3To|o?QjBbE*VV(0#Ghfn3K`I?lxs<fNvn)w4q-LL&kZ_slj{mG_{pF}mmKPfVm3
z{>k#W*Qw)cJ_THWSL(WH{ld`s&{AwZZl{@YDr*@2EQ_IJf%D>-k4dUZ48aNxaOC9{
zVIUqA{*g%37q*cdz^ls4I)(zY=hkY5Ge-sfbFbQZ$-kK_%7n!9!|GjUS-5zk<MRRY
zo$yF!*cI;OybSl+gYWCYcNU<`g2lc#{PXZvZDTp%+vYE&g{TUDsZmD5ziW-48ZmCO
zHJoA+4aERN!Llw+tTmQ$TbvU!8Le=uc}n+{yz}yU^9Ae|R7S=b3eq4(y@hq<Nde1|
zqIq(;uoudWIIHm@TWG!bDO=euyDU)&?iBg+%@Ub-oo<kcFUCdM0I$A%o#$g1R<iZj
zTqq*PcXWq(RdH=H50RCT@WoE5RMzSSRC@ITGNe2jDxV1!tZd&hB=X2)+1@*3jF!lL
zTMB`Qp9BQ2lpCArz7}Lkel61Xa;&?VHmEeDO&B_fcndD4D$^``Oe$@k2+0I~LO*@~
zEtTIoFY-w2p0uE`Y@6LeF;o1%{#~3S_4E|~Zs<HjY|YcHqMdheM@}%#s-rq3_r7%c
z?z*Is`PkDEh+|4*&On<;|KWVpyAZ)7CjR2(5{PppoaV)BjkWrJ7KKK+HkT`LqvJ-Y
zd91^%JR0toCGHI04F^x@#s;zDXl@H`wb2#^T;Znh``8Bdm734^;`+EDcPSe!Fcvr`
z0@z5z)dM=Fb2seX>SsGth#vx|46+mB^;OkfJ8f@(C>OC!RQ@MKCD?A&xt@tBzLS&%
z)LfDs!F{w40#-i;x~YT};W%NsHj(}VV{G)Y2PfjvDdy3LXo@rWX*y4I-9-4voU|v^
z|C4SC5gsoy%xb@do<h9-<j|DOe{mm`lqe|K(@nyMM=+7@kHc=c!09^ae~C|Nl8(bC
zBW{|9hK`xzv5fP%db?1pv;Vl+`L*isio)LCLSJD*T%>9oIqx^~_YL;62A`7GY-pc~
z=*wb*MyRixdH|P#E^Q4o-RZb<Drg+rGWybYKA4m{;~DkrjS_=>_k+35U783s?T>Ty
za-hB-eq@|rJY{^ABwGVfKUcl{#W1g1(qhoXL~!mxyE<xUY(E&A;)|EX4b&tgJBV-0
zxiIq5+joXD0bmo$zSNc7`$`A)`W!9-*qfO{qhY@${4>4z8T*!L-WySYnD>4(@--Kr
zM)-Dx1&E^eZf-7~9q+pv-V<s5N6XWZcfNP~r&oiuvwqtB<HDM~`xi}J)%RS@Z!)Xq
zEP3iTQ>IQy`N#Jn9{UHwBR}}>lXec&HeYtce%D~U{L7(tZx0T<J)}Q#=i2rQKTCNl
z=O1=3n1D&vn6+#W+N#Y6C6C*=FW<irD1j!hu#6{wq!Re*B=`pKtpq&x!%M9ay%lOO
zgx^-@u*B~r%nO|7mTQ_7=RPvv6$9;tCOp|<F1hQoDj$cIoab-+_X@xxz$*(~O(f8b
zdtjrk7E-$g^<Yn%*w?A4X4r|)0MdL3(P)8t{y0O7=v^`SWdzCxaB^jP@>)cyiJ)@)
z9@uB`FOULC?SeJfz$SQLAV~_EaOfGpfgfk|l?F5IXY}^Kc7$bLU5Lr2WX!GW08*;a
zW~))lxd_70l6jDoO+>fUJt-zOu#M1K8o;Ce6|6o7f8AN8kR<plj3(xjeOLGfYCX|c
z#}DrGJipaqSa$}lg67^`44FR54}aY03U%j&=6V;H7rBC;-!k7x3}4fPJ{q|F<dLL7
zNl*(kY99<VP(wu#8b%S*^C;sz)0;K@soZVW?kPM<7Qi$gT%NV~la;cr{7+V@WHC24
zlkq}E#CxexL}(6DlDsx^(a!5IHI!K0BHje{Y=4aigV3LIG-Gg6?SDnYnuR5)6d3rX
z;bj)Q1QZ&K09T6&;PW|dWDh5jb2QbQRd8Qd%Md}l<Bl}R2M7WiYQXNLrpx^{2R;Qd
zTre|VVq7m8AlApWQKc1Pq)izmJQY3?v7zs?H02QHNl&<HBN>uq!~)JUFl*X?^b43B
zcsWp5=RRPn#h#Y&Y1F(}^xZo%6Y_@yaT&soV?BLZqT*Clt!JW?Hz=~A<1n^{JH!i5
zoWS8z+D(^A9Y^kh1t^@tx5*R4t93<RUTC9D`Dh^gnJ#z-ieLJba_cfyBq=bp`#pM;
z<B}uSkMS>I9a~8LkP7Kp5cUdVf<E1wpq#EL6gd_I+Z+i^%qi1Wt5Z{XJ0A!TDsYZ-
zi@J{SrpVP9xCSn8&M@fA=-&entwje6-^BEQNUt3?;pMCHygV8Qa4lq+$dgpT5VlxS
z6C;Ng=o{A)W+Hk@P96dH*JZg>RgJ;8EEDImg!{707~!PgPK2^Ac)SZ-Cz^*&nv4MS
z<IGD*h8dUVP7v9c7)m;{9zwYwBQcobl3>Kmx8ZsQHs|X=#f>z(Bk%3A_51vz!43V>
zcgN?XoWLTv3Z6Aj9rG(|WerY|r@AINvTYAU1OV3Y)$_dUUm@Bs#Kx6i*p5kPEL10V
zxH`iP^K{V7r_kZlF`9a0Cezf^ou6JjeCLpD?CzuC-$Ng{sW*SJ!2D34-rk@uyBo0I
zd(=GEKM+bU?8ac%wz1h5QTUv=Kx349)SJMuwnvzfa&1#hFtZ3eGi2Z#+!jN!l_VTK
zu8I>iPh5UL$l0RPD-6ub>W+Fl>KMKaF@wRU3d90rkSYmo0EhrJ1vXh7@Vx>t?tK7?
z`oPk0Z3jjZUg-~VVE>6H&AD8^%H3%&V?eKlIacrjpc7xB7r~$8$ThZ$%1fk1;xadc
zt3$+iUat014~{s0wqkNB<2c8i&I`7fsz?=7-(RuVHd<53t_DEo<K2r0`I1C0RA{oZ
z2_;w)kW{6+PuhR$`|M(b;|CM_V;xtcJlTjp#OHC&%5?9s^|l@@wjS}auZ$G`BIwuG
zMuO%D5RQC*Yqn>G)xSdAYeDWfzgK3#BZ2AIF5IwcPkhU<S?4$3@VDy1e%IJ%`09Rv
z`FJAtVv`?2UMP=*GBb>zOm)F{Qz9SsP7pdCLjZ_#g$Ajncv*RyPpg!~HcE#0Ht``3
zG!%QDhj~}X+yz#`;~JW8!S3A{ZZ{?fcQ8F()VNM~Z<tf7HXjSpy)p+}k}s5WB<%^V
z&UC<;*-#K<U|l^K0vj}t=2PRo;4;)@d_8$O0~0Px2F!4Bs`pV^%~UDT6ea0S^G{U9
z0O=m6Xt+xu8A&#sx{NCKKJjbXEJvIOwS_IgfJ8WAa*mD9D(Ocln8evrRlBQ|ENUap
z2r_nnE<_k6yAFpHvB(jrww&A?0&+aaG*1|2yh57KhJ#Po9s1jm$~;`EEnr%xfiQ8^
zE2OD-X)Tw<Oj-mBVX&sW#hpDDeqK)_LoiWe9$*0lO!o9_XIs>`A9pn+;XHxu^u)J%
zda@a060sB$orNQaBX(~7^21nMHOW4_$3Fa2-G>@`_=+Z^3+9DClbu)=(oyGPQ-+CT
zYf_=5zbfW(@TtPG9|=Vdr?^Ptj6w+S2n*&2^@V<*e}nB2Zf;M#)ME6NQP|aF^u6V-
z-!SBF=b8lQ7FO4KX4PsDLw(r@WoK`_4>7G0d)XwaV@aX|@;PA2Q*8aUs^gfvut<U!
zFRL3y<zemNxa4~<F%w~e%&a8L>bg{~67VSfJ%sc<qQb<1N@P5uPFPKH>*aBdmJAK6
z)MP;Q>65%g0i#B?tN%s~Csny?!q?>eRZ~F3XN1kjbH6DpJIsZ+tHf1PAPB_3VuYlg
z140gyugFt@xUO<>2er8)+e2~oGcM1t@)81EF=O}FpkG-tWY|38ISZuSl7}nHg=zxw
zXk0}Dm+9tefcHWX__pDGO5I#NAU;e-?)K-gS$fy;XvK`VI*FU}uYh;-fWx13db#Db
zs9s;l8>S~64!7ZtMPlns9e#rLyY-s46-S#Z{JDA59@d50LWxq|gzKqp#u7ufTx*+?
z5J!RC3n6={be|l-bZ*M&S$H^e1B$Gsra1XM)LqCxaHkULYNqN@k7KvEHz2g7yWmla
zoeTcHoRPe=)FNA9Py@2e8^qTEM-qrXzU|elE}m{}ebb*q>l;Z3xP%x-aw2yd-Z7HA
zF2l48(Nog$xG=mEM~Zz=0XTz)sX#+1jYXLW=GC%DT4gocUanDTb-}qH2PUeP%PR2~
zL8d6SjSfnmw<t2DUdG@>IxRjs|9(DUeE3IPYPO9QWeFcrSZR8*tA7|q3|uFXAb;}}
zY`s2A>bw`rE+dnL1z<7NCi++Oq~<oCSf3K@k8x;IPa~BpfJh0A0O}Q|evI+JxQ{6l
z%fh&blu3*9p81-&I!Pi}ElDrpsT*Av6a8H6hJr^NL=$e9G55S_xXP|vw2wu)LiZjG
zf1(N6!@o5L!z|-RLHB?!IvgEtwcoRcHFm0>(Xr@zmWXMbJb3<zE$GlJvb1Pvq12k5
z>3=I`5PdbJ)*QpS%B^B7MNXJ9Wf_OePC05>gEi7oK!V{kIT6*nfREAhs!>jhVf2#4
zkvu;_EX9NwnLM@w&)V1n>E1I?1v!ALf=@Lkh_y@bDRDJf*&0c0^Z#TGwSiauPnIzE
z&zCUPygD7awAK?X+-j`zG(X`Q@2ff6(5cu^fx{k<8T+WXu2Dzq%?iC6`bk-<>zeP2
zC#NPy%KF(vaJ91H!!E#{>RE9{2z~50?$OL2s60bRe{*#Yws7Z=(2DXLcjXmE$03S;
zPA|M0Ym4J_826NWoDU^0r!$O#2-MJ*z0=iVUK;+b>_pwzXJH9uwU_z#EYm1~Q7;Au
zEZ^8EJ?+Kl@1IN^EH~aH^35BAkI3?77+KxKJVX6v?%7%{o<icAu^rY5U?RU?shkFN
z70=Dab|9lo6_V3V4awVWoQZeTGR733Ih=uOxCfYo;c#VwywX@?OBTKt|2&iWrdh^y
zFcN@)D7Gplfy}r;$hos1pL3crp3G9F!KYpJwe*t2$iruGh11jFa1Bk6%sN5+)chRv
zL=@41I<Qf1VvwXv#|7h}aTy<P8FL#Xga}sN=4Pi?#~5ZC(W4?Q6dfP-+Hz>ifzvKt
zpo0HOtSEJ{(0OQf&Zkf0hXpJYy240%sK#|o?|-ta&8Ll+nbmDtMHcIs_0^uYO%I`D
zw(j02KYzAuRX1&pq_GTRXSBj%!j{T$)j%^GyEtF`t}r4REbm$uzgTYAM<4nq>!F#u
zBdl*!E{lxVw*T3ip#v=A0qj`A7x~458YQI_iODN4qr+Fjw<(GS$}GO5L3g1Iq-}f^
zEzf%s*8G=NF?%5k#j7mTC-Qz+iFIyHG~p1uDN&UZU}a?4>gXAkXaHLvtGeHw`TxN+
z92FgcsI#MqJM$o9w0@J2-B&{&(L@UrDeffm#q@i+Ip19G@?Q+bxN50b$PtI%eDsYe
z$?lgW=mQ#3oj3_*7sQ`VUx|Sqk%1IM9w&0|?Rb3gq$W*2Tor+C-3Hx|(<#<5MHDLg
zyUPup8SUDK>Mkg6^sH`JMjO%;Mj}KD`KK#bj|7Y+QlmT3Plh?xQbiUt)=|cfKKNwT
zGl-_}Vgg<BzWrkLL5z~1t*C_$6u&5%INU|CcY$UE)OsWjpC<7o?MAe~0Rmbhb;^m1
ze({Q63E{>xqglrBL2V+$cOP>f#%Q%?c^E{9R!9@9dV*f|w;M<a5s^i0tT;Kc1v$sq
z9~7UWSDg``SK@)nqz>|_h(hJUrJ*81dnDdG%J&WZuCs)3co}yNpW>u|?;D9<i0V|Q
z8sMp2(JLG++=(^;?SCtuT{{J#w4gOWKEln#Jg>`LPsmxSemt!bVYCqD#i~G{S`a#V
z(o50Gx9q1}ZVjbQdjjad^f%m>`e-Ou38!|E=M3I4RbA!M(vOQal{W-sX!PZFuQg20
zZB7@ZtC;+7W#;$9_SO1w>#!i&R!uRT*>}Z21&mw7+!w5dT@TfMi7s04neUv%)Q%j9
zfxTt^Tf^Ac-3q&Pp*y4w-a7RHn)W03@Xs(z5+_<JkV^3b%fVK&zFavi3T`<LE<yO7
z=6|Q$G=tOwUGJu2Elr|C5SQ|6QM18UrXCjbON5D3@B}JIglgnG5quT055c<+s~w$|
z^ALt`RZNae1Y^Ss`}VLJ7g!Q{dyroogy%ID(TR-fYpZJ`=^5uh@0ZF)taVkJe9?T7
za0({dCDM4YE%b22n(g;3DO_iTsRg-_&Ld;X(RXv@JrjLQhjX|;OVSKS<?J{lGRu`J
zvLH~-J~v`i0yN88FlS53MqNaKBtvk%y2`H0fc0vnCBeTk<IJ5>G6ugW4NbETl!_bu
zbch*V`pk(l>vkTU>TigwrR1wCPR}9)l52^Dqt>S5&#jnASqBg=adZMR@JJE{omWgf
z+xcyb1E&Jg`DRTuOiDj2zFaFTlOEb;WBo#7_OUcq+31@4r|MWfQArwb_d@~_Tw~PF
z;H%SmatV|xXlDhY#gx!r`T1n$NnJBJyAC`Ke2Zovnvk^@dH4c|s<&I5S}JpqRQGQ$
z5Q;xSW%?EGB@+{TvFXGIE1lXwU=z@dMznv_e`^?8&F{;X&O)5I80=SM@M{{l@#{@s
zQ`FFThJ6M2<Fg17AcD0u8<QAUMru^JD4R)Frl5q4D(Ezq7;|@#FvvUy$rD?WK3!k(
zgV>L#GgS)^AT|&JFdFGgsY<DSX*#6kVXo<^r`M1+Mhyd1U5DP`Q-C4zC1y|#0KN#q
zib;*pX^8%Xk_+|6_2tW<O=X+Hi|l7Lv+9^bSCVh=7Hgkt@Em{)TzqscDt{;ll?`~W
z$=#1CCkpS7XTb_0MTwRah>1u!0es5eB;i!qIR6-5C~#J3!C7N+TA=hS9CT0IKWUrg
z^NZo{e>a$a<8Si*FnaoSg5yAWYWw@^|Egg6{MY0E{(r3b=zG=|FIW9H>$krBm;cz9
zGVjUnKk%2gr<nfc?<fAPHl-!&z+c6jJ@rqE``UiErTFEaXo6+KR|j6P`>y)>zVqqn
zv%PHlw>4p3=%ce`N{MFVXn&+FFZ^M2_)FJlPWX2@8s$@az&vs&G-Mz4o&AM3rZ07{
zt8YvHTgo%wBj;q6AS^&D9A^wk7XfFdPRbRh8ybR!BHKLJp_R2h&;n*@><Hx<iU-yT
zWX_EQcN4G5IN{>r#MzP_(6fdTdB158N>yWCc?(%|yN=54EzD$(Ad4#CKh`F=3;Fe1
z>^JFO8=Ah6dv1mn0tEnApr)fsxTh;b?WvMYoR0$kj1!tkC(4JK<%2gM+yX8HT0sHI
zNs;7ckfEB2>-N?_+`Zx5;n^YY;t>?RqM0$`{wA=*)9Gw)rZ|5TqB7T$X^FfxREfOO
z{&$XbLvGeB%k%882yq$}%SdtACUfK2&u-tvIZ@k42A<><a;f(;>UUXcj43fEOz~9u
z8;PjT6>gkNlR4}r8K))Re{C#ymS~|B%hX1C4G_KlO-ARnD9?^s-px6yo9E>_-s+jM
z&33QfJ|fcXX!||SfH}}=R_+Yi|M04>Kltm>uV{V_h23qz`sgutM`$Ga-bKyPUv#Z)
zu1Zds2XhGTo)cZ=9M+Ke0BBb%i$;P|(!XALL+sSjolOlzo+ATWN`6qTl3lA!&utYg
z2@d8P0I%4Fr%ZFX@yCS_l%)HG+F)XE>^_oa^?oe#)QPQ4GzgMBMiMSsZw9Dv5|%<I
z$PwI5NJHtezhS{{$`p#Htdu`_GXrtY4tfd~L|U!`)u6=u6dOr0LmAkit?B?tpdy!*
zrio&trZqC~VH{4x({;Fm*&Z@Y)s#uq(2P->4`MJ7mW~r#aNaz(Tqk0q!3(Bn)e2f*
z56R_Y(kmlRLRRiez>O_b%u2ldm-%=8q3l+FO@j5Ap5>ezwfbFgj#d~cjWNcVby3PQ
zoRbOgZes+h5+mrEv>3Tq<f0ZzkDf~a8+wa><4`|gY#;lRa+CEi!isttU$i`Qp291p
z`1sB*^rN5of@gHWdmQUcVZR;6nUFQwr1ws6b%a04BUH5CObqbv7HD9@D>q%Ce=x6U
zZi)!3-@V(tl~E0pX~@OhD1NOHVc>lg9Dymf4$o&n^Xm+!wE>Apy~246#*jBDfw@0L
zb0z@XLFsxkp=6ohD#-bo`4IhU?IAzBl>h{y?bVzPg@H4sxEv;}MS*Yg12LzDmU0!i
zJbHYay-=UV6JQ+bUev}@kR^zf^=Y;GnLRf8n3Ql;fF+hRXnv%Ygx-aan^<8SXkfy~
zw3KCdf#+}O0w!BE)XykZw)An6^k9gAg}p)=1usfEe_ZS71+nI^@$U~ZWZf65=eGr3
z#>8@4fO3iYQReAf9m1Zl@P~Nub{!9<+8w%U`O0@(HR8QBJvnDTuW=W7uc$(!va^P;
z1@><wuNnAman5{IK@E0nD5vI%Oy_7b!bvK{J~IQKzpjSTl!jAX+vilS_V$a`Iy=@e
zBPQD&KeKze<3{w@4>frAUA2c#=|Z3Df}P<(R4m~ek+!tJtG=t6k;t-p_Ir8ZKy=tW
zP&Rret!!|H_l0f~JO@k@#APVan8ra2u~v!+$@mpfH(c_g5}Cn!m08>5Pj;<l^%%Sn
zO1(yQXP^c&M}%)e6WCIn(Z7HyuEKhR2VBwL*pWQAP=^w54aZTe9-jy{$um8)kiOmS
zY1HcG_6X%yjO|oQJvn;?nq4@^Mq`T!$(H}gnT&-(mwUKJc<B(Ktg?|%BGggsuw@Hm
za;@@XIX<b$m4989`<(3Ti>g6_GY2pcLE2y3mmF~{Fsg&1!okkTOp`unVsb4~{+<c^
zxV*DD@_o2A^pzgY7lEm)GvmWW!D`oMr-I+nT^lqN)lKotI!wMAJQmoJ*LY|xTIHE)
zw&sZ%J)`eHoU;QaL@wM43x|4lQmzHeG%j~pU8yl8)*9y`SRR)OQ+z!efu%$aF~hlG
zZ~9?p?WxD#>2cj~g*rnYx|C~)=1^EMhh63`4}^ble=!f4{Z}<gUg*<9WLuuR#yX<^
zDy-ynETwJ-%S3V*1udZqS&UGTs%WNpmn1caLrz&<Kr`D|?0y|f6ZbKAk`mUGEvye{
zO8%E-J2&AEVwx*6fCRlD{pixP?NXb$gb<lu4G<9(sH6O%9(u{^eAJ^~8q<brGQ+TO
zt`UVi)gP>n09%Sto%~OVrxoxTf^)_#Zj5OF%7RDhHi{uXZDW`8M4}JlUdJq@?PeU8
z?ti+DfA!xFn!n}({Fm2sUaEP=_!d3$N`1qr@<K~&@dL(OtcY1sZsCM{OV`Xs_A;6t
zWCb;*xXY6o`9E4Iu_sS+E+L5w^)a9jtDIwjtI^o5{OiOOAi*mw_$p*HJeR$4zTv{L
z%AOk-4h5uC0(|oCDf`3ETB1#DWna~|g&byop82Z|%?{UT?+w%3KJH7we#uSuTtC;9
z(Erv3V{>ssLLM(F5v*b(lbyLl)JL3-YR5_?L3(F=-x7V+oZUSa7();AQ|qu7s@1uT
zsE5!eL0km5FukOl?{Y8lorl`9d*Nj_&9$K~$8*5DiOx9ZDA(PaqUO0Unen8M(ulS;
zl8V3`c+8|ix|C-41HguVOveE#%UXv;-_tbE1uDHHXNVRzeq0yG+w(4WCZt<3->AW5
zWjR}Y9NSS9U43hmY8jpMYPs#QZ7N^B#50ZSl>fTFOE~eIt$8YGjaJ#SYX}Hh207I#
zO&|Ag&H;)xC@$?T|CX^`Hs@gQZyhzJC-CyVSd)azK6bDsVWlOu&;2@G#!y2VM>w%S
z?Ol#Fw#EcV)S1(b50q4>`|kxv!!@XdBc%!5FWpip5x&`ztr$w@KU_KT3u@g!IHVhU
z`_C8fV)#qV*q3>sy%&xY2lgj1-dy6E<^SM=_36tN`(EFFP{f^R0*h@>6{XLKX;T5;
zYGPr=Pf$*sl@&0WV<J=B&*Jxc3)qn&pCd`IJWi-6g#%9R$c>qurfpECB@d4q(6F|e
z$uGjJNBcXnAc1Byc8EC)K1cg-Z11}$ZztDeVk;YPy{(91wy_v@KmR-HAQ!HnU?I?C
zKi-LGf1|i{Pl~KF&06^o%lWY|rq>s4A5y8eDmN`pLY(AtpiVvPPfqkaNs&S;&o_)z
z-jKpvDxY2Mi(gmA)$1~uT^|&F(f!41b+7wo@u2a;fGS^aI%9Y0Jf}?kX%0iffZN%L
zir2dkWCHh6F5@~(qioZsa?jphybQeCGVAp$USmQz0Xz8?wA8${C0ovu0;uoKXq5l7
z2@{N}S#z;8(az6ya~Z-9I~Dd1+Jj#gJ5srt{pNd`@W=x5yNtZ^XlyA}j(fLb8%G6M
zJ+Xw8VkllcspkpDgS$<ffp}4}h=&t<*B~>w1i_SIpc=U?Oj??Mr)wvs_ssVZABiq`
z7f-527=NUuaF^MLfNv1C6>ixU0_#lMtd-N5VgX!dUXPmYZC}Y?oeh9Y-z1d{&sHjq
zYT^ZI6XlouU34Al&S+=bNWIYrA(kazk|G)>{t+^#+MOb?w@$oSlP6X{)%%8E?@7PI
z@8~L>(a8+IwIbjY@Q(um0cYrcr^Vu|=V~3J+7-tvh`IaWf=EeAhkL(Crb3dlu?j#<
zsw6obFR;NIBmK0#LOGSPujp!8u-d%C++zNr%l~BdREbfFKW(i`@G$|bNyFA-DE3H4
zM(o<2T4F7FvE7v#S|1k+5slhdz~=&Q&lZGOC5`eR5n#@NAfICO@1~51EiXS4b%Ecb
z8hb|<{>`57nEl=!=~IHgemO-mQWm^I4=U=wdso^*UquI_!|h>5`1h_hctg_)nTTP|
z(wAC-)CGV&Nk0qfO;V(FW#v+7lhjp~c=!rWufGoILXwX*p`<~OeNY0Hz@H`{u&g2f
z<Zxc!^gmzYIG0ND=<DLpP6)i*P`vdRZ!dq@t72;Fya+Nc%Y3;Z!Fy2#rvFx}du3d!
ztM<7(Ox#6eCOhuZN9K?7Ek!GX4rv|#O?v4Y>Kz+E1qz8)oQ_$w&7qoa#GI1$#obC<
zJAwWbCLp>`AN|dJnQq=6Wx7%qudfyb1U-x;19pzM$kX$3fJ>TY4J|S^iR6A!3d{>$
zD66+S?T$fH_9|?n*slq-`uC;pL|y>>IHG=_p*`K%niQjdPSUg3^a0hEatB<A$OwqY
z{xi%mr%_{q#?sjkUWN*UsuzH2OZW2|ZbUg6TNZ8`#fSQCnjNk7XH62d5c1^f#=0YI
z6R2-5hgsp@hQH7#3p61|Uf6#q{K=qs`0|T=7nHy2=3QzwrX}`nQ8#ma2UTk%d_nRp
zoY|s#q;aV&DuCfdCl#aT3!rY~%R?Sw5|^FXL?yg5-?)hWfOAiPWak1J7C>UIKm0D|
zd(p=eKnIz!^}4C1ipYpY5vjo&c^#x;JKUE>vFZ20nWZ37#nt?+OkgJlM#W77sL?9-
zXd2y|dZE%G3uMnY)eLzqj@qpSq)~*Oz+MuBbCJnQfjlWK0r#G1d|5_datxPv(9+2%
z;A~<V#d~0qJDP>g<RPf0bWO?|Qq{9ZrA^|dUt@ui4$H#y1jBv($hURZbly|<j_8j#
zwq)5-V+MbW9RnU)N8nx1*IVOa^^kZmnVJF;?my8*k%MvoVE}NZ=~NV5?#j%1>cc16
z99WE^qD{!poYuXTW8E;{VVM3%;TwlvdD}edd*zOOq_phI+jJ{!;Q)Mxk!Wc{BWXhq
zg+0=Sj!7FjLK}L(9{j={N?TWQuz$idj7E`(7@LYvF#?b{Hn9*BdN;spjYiqW4-WC5
zX9=VZQ3=+hv3c|O?ip&4%T~IDpb8>+qGuO|02|m`cy9Etq!M?{xic^JkBf=d#nW@T
zMLx^<`h0b5!eRE55T-?CVoUJJW}-g&<CFA>2%fn%Mta->L?y9T)16?~VjNECMTjZe
zfX1-QPCo?o0MakzoDUal7)y-rCP#o<^<fJE-XuG(aX?r{T%Up~$QuIQMA)=W5sOAK
ziY)LR7<U2~mkJ(FO0<GY+4z%II0HlK%}W3I(^TT^v0snH)$c>?{&lOeY<w`silcQV
zcoY;MWEFH>jG=kq#a-PBSH$Y<n0lV@NfvWIC`9&i2x(c&_`c<sE+0SlTDKu}!}+_B
zp-}YLCG&6&x(Ns?fpBjd4a*l3T(2rWAO7v3@T+YhB2mJ@9P_<B=CL~%itT7Env~K8
zLJ8g6LEqbr<j4QgO6xP~y+JGNBa=bAXPT}2h)4Z(A_x^*2QfXBw>BzeGvjU`TqK|5
zku?|3iveOw6Il>#qtN1&>Zr#l>O?vwE)bp)>MFtO7Gyz<NY37i8YBKaacU^$k7il=
zuhp1ds&xADA>^AtYb`?e5ACMzrO>ziSn6V#eR9j9Dv~vaV~hC)6Lnp(+Ukh+IJdic
z87>l5A2;stV42*6n~pNfInQ~?H)p|FI_W<#-7xgulDiWjjU2cDxU!x3E84qy5kCwF
ziJF(0YuM9!KUz_4-D1l~rR!5^Lp`Oz@1fwFF6byG;FYlfI!|Y-G?kz}sr3mF*#<~y
z1|FWTt~5p+mqW|7N&*oRR#=Yw^%X5$x#zja>fJq!1IsyCQn%T*tZqK-cKUP7xAftg
z=HWwSV+Y#CSl#d!iJ{-vzkJpAa{$0E+uMR)<%NE0zSpi9OL=qI<r%v(arWNp?swA|
z7Xm2&lXVNkBM|`rJJPo>FXJh&qD|tZK$9TIz39$q@!5Q0e4Ao)Y$&uGEN5qa+_wgC
z5@iLs>l98o4`$m$pF+L)yv6iRZ82;vW%)vBQ>)fJkaB7Yvo{Dx-V#Q8yH{m`APh_0
zWy2gE6-M1W^V=Cr6Hu&?l`&5eYUMgqD`fR%lcRTXV`2;BA=zpoR%gfv&eYJf4(~ob
zsmv~@&Z0byN@3cJi*`9*t9fJLl32YZDn8MOxd#Ompe27Vf5irrca-$1OM{0bh-3a;
zU?Ur<`AXd8EU}3e6!1?uV*|~havO@xcNebO$|iGe2%GQz;qPwEy7SM0Qy<#vzL#_6
zpML!213!M^p%Z(O{@bronrfV%Oif<#*qv{GP&?x%?LU|?_xPG$|Lw0Ineo`3UrdRe
zvgk^x<FU7TckHlS&(s{<V|NXWKIuz;H|6j9fB7sBL~W4_GbuXb@p*fMPzIPj73zy|
z1{0>3T+3ci35Fb?EAY)$CD$hk;9{!ntU0&A8rS=%XW_cI+M;cv^&xkYedMFCFEM<G
zp?sTig}r~pbx0rjyaQ*?kjEabZ~N?Kp?6lRF9JD7>fXTn0Q#BZiB&)kbLbn2Y;|h!
zss`CHoSRL^1qXm(X&y)(P2%H`GP7fA3VC`~Tx`xIugpM%_qS9Pq%kOROp4gmlgboW
zm_n2$om$~iOO`QEU~x0N!xluxl)Ib|urKkqn&}<o{yaj!G~Nq|z{P5Z#2{RfK*+36
zmsv#G@nm`ia>wI*s7yH>OPKo&QRyomOOJLP+F4{^hZ@qDBS>!{7TtIW2cJ;YrKB|W
zH)2w#nv%6~cv23?FlseLWcM-`=Hco~*TtnEj4AmpQ`suQQN-Uq_PwZ-JhGKW4!iuP
z?zIMA*WKFQ*s~-$E+X54Em*BTx!$><S*dYngWcoJCt1nPN0l3MX{3y+xxUG_=4Hx<
zWYzmnvd*zVlmT*k$K)~GJf1YIqD9uVao&U5PkQohB$~f`+kB_V{3T6OOZ3=xo1@3Z
zM$iq9T@8PwD+|<XbdysClLEQT#px#tTRV@D-x|vYa{kFk4+wFSI{2TMBDh3U%H@?F
zNyB4&g|i#5iLv(*jSai@th;_<%37`-DT#s1Vz6CjTlMrVbfx8647-|f%o5reF6Y~+
zVb&Ot%m+BMQ_%_aO`l2)X(6{jh=#PeF_%m_-qpZ&X>#UxvoklVSNq`NoC3dO^1yGA
zE@wPSc^jP)lLPQe?K~Y85q;wS+na6mmgcJc73`fP*A@jEKME_u-Z^MH$XxC7C2uZ+
zED$LRSBM)cF(rWurvJiPR{U`-qyJDm)*9SL==|-Q%d5qqoc3tubFuDAYwKzx8;^WP
zb(LT}l|A){aZOK`P#55!u#h+7VxvhgsOgNwgI6Yl?gOCClJ}UETpydCxp;N;h53)0
z{bzKX*TH-wQ~%+%OoR9|ec5Gg$v)eu3iQ}hc3aLxExBAt%cFBFVapKmDy*;MeAYyy
zJzef951Ve|(cMRR=PUMAIAOeUtPhhWx}A_0G5m=lLpPV09PlY*t^^{u-&QUCe4rR2
zY=hcWt=QzjdftTIi1ARn2`H+MVLc+(-f~l(1r96FPn8j;Q0Z?ui{;+*onSOD9{aMu
z*xE@SF2pFLi-fvz7W+;Lg92!W#%DaB3`p12*f}%ayk2sbA{8;@<U*IIdbq+I+T%T~
z3DWEWigCVKlsF+xZtUdHULf~5>Dhd!EkO@OLxzy*t>CY56<XH6rysc(yd8|Uf3kfd
ztr$$NYC??HA9cP37QtY?HI^~#vY5f;z;rdfp6!+HW9LMip><Sw4!Dn*mf`t5H_x#)
zb8|Bcy84OIamDBu^%N;{{VNu2vn(#gJ;SW9c;<Vzb-}(vp*j5TdgzSI-9FKbb-04=
zj_@Z0lq)%OJ|9gKQSma@wXBpxo=N`J>gJ+sib|+q{8YjNzyS6crYmv<ufgMTA~gPe
z9I4d6&&0#)7p_NMW^Xom%PZ#yGc*$PELzRZl^n8q)H90nIGBW0?VPMYt;15Q?6g5k
z#Bn&%Q*WILJzRO(>P<NK|IRAL5KbpWGq73LUgsd|DP59^QV^r<svXzUFqf0M3VLcF
zS1^uUv!NO`_?$N}nY>7qA-yq0bt!3{iDmtEhs&F<OrD<_Q<%ksva!gxDzl*om<L}u
zN-x!On4Tvl(BSLoH8fO}N65Wi9mRsp%TcN$_Op!XH)`4$0Cf<DE0<I@4uXcoEasLs
zDqlN2_L)AUW2A;}jVH0}Zl`%vH+FAOGu%!F^_A$GntOW?6bEmGgRSJNXo8WUobZSC
zQQz5fEs5S6U7qHyhj&*u7sA!aaK8XN9(D7$@RL&gh5R2G{3UU|!W1xW2OU(+*MA(d
zL<M@BC?o2{(o>ue!C?zy_c!KoN2(4l09~A7KW1<(#|qOmPG)UkW*_;ZLdkI-iHRob
znt@CUQUs3D7cG-gvI4INjTlu0c}AS)jJ^1Gl+9v%3zHEo8-YOC&7!K%9A^$R$9Yq%
zAgQ$gBSB73$>3EeISzr%7+0tJCE_VhW#I^j#K2S<5f$Mo+sdWi&}k+ALMrWaGF>Q@
zHV|d!uZet10c}>P-b2D0-pxvH7EUq@9IC9~-@^6Iq*;#5DO(Q(=l#yR+sER~o9r*z
zJ)Q9+sZK<&KV;Zdm*F|U-Osc-wy*%Fri938ezu_8=KH25mnY%i7Mt$>cFQS}CE$1;
zn{{MfWZ~q-v!2IME#@P_xn$UESe%SkJ@kD~rzXsh{4RO9=Fn|>$UJg|NYCFL`l1zb
zci2HEYYOZ!+jOB1&BI?zY*n7KK+JcnDfGTJbEWiLM?ZuT{nom=1l}HA9;}L}wTNtK
zhozN}$GBcS{)Ifkktn-z$hkn$9|0kR0c)*0=g-Y-xzF**a1TqzDbn}=m!A`6jf}#f
zRpeM^a_Og#yc;Dm%4!4MRR2|uZ+!o&%Q-z|Yy`xh&M${$3OWSdMG|kiAH4zLS<$j^
zk)C~J7(#UjbR@|V>@Z=#a7&8}t0sz1XMLPYC@W=BE$E<(fXNo}KS?tSJxW4YrJI>2
zIchjxcI;aS^L1Yt)erShUL(4mO$=dKQ8;fB<w~(^za%vl0J@_VYB6u=O1szfE7!aB
z-m`c1`BzvnfD$1OS5GH+J|lLSHJLG+5RLNZ&@)&iu+dN7Zd}zY{s=t_iZ}OEb*WZ&
zA7U<&lI#DraT^;KhIOUqcvS7Kg*iL#>U}PyH2j6x*Q5!56ut$>7^Wo(w?>a`=;tuD
zDZ1cg^F8}$q75GSXup2!?oI2m!28<PD(xlbrdc;K4u0_5Z1Lo&!wW$i%{|XncX{f^
zoov-f9cLJ?cC}S36rv<G2?cr`$1xoQO*p4jI&_XpZlbTj2RLK$fM6`?&kR+{txVU+
z?Q6=(bk|cTX^&M#C`o<WJufJ$Swp^pR<_pkBgO}Dhd|!Z-Q_uQcCs+spbv6q$_XI4
zM1Cn_EK1^=S~L#yOZ1}HGqBf9_-X?_%f&Yn^A_4eJ(r4rRCHJWPDVP43I>-+J~%js
zt6MKR((zHaFo?k)YM`q#!BuZtcy}<f+61i>)+T?Pz6yu#!Omc%{gmBt)<4^}!r*y4
z+mA|AnG4HB*3*5UrudEbsDG#|xy=Q0s_jJ67kgTnBG-}oab0e+pL{Q2@}U-t;>65`
zMo#@d_wtn5cY5A3xbuPz&2`O)Y}cl+D|{)jEM)%rIL7}kI{c}9WWcUC^6qRuQk<eA
z*QF=*bjF*<`b)2!I|#7>_YWQraG}2@&%do7tO<2QlxIHL7l|v5K5Tj*1=#?*%Y7r(
z!No}_!O(z0pxwWtzBqG{jDA(M4aZNBXCED{P>JLm-s2dS*sF=DYkR72tHo`AkAX6}
za$)pt(83EfGt)tW%8k9LIt=9{698!dNVC;nMUEuDT)>Wt`gamjL>Y?{9qdZp7;qt>
z$)S&v{!<VNA3xG;p1gRA@)5iQ!?MN@Ou5d#D-EpBh53>BvlLCDVH{sowlc*U{5U!#
zWO@RO%JEIPCGNsUM=pfcl{MOX^&`t_6B2tj%t26PdyQ<B{#xkjN2s^vN57jpHJ#L6
zL7V)I<*K$kez4q)LPK?hBpMNM1P-95gslo^yj@$LknKM1cD~p%Qhyhy>2uwf2dU|R
z*_&hD9B9|Pt1t6kHT&C8(V)x7Uy4jv7{uU{?jH8otlibdXSu;o*L244G$8Ba&4`W#
zln7~}u+d5>rlJ3o<MJxF0AH(6j7l0ivF%lU=r6QzRWvE5Wj^R5N1GEkvUklK|9QsN
z5~a|pPAySDL@GzABN_JTgp1_7FvHtiR6X(WLB@6SF!y{btBhipRBF|3QPjd1o56h@
zQdy#ry|wa~S?5HK$?dkfsxGv}Y5*ZXA^&)D8CC7(@l;9i31iXWW`k>{#A;x2^x$bP
zaWV*at<DAGJQei#RJ>f3ERo>WLP`|cDSaxeZsn*6e;}GE&=Ww4Q^6mpofXJMm0o#V
zDB(I!r}B6lJ(~LfWQD28u_Ri@d=!^!xz6R%x7_HsRGv{W37H(4n)1^O5F~qQ;>_-H
zI==0;p3-s?QZ=r%6CwegT$^a9C{Ld|v3JAH#j8KWWUOuMnlAJ{y=RK%Ui*>gvA4!v
zhCSF8I$&1%^Fsgh3lymqqEi5A>bZw2Uu;H<_<YZ$b6LE9RP$HPQ9Jxi^E2Q2nhPM-
zCK5|0AOZqv_HbUrINHRq=R7B*7Uy<YYQaoZ>xECwDdF_LjLS_?kFTMLG+4RBt_uHw
zU4fE%#T@^4=`^^asi@U8)wV{0h#W2kXseBEPk*K-l0Eh|%<^%8D$ki!6xu2854iyJ
zu#zMIPcc=mtR}`>vIK=Y$bmr6FQGe(Su(lbXgpMkGO9>;KYhqL*xlINQ`mTCIQ9S%
ztBij6xEV(7M;!F$a93nY%;t53wS>kpq<<M{AOBQFepO-s`KtV;p=oBkZuIM&>HlK6
z^e(nlygEc1W$v8whTPbN(o0s?btcJ+t4$YYz}e2X>a;0zb=+ad<f8p6hVUncZmzB=
z^0X5flbJzSrZTiU&--z}f98TN6x^c%3B76qbDS-@S8ZF|=FzrS0`+Y#6~c5F?An*P
zZPoMMo?g=f*26`WFIvcNhhm_I-n|nG0f}x=xv;dn{=-e+h&g?XvGMdk?JO<~^dEx2
zNX;QlKL3L_`%HoVL#+~<U-kh#EUl!m8NXo48Vh?YFvV#CSS-dy9x4Sv+R*(e-wT_!
z0vff8r5uz|zy?5#o3Z<;iqIR*20+{@v^_cH#eAI3LyWwIK_r4U9*_jutM!X8!ll`i
zsO<Z-5s1917nmKd`SG8pgi-^Ej*^iD-wh*Jv3|vY3_?zQEb3HoPB*v{sOfo^FnAbO
z-iQU(*IGKhr&M5a`M^x(m%KL=5T;YEKe#-4f2lIrby6f7o7@M=wMr6=iH$^&o=26r
z5RS=Sq1^AFmEbj*vU?-0f*|d+k=bbJn&?Xzc@aCp#T{XdbJmJ`N_$d0yqJYb(k9zJ
z*S~}~dntRBN3q-)pyGY8?5oRx1>v7-LKD!kj(G>l?rBEyLWAMonf)49w|X4kH04z1
zvf_hYb;Y^ZCX;1#hH25>OgRN9)nL?U8HXW6Vx8f>=362_r`HsC6GUSuxH0!@|JAxq
ziE~<#-{YRI@25KQ3@_Rnh>>B6`k&G3qHW@ET!!u?{xVnafT2GM%`3{oXs@&9twrux
z2GUAo&Ok-UZA5Pae76kv{~RENl)p5Y|6Z893?`m@fp1b_$6K!Z9S-lJSZMI6WvdYA
zA(>H)9bgJ#)x%B`Dl%MPeh{u}xot$Y3Oj6XftqmO?g6Tk#6y;H+=f4vSqB}B$YfQ8
z0ezIzjR<eJ1gL+ch<;x`ax)ljZURx-xvIxB-+zoYRGLvwJ)XrRsn3jG3{?XE)7*ED
zO=VZrPXSid(;nvA<}ntMBjefXECYG8>+lXJkD?^OgYos6o+X=XZ+=*I=WMu5?PJHJ
za#H(PZ<_S6b@Z_VnqXV#ZG9+zypK&|eD^z-lH;1<#ja)9AnTC2EPsgN3H|oDbuxGm
zW<TN7v6qSJ5)}+%9B2arQV^e4!%QK-Q2LZQ8l%^<pTvO=s8Xx|UMXpaY^mbTk*bON
zS6+tG0{d_llNKdUgVL+Me@g+sq$33h=zWOSsooc_=WQsdnAm!@Zx0M$E`p>SOg&*Y
zX&@hwmaxx4jSQyPcvhs%lA{F%$*5Jnm2{#~L<uCX+cP`k=f32#zDdP_j@jD`=bro@
z|2%W*U)(r7A$B<GFaKiE)T*StPyOPltd+k`|Lf=8%boK1BQqn)lODPLNLIq5PZa;;
z<)56&gh?OB93HdfZ7H3wsc;A7emVX~tnU@!XIxZAhk68gxW2^Ov&U(u!y++CkytS4
zy;?AOf<RBQ95qF|miSBzgyf~A;lX@8-zvwExQm{E;lr`ulajT;=g{Bzq0KX>^a^_Q
zWbW>)2WH}yPBO`Q=n(7YR30m55JiB(?mD;A6o)z4<V9S73lpbRtQ0Y;huQqPh9m?d
zEd!pD8E$M$XC=qF`w1h(R!@mN|C;afn>wZStX;Wx%6u;`+)D5TRGXY+jq+vicKGLQ
z!Ljh?VhvkHZ{@wI!9=0Ly6kdSpO6wJr?}rtDtQ=S0Jk<sGmL6Eaj-#0#XLUn1**q!
zn+Xn87uW7QtS++4S|NOs>M6N^Bv3%{<|?!?)la*mjyD8GM4pnv4UNn74f)R#xyZRS
zdL{(o@;EIq4ri|PY;|+wk;H~3&_`=lMgJaed)I8^Ow7ELFN6lJz9eW007Bu1Go6)}
zM9M1lB{?cDiTEq!iJu-!DIu9U89pAt*;II~_V?k~6J+VX*igi)xuxtZv*gT8KxU$A
zZd;TXz%}tQH?NRRvA)E`NxrlA7sy~3IyvX!QV2S!iQHyiucmK(FwXq`S^w?eO4s~i
z8nqQ!5ATMYxmI0F0E{|*77hX~zs1<m=)YOcI`7s@2g^yP83+Bi?wt44sKzv_vsGKN
zIDaJ|L=H&@#Kpco$yIB9-=$nMj|B9#5XJ<$(5bV`czp}l`i?`lTqDB-gezfIG;}~0
z`UKaWryr7ipPa~r6v1OALj{xqL6>+fb6*0=QEM{rBN`yDvTHxSH4@3ZZq*F$IvGsL
z`az0#I&*7v7xUps9&t-SInl~<@tst<#Yyu1z+PG!!`{H=p<PlCsE(wZBCr~H^92VK
zeKc1wo}o|~q~C%L7s=)B^Y&}mTY;y?cNQl$vB{`WB|1^Ac{1Ms8X28p|H<$HECs4X
z7h)<&d)uTuFF>)lso|3<X<<uQCq|L{URFn*sZjU8lT{B9Rq`RGz^eB0Dn&3BUh)Yn
z=sVai;PlAX67i)J`N0}WZ-`kKPcm0xUPUqAL_I#~)0@HXm>ptc+3sHV+SE9GCG|oL
ztnxYCYvWt0<lc<RmD@X}%{>EFZEV|8P5@_b`EqLA3DW353=`j_-G(i7R_E0zwM{H?
z%-ZFvDn~4uKUA52=Q9QF<cDwSf`j4T+zy{@3$9}BbF^*j%eEjqs*eWDUq<FbpQ7gt
zzzk?%s<i==7@5ElI0RF+7<VtmpG!orQ=L^n#n#okfTyVE+ZZ=tQj6<079KT`{tc=e
zfK%Nf&X?GwGg|kS^iTKi9;elyjG@@X6u(rw`IrSmeZVr+n359ne`j5rVNS>%<Ax{z
zDycTH$tUo-F&n%eZwj>)KTWNSykmK>WD#zVx2kf+(<r!JRR=$in6U4`3d_v_Th}3-
zJZ}SFnc4}HSOs^_AsNOc4X1?v9C(5jM->9(0gw8h29P|8OkboxmCi`GGJB7kUa&|y
zJe?LOxzQ0RzC`+w1Ultky8f_ZZq94SOu5=P$?BF_vzl|~-z<06%So3$x1@q>Axxaq
z8a)f0R8xebHO4b~m~-0EIFk+H$gVito-z3$<zMGc_Z~~1ySnoDg*y{mrm<@p<uZ<k
zXC?PwJo#N0^dAac)r@|W82Tjo;c>IP0_rToLGLcycAL3(O7>Qx+<sGiQB9j|OHmVw
zttZ(PM~dK?vqb4r36FY!zMxB_M&uo+h;j_^NSu?^IGsrumy`ZnX71KYkC5x0E<lsj
zaE9ypMAs?!9WlzkFiF~MX|_sHz{xHh)}Vhk@^4yfVMJ6|>k^5xnWtlTLnMkwGGAeT
z^5_^Gi&wDEPg14}NMDq-^th3H5{erq7I~$j$`Il+()qrNYH}0>p0bf|F2N?83YS*B
zM(<-s>qF6K*ozsggr=m>JaLqZRc8VvJzJkFBB4Wf&8#aG=779Ylu}|h6XjVW%||mG
zBiY0p18whhvQJxdgZhzQN<?=}0<hV6Q=Jw!Y{NR@br$7q%dB_rAkZx@%fA~@5FrUT
zS+Q^WrYo^AYFr%Jy$AqX2}#8W(d}$)EIx0Vs>H5dUKxCP)Jv?QLl;!ibY{=}Q+SVk
z#1lRCsW}wTgx;K`+T<lC&Q^Xbj#wp#{8OVF`KcFP_HPszCnq!B^>!?nA%oZ<B<=w1
zY!zX!&N1vehMW7%a;7A_<XcG60*7T+6Uo5JJsvGR78y%<?56QCHs$S5`M+6A$bq|I
zMNJHld%{U*Azt*s5+T`9ek6^rrG91gd(fLoIp6SB?~q03@i92SLHG$5SK=37Pc%`f
zmwSfg7L{3@gw&ui88=ImrA<Ty!{z8#a}i)PmqPfhmCdQ_pwq5~AH;&;Va?_0s;9(;
zkSoG4mBnn^%qMW}Kzb5D64ik;<%qrF+<&&3x{=vTX|L&S=|6w64&2nby1DcKtT&kj
zQDN|7wN5fVDk*Pud8>y=FUAw0dlsvi#gN(AYFKN43n^C7SG11C1X{J;?5TwLHWs6;
zw`kGWC-i>aY7XK2azGPiT`|g6E2YnAsG!ymYH``FE%Yb#nUWJr00*frrkzZ$1Z2&?
zF_<TOq09Yh1U{Cs{i?btbxfO&a`4h4Z5=O=DqmDd5sh+EL1Y!u>f}J270zKFo{>}(
z=_+};xpn|xn&$tgaF+5QC<tjHN)UkF+71<=HuN*<Yyu^#5-V3Jpd|~03F>Zc!4%0n
zm3tjj!pW%yKO=jNdKRksoK`jFM?BZ=!-!0mWcLbxj(TYqm{SHxLb!QLkz@7F$KhEO
zEgwm+=QCQ4qq>u->bP1K@%Ew6G4khlmyH-t3`v4^Llf@|#@9@N#I9fbONFFIsTL_k
zz>-8|Z2|EXvRw(ldC29LD)?Euqtp`H5|P`}mCJn@$17{7h%naEOukU(UmEM1OGv4d
zqmZ&N?8U=A@I*|BM9ET7l<!;xdu0nlhXo8A4yJB4>{>NBVVk-^KMLQw#mU<^ev4LW
zf@$F)*H~xx=L2P5_S(bIW0%7>nDm`VGQyo|X+ny~G`u4MZ&KE6Q=v$QwD>$X$4cRs
zMg4oZ2M2R9{Z!E+18@<G3k4T~3i6}CORFG8vQ=8$Fc7g9c9ZfeMxr9dbc*f`nCS#W
z$N<n62FaC<+743u+$HQyZrCG-3`-OFgOT_b(+0I7lBOZq|3Br*D(*C8y8Wd6m-><F
z&CL<H>4&Sz_fxBM^F9c02R%0`A;OrF4?vXpe^B=(P*I<0zW+@3e<soB;1c6TOS6ii
z0a5fa@xpCT5tqTeG*+l4pvIu#4$KU<ZYtFD4T8p4vK8wMv>|C?(848#u92p?H64nr
z?X(?<I-xpARhhNXDiunK`rXg-!}Ltg^#9D9^FQZ*&OIlm+YP9y_xHZ<^L(H0(!Cx+
zWNov7gbO-<Aeh8)x+|SjcsP#iJ!ezw^An#)Pv!UVnvFZo-Pd5q6;PD(%I~>PAKYsT
zwC5SU7_;Ra#IST^K>1i1b|}XKmDEkK%CBtq@{~`mDE-zb(A9$e1ng%~u0BozoB)rp
z*kddihz{vX1(jy9-I9NdftexWSvnsLQRin^c>6w?1_C$3cI+o*f`6HT@2teTVvfcU
z0Cfe*>x_5HxdZW+*Q@Ldysu^Lla^|j9dEBazT-LiyO`J|18aABfdVO5BME<)&h)8o
zCMHq7-+=0|tAw>muVY)^h~^Ji3KEZrwo#@@oDM>mi#U>~k|Fnu*T7Cm|AE8W=&9r1
zUBO}-9X(dFRY7WxBWL#&=P>6%kBDFK89ymOojl=OM=JeQ9SKo2stSs2nOq!@d@jQF
zC_0K}my9H8p&>DHhZmb_x$|0QzpmZp95C#5{Lk(gk;VC^x-UA}ehfgnaTjf%kXKo3
z<`(xx${0dNm!<i%NsKgr48Ybl+CRV>D8KJwG0xj1fAxx<dEA!%3%|{oR_?lJ^SSc^
zU*h#m8GH*BJLENg)3+<@3sJh|iuIk>`L{ScA8W@Qo;Id>U+`AcTvCDk8SXs|5&ES;
zz8}J(;BpY99h=vMYc{*F2QwXd&>$a(D(~}6Luf3Gf<Sk*Eh6?-Aq_k>F|uT_Fc*i&
zl@PHg$Scg(vmzuS>fL;W+mLl}H_t%g476g=SN0incG%?=xk)o+LDLvL)Tm@rIRUqW
zZTX&2MML31GGh^ivuZ<v3PW(GK6tb6P*u7amfr}7$krgDij)(v{02LqH(Ee8RI5Kz
zN}{Ijq3xz)9VpCcUp(7{l|z*zgaGNmr2Lg5y?*>FY5X{EpUI@k`X%+o+zCtQ@Cvyw
z1e)L&Hak9Q8`(}EMl{#r-*0+!7Y|)p(h;sxPkKsH_t_4@EqxJl(>>|lZSX;*tnGP(
z`}%1(EULh|IqOjnzAdKJ&i$YNxOPQdae77Qh8<z?_dgE&nCwmEc1GaSkg=o6SIVvV
z#$0b@$Yl=T`Nq4_&~Ospd+kPWWnn{hCTR);-dlWcxE8)w20YZ4w#OnAw1G2*ZG#`i
zQG3c~VaeY>`S${~Nrt|glM}}g0aq4jGS^h}35b>q)+<r)<54xP=aSH<jb}iL1l`R<
zr2)+y4gl^KCobiht)Xulbj?vWJV`k9Erw-SL>68TK4##ID3B)yGbRd7HN{w{!)zRG
zh@g|q<Q~djO-PBwKHx0Ii00UXm9W^$b35~7gSHs(5n%i=L|wu~c$^wpWPNE8*CrPs
zcI`sS_$b)-eYJJ*{btd&J}pLngeOD%K~2O{$2DX1JO%iS&ah`#Mj|F{f$K$nd{&9)
z7i%h1A!pOkJm24s$x4>Rc+yL=4_6dcI_pnOiJ$I?+EBswOO2GPrm|Ex77R~wal@Vx
z#Y^4li>PcCt^Gs>M`gKO)I+aJZdj9ZVVWsnTk7sA``JaKFDRa`AUrAGwlTH~fDfNM
zEPI>AsF$=KxNAFZf;$YXd>kK={)^`3DXFBA#)Xiq<Jn7>hK#N?3jvBSR=68^4@=im
zA<$J(_L>kl;6>x=>4n@N;7}<?PJZ9a6qBSZQotF?N;?`R@NjdPibXe*2${KspaxL$
zr*H&@>fCz){rhIiB`T8)-*ZVd{}-;LV|WGD->=wy)xK)g;I{an%_Lpp`fEi7a*|lP
zs_cesJdU(|mV8PRRgGkd@ctSv{0zeJo*RsWi83j)(m7|#U|%YCs}#yH5nXfMJ~25m
z?C7bnUa5C|UB*EF06yQ28|3ia=1lb{_gakiy>hr7>krP#T%OUJ(B*lD-h>T=EyRrK
z7KVz5aR73-`&7CF%F{$Q(G;R6MT582!j6dAzeFR~C$MeSVR3wbFT|6+AU5&Y@eRF_
zXc#2;9F4T|Y8na*qs!D#R9c<y1a||$y4UqB356Y@a<pH}KLAWmOW8m-g@Q|F1z>~B
z7iEGT>?-Vjgi~JM%`^Z}XC=dao-7Jgl<CkW;6VurC_e8G6_iQX7*ek9$?O^Cd*E%!
zE1#z(hSS(4RdRWvzoeE^XGr7cEBkK~KUOYGG3wsR`{X!L`A{udC~e30O6x4w4JVXv
z28Mgf9bt5`d7W3=eYs<M0yjE)3m4X<+sj%n!}+4lk59L_vQnnu7%0;gPkM>xabYst
zisFb31y^WtRz~zqc^xI8N?0gwp2MKZJ|nTDSc@Jy-Ps>@^zhJqmT>I6O@#(~G6L7|
z>V8W3(lPp?aVW(9a?{wSHLA2nNe;Yb%=i<l$`P_N>3pKBYSSX_!Y+B!<uwLaqFX3H
z7@%%4Z3Kd+wok9<H<KK}Sfr$^gV6|(*nyNm=VR)s`fS(L(AQPA-j2@nx#k1YX4D5i
z_nqH_KeDm1e8zW9jy&-{qaW4(#lp$IT9)&v{U=}CkNhwB5qEPx|Iv=UvmgEI^5(aE
z-~GkH(_be2{r!_SLX5W2UBd-!_rAXHRq@+jbXt&jUZmC@qDdBf9h6Q&NL+S}lU^r!
zfO7$F9ywl@gjV`C$kA9o)V%~L8QF*%Em>qzM-PoZGOu>9)c7Si7KCu6yCc;w06UM>
zqt-QpjDxwIxrOx$L*)AfclV+nRAW?sfto0BqQ0(1M$jN+oNnZ}$W1^2u#c?umv~Jr
z>I2h#iyNz)*KnfargNh({|ID#=Hc_wjt+Z>qb?;0i_ll$+i<IaG{JXF)|veO*s&gW
zR)YU-W!!9Xt$6U{u2e!c(jsxS>7G$SVFwyXE>y8SLBg{?&NmlEavv8|Mj&gW5((8v
z2AZfyM9j3&@`h^`Ex9(a8l@`0qw8?Tt|#uicB5&m+c-MkHlp~yoT&==Ms>=AnMHRW
zfoPr;_&m>l%yy?x`TYu`w?(<z=D(*KTW{QRsWv>L;2`IKNe_@p_1m}k1oaZ(P4+mz
zeE?xfCfEMSR@I^oboiM)V}J847eHDxCg&6Ml+lgkP@)?nF-0zg7l4f-hsGI_ppc2v
zs(I34`?n90D0}b(KZu`!yF1}yu3CVv56)OQ_%Y{le<cs+L~833`N!lSv=UczR$m{x
z=P<f<WGWwbbmIWj)SEz44wI4Il>eyLW5;#4Acta{i<VuGe+nysI{!$aJESUfnH(BE
z7IBfQUO-7Hn?yW%Mf{8jDv_1@f!Hu|2+4>+HA9h7S&p_+?xsH$=mL(AM=K5ZRZcj$
zxIn`~fd#n!`fG$o#1m^)vs-y#Gi<J#{`Ah%4);^_+Re2`%EGNtSzEeiP`IV`EXi2V
z6@jNe#mlU}HdN>-M#LncV=f8I45N5W7*T0mqxpLnC_imE3_?wBow{)L(ZjH=2xqxq
zd*CI+exGtXB(P6$Hu+PNAAA>zbDTne(r(3V92vIx8jPbxKUQS6qMOA_(i$PeV(adW
zS2H`3(w_q`7blbhgOhlSkr@#?mugY#ZASW~q60}>lhQ~@lv~SC4cyxV@~XGrT7zL)
zp%;PGa)C@6id}$s&^%R7K2Qt!Kx<1yry3WE&V&z<_fX%~n<{z)6*107It=`=c3MM=
zZ%fWjanRl~*x%5B?Y32m&or`i*_sUCQ0s0109Vzz;)Tt;S4MZLyr`e3Dft_u*i>(C
z`JWW3hM6(wz_*%gD6D{zmX6-kfQ*WR+0<0onAwC%%`FE#qY3&gE+M&##3@l+j>Z*g
zWSIhi0PuBgLyd+ARdUi~0Q$JxbUc3s>;icg&|aqDV=6D~Ql2L_@J?V6kjQlFy1ga}
z>&oWzY5Ll-j=EO+%E+9Qsct)G-o~|j<RpJ_tz#P#k&`q5#~<9%_>0akO-Zn|<ZxNv
zrX%^2k8X<}2)m)<#{Z&CxsX?M51tRVLYLxI?ke}1lYj32JjmM?IBoM911JfRB-}K5
zzR6R*ZtuTPS4auaMbMZbu4zHW9?OZ2NijO<BN>h^;o@a3f0`Yt@{Fjqx#qPMADsbn
z4kBEObEzkFa4}OQ9t-}5!W|AN@UiIUc?Qa9Xz6VE0HA|gWl{>0R-%n^s0rbUUG%(=
zR4HsJq#>2fR?m|%G>VmNOSd+g0vy+iB;K=JwJbdqx^%mx((U4gQP$QgUNV{P((zD8
zsKX3()ff)?P^t_IgAMt>0B)qmRYM_F920x7ZWW+XRlG#9pApfg<lEr@p`mUeW0Fo;
zy5yI<Rl&YSytHsV_ubkGILo-(`LbkPV#NV`Yj{S=g*&?g8;lis-Uf)G6Q)-nq_^a+
zYK)5DGGB{+aO%0>^x9@-Q}!i!?v07BWF;;nV}MIRmbqt1meAVpvFSD!o{jF?gnL+3
z?mDJ|J6D-PI{`g@X&b3g-ZXw93|Zr_!=D_ef$aR?gODE<-7S@78e5`aWsrsa;=$h>
zBbKPP1}BArMIn9rdbZZ(!hQU$L1Iv3M9LJy>fSGem+3T0f3&3VIr_|~qp(_Zek{<J
zY!o4w)pHw4A=tGhf3m*vPc15TL`YO%jiJBxZQ!627xIlJIY*26P>C34-!rIapf^VH
zfYokEcd8uVvNz~d$Hlwwul~s(V$3~7JH?Hc87(j4VO|FZi45AdB6;MXP-C*>z@NKN
z9#=tCq@LlilctC9Q4p!e^>~~J@@d%48jXuf_q-M7SRp#Ex@H4Lfo5ZpRJB@?g~qD~
zmEbwRBLHHZ_vL`0A<ui$Uu|H1lCaIRGQz_$5=a->l^+?DSnY&gxk+ECU)mJ$21_bN
zadRgCg|C=UcSfQY<h0K4MCDY>?%|YNR~rrMe-RW}M>$r0Y(@MNY(+j1^F3Ti^k2=!
zp&@8nw*Q*-YX81V7z4zTod)r&%M%wu)CB!+0^RU0v@&CZ(UF$u`Nc{}EAL@5S-65l
zR-9(Sw6odp!n`$WF|Gij5@gHw{j@`TeyN#kDs%<Sa&n9FRicLz`%SLkttm1sewp1`
zs9(`UG=!S<2z&1m@CTnPBOWF{+cQfNL9#*zfdA{P@k39*mDR@NbiR|=!yvG{kR2)8
zR5&{{#!NJD$HDzi5-jD1iZCQ;g{;cOj&ac_#0v{!UVVFF&%n5Tj0+bmikS$VEQ|{k
z-X6y~AUAXDs{_TYP)QB@$JwKCJhZ-lqNP>0I62y&e2M%>#={&-bm;&EkSiFd37o-G
z!?*pxk)e?tw!p7oLOa5&msSz$*<hb<O(APC5#p9*A}Ej5a|qlWA~%#R*%XiA8Q@88
zoW^l?CO93@8}o>mViYOUUASo2u}my<j(iAl=eRPGJcbelGTuM3xNdxt?;a0)%xtj9
z|96jNA4n2VOF!GQ_m38~p?_-6Bi?TI0@EJYH8105hL!-FSdt%PRKo^}@TDZxLW*#1
z)USwD)dxSoLgg5KEudUIOR)1)C%w;@sJ;wvWxZ8XAmA}z_!H~OfR4B#HOMbA_HqYI
zTRVOAmXCs=5(E`EpAPlDNk{bm|EhwjQ!R-LlWv79Bm^u$QlQ*Gw*6S<!RpX~@R(pp
zz!Gu}w>0}+hyp9_wfMGnx>op}#Pnp{M{fag;84u11T}*_>Lf_AnciwWG_bU8+Um5(
zzH;Vb&vDw76Gg=QDAp=}RbQq>?srtXYt`77x0K(7jNLIlXjM6ue+C7*PY&CVu@{vO
z<QK(D@P(pfW&wgEV{s*@zH83g+;<7-y$uXpr|;$UTk{Wy=S6r(l`n^~7SAQ2Q!t8&
z-w;jWWk>J6{ACC93pt<FfQu#g|HUE87{7=k({fG6@-eayTGvvJz)0qZCk$2DBqwda
z&jnO_f)t(Qg(73kwk+#fL)Q9bRu6yZiA#GLR`zqn({(4hzfu`{4?{(lf(hZIo6xs@
z9=7JEctP1PU<Vn+s_hY@BBnl!d;oKUW-m-WrRrw696O%`ys53B^x|Ckmn2u%bNm6u
z5WM$&XuWCEng1G2UeZ!!b08G*QsE~cI4x=sB@LvDzfx0alPWnn_(&M>@+03X^zE>@
zZ~CJgoAV=cqR@531Z_2GL#>>q#IJUrUCZ|l@7hqY0!ju6g-ADoO5haHpizYoZ>X;;
zb3UDPBzXUG%@r|ApNTJC{LbCiR#O@oZ?icS-04N`$4WW|bpBSQNB7_><$)}v?Y?oe
zFyJzNru=o*Byv<&_M@!nOQo}+u^ECoSb!;RMNU#(Y*lvtLFb$bT%uaFNPJ(;@0;J8
ze~?5Qa%zPlR+E+DL$dG;P90gjsSJssTmcDCTmVF?5t&z<vZ{;q(IL){q9*k2<GEii
zlx|;iSFzSxM?(ynF<qek@S(kpxRYupoG%x*4SEW$XH+`8-=T&U8sqGuU!y`>{%}zj
zG_|7J<O+_2{u6>?4I<>*e~5TxI3r`&-Z%|+dnu;5NStajJU^jc{R*hSqz7dVLPv`H
zkaE`W6QM-QA7IUosbti^Cb?tqM5@8_FBc}pL{5=GbM|20R+^{!2(CeW_xSnkTB@%h
zG@kOVI~D1^?Sm%wO+#;<=NVa4E>jEVG<&ltVt}wpjpfyd<>_INhFe_1B}%0NdN~w?
zgtEFgVxXh0Y^6-kJ=4;6B3$p9Gvk7Ga?Dhu@ya(gpDu96c!x#(I>R`Y9Js$iX$=|6
zbf?+y6H`1Z0v{Sjhfq~F1>RIX(<#Y*mn|iGX_KsOuX`u;5C;REB1;@spVQlGp2h!&
zsi63(UnK=rNMWQJrh~^m|4}hWO$Ot9+-^>Io{WWB@)iny3jJ8R*bp%cg|5xh@(j;8
z`?&Ad95r~}RM82AJ0KlMH-Tcv)hb2}f+t%5n3qxM{(v<A^Dk%aK*p@rJ4L34{lk36
zpaPPU+_hX;m>gu%K+<{);YcEW$Q)X5Cg{moPQjBAad|d%s(UtBlPQv1E(MK<qBwuJ
zVqt@F1q);%J&y9f`C7n5Jb9!{V$Z%TJrfK^jL_t}pypgpE3S4*^E}@Lkd@OH9P1X`
zt~X^6wihcaM*l=ubI-n)$FBWV`#0@|TRr9#=I3giJ#a00ZcWQyoz=h)b!i@uvnL9f
zGd4Ab1)<dZ`Oe3zDVFpK%Co_owUM0gQ+z2YCEC#0Vfk-f0>^x57|W^q%5aA99u67j
zjSrd>#duFQ`oYkiqPwFH+<9ZK+3tt<$B?9Kbp-ywcI<uTyesteRKQS|#khnbv9l%d
zd6Qv54t|Ih8HY$DgR6%XFVC%vfG>RLpb&k~mZG4>2&e|r+t!5#EMSp_6O^Mb2ZzU|
zn!vjUk@T<!g#REkcP=AVdze`wA1ZKZw(o7JwD7*=+msdaX$NumQCyk~JWv>w)t8nr
zf;-STU$-Pae_5BaK8R<?3mu%ERbK-p0G~qOQHT=~4@K}SrzUE@QqP;>YUH<i&Q16?
zOWK_}$kzxPUcT1Zv{mQ<LY|^*;=2qOHmNHwrI<7^4f?kY4A;+yWmQjV(a&+^=lG(m
z3xG9sP#D3&Q3c;G3NNI=dD!uJhVR|6T}8cxtCG^EZH!8@Xv;I!B|)o*WU6YeD~u$G
zTO!r-x5YC8z&0e9*kEhPIy&UO2JPm?o+UCWKdZG)?EP@ni3)0@9QQ@r?U|jU>mjIK
z34Ea3Zt{C(DsMjAuz3Nu!`G~gI!2+z-McyTTeR~gePxUOI#*X{5GJZ!CHm+b4B}W2
zi-i0N4Fj{8zU}zVVBpld9)n{*GT;i8pnxtBlSwKDHm<lO#O0|K5299;YD;NozlZ}3
zqDv~Qiz@+(IcIZOR!Dw$P3Ce{3w~gsNv1qtOzgb~(nS$#;vx$UeJt0+>0!=={Zu@u
z;kdh%A0xhi1KZRn9ZdNwn+O3F;Xa#0c<u;NJWq-ieq|(!EO)k0V^r~`J6czhi-8v`
zteTvLwguy))-+lP$#eBJs)co<?AjJB`P~SM_{oj@L_mz6Gl0ephS{*jTu7<y-J<d^
zK9<T?ojl3(kjM06x^U459v&Jw@891^kjVEAxt_1ciAt}|KyftC(@?A>|B#sVj`RvW
zB+Ot)rFhp|423hw=r>vw&W29FKu;K=iPgt@K0uJv({SpDC+b7CtnEzTQ!G`_8~t6%
z6|qO~G&_3zpAX+7>iI#M5>QumI|GJ>>_O-LO!uC7*dm}cn3ZCWBcw2z5LZkg;o(Rj
zgxMNwMu~Bb^+n^B*rDp-lI-i*P{OM<^<3?fxUs$l&MZ#st&H;53ux1UK%*eNkv&@|
zP?5<x?OGXtvBtMSItXBi`~%`z!4Y6F@K6{{oBw4^HR!3?#8%I=_ybAxZ6dgZ8YPLp
z3uKShFcfHKP-o{-6F-b?LOi^vl@PTmd9`#UeDr{0YQtj?T$mo*1x2UxCsNGw$K6+v
zaPpwA%_x~<9T|h+8c5q1FY782U*DfgC59wH6Rq1msJf)HqeH4#+z#36FB9{m-Y10&
zg_WmX{2zT6k2U^oc-O$dU*3!h-<O*8$keivQywp$TN^#?vyLAvTf6Q1zVCgsw(7l~
zo@m}V_??8uJN`1F>Tho?Jh3Y=<!_g-ekZ#lwd1LMZo|mf@09|_a6zEpAFg%Xvp&gD
z2UE0YtIa)dFSn>NuhRAWTW3lp9yI_{V=;bGz-J84kx}flltjQ|f(Px1kH{e~!W`?F
zg76Q|EAiehj|uJF=Xwq^l!}-Gn=k#{rrxjv-)L<9`9}Yh!0im9uW951Te<H8+sUQ?
zNu;A8{;uS)>sK@%_04R^PIA5pbtk(wp-;nx^p!#Q)XfE6p6E4q5R?K^d&#p)Vj(+_
z@9;Z$CSVRCm1=~~<UB8W2O$JJG=TP`(@tdUsTJqxg?X1J^74fJJY=6=U~UIm^*kkE
zv$L#(b*W15B^uhO&UkUxV4fU=)GIy+WVsPRClCiF^X~d=Uj&I_<~Xn9v!C{p7a)L=
zfMM%9b*6u#@$EB<L`@@h>7$&Wl7BEneL%EHYX=cX!3@V9j*Qn6h;)vFjjA3M;vX_d
zsR~6)N)M;1RDDNPZWgTz`m-T(xQ*}Sdn333KWR;8MTR=if+t92hBM8`!J^AecQ%ei
zIksGK_VmD6-qOih<xe_ku;T(6otC+`LR)EWi(G{|IfjO$C!sFf>cMj=%3Nbea~{=R
zUptK<;G7-#d&=`qJ+bR~Y$20<pBe9$@{3si=MKC7^U~3|#@mk=Zx;rBl_9~GcS6SU
z0G%t1LD@@n2TZ=Q8YbkX#RG8}p_IvL+DtA(hiK$x@j;^7p(0kJl6FX%9Xl4(*tNd_
z&z5>}+pl=4MN=x1I#9LLOU_(;T2`8~oKNMO+p|zQ!FrtGE_9B%7BloVXzVwOJDf)o
zpP=MDP5yaT*$@yI$0)gO9nL*TlUz?XUJHi4eFjfnbQ1dn<|R9X>m|=s=oEdJOGMh4
zo-qP6DJj$(L<!~N=B13MY+^wi`&0?q_M#@%Pei_=bzzksT3;hKFq3XM&YYu&7ST%r
zj89RNNREd{fJqe7o#KRGzyZYFY-kJieUI-oUxaJSEX{^3GSCo+Wqr|cc|_a&^}4Sb
z=#-*1SWX37^CMRch6#VU)4ts@p?f#2$mMmNJ1m6G5Tb~c>*U_;rVaqE^om%|_fQk&
z6l^m^Md+5D3Ql}t!MRK4ep}>q_-)FEv1%qkYyj}_it_c9z@d<_|K@ED`Mrn~hKB-U
zHsxy5nE$wP+BmE+j^+I-LC*8e>K55Z7_L6gWf$Eil6WAKy5`N9+})h#TzJ%hU8qG3
zI~uQ*2^j>-A!=c)snWtLu>?Yy?LxLML?y~uYl=G+`)d5k=HlA|Go0dvbZ|bU`HAch
zpmX(4!sU#jzqX_-c_)ss8TIghC(B!%(X&MNDwP)Jfp}>QAXMTdYL|9t5A~O~^#PIs
zCF&N4c2sDY_3ehpAbCAV0|1!;n;7(pH3OnpbWyxAv<>P@>tJa;=O&wn%wPwF|4s-j
z4HC_l)je8;s`%SH)0?t+H%q2sjGBFO9RZOqlL8wG6WhC8c9>XG!71x6qz^}RDK@tf
z3`W%mz%lX<ohOq&lg0k0{5^OXP21xBXze-JkrIgJj7DR|t>c>lxvZ=Vn}+<!p{GP~
z7#*7!1*M$Y)MQ9oO9*OtjHY5zU3z(X)uij~qun~?JZUtNF&y}`DL}Rt1@_q1<Oe@;
zrzQWh^1EBgCyHOWecbqf<S%33?h3_`to#E);Ts0Jb*6>=y=q^)BrVG=^38HSbuNG)
z8N9GpX?YW?!<$^^J?~Vv{D<p}<Pt<<os_Sv7^Nxw$;x8inN6M5K}6GR63<|nBS;0K
zO<b;Fq5|L|SZic#L#zk$8C~}W&=#~j4JK`GdAsjvE>_kzdrfD2{-pnCy-9gGB-w0S
z1tRGogG$z4jsEU95aWDNG{Wr1lb8CIR9wNaOx^y_yjU{;wxO*eI7Trp%F;YFA%jpr
zb6cO7ZR`^p(HFU~D9EH<YYOhhaglAED^^S*h{N7vt8n`*S>ZUv6}2W`Y4dI@N?m1I
zT`_6Gg2(2jmtbprq@=sF*j~5_7$78nwfJlK!a~5EsJx4PQ!3gwJSxNxbCAz;sXoZZ
zQ9#{YU1ly@^!nIt<zw6Z<OjQx$FRJ=lTiesbxZlBa`kx8k9{8m4L_p%!Z8}63@QB8
zeXsG}SB`+Y-BE9S9~64PV86;6%w|pdE<m-raR!T;r^NR#2<U&{3>Fut_HPEWa$w*-
z)Kr*{^svPs6fVru*R>;c^1QQBE!D`7rNG4v3DXxV!JqY|g4+Nt<3*G2JZwc51=0Ym
zT&?3|$6(SnJ3pc;IJ~I}a}I5&^D>_yOA2%00v4BPi1nKo-s@{?iVwb1Hrd*<w>H1;
zLmrR+Ksd~9<~frv6gid!pBkduXFv$)otGzwhq$K*MN?g=KUNNl0OD;LCbljsECNPa
z9s$0^$00@<sfYK!dgODfUlEl`o}iXu&~hC;waOesEvnWfJUh~8$UF&q>2bYv<V(FP
zd|t!lypd}&m~6Zn7M3>^3#h7^5_1GmnummEn=!ot!KHmSd4C(K@OgCAi_{p*_|T*Y
zzD4WyT2`gn^Ue0Nho?Cn<Qng_1)Pit@mHT|yxTO^q&RWXVfmLOKlr4`Go<{?b}vXd
zE_K)NH+h=97WC}s8a&H5%-~E<E<jddU5;SPpy`AoF|X$`7fjFkdF~%>`gQNOT!1Xf
zB)f+5G}8GLM*}qzWIZ*OK9k$Gw|wPd*p~91i#I`8<eabRI7WV5f|CoLz6fP6ja~jR
zr2XXVgp$N2_&pLgQLu)Bu*T>fqt<E&msCCZ2Gq^D+yEwW$t0i^YrhFgFTJ<H65q$=
zTfhe{O%lK<VcGJrt738$#G;wxwdI;r!q1};3MYYV7+Pwm*?H_#LINaOavntIR9;Hq
zEywbBIi*(aJTwAwP8KgDawa97m6MjgQcJeBKcy({y#P5v$~6i&u}T|<XLwnxybzDL
z)$w~^U==)<2TUzCE<yV%dM$5cv13|!jME;Cfte+LSxAa!_2r3Nifb!;>r{g}Pkf51
zDPJ0D1<}Yg)wh-f(@UYc3~O55O>sJ5_UpmrtE=oc9>hqg78q$7`=lx0Nxst@@?6o~
z%VW-c$++*_4jH?<PqG1NuDx-@{fSrmfx&!YXWn6Rk|822T5}ED07~zb+tqMc;eEvT
z7UcozvT3QoC5Q{>Rr3`#`3X@aC8~&Gz8GfEoh@DzYjvOIRFF9`VGLpbnG63|>;>d}
zh=C5>EcUcqDhvB&c&i*!_eKxEpl6dLMmfMXs`=!wfe6iN-0OVtfQgP>+&uK?SY>>*
z{?I`J8YtEf*z>r}UDzZ5rpgLP?9u%Y25D|mjE!1dl2l{ilgLl5=LBF%<cx9Q*oRHh
zxKb1f$QEhxsAj&Eib^pq+m~mTN8@2$a-=QXmcbRMdlHiXP#IsQKE2jwp%fIesgSqx
zd{H*{T<Y$pBet40gylyT=k%}7v9Oq)ikJwUGizFWBKJ0!y-=u8aV_jC7KkIyHA@@<
z$)=mSaPRAweax7hzd9%K-11R5$-9hqON|eDbtB*IV(#guWDT9hJLp>2zdE37il17d
zMUNT;@Z<!d6cchgHc{?5o(sa>v2v19TQAG-Epjj;r!2yi#Y=sA)KLAOTSGnlU42VA
zh5@u4iS8ccB+pqe<7P;x9`t^bICv-Epj?&h`59G?^Ri*FEFhB;N8TFMR3kE0y%&Qe
zcM<!4s&1A|wy*0vgr`U0)UaobKay78cgjD{Rm+`iwh3e=KoC2Zheb`o<vs!u5f-34
z3@(O+GP0o9LX0RYCPg<>el-5lg~?%G<2U(}`1i&met{6kVUFP>acmdkGfe#!_!irK
zwc<gP?QC8;l)A()P0#k7)5F8|PT-!^cVnbKwIu(P!CtC@*ED`5Wh|f?SbrK@gz{CL
z;hHsX)lN9l;SxYBEN1FAZMR$TQ#az=JknzGe_La>5C-VL+u=@z@qWRf{v~<%vJm;~
z-r0#>AGe+w&pX-%JW^;#2x_ZstCCFImby1ipljIYJH>>a7FD%3s}&{+EGiYP#HEKv
z++?W64@WjD(|>SagaUxNF~UFO4g|lzdnpgdhxLm=41}*4_Z-){cgLV7ZUY*1DoTWj
zNp&C}TTBz+WwFK`d<T-4A%iA4Px9ITNBg-sI0|GDuoItT&RvUp>o#A+0Qo4YqxV&g
zhYThYv)GuMP1L|ZSGp%T6WCay;$<Dl!(Qo%kw0PR(7#!3$Zu~;mMK=`pXr$_UIm?_
z6~uCqd1$Ms7e&)(wt;S9xHRbb8Rfr<d8?C*7G|X8Z1Hiy15NrU+AHp;pFCJ>JE^-h
zcXYC2TK8|KRK%<<(RQciFcm8wT3Zn_J%wPdm3t^q=7i<>P`tpDnNU|so#UP($xc-<
zdDgIyk_gD3J*8jop4RJ8Mk{sx8slA~1PpzWmi%j4;*nOPHz;sZp^N$gBX0*jg?Vh%
ztHum==}mk)I=7e-tJ{jv<AwmRFCK7D^s10O*k=s3!OLa%u#@NZUsf3>AS%v(y~=HJ
zulNY{We%nK>-VN6&RJeTbtx{;fMx(sYO=Dbs=^;v%%zw%SS!Lq+(+rFvr-tJ<#J(5
zw~L8%)If=LJvl{u<kBe#p{=p!g7s2ILYJWlGa{KE3Q13V&l<5CyU|f~MG7m`e-PKm
ze+fReruceY0=ZDOQp4eI&;p2MkUyD26`2T~h@T-T=p5JV);)M4Cw1fak*NxNr5^XZ
zq&_kfTo%tv{3S6P+1<&rDqRwzLk<e>Hl9s2*j{XU<VJ=DAg>4a=k68vHx^xPkEl$q
z3R)1qinyc_-cxzoc2h7BYVi>nvn6yjCQrW}(gv5%PIP><Z&l47ZHmUw(1o4R<keBu
zb8B~18}G#`L_~e2jJ84INLDn;FY|1^nxeaH2aE2e`j08sMTj<FROYjrb(GaX%9k06
z+gWRMA7x8R<?24shxJw*fK5H8(gqj>$d$C=i7>K2kKaDCu~Qo6#*$(oJ3B+kDtBg0
zuRyhb!4M4+o>~#tr;$5}z2f;<b#sSY%zSDTnCwW-f>hH438#@>8O%8H6z41sYUfYX
z$I9=`Lp9l87g9Z~+limq2{E4R@1;xGBZXIen`?CP?nazAvl;-XBf$6Q=~eeLN?Uv<
zu}wi)N@oLK<0cbb7#U25(|oe`q@>Ci$1Hk+Fq3k9#8pslgVy7D%2T1v1Rug5eFlw-
zZi$wABCdIyW6}iJ6X@`<3~`9KTzsTEF;{JrKD=j`M0rUQ<-?Lgh4&iD(N(J^+t!aP
zw|z3e9s9}d(A1upcMi@9Bly#vUmV?6dIXn>1vB<yxUt(J*^Ff|>3fqRf~-9^i{st^
zm?-#V81{(l5ZrxX*;-1>hZlu8ioCH#?=8g-O8HCU*cGe^`)&S=vCnlkWjA-*l;OZN
z<@e;hQ8vZc#!lA<FC8$PO>_e@5ZC&+5CNgrJR{+{&o_=QFYy)+msc#;4sPo_H@>`B
z=k%8I@9CQcJkr(%ozz}7z;7U|H{baper<oWyws<u6lHk!;O`<!R?M|#uoueK13RZI
zcEpCPQ9XWGd3*=OmG*-U5~%TN`TxASbrGztp`6mE2>B`_nW-f9o~3RobVkbZNQxkx
z?OL_?;TA5%-<@N*fpSc!)kL*Yx=Y{7oT9qa(8eu6d(>|p+CGnmiw8%-mxW_(fMlAu
zra~TU!^lDQFTK!LyK%pICAzMZm}MeN^38pv-DY^g6*(n6Q++1RO2*U9w{j4b>m@vU
zQuivPMmdKgHMB<p7p}4}I!;@4fAMVNFN^LmHs+Y7f|_r;-E2Gv$n~jgS;vD%l+WcR
ze%^FOUzcm?C!HK;fD(r7`3H6m_9UD#Xne}~(np?Z|M8KcrhER3XUl*3&3}0`F(tY>
z{=1=%@A<nqOP+Il=YLI|7Og4!dGW54C;u$8VCJML|NIy0madsN{crv<`qSbc{7vOg
zgO*#qd~m1ptDCu}e`b68_m;k|^d$^dVmW&Ngy=XVl@tsc_i6z*D4gY)$dEl>o$XzX
zaTJ?|f0O<!I-ZZT=x9V$8ZKqH2Wu;eaXAR2bh_SwaT<q(UdAE${Tk0zI5Ti7pKF|k
zHFdm3WXe%e2Z+B59o%;JE1at)+J8RI7|M^-_&zZI$=)+%L6;eDjJGCq%algyb&MUJ
z%(LiUu-~U$&E{qh)+QhuZ8YmEH?#C<`>4x8SZJza<ZX0N0HGQQ^Hptjgc*hj5f=#k
zPcY9v#0b#U47tDt46`a~{1d4943T->YH^undKAsK&t>i|H_<rR_o1|k@y>XsxrrL?
z&UYPO>2AN&c_j$K{nV+hR7>=tw35|5sXJ36LokOG)54Qyf)A!7F}Tj(2xdmc!>$vU
z_Eq3N=*uBi^37oH<B>~&$;0++&NY5eq+E0a0Lv<k4=y?${PN0j+eq7I8e5>>c4uhp
z5#!K+ZFWI%e=j3U75{|aV+pRzzAPE<&v!BySRz0-Z>xPJ0z9cg33ikeOW!)4FV?P^
zcy!QOcA95AxuYRsNf%%=4r`=8Ug{GkruBM!q}goY2-c`;7t^6u=Q)oCH<Yzg@zW(m
zCJRNEX8@eR9T^)A$9rLy_+$%?#bDDDFDXQ&Kpv$AZUdfgiiEhn()yOHS&5=(8@DE<
z7?T_M4<Bc<O5mF(#xSC-F>$CIv^>Pl<k$|J;y6eJyQhR!P57kJVxY}gU&fOI;vy#k
z#?h*WDq)!F2av8lE<vfDiev<b)osp$gQ{%;APwIh{wX6tBA=yir*DuZUb-;3K-BMr
zk!#4^?>TN=G~^|ZgpK_-)NM?vQ*Y!XzB1*S^q&#krH7nZZ{@57Vt2k$MrIV`ZFE<m
z8L&Jw%)~q2g|aboLS0LA7Sy5C2S3^{`U{dG`xU&oFulw$4(}$2!97#?d;gc*4SD`i
zfz5gy2i%WxO}Y=NjaKL>fcAzMk=qrZ6!c~h|FR^*2i`3d4S*|1nn}Hqe1gzn#=8@a
zNQ(q4Mx$4o=SuBMfK?$}aiMa)b*4k&CB?Q*u92Sdb^{}2Qr*Bqde;v|sUppc3BA<m
zy9E;yhFHJ(;Q*DV!M$6xm6xoZ1CaWngD$VEN`AotL;kuOGnI3}OE|0R>0$M=r1GXc
z6YxS1Rd%h6uw?p4ImQ2VV3FsF)Q!tkImuD#lZV8KA}-f_psbKsy^FemVq7a7+*Ub&
zRDacl<QekvIUK}RKm^umGwlKO>j_x`+^=GK$?8J}*|NU*QeVb`*aBCKFMa&=bYj-Y
zP)wkHUgQ_0hNdCU_Z*d>Mc+3!Cx%66N?_=2z`VAvG<_OgEvZT38{nD?84eRR%rsM{
z(MWE|>O35ZQSmG=b9$sLS-!9^%@do?h1_qqjeet3K2!q3wo#&c4|tC@-HHwT!WJkD
zd|<oRs#DxHUvi)c<@xVzKdTErfR?qre~C_JMl=vIm4s~45K#wT#}}t#L#b=Muy1BF
z(+<Wtu%Iw<Y~Ze(*sEr?x92nJs>pqnL%8`rxz+^s$qZTsL%GF<SrGmvdUtSI0H(0e
zx#{QVVQJJbuoy^TIZyc0SqjdOOMSZ$wxyIDqnMIaM%$(;UFBNUGVq|e-Y?`6`9G&D
zaICKf9Wf^`_y8>%PsP~<$vdP68yxq<Ta%>bXdrf%@wIe<Vu#Gv3oJsa0yw$^`~Y^t
z6x-a$7x}`OpHQyhNk{gxDErXjz)N1hpKwNT6miShsb@0;N;4fK$bFOVM;OmITtQDH
zPdSN`DFoQWh1N^K<*^fcFK<dDzhQ#@X=-@dPpGe(WUv9qL?`-I4!6d<u!u?CDbwg7
zUWwS%dDw1VU3F^gyz-?IxTp*$K9>0#WvFQ^S-CxC{9<PEW&fwQe0i9|zh+aCM+?Ef
zjsCX4Wute<|FJQS(X$E9%J%-$T6jI-UDeG}u{XL!Xg~!NL-VrmlzJXfC)bwKCP^??
zEf$b#Di}@i*waP<Y^v%Q99Xt701a>A(6V@{uXuK-l7sT5YPN6RaXk164$*VhCZ<@$
z(a-gfJ5u8#P)X|&qpnav+hWI?Ei4UG;>#88Pj$jY3)Fc*b~@*UwxEBXce&_}L^PR_
z^esqD1lhq<SEYZd>~8-XS_WRS78D2aRXi>#LW&?9Kt<&vVNk=v!R$ns<XrIWRQkf?
zR?RV)Dshz4fOF)(Aq1QPa|k#lvuSk%Dt=IAsuej0`ZG=nT*?bmw>NcOu<sPk27P4)
z6ge{TbN9@g>iFuSp(gqEcc+$zu7(P>n!Jq-YkPv{ri-G&T7tzevrCu<k4_*7fP>aa
zEK7ujEwLC#dV7*^YuVS<FwLqbt_Ky3{65b%x=$H$DE)cLQClF{Z&bce`eTbqe80Sc
zEarINj`1twSjd>1=CR7rkie%wr?GnI;?uk7W;BhCCA;?nK#-^mFEiSN4W<r`?U<~|
z-3gP@>C&*$TVo$r_``X?H%iMY%Ulw$g28nMOeAC%_VN%YzRtx>E`onJ8N!nni#nRa
z9yt5A@Lu?u3EtCFu!1i&Y)+Ef=})b75Pc3PsYS{-+)4%vcu3c6hK=ez@1q8j`#coW
zO&Kgn<JZnXo*p=?x@2y#uuL*K=8mS4)Fer&raTx11Od{+B{&d9irkHwVtlZ^o3&MO
zaG7PuR00=KA@du5Ba|F#&+%_n>fFrqY1kl(=>*q&i##Pfk$fw1XXw@>7(>P&onY$X
zS-`RGO?|0m$ooCx?;Z(@e}g8?wXi;&EQpju($i2`31>tDBu_cdbSqLUFN9OmpNhhW
z@|RXz6cAjArgcyhEFt#W-`%DBMt8TcNLgVU`%1ZM9ObSi(zz+C=#JO_vGPe?;1hJR
zUOCBQ1HXRW(#;x+Z{{BGCCAg$6y$O*KLv6^wOi?`zmo5Ii^o#BQYLNqTSQ1n&C=Cq
z=~|$03uKDmNN#iqHF5#3VX)dkLDs#*ThZ*<TCHXpjC*>+4@IAFV%G9BA*P563aih}
zw<OfNYbU@OCth8PV5IE9cIzcvGaXNxoRFK=bLghB&<s#!>`ijkTi2L?YXp}Teh82#
zu4Q`0hYTPz(M?iX9rDEUN@Kc!*RDYIp69tLi}Ud0KDiR|wTcnwX)igQ$$S?aO*E|N
zug^<2kE5<yxHI{jFBy82cd;S{N4?|o<~xtu*YiB^0T_1Z>*X2|CX_Fyz*$L#4|fQU
zRK=&G_rK@(qSXH*`;77!b02OKtE!syZ|6LUuP7lPT3QeiWxZ8B36sj(J=;pOzMf*k
zTKmd3(ZUg6iue4M?y{Jpk6#bE=GHwJbNKJKv3A3wj<K$yJ4^dwiyBGzSfSi@C_%U%
z+kB5@{j23%V;_irxSVN|)#W@t$$QLD-t0NeXOH2L!OniJWE+#2F3=KU3q}$uXGriC
zCV{xTus7$r4rZc#se@SFbWRO2kVb18h!TV7MAGfbDbc=N%#d!Z+!xigkt+tpL0zDK
zzn1Q-RGj<Yu-0eK0~4qHQ3qkgyTCSBd*DnlBc4V8a8jLs)5JsXpdiMb+P*Jb*PB}O
zG+Xy<Ijv84vV8dK<hR`eDs(1NRl=GjtAeBlt_zgg(XpU3yg~VXp;=FejS0jWj$OUI
zsV3aGDpLr&^TU%V_tXhVpJ!Xrb!XSYJnnNQxD!JCKXkZWeCf)tkA<9?9zR{b*10h2
z**8!oq;=2asbV2R!a#uEoK}LxE!3!;7TP7xwwy*L8Dex39kew4oObH2R~DVa&(?d>
z7HBnkSjKA)4`A7iA%Pc_Pt+Q^e<}tcP^sE!LMZoyGOzE(9k{Pl^i?-87%g{;$uub{
zlBmm3mI+`caQBqdHK9T;iAWoy-E1S>bbPb2H)Hj6n=bpQ1$j_8lr=ju<u6;vAz2F+
ze#K)h_pg=y8f0QU_!}&<(kZEPyRQPx0ZPzaO36d4kGX9b%9G&rwJSed9<`&<{k)~N
zE>vSb<2pInYi5|waYuVY%Ao6bL#XITB}<sfO3phbP&^j&tu*M}sYN$9)Hrd+)y4FG
zB*H+UCEAN)o&Tu346V@P``5$imf{x_lY`unh~^2IzN%owOh6)_#BftEumM9WHM0m!
zsf~pkPJm`Ri4rU@91>0m&d>hx8mB$uMv2&isZ!F7z8@vu{vIn_6#d(J+RjjOb2n_T
z(DLvBB#3CCdZ0bNajC3*+fGK>S+5fYm(E9szPHC*1~q4aXSIH(Wi8~b!}hRT<ET#E
zrcM4pp$)b+=k6_kFaBGBHh+V1tu5eLVN{GB+lYqszJD#4h+z0N`kN}YeH39G;~uVy
zq)8-=4g$fMfWbLj7(g5XQH*J#PNdG>Zkk}dhM{23RcP_TuF`dzseXiiFR->J{zQpy
zN29vdWS7OHfoS7zzTZAGu|h~FhCLC<f+lg+tFGn;NZjlH^%{>kG0J(8$w-_8JLEel
zN#D$pk1r}Cp)0tO=+}4q%VKgq@>JD9agYyMrqQx0UHxU^wg_L7{#_-DXjm7#t&5qt
zbT~tknPtlgLFDKEsEMF5Nwl2^)rE|YiGSyIbt*zVC76fIn)p3!X;L?R0$-7Uht>d3
zqX-*;JIZo9CD~HO&MF08a1UpUl-mqTP6)Qg11CPp<xTHQ7Ui-q&bYJuKOZ)94f|d$
zT6_8ks?@iNyVp@KnSw)?rl~fFGK^kZp|{UWkB9DiIXIF->>6QOv?=+IgP3|&V;@cz
zS)OeTJNk9$nAf4q$FAD1jIf{YHz|SS(SXjc!T0Cdqi3&u%LV9<4g7B#UR{NZJ+8tP
zvjw`0BSZe?K)`l?=A~kCY`n|!8e+N(B%(d;&LxMXS#l)Ee^{5)3#pUJqmFl;2+Rc8
zg=<RKoQp|q|81Ovx6vGeT~&jI%##KzCWzoFl_CZ@q7p}Z>wrQvN$6MYoBPkoho3<Z
z&Eo|defwc^^RF+}CqJ3R+*p&8>+H&3yeN6`fC|Jiga$rIvp_H6!_Mw2g<hiFtyVEI
zRJ0?+0J`Gt!>T~twIChL7$}<iXP|ndaVM&mPhIa@#C`j%)R<^1GfO04F#3ijm2eM=
zZBa>L05S^_J>gP>WAFHue<fc9i*sEP;qod@$``h%+SCJ?yd*|zU>=-hAIYS}s`AaG
ziyn`FnRe%q&fa0em7wm;Io7a<x<wVMtR*-izKO2bls`!#h2J6%a3Y~El@XXc6*Ci*
zAYw6buE??AKtJ78AGFy75qx%7{Ch!Hjn=V!Hl;xsNmh*hCaNsuh|ZtxJ(_$g#Qz|K
z>~ELxuAJsg!<R!|dH-J)*1yxUI;(T}2Cgb<XRV$a@_aCcEd4V%j-e6>&0;?_J0mq%
zM2Vz(Y`QE{U*~*$Ld)`;3%Qi|+ROO4bJjBkAvS%Sec4$-jYgP`Dfq+Sx#}?)s(;1$
z6aPB~=3_UsGp_mnHr>tiuxFpx*2~Jqh9^hr_Rm{m(k=~I!(S}X^{wnL)mD<mf#w@?
zv4;|J&`C9(hkbjpgCm?MAHgTpLGfwXgSQ?wOPslkLFeNp&w>ccme?lIa{zf!XbTsN
z?bzb99~b{>E~XP6uiBV_YP=!l_I<l$CpSqLFJmq4kW9V@_jeN$O6kSLvXc&qewqWA
znz}S(B*{V1{Np$5VPVDflTHRlPO1d`NEd)ZQ%<iUBrthKF3lk*MkttgW7oAn#XID(
z7w7PFI42%~t-VOWV!dhK%$Lr3J(k#<e+<r8^Fq}3SBEEV{?7OEr*+St`f2U|3fup?
z84;G~M;7e=Aa3$A6R&RG<eTyTO!R!GtRnF*zQ2B1xP9%8*UQo)UYrzst7$m13Agq0
zjiZ@gUC+()e&_z*KhqR^bmSeQ--Zp5<37o2_Y_w$6R}?9R@>Ou^ZTy^78=KDY!CJY
z?isxpac&Q4$gXdL82p^v3y3B;F7Y5q%PRVQmmxG{5oT;4%|vgL^22Sdvx`^Vg<v@b
zFD*|u8Hf`pw#Hk}Nl&1Lsn5e`#|#O-l@GqBospE#aQoH^%v#l=j~1XCqf*v9k%fT;
z+|22`E?!70O%*Lx07Vc{SXAR4_j$cb@_1NTi%J?{(aBB$MI~>d!`W|G9<mtknL*ZA
zx+R^;7FHvQ#b_eKe&K{?ae+BclE=Bo=%|}$pV-T!dR|};^Bi6j90rGO6_jNwsByN4
zc+?R1@qpr+)araBCHS(DXNyZO?|!B{iU<T5$1@rwACFE3s25vh2zcXsan2NgQqV}*
zAIMPDo67gD_Rh}&@s2ysB+cH^U9UYETweEAn#^?;t$sz_DpMIa2B=~f8F<tL&?#Me
zUdPjNt;|)lp&Eo?tGYLAjMg}N!oqsL4pF-NBv6d$2qGELDPMfMxOnBZq2&oYWn;o}
z`u)8ocTB?yuNf+>3BNU3z{io#9P<4r*n7xOMc5D4e1HagK7;Uq9^-zO7(}5%eiYSR
zew?ErP7|ehmsta3GH%e+@d*R_(xmmhONUeIdL}DVKAFnceldDv@XIii9T7f&|BPy>
z%j2Xm`Kt{3IcqQy;!+iFz@By&-q`2WwZ$dwX^Z|*Ud3FQi7eCA71ph22+w;6ILE3V
zql#VkaBR(Y-q_@;w(<=)8-xO$*pmVKkDX6u?bCoOEb_;4yvy!YeU?cg_Yh}YE2+sQ
z{#MN@KFEnrwT{qC{>0J|lTqM$j55$ix%dZ#Gf$4l*7;+3P8f&C!$c+caI_8G42dlT
z?QTnd%LCJpr^c7lv^Fv~EG;ts)QM^O1!>OnD<{R^q^qV6a|IBa1OWSXu$Xy8G_kx4
z(`sX|MD|9l+Zz#~vDm}%7bPA1W(DDx#*v`F?G?)BL%!=7^-cKC-FeaWph0=y@SS#y
zet2z`_qsDz-2L_x?6Hu={Ft|_wWC45P}jh=TAS@&KsVG#b1TsXa?}s;`9K$?xd&oQ
z^k8hbAlCIV@l!DR+3{jDvw%G>=3+FXwLG(%qrF>}6H{EPuLRWFf-FXrx$|WoMRfB-
z1@i+uPinX|DdR-}x49Eqk2|aQJ%*lTx$_n>RTWDU4Pr`;kl00Omy5N~?PIMayeP^K
z_sa-;S`FkJb4_7Ze!}wtT``x?WFU2%CAj<rxHi6tQy}vg!YB#4!?01@=n&gJ!k~op
zklTQ;atu;>4X3}=JhwnQG3l2MRfH$+$ECdJ`^lhWJa#BD*P&8-sE+O8V4$<qWV!iC
zD(sd7Ed@mv97HtJ><#Um|7xro`tquw>2OBzD(A$4RaLpqLrtMJ);vqJI22poqWojp
zP=biKz!~x#II$UM6<1^d+u;kQSuRa+*=Ou>el$Ocpz*r{ki`7pA|H(J*`j~WQ|{*}
zzZ&xYDz<1yqqG@aG}lCD+7bo~%Y8R^WqP6%myqP}f_)$EX<L0a*jj1z+1@ArGb_wX
zqQ_eD&?N}#A423)BrHfKAKa78o7~Sl>lFv|6<`yX%@9!_j{`1p%TJ!s*M<vUq@#Dc
zMap;8j-skfom+&SwNB$e-44-I6m!~ReUy(|ELr&U)D>gr*^rBu6?W(`u05SQxL@eY
z8BO!iehl4~mWbFIt?$Q>7n)ofSr{FxH|)zp)C_4Fww{?iYDU2i#tVPz8d5<`lY7e%
z`TaoxEmukH3h$GNo=Rpuu17r6K{a_K!krA=o`FH7O0mTo2W2oG0ZF|La$F>Yh#j2R
zCol0@g7JK0_Zn)3z8v*6X87KlpAx1i?w;X#uJ#SJYGdnCo|&^b@884j3$NX*#OI;P
zo;!ipqqz(xCCf%jbQ(3%Y`)U8de!}I-DqG)+3Fa3(dJ9D1uBz$mC1@XdH8(DlH}39
zwT)i1jUv%`M2_gOkih$n2UnDC+u2t^xoi5bqN^ggQb;vD@FFg&dP{~r>`dK+ZZ$KQ
zX7B|*O>zKa{)Xs6_Fq*STgJn%2C<{LWlbl#d^T?DN<kIDAE`6t$c}bYANQ2?m7#na
zR87Ie=MR08bV)o4ED=bO>{y&vMdQKUTAqDK>pf_Ff{T$3jLK#|4VL`tn68zJSa^5y
z0SmO*_YI-m3#f`YYYD*I)(Hhnz0+YFP_$#}A#Ol2ucRbkNIkBI77`q2blzL4IF!0y
zfIbeMM}%~Yev{@Co9=`>)-rcIF+w%;GRFc;UgcqI^_#Ku?b)o>egs~p7X4U!@)4#0
z`M;pT=N#5jQh?e~jd3=?MBo^2!g9T9ev<2{&a1<Qyv^%swNtIH2ltx~Ye~!X>`Gdn
zSb{4+XlUr%8N^8ST-<;O8=2n3K^UH(A+pcFwQTdwoVC-+&Ce;#wtEVJnU5)-uQ1;K
zijcSD2cIPevXe)rde4UpC;O1u8ozOjTvXS4RQYh&s3^nxXE&6_#Wu9)>nN5ej94|;
zfQW4Akult<<4%kTtwRM)B>3jeJyk7S*3yO<4uT@2`JSdMU{u`#n5=(ZwpL3AfOrkV
zo*PI-Ra7<K)odVOPlgnkVg&=#EyWOHxY}_MyC^rsS5=6>c~D=~A--jf)qOLigtefp
z=1H3_WeuI!U`{8>sr&;;4M;!wE>b@{T<q?Ko&hAzj25?kE0mroGbf?x!d+T3$o_Fd
zS0Be&0&_)mDU>GhOosC0{edXx)i8sYv_{cNYj|@);o+yseB?2p)n&Ycg~m5M@070q
zbSaCYf?Mi*WLzvQ60VEreAOyY<&Lk!Ie^};R3&I^E9&WJ?Km>zeXaAe7f0V~DPCs%
zl$5W;`m(sqmSV)S8cVTcN%48g#eBx_M8%5P)va3EvxvG<Lv>6ZjjUx_LwL$L?QFAl
z*O83TTaZNGvptAa?m7I|+mw%uw;T_S(~AB2O3{1%ue(T-Kkr~*`Hf>t?rGzlkB60e
zE$^d{s%$$bg*MA3LxgIc{WG4b8jJ`r$sRwSX`i``0I+}sP9&DF6_{X3W8skork8Ay
z^&G)`f@pCmUieqFt#g)hkkU{x(~!Fp`x%($4@dJV<s{hEAUYR=Xo{I|Gtk;k)y$FL
zd)mE@+V&!TEWRHi!{*oz&2yiOOCBe0?=5kDY`7VxzdkDkZmIWzcoeL3;V36Od1(v&
zNOB8Ijfj|dJDOpJiH<EocyLF<l3;i?Bvvf*IM5|LNS;PjcKZx<8qWRVv2F4n<`$ku
z?*>~@t-$bi?_Zx3O;(-URUAEOn83@!Xl?;~7mSapOy=hDa<J*Zt_FaJoP|6Abnvyi
z5sYD>IrobfJ3qVlpfO`{SlH{UEbevASDf|5-4jF4zy54u{^GPq8SmS-KZ`{|Xp9cJ
zFUNVPCaIT!!=buj`_*+RQROk`*81k!M!G=I(DKY=lM@RxVjHz7ItA5ExO-pn?`PVE
zM<FIC4YqqP+RkNMB^5K*bD@7pNIzZ6vLM;2RoRz&wq~N%^8b3%xKks?Ha^v+VB8Qe
zolSl`f8(GaYV3{+!7ko+4gB0-snKLcM=vXFs<m2m_mlSw^afTzo`<|+mF5q5uj;tx
z7YN%)l9bRLmQ;EXh!&BY`VFN9!Z^7Yu=K-pb7{sQp9)=ZjGXOg2ua|~46U4fX}PXd
zvIxYA)wFR?>N1d0!D8v!RM2G})@>>k4#mN=y`66g;T57Vr$1a$B29!7IvCl<#^atr
z(v7JKdsIBR`P*RMRD7DrixG!uwT~ibR*b|Lma7ti1Wy#Vx76Gm?&G2}dU0BE5nczY
z9hTVS;5Hy>O3`$Z%vm!z#JuT_TcecWH|Kvk-TLWP!VVI{@{2dnko!(eFw<XcJ4X^z
z$u>k9g|405>r>RlZgTaQb5{X3mB)DYGTU^%5_~kH=f(}4A7{T$l1DDH%N?BL>fJr!
z8iI^CR#^1aa8besd;5AR!CR1Cx+gB}3#tn0xhT7nB0u9pyuWx`Bzn%{(arR1gG&?Y
zh26ySOy%KJn=HAO1!lQ8J74t$Ymt5$KtyQq^dq?060{@dj}_vZxLeaK&<(jC<GE*D
zPa4XQaAC!x75Tk2e*+<4a#)JVGcBn7DiT^w!jJj&dW5>m2ke)H71pI)`cB8}f(<2g
zIVy0@(7O@;L|WCYjJQQG2X;bh1%<amRoPBs)WmWG*3lsGSqt7Ot<b{+nekDZ+%d9O
zaki0qi&rgRYLQm7as@7$W27mu*l0mE<p}2$EnxnEa!1hnwG`wsQRhkJ$5S@o4nRjH
z7gMTb>YXD)o}YDoHa#-8XMRdWx2Np6!N2<{s_8Ri|K`IYqBz?pS6RZgSzAaSv5+aV
z&zFKKp3in?LF5KYnf+`<jJPwq@u?9cjpF<30d?OxJU4A)H*MahY~G`s)jEHS@|DfM
zPw^dD&KRHby3tKoYD`~T@+Q3MxDMkrvpjD;Z|a}vF{^-Q5WJgMJ?vO~dm2EMQ)ufP
z(q}=5qclT{xgyn~`g5``C6Su)*2>LHX_aKJ5z!q3RW%}tarOQR02XRBDQjOgWTn!Q
zQ7U1K@2aJaUdIpr6D#|etG4BUrF@02TEDc*vutmE|2uI@W#QvABo)X)HN~gtXHRo4
zOcuqoIw_8fP2_eRpF4Xr3w4k~MX<x`=c3lCNiyn*O5MiSa^mlK%l}h0AI}z$#|h78
zR9}27VzWw<)jd;O9Hs2wh5II`6`Y{J654TA{UrGDf`-dw&bNvaa>R@~3mjiO)!AY=
z7u7Q#pftr4mA8<jk?twJMVgAZMI6EVBA16U2Cj^t6iu;)#rh6(=hK{>xU_*uqII2f
zXCiW(JNu2_a1dv_G;C9j0(Uj}yPN#JCdJw`_H+N&yNiYkjl)5K-vt$YRbx{^{DtJy
z2QvJ}79S8|iQ(Wp_s+)Un#Sd^G&xv)HzFc)t-nt2j0@4AfH;X0X_zp9^;SWR+A3%1
zGwy|X?ns32Q0RzE;<;y!z04L&P*sGK?^<tJEDtZ&va7aZZYU#kx?n>6CdW9R29t;c
zX*dx_pq75t+txL>Y#^N8N+QNe*JFHtFfHx^?m<%GDBU{aG$sy92&3F?zz&Ns<#f)v
z99*zCZ?DCVW}8S0Cl<OahnQpHUVcNe`Kk(uZ^l#*o{%PK+_n1*hcJ_z#N`>hRL7Qi
zvb3xV_pylL<pf;_;w`7VZWd3AgOz0#$9WCNji)m>X@=Mk@!L0pRT%9_wZ|I2gncQ|
zI%rAoQ5YO`fa1wKjcbi>D(67sr_&jDb>4c>eE9H&Pb*8)D-t3IXAG{a%7)}#*KKxg
zbY`97JvvVW>*t{Bsnn*Di8_&nG5wJ1{By5&beFArV%KO$;IuNVcyuGk<-c%@QfW6Q
z2R=)_e?FwP>7`!fuFhW*xM2KjDDWv5=Ww8RNC_zI|7NXA$R5*A|8}v_tVM<p7ALjO
zK;Pzh?ump3VWQK)l5;dqq}_r1@k{1EgT);i>|Dq2L(5`#*wvvgNLTJW&mk$QfGsO&
z!uriYveB^<yoPaNH*HFEU&s2L^WZe>UAvQ!=StcATRR~E)5dj(ki_bF(=gs6<pa-k
zjIXr&`N|r7<%-LF`+6H@M<sLiyfXpDsbH3HG%ISj9N)$48V9@7qVKD4$VD6~U2|}+
zRpY%7V-R$b%jN83HXILT+KMG8ECkiYnN?#<*NKVa_D&ja0$OBl+P}|tf~g)ii?%=v
zy&)hlK&!_jjzqaL<@`y;zhCH3fn-EGSfBkwPtB*JhAYE{whZ5@RXNj|w3BCCi^xiR
z#lFNG(-oF>iDA2I!KIu>ttCSGzZ{_v`{`EuKI@vb*UH%Gz3ZoAZM42(^|gRi3Ft;O
zipv&gNFKXKoRELFa>sZ#<iYizzAK%sCiZwh88LbbjaZP4IRc(h<;$(vNjHNFHg@iz
z;?h5zwHFc*GaSyP?VWoVY3sYCdX{z7SH2Sr4Oks*5o5~^q(4?OIC!?@+I^Gq(!H_K
zQ<3-R5&VexC%_zxJ{JKv#5!N*o(C5>_*Ovq`jfN2S^WI7xAy&4<FRLc`t&cWzH>C^
z?f?GD^vZiVr*nUv_}~A0XU&hTfBqd`(m(e8S<~T3(ao>^;xF>8U6bCLJ#*<k<$pNt
zeRJX3pyT?-3vb=h<5w(9@aQh?kZvqdvIiNP!8r~R<TdFe>Lk(QqNnBj6x5rVXS+qe
zUQC}5>qY(6U@k?20w53Pn>UoIPRkePe00fu<JUqht%Zf5PPS96FqUY$vd0A@AXVUY
zV*NXxHUo|-{s&CtMM*7(9FPDG?6|UFu_U~)hsnKZol8WXLIWYvS+yG&bVDj7w^zD+
zI(Iu*3qEY%bC_cZ1?T@oXX{x`9VZ<N{W(Z3AV5eDSi(3p+g004s>K$=C%624SA2C*
zozB#%MO#dvu31~0t@J;uoUOpTCc_MBq_pnRP`uhbRl-I^z;Y?r+EXf8z*Q;L*;6BX
zlgGS9&y_%$GMMZSv?!k`0VxP<V<8x}yEVz4Teh)QoBu209mFSvzLf0F8$3-#IbLiP
zc44ysa2l*yhHNg%(f6M9Ep#l%Lu!%<pj6?!%sx|ZeWuHqdrwf@5>1*lI8Jbh<-G)Z
zLqj~(AxF7+#o$+z`r&!K>l58%*?E>pVK8|HvnTv707uFU^P7k`>eIk;!pAl!l@Vbj
ztSA+eIfx?xHw9D5H_Kqts3MbujLIBH7<2SOXX2mtoptW1Y?sA<Kd!Q`6doChUsY|-
zXDdY)sS1#cXwl#{3y6+UKJ*E?gi@8DrfNUMV?kYT(jEmuVU>mLriL^hL)Rp~9^`pd
z`wyNWA-#2o%!~s86pLq6%)5EJ@JM88Dnc_x!qWPy(j-BF%G!A@gx?O~495Wk*#1qH
z@d^{h0Mi#b!l^pXQsqhANW3n(yfvphtYAi6>tUg&9kFi>!!)&38+!H#dVDb1Ol&aX
zyD&*>lMv=9uKg5R&UJ}T?6Op>9zB0Q$OaG5S7;nX7iPQNs;kwF+4G7<HAdfn(aZ9`
zwmRE_q|%+**$y%T7H6r;t;&`emSbXfsb~g3!s9f9+{~sDkzvi((|S`p!ExDEa7^1f
zA5yHwLXsu1g1l$WRJ7XI1Pbn@XZv#ZBCSSalF`%4J)vSc!R)=n9Tq*M93W%TNg!Vt
z!DOPD1V`~LLM(ildk0O)OjawLHhEk!ToCE5l!pu)6WIOouepRiT4OoT_^$TVmUkWB
zasdh!Wp~c;9wXj{VzZMM_fW(uat?K)`c=<5VVye12PrU-ekKaiKgf6=miX%GK{sN{
zGq~@D*CZ2_%>4fOGMH6|vqo*87)#Ttn&P^l?HmWvreL(d^VO5k1f(NI)2bBeB%4}@
z90(h?zY_b+zM^Zk-kGq%S5@@^yJo%a?9r4@^gixesB<mkPc}Z9%gNxf?>5CVo1T$k
za_`0{cInCB#FDD)!}fKiJvw!5hm_wFN~RlgYHV?~2jxOx3wYWTH|)fZpXg6O<AXoP
zptzrb>$zt!Sj<Hyw>#8#h9yBBxf0h9HfVDebW%_HAZsWWz7Pct`rZ&^E+L|O+$}<^
zFYMeW3@ce$1o4cq@e@tp_(|9A%1n_Ojy@0452yrZ0S4Ah=nEGNtbA3~@dnRD@cp)W
zrXwS<QRYEKrRzDkZE$&2khdBHwALUipss7}E=#R14mXJR4GHhBWY8iW!pAu1#ldW!
zD+Bwa@qJgy!)9gMHiR2f@bBnHMm+>Xf0DLJ#rHD5E$kn%Cxlr?!WfIVTh7~x+|Eir
zYJHWin*5|O1uN4Yn1{m)^JG?l*P85IK9dZ!Z&OK9q+1gp1%Ku<rs)gLHF;OdUQJoG
zZ1d|mlg*?hOrzsLNmU-BgFjkJL_27C<V}`&F-CTLey)XS-qU6F%heIEJ9F%2^Wm?L
z$^mUtE-L<zz=s({_a5`MhWPsPjKej?vDhN-72^tB+<<u5ML8OTyT*8$t_$NMCY7;z
z`+WVbqD78J<-BAB7%cWv-l_;3h`1!^%HjX`)+AvYx>zWflw@GU5SB`CNFZFe&)kv_
zah;vYZze8vv_gD9C|@5#CjG+q@U61syY_G^Ap|N77D704FgeVxcwQ6dB1jLQG>{iC
z(xUjN#O!Ltvox}304B_KzXA-MH+M(0`*NHPR5iO|_TFqU&>(`5qmjc-5MSk|<29fv
zv_vgZ#eaF@Q!X{TS=1cjs;%;U1=I!=p&{FYkH3|pq30$o^ti^Gy1?Q|11;Z2-`47v
zvYe@i#Og61IcdP&fFnH#?H_y6yM^vhI22eVLd}Fb#}y%PAN3*DmA*p;JeFlf))v-t
z#osrx^LFG1@6|Y8sff`yAKev^v(^HW?Mw+KSkEWDacRSEf_LM{N3zO>-=vU<>T1nj
z79#z5im4=h*|wvLB09g8!&u$0w?~`Q8`v>AU%49_7&G1v8FRNA@6F_}_8V`9DD$Vg
z>vIunJ6{xIj0CO<*PM67^Vh@~YN)7T^z$>>N<uFv&*sw2NRKag)j)NGp8}h2H@7SY
zC=G^`-IPL1(&o^|QIjB?c$&7Qywos5o`p^`1UMd#Yua@C_;RnE=Q)>;-8JMRlZduO
zdkqC@QC?!bwR$C_Xo=R6<%WeM8Jfq0va3FErwur3Jl}oUP+RJ|=3cvZ7KQ>yy#H@n
z>vk<yZ?gAL?2*{%x@lAK){jZEhoS#03r&Cg8uDWMhlaHkv)@j9H=;Z@A_rRmV$mEB
z+}tPa9R7)OqOhslRrQ?W*1=l437vSf)S=wXQ*4R{vq9y4YhGVl=P;&&%2nG)hEih;
z++=R1d~Nh+1WJ|P_G#GT?s~r+edN?>a~Z=3S51~0@Ei9o93C+014d(4b3fKo0FXUN
zIcY*pPhE_S6yr=FFLrsbzK-ko>_k0)alVu9L}gOjD(XKlS8503C3;a1Ew3UsL1<EP
zNl+KHQ>ElHg78$==v92XN63-A>j`9ms5}pOj(l4{5}vy}I;Ifh7%-o!OIP-5Ppq8O
z5U!u|Ea1mMwqU!f{<ACbZY12@<u$5Lp&aCDdJR6IVjJiinxBRoKwuN2I3lJwLC1t`
zE8Q3nfT7K^98wOk-0gzOa>YWmFw7F2moHQ52={qUDqX)w&2ue&LRU@Y5zCPi2}ryW
zW{y?9{4)F(2}dS6l@ASjwVPVLny5=)8j5)o=8!LsjlACEU8);-RyR#|_;9d!lD>YV
zqPTDCE857em8(0~Rm6l`<CL5VwCHR#n6;_XP<S3ncqeCLMbK;}p{_@Ft?Kr8ZKH}`
z3G^H9$-+AVy-L4vxDDL=-Z9^Y!vR0r{M$3VCH5Vhf=~xI(-;*VEBlDK<QxP%xJIO9
zHB6}p&jX&%bM2XqOO>DyG<JYZQd+7tw*bojs^(71+{>pvqlfL72APeMAT9-`6}pQS
z@x_enD>XpkEMdU9wsu0Ob2nzuK~(kSPIXjWL+uv|+${$9&Zjf;PjfzIdg6kZJRIv|
zgu#Wf1AXCTS?j70duroeOGHS?v4{+MX^S<AJ>NnYi1Gd4xtD<fu19rqXuk+k2s9+U
zQSM3BM=9blRPBbmT@+i!RgiDR72Z647WZ@MQW(!$9x95!3cRc!@cH88e8$E_cx?Mz
zrBXW!$fETz`wkALoT|v6hCb6@>I8^tXb~la){9{#8JfIy$w@*l*cQArLTKB}kmgBd
zk}vZ?qHd%v_I{-9aO^6}wsUAfq9^R?Tvy(<&b>gBw9RBMO+f1E<k6QjB-OYEyM*-h
zd$;JH#^*59ik$DvyJ}nYz~;Z1OeXLl-Ama;_kK2BgC&nP@QcFT)umY2b@Y|SIS$X>
zi<%>q>t{3ZU<>;2HzCQ?v&%H&oOKP_t~lRWc$?MjhI5FhHj4!ngK3AIt~WFm;t7-f
zGduyIdEFX1Wq2*1Z=^3&wDVN&G<kMN`=M<ptIba2%L(AXfm+(wa@Bo|@sX*%*}ti?
zyjiual#7>db{dL34url%O%S)xrNCQS8>PZA&}g1DSu2{e3L^TB_uZU=33B%G_iME(
z(_ig!@DRoK1ek|9N!&#7<I^Ouo}Ob51$rGItQ|Df%s~LrxTjSG;6d@~-)x#aKEYBY
zy$z*Ohl!|D-y4fPqRhXlNHXU2X(f1CEFmbU$ANRmnY5&muEv9nnIX+tOy1HAI_UCT
zShodcmHt9OC%lNM$==BE!#rYZSXfL|OxKjelqb?JIVW~+jW0&P=}dL*!6P0xh(v@a
zP5;(u*PN+`LN}}}(JlykZ1KYzyVo{$dC2G)Ke`(nf!`<lbz}W`0pHNO%R{)Rz40NJ
z4B=hxV}j+~1R1lc<zS}TyTWfXRA(QW#G#!(xiX}oEId!&J3q$Bc?#$-Zw(Tg3DS0n
z4@X?@RKmWIXrtm3OD6H=QcH49K}<LJNbra6!JVV3hB_RL4!}+*!=dq6*dHx#JpxQ{
zy21ZL-J6F+d8hf_rzbPVXmk*aaib+5;(}@1kYPeYH)s%-Aa0mg4MQ}Vh&FE*r!S|*
zlA3N=G-$~xAfvQmf`$ZKV(gk!Q{56{6-yY~Hn>cnlT?*48(I~Gf~tDn&;6k3?wL8~
zI`8tw`(9TsSrnn3-*f-&`+I+v+2qa6RSD?~bU5PzZAFT+4hBXOOwK9zwIpBkZYl&m
zZ{5CsUgsHfw3wvu((u7Pw4;<&47KtEosp=ldzt2>T@v|K+rx>85B^bcMkp;k-LN77
zI;%XXi8N^#dN6#^XX}{%ISG$1U|)t4+c8zTt4_tL%ul9Fd52X!P^}1qdBE8Dz&xP_
zEKK*Pv=`LCe8qGgZZ199y@IIuX<=u}#!@TGvP;9!YsR|99%?)W6G^vE&x9FB<aZG?
z0NhvLnzKmkNXmWkP-IZp*%#j#Zo4i1uS4>b^6%c?oy3zqh_QISe)xc~{s_p*3*FLE
zim~DbBk%!zH3JN(InzSz<@&RHAZ&_gybA#{<SvXvKPOS&)->m{4m4XLzU<b9zaKnh
zl;auk7Exh0sm&rC;pOk8$`Gf)mmX<Lr{B`G*R4KFcbWp2O?0wrIrlHOKEMJvKy-3T
zAYJ_1P!gyS0dl}pc)n^(WCgHv$IRpflG!l{ThqB<sE)}zV^iV0LhqzaEip{8!+;1c
zEw*B;S@BRn?565_HgdsY1FPZi!7GH}q*mx0v4i+evRk$%{B}{@@;G81>25hacrd6T
zW^4!wYC@>|jwuKZgzQ3czyaWvVV_}^U8*lbm07jlnkL(HxXWi_+gPco-xQ9BmnO8>
zpGa_DYQcH#{Rv_1X*Soows|LllV;1TBYjz0w2dLP$dOgH86|e3{l%7LQxn_MQ>s%O
zNrm)pVQ2RaBCx$`@a<GQ-#zAUxBgk@Z5b+7ZYZB74AgmNeB^9Jl-UY;)e&XCZl+Mi
z9tz-ysvyt2fY^6af?O%_Y(Tk>7o~??j@xfPoMS(y!4XS19N1+^_dS_=`*6eb<h`-)
zj&0kB-AEDh%%j;$hrdRl=eZ^`)ApY|@R0EZJ2*ut6N6k?`QCqXl{NC)^ujd+N8y*V
z`#{t_xh#9?s7YdU%i9)JyJDtgZfM8JV>Qz1GZ~fG@|m7AwMC<+WI3}&PcnQ^ozuaL
zwj)s}@K^;4>s@6myOyi-)-w#c<wC6xHhC;kKGr^t!x!XAR-cX(aY-nZ)HQ}@Fp*SB
zKkxPwx(`-l^X2I?Zczpv3hED5;30g#d_XTej)h4rP&|ee_2v5E@_{9%d`XqEz}QcO
z1fw)ECCt(*s<f4L9xmqGps-J$c)_^rDbK3SORXi#sv}D`jTu>7LM(|y6j4^@qzkJL
zClX_&Sc15+OfA3GE@_(c>2g>Lev8)vC+$7MfG<^f(2GJSVb?(Ox#GjW+F8^6o5|ZX
z|9q?Lnb2R1{r&;_$+t_l6z6|2?&*8q`{!w4D}J<pa#-Tsc`N>I!S7!=5xZyP8_U9e
z_TL`=KYn?7;Ofcc@BXe}%HiMS#dQsR-M71Y@WB@=Z?EgV@2m4qv)rF+x&KX+|2|Y0
z3kvVA-m6mz40rnTZX@P@W<gZ0tX9=+v~!Q`ig_bIO4byho(Wui#_>XP+1U<RMdQc@
zQfVrUM9>XHDmytUG8dHRX~>U4X%d4QFn$aRiH56(#?yCbTB~@Av2gi8P8-;VHBF8N
zSy<q70c1AfTUm)6s&<SA?lp|JmJ%9o#%72_Yg!O@1yDJm#lbp<6~n8GVLo(0$pEIM
zZYgVaeC%DC`*Mc#12tLIo3~g0+l(o7`<=;h_46Bt=_`9ov(=?cX#f}uHJ(2d)k>|#
zy;5S}k{sFaaD}%{x3E)YliGUy!nFV^Glc96kXOI5OjhPXd=fGfM7~y7_eaqS<3%lI
z9>Rfw#OAaf95%}43(66~S$U$ESgK#II@0jB0`bla!Pr{<Wyrp7cgMSbST`&1Ud`UU
zGn4bMExxijc6RsTj8W}rx^brGs%NHB0=-X<j>KiRK&1rM&rKpI@h$tZNoO@TZnr4C
z{(E))a>XB_d{L$Rv5y2?yJ0ZZ_ff0&qxb4|_q#2F7nHkozHe?{U$G^_d6T#op%WN8
zN0is=F7SR-Pcn^i7XWc=DsT~(AgOoo0OdJDlVGPPyEsGvJ+dN4hlEU)lxA7Kprt;<
z4F3uj&U+5)X)X2(8-tub>0w@5X;hc)Q2KdLdxA~tn>?$_)U>^1tvpG0f%j=UtndZ^
zz6O03vz^!l2Ff1Ad!3J`R#To|Q_`S+wSthtX}Xw9Hr1EO^sCtv&US0!xzzphGWpn5
zMjA^J4wCaf#*H`A!slo^V+g!NZ{G~0Buf?$7>Fvzq%trGqe9>*3p3~7qLi~d8X+;t
z8Dz;Akjs#Q;Svs-K%Lr7Xs%|oOw$s|9(}>#qM{f2`opi+<c$S#;91X=szWmR^vXWZ
z(JuY-`iQWwuw_l&r;i5{RoN69_8dkly4R{n=gK3_+PYjEOgDs%3#u)d*->I2RbHtX
z`9k5|uD4UYA2u8Aw<ioX`|C5YZRoo&)p?=EC$H~$i?1kQ=$qTMo7&%Fd0s^1yK7~v
ztj?jdpt(TV4Qms;+f@MKCiKLe7?D=64wd<vHJj-W*)w!=jOwa-0PhM+w(8CHda!1G
zbaI<wiHPVJ?<v&2bDfbJgqJS}5WX<VAtcteD|6E?A|LQzt_^{g&rcG9TAhscwl%{>
zJbyTi>H<bDA5ffq<wTwmob&dW>F>^UF@UqGsBDNO!M8>ill6G*rn>6*d9@Al<)Ou3
zg~91$X__Mj>NNkd(b2EcK`@rz>9np91{!C;^^Th94$I7iOyeQ_tZ#NI_n#4vL4qvn
z@Vn=026%F3_a3TS6%Q%^!$<PzS)Sw|btFkb&9oM%0wAM7WD9V2+&|C>Wg`HZ4auBp
z@&t!1MDJWs=eFElaIXeM@bvVyd5cGmE3ML$)?eKJw{PjnX^%lu_mWCHVv_5PxG^J3
zL&(HfXDcaj%$&d2cDT!VwOJX!Z>!%=64MtG^1jT<`|{(i7A*3HZW#v7_2sq1zRNQd
z4Nyizuh{L<C%!klB;uMRac4Rt5l$%tV_spZZ<IVqDla0z$p}1R4`t_-MD{)fgUxZc
zWLl;z-Id0O7Dw0TY6>RH<vIJUQPJB)oji3BnN8DExle)P>Vq*YlcEnhrk?QP26=iH
z=^?BEE;iTf)Hjx8NyThpw*lBtisLnkw#RF7mT%9p&q|mmyww%3joYIG+<TlaPTaC-
z@cr%kX$@rD3MMA2t=(Gn=#jc20(6r+XV~8ERMBwH>i_zQ+E)VHLpq*CxLoNpjKez;
zJ?9uQx#dF*&pU2GTN{z<sAr?!!~S#Bxj6RU5op8t_P3ur4P`F*=)1@fAt`sfXP<SM
zJ-y0O9X?N#{k*aFYWmz%;u=k0;B<7JMM0f`Cf6QKY51+RC!_n+Xb+9;y!YuGAQfw^
ziAbj(+iFd-=LF3^5!`-1_J#8EnaaH=|L1+aBRtTmKYyS*{dc$a`hJ)HOrMiLrmJSz
zNzh7cK=M#iL=p|%#nY*|$R!K(YR!~LYdgf|iUw2L2#j?|S4#HG<aTQdNF-s%zH&Ye
zO@X}Jj#mKyUS!)TX-iLPpHWxgwF19DCuIR4Hz@%4e$5$G^Tmsk<d_xWcvbhf=_3`Q
zF?vpFq`2WjCB#Pl`Jri168YsSX@GW$gts)#5xRv(TDQZ^<1>|+wp3?t|DZunAP8th
zo>0iC0+mQtMX?<Vk{vs&7!aO|yz~5r)vD)32syP4^k{O`im@V{C{r`^>5}i1+&^yO
zo>8f2vxU}3rk3E}!faub#_=i%Ixu#l3o@m-8&>xc@)t4*GYLWX)%Fj?urz%HP8CYO
z$1kK#?13E#koZV9Y~0(W&+K*ISoYf9Ijc86Zo``)oZy!H#Thtj?9(`w!@k*z;M5L?
zlM-8KCKa=C8Y=Lx;#VxZuHJ*nwO-Ht=Da&ni3h%|QrZ%HQNBp2#Fn8L&@^YW;{RVi
zZTFtQ|K-zO)`Vw{8VH1~U`Hih5Nr$L#^}b)?;|IS*K+Lt<Z0*YXEDeHIBuSV)aZiZ
z_D)=W2_L#_OhP1nRw(53RyEV$0c)X7t^A9JU0{~H=B`D8c)i{ceR*#87Jb`{55T0*
zhY07D^`UEr`k|_x8GN?vTEIQ~B==M(5V*4<AS%Icj&`22W#+zB-KNEsR;nlyA&;IU
zOO?$9g$g}XM5YM#1ZcxbR#**i5&?qY$mrju-&2WjMoCNDd_DM)lsotqCLd4A9&;Xa
z43$WS-B)cu8?DN45(z%4O}cUSUg*7jZuj$pD;5`iTQfS%xED9Xh42t7roFVrSXN{k
zU)Btpw*;J8yM!hO3TndRwiRW}WoNBpzfSdEF_2N~SHA4VD7?-eWl)9^e0AOrYI{Fb
zd+Bxq%%>YW6O<<uBA%W#cy^wt>8LfK0qaOslWYiO2MW_oa+0BTk#kDzW;q?4>sXrM
zT1~oO<xbEt_`9N|#$=Gsuv9#g%r%W>k`h@W4GT7TdDF#kV`|yXR%w#Jw1UC7V_mD@
z-<hEfag||JOL?TX!islR6qNALOs0Jggj`YTi#jiZ-fNR?HvB3fwlk-bC56!vW)D@t
z^wq+k3|3`?Mia@!R`V_G`=f2`n$GCUX4~kxf%muTUs}OL&xHuF3K+@cn&uI<9jqkY
zSjXYIl4&74T<N!}Ja4GFeCd3JIV!PEKhfHg(Nds0UhCJZk1oBXBtR4+BUr-&<p^&P
zbaW;27>BCjAaV#O(2^r6d3AK-lhqloWyev~5rD2;cHhb=FTtFZA+x&mAKli!+ogX)
zAECXpRl8(yisR!I*j$=c(=T{_Y;-I=D>m!1)ALFkr>sC(n)1k$!V*$4Uf#Rc>S9dl
zqH*@WCs(raM~lbcw<QdA4L@w@qaV;mNA2z((U`S}yyI@9%br)H!%ubpR5oAp0Z(BM
z-hA%6XM8Nq@j7RjoIvs7gl5L)tDE{``{qYn0G<z7J~)CR(1$nD#Ji~?;Jm&(|1CK6
zPr@NZJBQcg70_+5Q<IMpb>c`4RZS$Nst2plM^BrNA0t?*qe`+`PW^))Rlgq`mY!Dn
z-e$+jf|>yBLKn1Dz87A_<)#Sx%Tt|?*X%5h+Amb;PV>kqpN-q}U|;GA67fX9|2B^>
zuCp@4#;YZ-Jv1WX!t;ZXPN)ye+%c>IQ5VtBNppCWMdwHF#;0@4MPfK7R&s=)l&8mw
z{D`+YK(2zrIc<O~ln^~yG6^at+Kf^_sd*^ickPf$lFr4EDGi}N)ghE9?^07lW!Hn6
zzF9IhX#&$$!?w4-#)DjY5!+wmMQ!+`i{i4W_Fuv60j@p#y!B<ey`nFflfHO!XS{Fl
zpy6JEze>4BD>fnTv(B7*^j-dR<?94qXv2?cv*^I8Oh6X88YqIe;B(pWt##DWi4mrj
z9*>1ebh76SPN<AON%4@sxVa_7v=zx0@HO_OhZ_V{V#tX>n??%+p*386Ky%Y+nuq_j
zXtGCUFFmStyqbuCAY{jI-8mQzvzgJv!jL(JACGjPg>X3C(-@3dqVt8w1D=;|MaR>`
z!mLBAI9Q?~C?-aLOAym&@G<#FHJ&rN1-c6Ypdzs<-~G~mE!8E)n<C7_#)r>;RDH3~
zLv9t0VxVW8^>jDucW(X7O{h**c}AK*N}=i|>oBKjlk0&^e@*@WzWzET$w4v`r4VI^
zdTGmYTooYj_2@#R&Mg`Imo9y({`sfU%53rFjjiKr@@<s>;*%t*uLqu}gL|G&9itR1
z(&^Y7E2}lHn@5&!+dKMBZuVye&u5v`aL>*>Ut{Lrff&mTe_N&}($c@n(qHAzNl-2;
zgO+<1UuK@+37MmGHQ9ivRW|s)+uh8S++LJd=BW_dh;~5bnY|1^t8^D=eUU>5#tPd@
zR~-|Y%C`%%NI1FmaiU+|hjsuc(X^<HAxmJ$H2Jr(;09a|!iJM&S;I)~yt(cOc}$~i
zNyY<ajETgkggL7^YUNUG=Ry0dZrT!tvF8}0xhmG=5k2y5ct<`)DPy7G@UqXM!?Ga*
zi=@Guua3#?pEkw5JE_jKulG!<Yb#JYl|TyipCtEbO@QP=m;UbBW)5ObBGHdN4lw)2
z4EG`fvZDM%x<5_^pr$zfr%e4i{I(2=mC9&={W#)J#aUi%6-QBF6%G&IA!4+0zP1nw
z(WXO0=2UZoQMT0%5x~JHL-$h0f?jux>qgkJ`8T$P9SbhS(o(zD_;{-p>mA3+L#vD(
zN!n**4s1Jva3;OU^@rdzqw9^7GRKA!FCOZGI*9iEbH)87!7d^GR)u124;dV@h^PKR
z@4ytpmpQ(-K}1F<y@qcx{kshJqBd>suzzgsXm`Gv5s}Ofg?%7gFZU{L=C^2}E;?0a
z{U#o@>T|Zn^E3@KITd12Sjm*jUY1`$E~SQ!MZ0i?^?a~2(R|vTMC<9e{7?u(#E#SK
zQbB{UALH<}EkDXMPB%-hwNkhpkvi7p+-|JN#==P!HO@ksX~G^ELhY}J(<N>od*q(&
zF4;*fDAAOA5<{bsg0|$%EpW;%rmSY9XJ*HWJeHkVo2(P-zS=qP{tPsX>;tA#wmGRn
zD@M=DRmA15o(MUg@8Phm#jS<*11BkXVT^MRRokk{O%`bGEg6ZP6e#STQg1#&KOw$5
z{v_x{FiL7N)DBLzyR3frk?JL9kkIT+M!CG1#F-Bm=TYv{lawTI1xVJ!8ZH2M-A~?&
zt-JrjwD^tTjK=@kvH5fD;-<Nig2>4&tJ<5tqQ~CmTANoAsvGywDpLeFkufYBZGCVl
zB9b|ZH{pJ&|B~g~00VEh_hZYDK?(5}S-y<9?%gtE@n;+E=D=V09mDk=r9WZN&k>oj
zrP{`uF2mRglA3t&xCW7WGGeUf0b#H4DmddA$lJnQa3JeB$Z`imJ8{@InlYpk1%I5K
zUj=`EvY5|m+Q~2FREBQeFx7QBUp^zI4)0Q~qnGPJjVT6He;DTUsw#YPU2Iu#aoH^R
z8oBEoz%mCDDSF7eMjk7)Rk_~a9eJ^(#C!|Vul`u#QLHdc^i--+!P!=YuT#~ErkVqx
zmnT{_(VGptPw|NBK!zwCvwjV<dF^(<8n&=-K26WL3jXqGCsF~nOP7sKm-`irGL8@`
zX(p>y<^3M$ColYwL_ZNxSr59H3{iEK=5UxqKG!)9&_}&1tP?;95Fn!K)um5$Rj_Z{
zRBoRoa+CsyN~m8voIG;PTVV269oK)<uYY56T}s*c_^VkFPmCw@y3jE*k&nt6Kqhus
zh<(HROU2R@vtpK>h{8<E7HJD;<Vu$u>h*2)U+nW;Gd$=vc#9SL1Xf`~{fZ%PPiM}%
z2AGC>a9L7(f_o|tQitjiwi-?E>UT8hZlJd;-<ivtQo890lt@jc!;!>8u*AM+C@E6@
zzWe0Je`@&o$M3y)^CvHF+VzLIix2(FFaEmh$p83W%C*oH3&SRNe(xXi$C|#*9~~a@
z*u16x^4Kqa68gRR-;MnT`^Y)({nHyeH2%!qgSTJk%gEk6^!CTsKJ5Q_XSa4tQrY#>
zyx$O`h`^hZET;(LwkAw5Vc!a`9kX#cV+0AX4Oc1bphY*9mPtqx+z8ci5|~@|=@mu6
zFhW28R45(A_LP*fvD=8FVwnX~SGQ5c$41@I)WeGuJM;#u^LpEgOd>r3YH&R{-~CkE
z1=oB0fdQb}!j-KKzoGHu$vHR6snM*-Whbwn_(65U{u?pdTh4^so&wR1>53)VR96a;
zeR1A0dXxF}3=oVP^1aia5j(+1FDF1Kh-x(#8e6}(RyBEzvOyEjxd+)WFHI{egH=rs
zmfxcZ#Qe0jYFI`HX+}T;*CNOesg<DVs1o4<dUlgxk<ugZEsJLMJ!lqHuVuZ361od1
zZD`BgjmH<Wf1~LqzhG`?X(L0ktR+e!*{mjpb>ow2-{?6VjCbMT$dR7)EtQp;Q%}7+
z@ac?Rz5869cK|EJ5N|#Gz`a!EPH+EXixReDdmp8Ip5yzPjK>KdAsF7+0wi}!e^$KE
z1nNJ-pnY0gkv70{Yn@3%RQKEyKs(r5L4gWS&Nws;O@n#7H1f6BxUo+W|1Pzq3xLr|
zAr{j+vMshlW36DRpc1<27+ISV-U>?*VhH7(k4d#eKL8IhdX5asrQ+Z?oe-9>XLVLg
zlx3%(dog!BT*!P-lS-&j;6b|Ou9Cj!@T|uL)^weZV~NqEKUk7IvCIrkm)^uOE-C&-
zaA%Qi-UsjI4E$^;uy-zBo1NN$IPAF=uq|iu@*G^Nni(TS5)WL?Nvn0uBfSFaIZ>)R
zTjNOl1XFgl^;vI4*rn+p7=c9X>Y~yB9qs&F_fMqJhX_)nr^CVe>n7tb-Nc1ZanRKB
zCg28u_NtF(pXw5P8deW}>W8K!$E!A-y-{p)G900S_cngIob_8^Y2<PHYwXcu!{m@Y
z4z_KQDSl@1`^-!6_KhY~Ej(^o1s?ab)mb|RE>3XppwpGFl>7a8?im)B`kZ|ppt~I3
z5%oD=56<1;X)-y_aRa!|igw8TPC<<jj|r`eJw%z;Ov^;3G!yp}mgC}qN;s;uohB5+
zYl(O>7LpZ)?t}aR1hm`*CNloEElhuSpB%>IFAX;d!7tbGJ_|3M&KgS_xmYlAQr~+1
zA=ftMFvD*@EN^}$UVmneOR6cO>A9({_3^fiJ=*QzcAx=M1hlyQG=|XRi$>Zm(8WWp
z)un7ma?R~V-;RJaI(<8YuI$fhzrOsS+`IRoWElU;mQ*25!;^HN8$@~p2~$m=i4)E^
zgf`K$pnbtoQTTZTh%!#X?5FHtgWk&!OH06amZSQ43D<|G&j7^a7@4f**YLbiY_Wo)
zr?#t4-vfjUn<{?Taur~RKPj3o-8C1(PjFqRRe35@=r~rA)q@p$Xm{A9D!pw&^z?0=
z6P8u8Ei*3LqFs^|5t>dw*a>}PK7l2A@`+wA+fkNpqdB&RB1^a!#tDqlpYdYL_C=k)
z)zV*^;HeBb?Kw7hCDYSrxZCX?Ol1O8_9$Oj2KFen-s_XMbewm*=RDj+MhUT=%PFx}
zH(wW=bXv}|5IF(!Ed0!1<zjGLmRe{JwM>(jJpfxovcinoh4$=Nx!$2kNOF*7UoboA
z!jY;?*Jzn37KNs&!UQ%2YRH|q1z>L`jfc3K)7i0e5_M@f?VifF5vp%xJvEa#P335D
z3<(zOc70z+axT=PIHW{U0w{+&;#X8um6q6uEIdzjK3b?h1Xi<<^!Q2dhTI6TUw%;U
zDP-kFsU?PHHxzXN^JnH@DSuT=d6I_t$WqDi;FIUXnZT_JegvdHTtY;HNjj+J7|~<b
zX)fYH7ogb_E65|K-C!8nC>tBqixp2u3b$~7MQfr49NeJcJarM8<gfy%KyDu}#=5Mg
zRbW`6*z~Eq`L?9cR&ym>tESK;o{Lq3^V8(nFHBxMvV2q$Cam=!^b3+xxKxvXd0gK=
zwVD2^bu2pXM2SSf;BRcE)z#-8Qf^xY6o00HS02Y)XNa#zx!aIe;w<{2L2+9=b>3#B
zz0Q9x%Hj#}o%em!ykS#Q=NWt%oG4OXhV)oxpB#E7i^*oH^ST~^)8SANz(-<y7Rr5o
zar8K7seFiP)EiHN8IYD=zy6`{SRQJ|7AzazhBzvw#UfIGCY8pS5!H@>QsDV7Pt}Ge
z%P(M4BVpdhoUiiAoOXpG1tYRqwv!k}P%z$NpkmUqZtTI<D6|zTcCKh~53+7-GA(Oj
zl;A&0fds0vp{U@(uF0JqvgPF1T&j9@k~D}dt1}j*PwBm$`c=T!ms<|?3O}6{o@5*t
zy^yQq3kb%Jkv*R7ytd#Is#*z&4Ut4S;@(v!dskitfp!ywFu-NpwGNV8x}*{ftIDTy
zJ-p62dZa5Kj`5{IkexXdqK#b!6SS<JhQFR9!`6w_pxg|i&xRejI<GG1UO{hA+V0Te
zCqu7n-m8yXHv28MkHG~ck?pN(r@Bh!^e%%X5Rg*ncs49Cese_n>=5nCj=cyMeE6Ez
z8xSve$_;l@{Wj%JroUZzFXRLTpjaX8w?V=Ghp<3bg7;RG@7-rQ&d0tR*SWLff-@_F
z2Z+NgL}#(Be|1sso4gKIQC5#y5aJSijvBqZhg%HpGAWJnYJbOOXS^eOOj<AKgYfGF
z$5Blnsvxv~_uz+rMeHaWt5kX~lFqo;^Km3W{K7?yixGI2EIH*mNxldG8ijL^nfI)7
z{QNlq?1Shh)PE9b8&6599m_d*QhOlKWecLkCns;F{U))fi#VVi8w+&km+nsM^&a#6
z!nZd^ZzbxTM=jpf^bcItVkG&<Pe2kw!7S*-=r~1G__FZm_+I3!NX-zUa^2e!(rt3x
zmK?oEp*7PW3b_^T<7%;S6<i8k;&hr&PLyy4hNb6{{yZ!_AO3m-<yCa?VKYUk=nsb`
zewHOQ_R}ZOuAGRng)OP5x51y8xpw|I2M*~|!;&V>PM%s8nhs_ka7=yh88=H9BMJ~T
ze^nf!JeYyUs{i*nKtBer;^|e~ieGV#cs1{2-UEF3l)DD^&b$YgEd#FUFJmuo-Z9h>
zol^_Dob~uPX_{jf<v6C}rB{Or%m`^6$3tnFMPl?!GS1A*$`VJPYy^I*%tnvI15~Zh
zrF)BY&I%c3yGSE1eXAs00L@ePs@buFm$h+stJF#&st7$3N*1pi{21yxnY+67jmGB)
z-*K;|Rg>30&heUw%Qd+3--5^UevoLd3NL~V<y>M)FNs9Y1f3fEfsE7arLHXNMgrkt
z+lp?)+zJ_RZuN{5J`4&`AORY2sx-dH@^ebRNSCu))dS|BRYZRxW)d@b`6S<4ynw|u
z(N)5e7gM^4Wz%5om2=?>@lG6`)W)3FkV%m*DH(PmP)`Ssl%`*ezfj}MgeL1$kBlt;
zQ2Mv^x6k08Abn{mlUgVdDhSv->-8yv&!n|%SiCs@L`3-EpqJ*Sl-87xTw-#)T@e`-
zxwhK&_`cAP*(jGtEgECGxs=v_-;4IzsD!~QIR<a7-1c4zd2=qy(2!c`NhfoJ_|gqS
z4NQ*tom-k14C@H1aZ7?~^|bwh@fbTd0%u~gFb<3d{ebvupvF@1`*UPI5_*BnCVLuI
z9yNXP{)bjHyg+c8`h3^KLw)HiT_r}^y`$uj>8Hbuo|>2yzd|(#vOnu$mrQ>LgWkh|
zNb2Ew#!-qfK##>!5A^Eb11rk1od`+g(lFvN-G{L5Ku0)eHs|O2w&;9K68^-kC3Bf)
zjN$x?!je$70j5Iu?i$bfsoIrz?tVR^#q+|@6W;4B_jo8^ncUkVK!8QZ9L~cn4CFnf
z?US6{9U^HkOIjIXbm$$xf85>X9c4H}a4;7|5u{0HMld{!lxO5JFqTpSr8z$)3N*5a
z3J=)TC!Po$40>)X2UB*uMM8{$W(x0w()hwm!e<*?KHhR&WF1!G5499dc0MUB<6TR5
z(`_wd0|N^Vm%Osf-a2Z@N348iztB<{u!PPI@SCZZ|FV#VY*#rAfAZ2Vp9oDWe#>*V
z*DGCph#z{pE#dB-g$ZjDzU%7!-U;gCj<~F!-)-(}HA~%G(Nl=x5|==j&biyelR^F>
zb7cHDZ+eb<UNCpsPsoy{QG7(!^C@MMA}XdiA)Ks>eQ`x~DAl?iy&64ub!g@(Qoibi
zWYfK9HcxRr3;uNw=(!l|uYqFgEoKMWfz=uOX8E^_yd==Psrt6j?Vwm;yT9(JvlSc1
znQ%)Jmx^G7xdjvE%4y^-jJ6eRo)UDZo@RU#&l!VL8;of(K;(~au+K_w5(qUm@edyy
z{3>d|8FeG%I{m4{5T&UTo4l5C!AlWVMZ?@uvQ$SYJe<-jenOOE)&HI=Tg7hSTFV7Z
zX_1g3sTU8sI>r{TJD}|E^y8wURYi^5+D|@`kHd+Rz~K&=-^FcRN+TAZbd9nV{_e_`
zH0xpg95W7A4mc{L4c0Y5_b#=Rw1f}MAHQv7Q~p5N=#?{B)(0#|D{HXM!&PRyK&V;V
zCuDm&lI#<y{55OOVl6ZDX^h2xEn%q0(8pg?a@Bw0aYiZE>b&PI{;LW8OPT&If3f9m
zf^SNG&IcXQ-S+BK?DlcDWb827Zp|eQE4(9G%(UrL=wo~2ThAsbiS-X%3r{c>?tj6K
zMA+crHB?7zuRO-YG2^y))(iW}{tTtO?@d}=6ceipIj$G<2Vu&F;A~|qja=kNYDp#r
zm&6J|l3j<YLOtI{yTZene)+k?quPa1zo)igzK?pq-l_>5wIFwTYRxJJ#yLdSTCZRo
zHgkJ>sRk`2Fji(U$w`Sv%O)lE91lLHv8`#+gmy1(vWG5fs^c(toB?0Ycrfkq8~-ny
zhv!<*{c=-tn*lpX)tB*~Lj_|@R<=dpLQH=}8UP2?K>*MuhyGz$@H!j_UfH`<wYCwJ
zCszZSFp<a#-9w5EI|Z<zSf?CDj;~NfMb8cflDs){;JEA3;kDZ`oz}olm!Ea+Z7a2o
zhoV;IdRs&LiL7;ES^jCxbo#|`+l4SU%(|PaZEX~ZG%UFlOk58twD_dI`*QsEstih$
z_qQDZU76&6lW19|R3!|tE*da+yIVZPhJp{axU;4{<vgdWy}-V*bx!O>ur={}$l{I2
zn(}7sB8gRKR=fJjqS!?&r};rpYA~Zg02CvDLS9())8fF-b+2Z9>JZeA#5*wrTM&gn
zig$@EpZi%nUIF|FI}pf3B&@FWGcS9&ze|yYBgvf#qnnk#u38gXhk5w;cWvWy9dENY
zX<icFo{oq&w?H=w%%|D@OmAr<potavB+nqYU@RiEQ44g-dR(D}d~1mDFYMKfERFo&
z;Gi?W_j%N<5W;H3=MfMo%9BAiC<re=oBG@kB*#)aTxC2j9uvio+29oOd@!OK*ug!*
z2dJWgdv?2aaWMF1BRxNqQqxi<vU3$$q^T5Ey~xkYf%jBv2|=n+<6@#C^gQ-1EQF~!
zI@>Kifk+EuB-v?WXlGz_5T6$kboON4@&^U^QTt|gFV-JhjOWB`UHjqiImx=}5>06z
z@0LdQgsr1!EzyOQkx5N^SFC34e~^IO?Q1yC2H6W0lSS;mU>M9C`bAC1QDJm=l9gNP
z0C3izY|Om4WqW)_>qY&d9QUHS9rjuK)8GhltFwa{X3v!2$<M7lI^`uq*1Adm&z|>l
zKlok8BQC&SK6^rt=2@~6$;C>f-B=!bc$4j_eUvAh?1(?kOrLQ7bf3$e@Zrz2zmNF!
zn59pjJly_QLEnC0N?f$yzy0@BKmTumNi#eCdH?r*@Z*l}{jOm5zLI6_Kl#<#@z;NJ
z^NZ?Ve&+jZx8*$K@Qj{CkNsVjJHrU})KuCf>xqc;p7u2y-&FS-W}26<5XWz<=j}`A
z)#kdxMlJ|tRpQZbq;yO3poSjTl(!Y@E9&=ZN+VbQp{1`e;qJA*p)ZCW{E!Jzc1}c?
z+9ojesGbuh*BciXHFIYOJpniAvUFicyFl+iE+HB$H9Fg##paZYY7Fag=K@<!W$$r%
z3?@TsjuZmdJ~Kh`vU?FaLk8X$Dy?Bws%WibaOA|S8<xIz(x|R6PAcQ;wAC_;8%?fk
zj$6!lpa}L|X4#jfvo#Fx{AjWMSHKBhJ~mx@K5e9}GIIN=)4>iAZ@i|D!1~c~YO{06
z>>LPSXCMVd&N}<lZpVZK<xZD!IbrBhSKg=3ZfV)o;wcYNt{A*^diRWk-;X#yCU=L?
z6mi*H)9-wBUTZ}R0vg?#VfzLen1*0?j|TV$M>)ZFF@<cS<;*aOZ<(58-3a{$5T8Xp
z?m?7^WDkFb%T;Kd71A>Qe0V6tq_Twl;<61!{WdrcvW8+WR#N0_Yn0GLQyWCZd3DRn
zCa_JOAY`f1NK?dSfDzTGGENYDIKVMvN3h&!k~BGKGK-G|VU1_MHaX}c6n&voVaH@s
zq7`06j~xM<JrtH);$D{htIH$9!**K-j_kfaly`f??)$qIy>YRlQ6EMHlOSu8HawN*
z#P=cWe~v>&=PF~qZgvj+WlykUorqh_e@ZH6Y4uXms-AW-6CK45n-z8o%~+7DQ0a7V
zFPt`vfTSiTJZIS`QcJfnfC=y7@Ps5+1QMNQT2E-OndJ?vs0Bojs|wMvG$Yu<>tE;z
zr!?_}b2uVoSApSz&pDHJ|1s(CTZ>a-p9-9HIc&jr?dxSV^>#cLHKh$4??d{EgU8K#
zA%YN|iN5x=0GsP{rq|n$Ff>BBaUnrD)mQEL#^WqX@Yg5!cF}|$oO83#v<Q>YO+6jE
z-0xIxTGK-&y?vz`?i!ps+Oxu_%V7!FvGqCZplPoGg{J7DqMY?5;9WFV`(3Bd#TJ)l
zTWb@mQy^@`Bk6V3IC4iqjgRckXKU-2nAjA$nYOI*bg<_S!Gs&IelFCnwI<~{u8ju!
z);Kz;rCrl)#f+%M;+kZw)Qp=yhtDqEs9kXsF(nHc4w-YEfYYZ{demie_1dPnV1dP~
zCkw+dsm++)?W!m$89CA&mL60%m#I7QaQ)chBXu(g%!!R3`~AVwzNTJs3VrJ`eLo$7
zo6_vs2ZWO?KC^A5>Pe%46i*^v(dZ<&X{XFL3fp$&R<Py*%z8eSeDngjfv5=kOo_TE
z$ykqK$0CkLV9{nw!g-1LN$D&^FwRND^H;7S`q{#^#^I0vzSYsTS`Cyl*&P`Z^HvH!
zg&S5HY;Y<_z$h44V!e!`+3*(zlw7CU+;DHb@@D3}(3R_tpRj6|w4K>YFMpzJ2PV=`
zb6%TR8kzn+g4;E=<kANH9|(qlOo%ZSxoUOZKILv#i_ep~`*O$5u9yVxl@Q+@gR8U7
z`$~K9q%HB2w;1)dxX!Dr>dmvdUolpLpuPd9DeG}zY|FZi-4G9GK{GEdFQ%Y9eJ73`
zu&)GMc-5%e3sVe74Vby7OS20W%}I7Fi7#hfw(fGga+K;ftxqN7%bQ(EU9C2il}x~H
zRLD4?n5oTMdp#nb3d1gtDdo*zUDRbUgkx%zCQbU4V3G^@j4Jz)W@jx8LOJc3d3i-y
zjBLE2fFLt|)mR2Ko<C{re0NyX?9rx+=A}WrV<#e-%P@+0ZRwpoyDxPQ^<>_^AA-tw
zxiJodRZV4f9=(S>29qK8o5RC>t%@<-o-LOQi1DjJ7i>_Q=qM2_;6BX!Wz<Cl=q9I}
z?L>~driq7_-b0XTBd@LNdBDZ%=2^^eJ2KW)Pe{@(<eal`H`_{s(e%qwH9no%pDv|U
z<9_L+8Pkm4@OX>co`t7CU42&0O*@MhGk!|W*>(E@Jii?g);2=>I{BDobB>#WrkkE3
zViRfgw9R|9o%eoMI(JQCM==odYznX`ti7^+&bBLg_gEY<b3S_if8hBdAn)#d!hL$Z
zcaseRA-*HMzKzN^hbAoO&@b!=_LS)W_5!!`RAetq7H46ugl>LfGJ4L(u^ZFypps3C
zLd1c#UV{oXxhG{qu475Yj+EaACpB>dlEl<-n^Cu>e$4Wd8+yoys-4zXS+9c(W;9m`
zb<P^yUM%Im1HX)7QybR@cLGd)p!RWzo)sD5>IrSkVcmsfmCUBLV$Vqxx8yUwzBB`m
z>WI`(*<y~#U3*mfR;aFg6GD2&EPDQiecByOj66~<tjQ@D_8cxsT-_R;e8pZrDll-?
z>J!%V-k~qP>@)OznK1ChmzJYgcs37SD-u??{JAmW2)+jhqM6&F6qiKD^?Z#R0*!fA
z4hs?C3&$~4VB|Azt&yI?ES~2byI4vrE!aUVKRR3=p674aCY0|C+5(7HZ(qU`mOkEj
ziH39@y*Eb$7+uKae6g-da<i}D?!c@vadz{fvdr>qd(5mp`F19He*PcNK~TuvfcJ~+
z%K~GcI<+~Fz8rRcWg{&(z@=mMnSCIoZE1*;fWqoALZn6nzkKpi5nbhAg8!=Gun>Q3
z7*z6x4w9&Da6Y@KWqDpd3*>}>0OcF%!o6+<B2rI?fB9x7US^$F4|R3U7<I0+p)+Yi
z6QDz6x3)4}^$w;uVSR}+_%LqVIetqS+2<G}i>!t_geyZ%i*80LSPKDB;!9o`!>5<3
z6Yd;aLTn%?ycq|y@4G6d74mpVrW0j|b2c2c7@9D%Z5Lg~QLb3g48)f<1YbK1WWi?+
zo8wY&0dkKb^C`2HVbql-GLMcKz?H}J6}WXbgl{gi@5P2jcMjfC?4lHH0TC8Wt8`}~
zLX%%GYF{6HCrSI{tkoao_1E@#s{D>#pEDqDYj0naWw1Rb!CRd9pejBso;9Ira_WAA
z6Cc;vsE$l3%z9NWi{>pE)49*N9O)l&c!u<kNV+O_(l2r2%XzPEg!Y21HHNQ{?QoBH
zSI48}$7esS9yCx3DHB3&<C2KuM?(eD*p{<i<iV;n8tHPw+6C?&E}~@l;3dKWX+NbA
z!x~4YToJ-`k$&d$#~hD^yjZ^MsX!C4F)V147ME!z(H~ZpG|WjY4NncN(FJN3Mja2|
zOzvaovO@tD?`;l7g8#wo1fRu!TX89%4}EetT0EZ|R0iw(4bm{&-t%%+Q|G+yxE-E<
z*Ap{m`cH?Vs&h@OSgyzpiNb)y{{POAppR&Wge{!#Vpl~!=ZP}_t0!4k-mi81CJxkJ
zhIxHhDjOm0D*#71YN`%1hwF(&CAU{1jH${AoED8|@x{35Kxww|7#GhE-;55>eAxVW
zU})5FTV)9$^QKRe95ZnTY;GAOSGxaxdT*ZZlRCYl?%5Vkqu*h;-`b~CX5Rl~Pr%f+
z8IQIPJ+fVDphFwp6!BP8Q0iJoqkSDb7$FdNo<X8Z=dyD6!3pg|=nA=&YF{9WIo<4T
z2?3<u&@-amW_`L&-9m8GfOuBv&U5u3?}UJ`&cn<wrV{fC2rD4=AQP<HxOU`t;leKM
z4E`i^Fz}@6%{R<K4dWVlm3BHPK0N-N067wYDQ6=pISmAiO$;N9c*)WE?_<>h1+;>X
zGT9v#PB^5s(lLl8>wL5R{?}b(0s2@y->UPyzw-lqmEPSNqMWCNv?wzY-hHEU`~t$c
zqGO%KPSie~j0`XfVuM~Xx}G=gD6?O3N|I%DDvKsF*0@~u*yCR&y5O?@xailmP#n<Z
z_Sm0*S_Z(-pe2E9RCH?A7~S|F{03ub>o(Vtb;ksS>UD0E@P26tc<Su`WFeh}S0J2z
zE{a-w?}vRiN8S)t=Zi+tUQ^KkvoJ{OA>35@spO+LcEE{)N*3o>DYdEzhH}72p(Q67
z9Wzr+VRf@FYs0fL7TXp*N#s&wX!>iK(&H(mE}G^K1N@)wG<XbNpTS$Md|KygRvz=;
z&GA2L`0TRbcCF0tw-k4~fBru8O(h(W=A8*~oJn3#S6DZ-AR?s$PZdb{bp8TCm#7P8
zycUF1gH7IW$U5Wc4A(88B=M=PYx0!2w(!(BKf-$4Hl>Xr7-Ivq1mF{lzmK~9P+ghD
zD@4dI)M<{)y!4~|WZdX{LOJCs>p=vd1SgoqpqB%}?t9kd!evi|d>b0_N?2mpS)wAx
zmu^~VD>d)6B0Omd&oa`#)Z1qzm6;Ky5IOm}wIsB1|M}h5qw)~;`a-<YVeISM@A-jD
z7=zh{!TvlPdsftDy+3+EYVJy-=eX`L%hHak8oG63KKrHeO|Zz%=+s5jIEq)Z#LVfe
zwSlge;vtrM7PG^2bmD<8Puo@nM~Wp45|EXT|HbKGolzVv3Q9T*b%s=r6F)8Y3HtkO
z6jf%-@Kj6}$0f#YX*wau>G=8SCT&Nf`Ol9=sNK3|Dr4JZWGTtkxB~OUpfFx6?(3QR
z;qyN3AxW#={5ZLVR8CFqi4yDhk)`$4%CHwp@uRxdooTrLux@zW2}+;eYw>g`pLQv`
zynj5XJP1*S4A6R=mfM-0?s&a>$Z(@MzT$uHkc@j|9dMN4k=~KpW5%#qL?@*!ua=%t
zu*tUQ%+);|mC`>fp<&|bQ#E<{Cc9Q$kI<D0EJ~$fk4EbTCi>9A;wJajdccQ;uGv&r
ze)$DkH0M;v9H@)SW1vCE&8PN4tFrF>I3y?}B5c{>!nLln#re;Ss+?e&5AQJ+Gl{tH
zMQN^W8;+GtP(OU*(%3_rN|zOW)hb=6|6v#Ur!(>;XGCt73&FYDjnobIJC5bf(93b1
z#f)T3%$;euoyp>YK|gTWyNFKGnX86YQ0Zg2Thl{tz|IngCQHI!xMqdq$0eZ>^tLgG
zFs=?qlQGtFKHRgTAnEAfW7rReOY%IlD7juMV(4IGK%YfvN_AMP<ep)1TpC5NrDHPd
zFfL#2bfJV}1t5ct5EkT)gZ*5=3>#WU;XD;y1B8)SIB{eR1uU6AzaFpxU?!|)#l;h4
zVJl9)vgFnCy+h8Od3Te06@zlnq8v21GQD-mRm0HNU4Cy>um5ja{E-O*kxCn2jpcs5
z<=$r&cW1Bng2i_?D(^<76PL|Z)B|ocnMu0-BpJ_GcReBnwoH#m#(TH%Z((fou`r=U
z!?utsL%NV7>NT4A$PfVGm%v?~XtG4875~l(0l1!qLS80@ydDgPj4maL$(O%T=@aDU
ztv4PBX9FHxhgy%?p-&rzITmg=QoW?f%g$IEVr?67X#`})3TyJ6J)5jra#Ujr_gd59
z(G+y7BiwLhY@+0so(T?ZA8$IlcP26Kd4uN+ceh#wm{PZP>}*Xp+}+db=~f16L;Rly
zD6JOvdxkH2Kh}1fiCx4oxq2vyBpmm`DD9XjaIremCc4uE>8H%ivvkVZdIn^`QSO&i
zVrTPl(J|3+^2B8PR8I@?g;Gdm_2w6iadtp_hw5IOer{FMfl=;#6vmFYCmlDf(J9;z
zyczs>6oVXK2zh_j^z3&%skOIDn@T;mRtdj;j0`+zo1V)MoH?@F{#q&5a-+r_mS2PP
zlPDo{8PS-Lt~oU~Dk5C4tD2I{e21Y~8TQwnbcJdEFn8bjrLVn`_<F<_(vA!c^$ra-
z=2-kNLol%3Y4G=YZ;RdVZ@PScSyvAFuVIgv?mrt4qQATE{~zZh-1869cRw9p5*bFy
z3bw5K!`Mb}_dsV)%b1f4JCf^Z=36KbHW)R<*2*vUgtm=P5H|V~e09o~nclA&EUeJ4
zH0OD`^dmSk{mMtZ{zm0@H1hyme#^a>mi{8c-*v=wpfJMzvq(^m8Ew(qo1|km%?@y1
zDalvU#OA53x~RBeH$Xh&I&G`fk)JGh@bD}nVNNelQ>g~*+)rt6#??$yF$^g~ICE6Q
zFuv%B4$kPPmFp+AtrncrOxssGt!#7r6@(=i{R@6=P1@j3aNWJ#Kf(8_KJV11+9$gI
z?uh2}su|nze!Y6#_zk0{)c@e*k=+w2|26roTX&9sx$M<NAOF16@{7$^maIEv_^avD
ze)L7*vSYIv|MAHyCpP^<Txj?A_II7@>%aXkef_!L-1{V=zNXUsj|e>)%x{|Mlr_<r
z?+Aw`I@vLb>6PO<lNRR&%f5K=cHk#ERJ1C0I!-dtsy(d;@A5~R%I21j9iQDZYGnD?
z*w-}7&x!HAo2k1WxD(t}2G<C~x9=%9Fy7_r@7ZY>@bvG#G9y#@>PySr&K&RWFDM_P
zX}W6ZZ?Cg>ibEcx=eHFD&pb4ho!(&<JyyEzylx#NAog;4GzOB^Qd~2kQbID4#X|a+
zs~fJ1+;9HJD%0Hp$N$3h3zKUTBe~SLcaI@HC-Y?lSWN{rv9okqtN^Z9-kWd2<ugmi
ze2_(R8o&mS8(3d}KzJbRRK_3Dm7zYEz={M;DM7d5Sv|e@+rpWFS1Mu%i&z-`LjJ5k
zXmA(pZ6)Tt=5dnnQyg5nGtu>Y(_VA=lEQzydeHxAulJh8S0AG9>u*t_lxBlJ$NyWy
zUpo{2wl43>XY1JHebSwG|5m1_t@dq^>vcBbXcuSS1bJ>vQ&ui0DBy!`Uf=#v?f9Ov
zCty&-Z?60TBs-b88_kC&j3fb>6}ZT*E(n$8pt=Q-ve#1;iBJnhlWukgc7>pIAm2o%
zxc-LWXrud095Ys08}&bqr&wk6EGy8Zis@le{12ER37cVJPu20KbKbYLyuxoO$zH0(
zNJHcKQLJ2jx(5BP#AgqUXFy5>RV-%-LGBYBOX%-Q9Lr{QKNkzrjf|C0?dx(xK8?)0
zk*a)~p5SS_UFZEg(|2R1a%ZRE%NUFDac_V3&{wq<SEogpQs?`$`ThaRedkuggJj?5
z2mUrn46rGY@H|nR9T(Nhe>fbj^{$tkOB_JQ*3t&igLv=LJ#r*Ulpi^+$sMQvmHr>d
zxWJx+x3*x1V~H_thwHP4XJcg#(wLj;xgusZ%b(S(-Ts8*Gw00AwMU(^n$>|lX&Qd$
zP)~HT`)tw80>AOAL3h+f5jMh{-%mNdc(FSIV0G@=vyYE+?1k?^3=rLHWcpIrc0fJW
zXT|eWQdcj3y0T|`jvmIx5P(X3h}YIPc<{I0%b#4$*?s%A|AO+zY|G%0-Hy&J+A)Bz
zN1d;>Uxrq>vMDQ`{i_v3d_{)iF=BGP)pPmG<wu${J0N$vJl;OBFOfBNdAR#DJN$|U
zJsg$wOM#R-qERsCM@!=z#l)24xD;n!$5qcyGrIyM_utHFcAk11+hX@e&=W=XFe~JI
zFrVJ(!hG=6bebXgdT|aWN_PJ2K{5(P>t=}i*aVu$bl`CZ01r8qrBT*3=wBXZsh!;v
z=6W;OJ=UasozG%P&%P&1>nm<HWDb3?wRfN<#D6cud!>)y4PR@o-@B{V_sMNJ4E8VU
zcKar@cn)L^KIrw}Bg5F>amOg%L=838OpsBT<3M+pxz_Werq0HP_6)LJ6MF_@XX`4@
zuVtuAaxBL?L(T%b7u`BO8KDpbsF*8df))q|D8%|{0C;ngP$R~=W=l&~l$A@qt7@xA
z$RDLiRsOk8A9MYJE*a<DweW^TBP?A4J#YYgAc_@zq`GnDfaud?+2>v&wS#G+JhE*{
zSQ!6Je-M#xP1{_m9UWS_GiW|}t>c26%eF8OpKz@&EV0fj57#bv>IHL3-d#B!%Iy%Y
zb3+$ZW5&lb>UKZ4pOfe2m<(<8S6TdPl&ea|%@s{q^IAKu0=`u1tc|wljv{D<^5WVW
z_5Z+F95Qap)aM@V^owt-nwn_TrAnX&y$#vYMQ}WLrVO4J5?_*!FX<>WT`r`x@+FW7
zI76Ee{D42aU1FcCy#!mnQCB)vn7?m746{HX+KvqX87mVVM?j|Jeu=~6O7M!Q5q6Hr
zoK&&BGjEW~v$3YFc<h#-`B4ACYZt>0#+|W5>o~$!U(%CO8iyRZG)!CJs%-wIJhf%e
zmL9L%iBZ1j@?T2ueUsyJ_Ct&Fx9T5sW_p~x$^kABi*t%{r?$>_BjmpGK%9_ohbL$r
z!egdI&G=hNR}4}w`cBA^0ZH<H^Vlk*q%-9nk)_N!Rmu;eS9Buo<J>))nuv5p{;`Dn
z1f?^{9o9y5@>ZRCcgPo|M*q2W#|S!r#3s)w4+xH^QMkGIq~lAZ<Gf`S{w=qgY!sHL
z-;S)91>Wln482#l_aDpmYVN%;J)l&Jy~otHKwYC?xcRYIv?WhLD#gwkHU)p*WIbc;
z8GGN^<@W?A*A(9f1Fw$a->G=JEPXR>dQS{}m7eFQwcMLh=l^1-pRDd9mVrAKQ*%DH
zJWb&A)!EvP5fFh|T^wwja*&Nahl2AYvn?ipGAUO_%H#}OfGC6zUUl7~IaXMS{_4=Q
zJ{z+$x1~&zChhi*-(7b75f>ob{<cZ`ObBlibRuq78RHK%1lu->hjOvXDT!1wuGnVb
z#mG-4s~s4`;2{8>j&-rlA9juq<pu|g1<(#q`*pTr6AD$~mx&D^jTRqc)S>#9jB$=5
zaSr+C!19`cxP@<(>~kFtj@*24u`W_uQLiyAD?h9aw_i;)e36s)+19*)8HT&b7WY8~
z(dCf4Z+CxDouSuIo!a8R(QWvows+w59Odgi?_jsTC(7V=b{Vc86oPD%>lffPsqX1;
ziq+M91GYe%nkC$v(lkNephfDeJScWg&bEWL=%BN@wh^vPl#&2-gB((sjtk)RLKn2c
zlV*9FmZ+zVaft&+;^0p+*zif-_~kK`6OcmNd!v$%GElht*2qs?*)J8wwJq0<HNKoh
z5!J!qD*DHI*9V^-D2d!vu9?NI&sw4#oyG^^IAVR)RJOTR{P6+%SK5+-iuW(u5*F4|
zqSzZH+MvVW+mqn`xL5J@db^due&6ESy8n@&SOz{+?8@B$%Y(*VPkEok!(_2#6U_;&
z33sS%UPv{J@OIetG`!eGfEkz|6QKiR;lVR)n9Gir3dxuZ!K!Ve`*}q2|C4hgXC={D
zT&5XE5Z6%I1YP#hCdaH~$0ZQG40@X3Rgwzy(2OFz)9%M-h!TaxVcAZ(R*pI!<Ic~H
z7%7_~aGUb*C@0olIPL<Z+uxRx5@COPbjpf}a&yVfdYvgE3CvwIn{Vf@IEEX;Xl#I7
z&sVH35C5t$^Tvz>UuEipoCKY_+5c6hcY;K3di^^&CA;c&yVF}d{f5uZWqQxI+`G`@
z{i<7eGvqgS#35vMh&-Q#na9QRhYLg9VKDKAi;+A80-EV0$2E=T*HW8kMKsewYa8@W
zCm^x${7Cf<Nu#}?H<=m2z}Z$z8<Qc0Ot4I`gOrN3PCSMnAOHI*-MsX?6@DKXod6q3
zyo~Z1U8;bJ&5M%bn<Q!l0|fjoQpyf!?yT(DU=GB{HF8N4>Cwipq@{`W4Lvrtl@eoE
zJ@z%tjfn94u%stHvj0=k;=FIJrur`@^xv1h)n8|E8wTqQo?CgxJ5&GFaxX#o5c;`d
z@ZP-8;;TydDmh_Wdj|n9=XKGkxjUu;NT(WUq_d{Bo#~7Q;w<RA#&g@AC8v6LXRDK}
zXqdq&@t9LHCwa1VN7+-d`G|G>OxLzX9mc_h^-l{z+z6K4h&joT>PAx;eMP*Gy2KcF
z!mxuYAdp6`B7VD$H<i2Rp`#O6;Q;`^09~poJv+ujf<Pj@gU%|E+G;0bZd;RUKLR&;
z&f1QR_!EbM&gj-{NGjSi|5z{<R@P_v=6QoEIp$#n`zjiD51wz%W6<$2=yZYk`fYXo
zPjU?Y-}L$Ky8DK5`pTOP5AJ6UIAike+%nvL*5W5!%n@Sdk^&%KCEO313HWs?z4X_x
zc-3bsIEzl8IerG379vL-OLSr*3EI-KSXMg}9LHpu8+w=#a7?H~G;th_Nwe+5VyeOR
zKjV7Ul67pY!evtYZ@Jmj^t55q0{uQ&Uq8H%(sijOZZY*flJ!EjhC9wRJ5|0dzPFH=
zwbiAr=W_O1!<IC?mA|iSY-y-t!_=VCd_rfYMx;X+Tv3r-W<R1awFP3){?rR^*7@7g
z{U6LoxWA{**RxZ}RBrbvO5WwvcRVTmcP_NJa}`g*0H@}MZ`mFl828*<dzSr&P&!1!
zez|bopAJkcMo{odpuX@ja8n@1699+Ho0)W4mT3{O47?lUE6jUzWGb@jUB3vfH%}%T
z1REec<Y9ED#pA`O!f`NcqF6<S2MBKXXGaDSBujCM4JR`4@^fWKiOe%P@-^cUsk>+L
zIeyWsE}49!rGW$FnAK5=YY-CMOXld5iDEwY3Zt-y*uuND`PMbY73+hvj%C*IvsQ%L
zhfB(})CZm!Iev@bL6rYbLr-t#)?ROAj^Eci@YMuEXN>=&+GgKZ0Yg`6dw@4+a0P9U
z#?<kdbl@_vsx9fd3?A~jrZJUK>U-e0#(Iy>#S_Hu8TJ-cr<3RqnWVc*81z|9Z8^yR
zILv3dEUQFOEG}1fiZIZc=xQbRSgx9N>UonRhwj1JXl?*@l@D-uCz0Kxp40}&%hLYH
zH?+nWGd?h9&gyXmm2=ZpGZ2ThCuv_t@PkO*eiG?-;zcrL<)H+(vpxu%o}DnzoY%ig
z@ecR4|LKIBi>}$U0FKl7SZ9*^^jx|P&%e=>Rf{22_?+e~Yso)~l85NHW}cA>KkB2?
z8WN9+^ox6JrKg=s3Ut{Skk6|iC|9JX6q*^;$h?)Nx1t}vr6m48z%lFcA^VFk?}!-&
zu+5O=F(`xMnkr;-{806+5!+NwOFEKwAs)!c)AF<gGJ*I2w5xSR2Ty$zd|llPF8B?`
zMUxU|_JJ5b6Oi(F?vvXnU(+FTGzFv3*k`SznTkjWUzl8U?@mL){Tp3MN2cE<6Ogx@
z2}rqoyLafDfCQyQ`DwgoaBHSgK-W^$;@jEc{<6;hai-s!Z17k4D%mYMH>sP!$As|i
zKF-CnmQF+sLZS-jvX3TuR#PI8Z0q+yVo{ARh}j%wHHlPC4`hNm4&>u8(P(pWbg2?0
zjBYnpZXR<R)rU?)_$kmDdV>qRBdqY-g}=Z&HEMz3H?E`7C3jq2+vEw-Rf2n0X2-ul
zYm)TZ-k7IOEnYk|D2HgREvxrg#|L?SX`N?Z=D0MOEo}7Kp4XTNLrupQw=Of^2=F)d
z`aU!G59ZyyY`IPMY9DMfxNgi)>T~i2-|q7sP_8vA9m-b;w<doym-mz$560jp+(*ra
z13DoR>gElf3f35oAIeLd?mikvGqo{yv(fcooP8RAM1FLXig^TkAi~Y+TYgs08>0@(
zM_or(;px8Ss0+)h>&zlWFaw~O6hN{Rwx@Yr#xu#<h0U&`9Ap(WZwyYtO=5>wV;7Iq
zMTRUZ+u>S|69#;MNw(ryIn!Z6CRLl^I2Dd+FU`_D?r2-nV>4G)6yc5$8rpWg)ZVr+
zdOs1;!8IkPkB8@{+`rcRO+$j`iy4`o3&YTeWgx0$XrsmB>+`oL{=Xb`ZYx(~GKU5;
zb?y(Q1dQJj|AzF9Jrm>#(1B(}y@@FXEITG(d7P=o^rvFmisR(97SEf??0lMv$1tcC
zCj$uY2kA~6hatS`pOzd#&~P9Q0h8LG_Y}KI(z!xlNpO%E`lI`pehqczW*!60O-Ch?
z@5~vx%qHFfIg$E(=3$*=2=pDgk-(h2GXw8^kQ5fCeK{g!@nRxA+A4v~Y#XzsFmQiZ
zpKF?#oWFS7^q#MxS`@_}qd1hWdw2KkR9qtzS8{?k$D4n+PLtp+wzv}T$oZhV<^GXo
z|J#<KN<iF(F*|x{HXU8mOc)mq-<_oSTn{#FyDsA{_kCL_M0f79TwYA}G%|nyo^*iP
z<gV>tLB58ey#w?B;lsA28{p|IWum#@`e<|t{U`s9Wfvzs**FSBL~83W=s^+|lQ9mW
zB6MP7NIO9r5nqu$jT`8qc5_B$0F+2L{V4BsAS;;toEBtFCIx<K1O=j+Iy}KK3;#4W
z)uQze;KPNwpwZ26Jx`GsPrX7k88|I{X`8Tku9TdJ7_~&5W6tGxI|3AYo$q1M_!Df>
zTlZhgG2h4>6Xic^dGi&am>h$ZgDhS3*cGz|;iB{btVXE@mfWQ@9`b`hKq{hN_><p7
zF-5{NPjFFoGc<D#{Vo;?UMh4);Vd>ynw`l97hfU^=4#}Y4T!Eo0AGt;KjUE}i{~-U
zD%kjp)(MA$lmrvA`O*xeDNKUwz0*lp(b{L<>D)ZMrzFfhHVxS%KDC$_V3J_lxS(|B
zYE)toS!IiR_FdFIIcfFRW8St5T*%40+iU<YVe8(mv|EO9{55r}+~r>ij8>=g`dU*B
z_n-Yu%F63E%?mEa?a1hi4!OxszKJF^R*PPN|8ns$%cG8iKXLN3v>OF{EE9lLhYS3N
z)?Knl)~&Nfk8`b`Sk|#dML{92iP=f9x@1N+s9K@u<Sr5Nds+r9KVKxJy2yS+^;6=0
zhL4u+S(R~M6>8T*)7<N=L#}Nj1xD*AZ4j+=>X?n#I>IEZtcO}VA*I^3ar{SS_y?nI
z<!d5gbudvZ>zUOpy%+QH<F-o80@LcIz`(MS$Q4UBocy=NBOY99{tD>CrQH5o!$0+3
z2^f67@1Lf872LgY-l`KPR?g2@_t?_!Keg%96Wh*x{pRk!+WYgXE4wQr&fR%)b?xQf
zZjN0Y@>u!W-<B=$SB!63xZ*_hv7i3&@BaFhB|V{TXlG0R?hm{F`}&%F`k!R>)EOMf
z%3xQETN(VY&-<Xi#XZ8`IxrZM_w^LZy^RU|?UrvQDBm1x89X2IpfzB(1g}UAD6C*k
z%4HyH6!V-l{fFjL-k%E|6PXYZMU;oQwje}vb+g#tudh(8HM!rW$Y7lsJ8YJ}J&A@0
zEpLgY)x5Ulq#7*7Y{b`D2_`^)Gy{GK4$3fe3wHEW)L}&E**6T&0nFeXBC}!PO3w-i
z5w=>+DQ_U@w(r=I32r*!+*28_c<_%CAw8tC#SUZxVgyWxKMc!0)INE=v9K-Bl!$|F
z@!H~mq1Bm#s}qK()qaa7UAd5|*mwFW4%cO5O7xf?gg3Rt^Yy-tGcPYnb)VH8&hDuV
zc4v@N@Gl+Fv$wZu-@%P*USH?Z_{rnNQ3p(+8j&45NVc!-sq1l8dWINgb$5Prm<g-G
zCCG1Z&jiHVkat!kl3JJdw9kv8k%48(XY=>fG+n4>9n;g{+NqWfZ(;<|pwYqNFR6EJ
zAuC$sp6O6(Nk2EawqUALljd154eAT`1(utqf}E5;zRW4>1bcN#S!B4*id={<n_Nyq
zM_N_%+L;$maNPVI;3sUTze!L&BYVv-SfqS=wGVE6U0#=AVy6FQx8=d|-hoS*o{xHm
z9^~ZRCn&Wc#Q$m3&@r4kc>gKh6zr9bPO-HP5ao4phIrlM{WTX-Fd1BtVk9Cpy~@Ou
z#iK&oj4K#;TonI;W6>>pof=3h;(q>}VMC96*#Zv+B#8B1ZdwKDMOxox%sRmQLJjru
zg@{OiC9!s@iuu9*h1KU?%^wNVUNr;}nLP7RaAgb=nsi3<!ajX0WF|bi&C}HbQrfVj
z%v!qHIrFWQvKSEEnaT0vV%s#!r?-z!8#;ae^**JxSNX;=XfeQk=}JB2xpUuU@PA#W
z6e*X|EuLo<>_3ms)L)(sXwv|W=Z8-R!lk(VG&;#s3pt>aoXYI7)85UjYP$K;D5h{G
z9lyb!NAwV~#};WllkORQjo93mR++0d#gF6EXFVl8_EZ}oA<l|2AzeYefhT!E4eQk!
z{f||QtromX<lh>;sDQg|$u$QfpmHMK=YKrEG~1Vv(uXu`Tv?ZW!K^+<bxM$l&Z)A<
zRvEKzeSCRjnSE4wc<9Y8{i_K>Bu})(``Qlr-%A*(H#|7fO90I7+X3F9{`)!Jn=yHx
zU1;&QTY94WM*=c^Hv<y9pYMrV`~UT0VXqg{ImRJlTP^%da4vd;>G5OvPia^=+v>>l
z#ToIxZVQA8T~Qyq3~y^8W^dI_KeQ?3>|Twd^}+81%Qm>0{rC8bU3tqoQ=6$3Z`Uaa
zYSDQ6nRD~TZDA;b2y@lkal1eWU&j#@klhT|HFrmRIrt>wjV-DA3JcHZ0O{qy*hy1i
zezH~e3o5bzL|~V=If+n14v1F51xxzHAP7+mXQvYyetLY{R<Xa44w_A)^e*B&O~3$u
z$MENvj!mv0w)JRG0Bv$%@{5;RawugOZEss`N}{0T+@h2pl{cdT;P|-WZ1bcO+Zg`e
zzO9GZ$+{JQw{^9V)JW^r<HzE{leW#AuVstx$l2{$kf~g<+~1SnnPBj~r`XveTRh*s
z*X#LRL$5oer^@o+ezWiQF?sjWTkiK;e6@AnJ()gti0@Kn-oPhZxs(i05%5@ux9!19
zg>p&oYS4P%Br`wB(Y&4#aNC~46luF{Ka!7E75XL;JFy&kQ?wm?WH-?&t;Eh$3e9(h
zAGZnZVRH|5_s~b0KMg;`zTqv9xr%HSm4r)kjNb(yV8=TD-#_|Js6wYS?g;Rf%qP$8
zE&FuOlGijZq?AS`CE6#hwyl76u&k^GJQop71x+jE!?dh$@7{qw4)ywcia*nz&?0-%
zHQVwg@4k~`@X_Ze_HqJ;4TBb?v)S9A+&DC8%SG~gjh$z7hpRm1)BHr43)yvvX%nl%
zb0$|>0YGIG^!#)&ssS>k*n+XJtE{ck=$iS6Vz}tcRqzd$t7r@UG06<U0jPV1VUU&9
z_WWSF#+OZnv*>K;YQ?25Dpos&2|@iz?j@d+AhZvi256FE52<Wc{gXi=G-t<9b0YXw
zcveJ8{Na?TkhX#BNSiLpre+gjH#fkuLgk7_hfL4fs~0;isJO&&MP6JrH7GE`F>CQq
zP1M(snLSngcgP`VR=zsuzp*p#_61A-k%YSmZ#5e}>u$L>#d1HF{B7UGUeDcY&A!ti
z>%cICCh%}o_%diU<NV7@xlzFrSq|cLl1>Zi$aTeF2x8Aw#TwWgGS^Dp=eizM{ud`w
z^9T-yIu-+hEH!c(a-DM2R_0+bEw2j8R3Ti88bky<J?-ss=Rzmq*$}#1XeTh#aOBZ<
z!mddOb$s|yZRh{uWI(3d;7+T+Tu+joou^hkb*pSKm9?$Z@!G(T{Jws_L+N`KKQL-X
z!oat1!S)!wZ1~W-aB!n?$AIT^faSg|#&Z9hg5BV9!_e*hHzse%`=yDCCKk-CV{?4@
zrfDhH=hnv^NW+l79??Oq2{qo~E?b(DHWPMihDx6#rEfXVz6J-42z$L4b<rOH%n8JQ
z1Wa9B6+y>@$FhJZIOj>S7~s*GiDq64NK#9W(sPP>a&>e{lRl!Z9E!!{=T3s5kamch
zCgXR3$2@W$hsC@Yv@JuZ{FF0Peb9Bsdwv<$nS{&SS|gt-42UpOA%a0t9)gp~tgYdT
zGnQ?`x2mFRalFUAwZ;F0;mZ*QSCoHXr_u<hVi>%$)8gKfYPoIL@Spn-S3utVt5Q53
zh`3Ws`g3RnRltjpZMiDSnUAByb}@XB*ISegZ2h{;b~tlUYO(34tUDR38cL`K11lxo
zl~mre3LJT7Mhzba`{9E`va=Y;czD=)&?c3JlygRQIbOw$2ysac$iKP_n@an8)P<oG
z4KTY9V=UO2%0-075gA3UE?}Se$$O!JfiE=e#d0)%4tU$)gR5<mW6L%_P7;djHv$W^
zqfM*F;5-@Pb69-seaZz|kUrn7gu!Nov-5i0-@C7V-bj{Wms01yGNadfX#V97CfC-z
z($slX|GW_bcTJc5I*O`JRJA4ee;@8AtPtIBE{8lGA3up-pyH25-Yw4U2yp1c^7R!p
zV~JrfF?fTWH=&c1zZ`BbFo`MDa!V46&9N_$bg4f-A5>WD@(k6A_#mPYXbg}Is4AEX
z>Bcz-U~JnOMmBb4319@YZ5&<~tev>Jr~PW&Vgg*kDaVHoCTgdX4-k4$V@q~STn$W1
z=;XNZhl#g}%pClEi^~zUTUVE-4le%B>vrGXnK!hn*YlY2ndi?3m!2r^a4Rx&rO1`0
zV;e&h$J9bx%Yk_$3V3*UsS*G+Nr?7*o2OHFb*jv;xTXuS(pYN2*-0G9)SKi?)>3AU
z|7*vhiN}y%6Qi0mWlz+VbHh{=VP1^iAP~m>hmoI>_L7cCSKN-o_^3}Fn;sTcY+a{a
z!e%9m%3)fawes=theQg*Rd^-Xv`o5#*)JG(Kb>xrwsff6aPOe<F)OarBc4~krC|8n
z+_?kx+TNpY?YQpR;{MRQortaYt;R~r&9P-v!0wJ(8~}7}^AaLZ0<+CTnMatKbvfs~
zk4O%Ae}$D~7ST><vZJ#jIYpor$XoLFurXSfW>7bnSJDd*13bZB%O)ear(Jv_*BV#a
z*=lf~#FF0F8Z;ZISppiKOT><g1rK3AY=UmDc<$JI(tPIiRl+z}6j~#cuSzg~UgWvu
zqf8{qz8SY9{{N)z-J_zq)4bnx=A9m+(IJ``qSDYH3W5&a8ICm70S(}#3Gq_e5(r`>
z5gq1*nb)oe)bz;3Xhg0GGNDE>L4yH}tqzk4)zLOU0nM1SqNW2eR+TZvF0DeLpsM!y
zJe#DuXWr?x&iUi4b<SG(BU~)1_WteX@_oMF?*o(9wK`TxALomKMNw2PPb8K<)#{jk
zGT+hJ>vl)vS^rG9i2k$TV$|e*5VwNliJ)wZB+{iCJqX7^3JBRTWoZ}bPuGj-&UK_x
zYNRBXvqD>z^}=HWHIk4_QQ_a5R23OTMy7U7wC3akd12ZjS2!jU;4PAudC&ALGN)kk
zgj{Ht*s_Nq3!ww1>0tgSVOI)iF22;#Gv-bx>&9{g%3{A9?-U0^h5?fKR7pw$=v;tk
z;E&B;e%z6~7=d(WENlO!Rh(wi7gH~ZAX}%*$_g4#B27aBN*4kLAW4ZFRzmHg2>tCD
zD91lfqe`)9v7pa7@a#Op_gsLRJ1!lHtlMRK6*8`K#aV}TM(hVXFbzGKU~p`+ao4D6
zq|n*`dPs*tJ=H%aBA4|g)VCFWy37Pw^Qc?oOcqnY+*$D*oZN-0LqSj)IVJtr8W-(+
zNf*+dvP8?rvOMXS4K_k`464L<An&K7B)?WUJ!_t@UQg(F0pV#Mbc|zpsco+XX)m)U
z4b#IFopCbENuTXvFBoN77_y<Ki&Dry>tpsokE`rGE>!b?dFv+RF0MEnIv&rO=x<jM
zOb+WGuL1>8-VgBH`_Fw%`>lHe!{@TypJaP}7oogidaz5A=NLmlPV*ep(KK*>>2OiE
z2Fyaub9Rz2x1pyT?Bs3}RS76m-WZ>Q$O6Dib*kPoT?duMb;1kIM_{ZRtO>FwiWXP)
zPsrK9|Hq-NQ62EIw@G5cnvjGcp-74mVwEtTM13NE9Hu*sgecTyx$!ocLyByKoHI5E
zv~yJ9#C>Obp^h1!Ne+s6sukgEXIXf4-n(y3(LMKUV(FCPS<iryvNRCfZv3V<r<@=D
zasW2_fbtmoG8v&OfB|>nHN*9WAdXLE+UU6!<r)QlPqsZv`PE@1bMtlE#_Ko?sNse=
z=Yj*q9YQT^ytwquA}M0QD4HEt%0K|Zy7Q`n+2GD3CZLihf~{ZQi3b4^#L<enQYyEw
zc<0mB_n>@7NP6ddVQ1)gsZ_FRvlJvxyeTUnA!jqgxuU-N6q!p;AQwye67EONDPd01
zC1Kq*yKq{C>?`Oh@9nHs7s%x-P69W|p&GcNjux@b*>-dbOK(eCx0i~FDS6(+6}X^N
z&j%fN#`W6$%A|tF_cTg1tUtj@JT2ZqALVZ2faf<pF`jzmZ}VJPY0Axr{M){I>t?<E
zbCc_;Nomr1Z$%V5xK(1EBkdeTfXw5nP>Tu5P6vX<x&lyl;J7G9JJUkQGxRtu;ypnT
z8WRE=pRMBDd$>o~D5BtP6e}Zev-yB5W`xZkOlmDVN1RUC_se(JihEd!puXm{%mZa$
z6wPJi{K^zLs@1kXEPGo5<+Bt*;*%?{45r8U;9zkuYvaNv&&gR46%-fSytgCsgS>Yy
zt#3|T(tc=chMZ{G7+lp+mVdp(<ldzj8B7u)=x9IrIHGN6T*1EXq{B+5@;DON;bFx=
zt8*`Y;NGRQyzBq5wEd#3W@s#|Kx2E|^6ILEfjcUDpE;k{QAlHqTLx?Kipf4jwFfJ<
z`Y%28VtT=9Dt^XhbA05j(uS*o0BKWuE>)D|EVlO~Q`!HN;)CrZWxZX#mUqTLIP7C0
zpr!orloUqG<_20HUTuveEp=#G%EZZ!i&KckgJ4AI407Z8mlRra%GP)%SZ8Q_|H;D6
zl)!*ViH<9C*UdE~cK9x4L6)|8UGtmKs}szto(>C`8dYE;wwMp=ZC#_z7WV@TUNj?_
z1Fp9Ry#2eN59hUc52THLlI_0M>v6P<7MVsHS_<x$1QZM&_3^JDPY8G_ECXCbut%E;
zAsM(TLa@;_l%B4CasMMe=TsKn>B5CHRQ7g)e6b{xP{MQmoiRy-IB8+ZF3;S%emLh~
z-w4G-q7YdPSv$_~zzncZ_+JuRIppA;wao+#Rn%&z%xXJ9G~r-ks0eh)awOsKvOp_N
zus9s=md|@>d2?v%2Y!&2s>5^U2CT=F<(H?EvBf%s`P%YB4JotscN`uukK8nk{vN7U
znmY~tpXSlptp)#{=P<dXY*o5iymtn^iEti0;Cd~0M^7C#Q++hEesu?J(_5ci_-nx`
z#HUDQq%334U`z<p%nn4jQg|*C;H0;6RX^6|91Fr_wDtAzotdIIi+a~6qEdlJ*3h+5
z{W9yK?7?{MN;C$R@d(bJ7mp`uTM-O}C&k4ASc(h_m`)8IazLVr3ngpu5K5E8B7utV
z2wf*kTJti>l5Lg-Kho@B@eQfYm^76wfJN}xn&q2RyUgSF6{c(&wNx5BBfWYlR9F?U
zOA*eA-uZoBqlx+73zbK9Oazs$2b3=_`2g-p+Z~B|=`cVyb%Q{v;RpdUz}|7fU1AH<
zAl8Xj2Qua**%Z6<_BgMS6>Ko{AmQ+stl~Wj^-M5OS0Y=vCSlh_LXnq?4;19SLB00^
zyB|fdhv`9`qtZHu3GGQ*TzuDBC0J3pWN+(OpB_WYk1(d^s$y8?zWc=we*AUxNB6Qa
z*8Z}w{Rewb#2?*!?#)mCk2&eTSTbYF*`C>Tr^esw4mHQTl+^OU5%=yV2j_09HvZLL
zY=7m=XA(Z$xA^6K%gg@#jn@vI{>62xvol|F$6e5S^7h3acb#cbH!!YC6|u%RhQdOT
z=_xbrOPTXQ<eOF2z$=!072*4<wAFKco=)}`Z`SgR1k$~>zY}MODd*0&ebDy!g2D4J
zQ|~n>J(<d<y=AT&jw=vadXz!UXwiUs>45{ftVO3#(y`ns!^!p7UjJW@-ng7@-|qc>
z$Xxp-@{ssh{|EWX-?oCWvqWPM2(gvPnPr#kf@{uC+J0nLp*e9d^3%y+2^j(o;_=Q6
zPt>AQ<==_EUI1D^s#L7^IWtXxEL|fEqe}^fuWOn9QZwBX4=!X4zWY9ONn>S1hn&7q
zM->KEos)s_*ET!aU_nd|v%EjPGsU_urIS%*`33W`)|dK6zR|Wh9vMc>eM6p>e6Qlc
zleWa*_3}J^rg=25GeGZ{VQ@dT7dQ>xLun%i3kIK5zH~-v-nhL+?VXRRvuEnqp9t$#
zu076a=G~AdLTZ}wPLfB^+~9W%QBH&(lwVT73W`N=0Yhk%iJ?kEhMn}Pc|rE1&ZZO&
z8q5|}uNJHplN8zBR?34n=9E+CYwfbGhtj%^VP(|P7OAggfhsfGX6Cqp=SXcJFKfL_
zkCOMJK1dJpIPl6*`%W2?w@#iiJ198k)j3;^R|Kxcwsm9fENyA>R-W>=Cr%!pyghq%
zmVWex!Tq@goVF)Z|7~W6dxz`m%>%AsjVm)C|K762wLL8I>&|;E=9jKtIksP1s%>NC
z)Kl9!&M|Kmx-BcyS}}|hTauC}y!ng_hS_TX?KRqRwU9y|uFMr|(^;M;@*K<i^jpMM
z86TU^;h>gC8^*CvUCafBTi?vY(IJvH#$K((Dfla3%UB%Pxq18z1Rw;l_07V}b_ONZ
z%xobVm=T;w9%S<0(G--%x<w3`q2}SW#MEGI{gH+(0{dFmGb=G)I&BHP<~IxPPE6}9
z9dW_SOD98d+Bo_(xyQX<qg?9sev1*kX}C%MNI&Y$G<YA34sbk3RP6)9)ky`ezBW(O
zfcNdR;SiJeqh96r<FaES<E^g~0i+xG&PfICu2c34!K8fJvWnOXKgAdlK(1y|fg;g)
zET@N615|4S6cDSMO!7Gq0IifLqRYU6<`R1f!qe7?M}jzE;{8c!hiB=FB&v7J<81Fi
zU69il@5+Ib4$Tw`*#NQY=<%IJ_aOU`lUZvpO$uZyZdJs}R0{O8s+lht13yTabp&UP
zxaIxtZJk}+eR>_s{W>Ae2jTrPGf2BN)Oz5Y$rXuuQ*kMGvtvdxG;y}f%UbWHgZkei
zec_CFODKn}eGGnXU%$AmIK2b9eq5c;+?Lrp@OeRS#D~M38&0&3>SHlY>Tlix7LmTf
zHfJ1X0a`oqqCYKDM%2xO1j=0cM`@K{6_5Q^N~=h85^=zln>`-_>06HNNG<C^lR$^&
zWFJVul*RFbR`X}`C+NZh>zJ)k9*f4KIwSS?mVWVJX`xm2ryRmAwQQz6SZ&gys60uI
zFz_JngF(oc!SNoK*_!S+w(x1*aC#8VV}&VmwEcj)R@iS0{;sx+_V>jwnD_SShws`E
zg=su%H9wB=S|Vdy_vsupo_VH0V#k%p7<+v7Kl~ger4*(*wMl)wnWue>mvqAYm#&up
z*X;P9eRnM0HnyO<fXPGzl_!NkC*r65fAXZmH$c4?Eo=wJ3hVXRI7(HBQ;X=HXR5{R
z;jjR)?(8=o1iK7P`x{2_*m$Kk3x7MAfeT2a-%QKv7-SPla9`0)X1Ma4h-mWu)w1$o
z@y|0_%~zHmMH0`tCw5P?9`l2<zM4t>);i|KSh84uUinxV-K88<Y-viG*LGIpa*rwo
zd{wgDt7{BX)0A@q!zIf80gptHZ}j#6y;U=8n_rjFJg;4NPHY;)mzFuF?i>r-R6TR|
zX8IFIJlp$e@=I(#<|#Km7n>?HSRX;vnbg3bl!tzd*C-hfql2r2f|x2e|B^i*e3qv;
z>&H@%cy`ND5bMbo#fr`t7Ei6VfX<V2gpbt8)`}9H5Fa7VT3%BT8SJHaG$<p?O$>^k
zVg6A%A@`~}ot~KJ_$jtjz%Ek{H5RYz*FyEF-ZkC4{>1p!*<JAg`Hya=xkr37qfXOs
zgZ`mb`NpSBxzVVhQS=NB-;9j$e$tZX=rTRNpXPaN&l^lC@a#4y_kBk4dKSc6*R$92
z^2BPxkK&tm!qgFNQk;Z)dR$zR!Vqmm$y7)4uBzkUc#v3d6Q=@`|JM=prVCa>DT`Z+
z2!A8O^d@0jauI2GMAl43!O58H0Lw7?twPo=3*S);h!NX!;wby*l1L+P;Be{5sQXTP
z4P~U1OON^)Iu_g1LgaV$Cd}F&xo_f}oT9C*LE4ImSFi!AeKYpr^su?w{x@O$XqQ{U
zZp_e(d^`k?#pJH*^;%k3&Z>X#h4P_cyz}f=4a)Gc0HsB7#Se^>_742~vrl&P!-s2u
zA^A%z3n0fY)u!%5Zzo07Bqv_mVy&(QzoRf8%}eZ?IyR@5I>G|@go1aF53QFG?6H4N
zm2I=U7e`)#K#A(3g*I?AT&+KQIHu+-X3-mdRK$&lw6gFd=_#uLiOuNG*i1efssOeL
zp)KSl-PM%z4b2c^|I6X%H*cPLXxGJkj<%{@NPos7m0K9t*EUj}@43_FSc1U$j^2ef
z!AIlqK#XkEf7AF`yTScQh{^NefV<IjH3D&E{-966Z|fpoEj2eqPZYN}&GV%xpOYBK
z8Rio5g;>yIFtJ{>)We{zU^T)GjF@PYIQJdr06XOW9gD|G1s(Vc#wxsXzU?ib-I<`u
zOEc7Ef@0GGVIMrEN%!3N1=<)n1;|%;rBpeI0<_hO#%IOYT3?Qv{05;y0-Ebs4mnIX
z^71>NC+G$j9^V@sjD#8H*pm3;7mqXqO9vEgx9p2v(Y!ZwQ&Y>J?T$nY5ELrV`%-}x
zx;8!ht^zf9E@l_RuUVQ=+FD%O?G9!pOjO4B_p=mf*peirbJA-Xg|7ytA|;~{kD)^m
zEW;EAq{RFoEH2dYV8K?4y3+wVeK|pd@bTVh1Qlg0r>G66L?}ykf)S!qaZJ2kt23Ui
zxFFpUk6+G#vg@*?D84ul`W$v7Qlnt^3z@PxP<Z@g`S}g8JVEJl+1bg_ZH%y6@T8Uc
z#XoiuhS5H)j_ugxmQFu3Q^t&tQJ;l;Fk4%CFlOYNhyn~W+~4+kzdDE!vZqNi{C%PE
z+1;#x;o%u3M`wg`^Md}-=O*VBjq8y;-@8lmwa~;Fz5(KlSvey|KsI3vv9kaM423H5
z$1ae2ruRe*D2%m^?V`^ZGgR5~-x#V0uXcdSiD@BPl(BMU2sJWwhmY#jlEjqYyV|ye
zLgq*lLOTeSs5S&35dsRjDnMGWHks#;Y%6^D`!Xudd36@8vE^;?bKC}rd~ZeVx{duX
zZ?fmbms_VJ35(o1`JKDrdS|wGxW(&6WNq-AAwqm+zZ2sfZb*u8eXRocX9rw2l`r*M
zCe^WT`_k%d1r;LuLNgR@AMacY^-(GcQTIJrEM18m@^Z}u@zCk7lmTX{<?XysJKN+_
zTrZYB=lF#TYWQ(H>I+?XQX&>kWPmjl-3L3@rplHR@s1utMbKxR84p#DkLW;i@d69g
zk~^68mjy2p;jGN@ga<&uT_$C#V{u~5m0z$-62#f^v#z)b4u!TK%C&kZDkhE{ik}t-
z>zY`#=F_>Xb$ZxuIc8ZwX&!SMT!6fM;sjp#Elm5|gF|}zhX(ISL&2XmCL8{wxOjtV
zQ<aXYvMcUI`AmksKi2g!yQHsx4cVCl)bWC&OngpU3q`ZDp!J1*_WC;hUpx#tQE~Hb
z_G3(AL9-ah`dQ2k5+_e)Ra9l@oQ1g-b8_L<NwldAu~gm+ceX75bwc@ys>omVDqkp0
zdyL2Ay`QOfc<lO-O92IaNv*ECLjfSr?uG%+7YAX@nnsKCZVK_%)=ws8*xv5k-!qQN
zQ_E6!2$9;0#zu&ml$3Q4JZL(jyO+@{Ac)<Lg)?K{*JDfbUJ-mJw8eJx^qSZwa5ji5
zWj^D?^+Gm|>)<Aa`HWZXPmGZ9{}ms=u5$W6DJ7-&Sc|(Rr7L9`0RyH`*7DY32m}on
z<&km#pG4uY7P_@SHrVSOn=YKDBQwQZ1M^4BdJ2u(w^+=D(aipGEqKB)7ENiLj#c1>
z6RHHGHOCtILEhjMt^T2}o<jTa_p!p{{c_ZhKXS>g?@KbZ<lnxX?KX#)+*t$OtJ%tL
z;uTAuNpTH$zU;Y^*ndM^>SD)CM%WQ~ugnOen;=yP1d2K*W~{G_S0g`gnMiz7McKEV
z^CIED?KMH=%}EQ#vOx^V(9b$B=9LnoW+juTSomN8V5x}5`y^IKfBrgyQ9M#=_+$Ht
z4U)fzwoqSN9rg0cHQ}1SE0r2nXf1w$D~*9cmFyASpVB(zgB0035L;TD7+ZB=VP$ak
zyrB45tBtRh8}PjFUb1^WG<m)~sPPOAwT*rOhW2*_Bc2lFvjMMN$)=_-!N*cQjMNN&
zxYRa%QMYlC?u_eKd9OLQ^ByoISXncKI~{aC*0BQ#qxa6mTnU?qrUPPwn~WQ4$2eIJ
z!SRJt`zG-YFIYeEt$knY{fU*?D^kDr#*xGEyrZn-v>!o3YEbl#h-?1P!$1Trk)D-7
z>uV=ae=tzRG%f0dj%7M?M_m7OE7P6#kAp0%ZSJK0cq(^sEGS#qg=0}KSOQxdW%JWd
z7^@po0;#IvbYa~qvH#tdvnCcZqBh4^Y4MN$O=*$Y^<PVi$bkIYC7!R>WLuyEHAh?i
zzkeijp{R&YBtIRsK^^}dD|<6%abWtwqfc}$qo757bJB0kZ-bP7ZSiaw82()EY**|W
z=+Rd*^<Osr&OJC<M3ISait7#-1~h?>>-vA$6TIkBA-=Qox)0a+>?}o#r%QquvbrPA
zUv@vIijoLOodvMlS!N_-UrJ-4y5nswSf8wzq56RnL2Jf_IN^tJk_m2aqsx(LFeGQ8
zSc4+xwaUy(=!=v`2z-)6>E%xom<tESpDYeCP=d$BOdoD5I`(Gr=p|qaLI>zuiDOHv
zstH9f+{|$zDxEkDJ_~aesw&?x$2m6ikkP2!AR(BO*{g_Bw#Cu_L-}F*A3%&WOgl#Q
z*!7R^Vw>}_W^>!)0|!kH!wqgnOa7?V<XINrsZ3M62fcz}L{u1g_dF$x6Ud|Qi<)}?
z0$78%V|zD;;iy0*aU{G2jdyYkp9=@B|JIy@X_w4(s<dk98U@;Ar9`AsFUzXuQ`Kp$
z|ARCY<zU?OojXayT7Rio1(MZ;1{{f4t6Km*GN$l5W|Ursjn|-o+Nkr1jt$K}uN*9%
z*cx;!W$m$=l=59cLKMM}qUlAn!ZPLEyEiiboF^~MQ>S-^=v{Y|QG5R5_yTiJlF6W4
z+tubWjr7KhT-$Z6?){y!Yp2a-uDm0??RR;v<zh|Ivm0=$l(y_o&cqA-olR`|z_o8t
z?lTvK(0;}bHph1)FcrDHYD{Qm15BcLiFXHsB3woASt1)XZGq2hV+~}-VHg5JE+9CT
zI<}za%D{p7D8Vq0R#eWkz+!RK6LPx)EkDbBX5lG6#_aLvD|BeYx$sisGEe}pupUWC
z+~}OoiP-A@GYtQ7tYHz`i9F`@{S$DrOG&&)mPs!7efX%WCEt3lZRlL@f2}KC{$pLC
zAKdKuR6l5I&_B2oK$w^nq4p7?BM@{!<iJ#g+(f!V<q-V>Ig&zxE-hH|E^cOp)53OC
z8nHdCbN`$9vuK&_R$id=V6qKOO?QqQz|*ifH0|$7bKYW=9Q$f0B8_1I&$+-Cjx>t=
zN;V=No`Xk?l$WwE#i!@+NjZ@un5d<Jl$NZ*0l{mZIG)$43vLNo`9@rE$JFGDRiU<y
zsk)Gt()Z2|)>;DRv^%!9<~*~h`R8$+-(-7x0+c@eW4-tDUhl8fLgGqx!JRs#Q!`3+
zd6x2HQeC?3ZQaJ2h{z=8{FXzwh#kplj%{8_ZfGRs*J6HF?favK@?+m~0dm)0FSd#o
zX}902r3)_!TLBaTbOOKVU|kjh-;Q&8t*?P#!e`(b<~>g_AZ{90hzT<n<W~7uu0~rz
z1H|8uG8Abv6=k=IYr~EM5OLs9q-r$ryQMs^zLtvdhlCs%4j;V;*!3wsUZ?S&y;S(?
z6w%ypm&$kLRFx!33**>eSzjKW6bPz3iO)A7Zei!SXYNaASQP-|t(FpLn4gZ|KIZ9-
z@!T3R-OKEQY4|X*fLXTpT(<HJEW`%A``*B)+g|X%mTOgd{k?;4?D2n`-~VV<kLJ<F
zpx2)-%2@Si>)Nh`Wlw~jdphBxUwrbD4-cI?kXHEF3hfV${I9DoNA8RK!M<Y=N6-Ht
z{;wuHv;1HG?TNSEUHg|6v%<>T%bs~DD9X@36qWlMd+nw8-@fsqm;2DtWlJ^H<X$fB
zJ>Wjks8IURYr+LFS4ucgYl+fUP8K17h$0MO@f>HMORGg~v6Zqzekks}J7gh2FM+$$
zugD0ao?LvvycsATiVc|`0v^#O9-+C&(?hrwV{R6ks!J$ZGQW4TC@+bX=K3s5gbeAr
zWH`*fH!qVfC|<&oat@>fPM-6@-j3>(#dG5;xPx!ztf{KP5Uwy(w;;Lx)$-u1W4{h*
z8~S2kB*1%5xlP^DlJ}dQJ2!j_zPYErZ_gXP%0zgNemIMD=*mOe&q{aJoFT_NYxCIB
z>${sI=GxXalr>3Q1^3tj*H9;Xk<#u)!EWS}rNcXAom2U)1CHSC<ACl9wH;5bs=?!&
zmXQfTBz0)L%+_&dhS?aeo7rMZjAf&z9Kn7wKI|O9>a0~4S5INzb8(<99;AbajZ4;{
z5LT9{N6_bwM{rts!<u3w=~0fY$BY=vMR9LZbQ|hdS47v1*FCFSzdUB9dCAd>$cWd~
zuZZh#6h#Qn<g3G;huVVS5A}lsUR%8KE!f!C%e*<mBPFK6TP@y^zJgoYkvjdo_%zR#
zCFln$V<><SY-0Q4dXp=zngh_18r3}2T`ha_thidCXMj^9Ywaj+sbJLufZs+gm}RcD
zlkJJ}Ls|2#vJ4E2Q7d7ah__V7lI-dWq%5#rmEu}oIL%QCgHeV!N%XiX&((GoQzGk;
zUZOXL3TYRz4!+KmcqFVQ5F?OdJdx!K%6o0Sg>?;Bh{7BI0coh%l-xXi{L`}nbLUNo
z4N9_wwk`x82{n{yJiC+;5CK!Z{X>;0ZZ8<jRQm5+%d5|K`4-%5^xkUGTMwH&m!Yzy
z4WDc|u;${@9jQA@I~Oq0Ml5+GgtciQGlQ4TO>%6hm!MP$FX;w5tTy&Txt#064ny_!
z$JJ&9Z+#A5oT{09vlDHJfP%rY5}50U^ky1_wFq=#s(!L#w<VOl*1}UWQlLP}m!>7G
z+Y)FXu7EETS{8xat0Nd&*cOh?jPWi^Ll_b>yqQ$3oea<-yoC6u;;JTL(^Vc#v2GH>
z{jm6=75u|BZ>7vhDxQiUFt#$2jjA{j+|**voSo<C(zxyV(Oq6Egjd-Y)ESUB2!sB5
z+Fz%Qe0eXx{cQ;X%7KyAY|l-t-W%Y4c(6^$)QqH0k-}xnn*zt|G{UVq(idPz2ofp`
zazJfGLWDSiJ4JT2%+27*MsWTGTKUeM!t=d5g>gXKDF`d2?jm}Jus-(+UR1v3d;pCu
zjPIBFNNh56vgOO4;E=aLJu}HMP2!E5$R16$s+=&Y=3?5y<4GMW7vju4H~1JPt*p_Q
z-1>oSQ`OAaP=ncgWorV0ay*h^Tmi~`q6Dt>ic|0MnnrgTHam5Xucmn)_bCq*Ta(Fs
zKQFef{PZM|5a)CYo_^$ltrK$B53Scv6E|10JEBPU0lLpRwpDk{u-5F_nreBU_twu0
z-&&IyC<|J{<LjZkCK3k5rX#e>&zj4J#D|nI6q8whbqzXG659_cms`z^j<QS?e1uU4
zz1*6F7%~rHz!hN=$Dh%C5PTz{qY=ny!lGUWI|QpUTQ8<W5pyKAmgT54rG-pxxNfbs
zP0hj6K7C?*WfFG8au6D(%=XcNtUkO<pwQpPn!G*Db<6I(q93gb@SFt`Yp@S@-MJUw
z{j|h1axZPPDD7^x_x*r;>z;r-M?;bs{YJGs!+=<U_QNFJ?y*@|_cAGC#GFJYB7tVt
zc%U1AO!MY37)v@mws`O!G1QZV*d`H1YMbFAMKY@*4H;+Zv8%zIL|?9^C8J0pFaYT;
zSBoMc3)Kt!1}Alip<2eVEYXxsu%H5#Z#4|0Y>B-1A06*lxqTz1B~_u%)G`SxO)0NU
zVN3dpG1ZBS=f+Q-HM+Ikcjz($=?mU3O(O$Jr(HQ5bHV2PR;bN=-iLZQ0*79m9dUlz
z#ffzx>ABrW3$u6P(5OA1a_KoLi0=7a$+{)XzawmkJda~aVRelj@Kz;uS5zrn4kjJ4
zmINMQ3Vjds5wVq;-wQ)stkGbTP^qljSU+)mSF*fHYnb62R0m6-&!n#Mri*-j0)+_o
zLoJETe!14fQftTALfx6deLB&(g~kiA8i8<z1fms)B+6zc9LS!Xbvz>g9w$zhKUGZG
zQqKm?vemPzO5n`|_L|Sc_M9!8o96k(*Yxlagdv}T|1J>3{Fy-DqYU=OxYAeDTW^j%
zx575{B{~xSpoKh7A&FL(gxDKKV4oqWW4f&65%Z_ca^k2uNEwhRiYyf6IR_|*bBKIg
zOQH}53i&TZSAiBua1R}Ow+M1XM_JGN9T-Pk<d~&eF{`GsAbSaDo-G90%_uol!Lbf7
zT2jPEvBkj;$4`4{Yl8K)4?^P;YRalI5_*zp=*q$^IIE$%TJhew2-LueR`Fyj0p51I
z_fYTquIN!S$mls`(2)1&u<4_St+RW#ZF^$Tj{3r$1)5IdIRNaG8;nklOPnkGZ%zay
zjRm$LSB7R*+yUrH4oJs3Gcb47DN(>vQxH5L?ygtkIybF5IbF9<_IydN`(EBPrX>^{
zNkvhP_bwVc*e}w;|G*WiBnlBwr|Dj5)*Z?u7m;Ba1fcDXo?{5xy3wCsTMfmkdVFW2
z8gMQZ7V<D$Pk5q|4H1@2QN_QVe>5*lYuQwuG^^0B=WJEP=i~Bkb$Ng8y>C(!&!5x{
zqyM>XP(D7`=J?nr#&s(s|Eo=yir_v)vf-J=GGk%94*B}i@^F&w5ucsnu{R3gqB&3o
zNbHeDy1fv9pBp|pNwyJ4y?~HcT7G|(ehRQX)|C<mLPTUFv{p*T13dIdd&O~FUJ!qz
z=2mf{Snn?!*0I;44Sqo~!IE_a|158bA(eRs&rUz<Ce{Tl+_(3wm5b*dQ>mHdt=o?=
zGMKAHN`;B(isreWhIBfrH6Fe4TZ4PRH1vi3(G}*7?uYF$B}9ktvFL#9LWF*ImUJ{^
z1^@HL8Vw|Z8P55J>DI&=4*AsC?L2`(9|xo35jYcV!320m^XU|wiJQ+S$XySXuA4%)
z!8+@{=+5N+atZ*(9;{j<;)_!eRQT_L1n_gmx$$pO8|jwv3pZ4fX^6}$H*~N?aeWNV
zvbbCqwX!^-NEgp0hvU(mvO0_LO+p1EplgI7K}6%@R&4|ClL0-TABd=K;VEKbc(3`Y
z-w!2xV2QT({*3&GacSOLyY!<B-<8W0N)cX*Pt3^2hP-bhE=1%%c;r*?_>#eO8KUxq
zg8NMx1ta@$1IpQ?f&|da=(>^@3yukghY1Z(Af<pnq1C&%2A}6`5MDc94PjFTL=pL2
z5lOGXY`vEWtz?O8?k>fZ>NR3SJ>lH~i0Uu&hSsodx64eXK|OD$tqbXB!gH1K48E^`
zpp!abq_F*kYCcv)Sn9ipOh5!jF2GD7-XydJRnI*pM2vME!K=f1gOFG5O=(>iTxi+n
zx4&$7>sL?f?b&*}-enrTBnhJ6Hd51N1Fq8EqprM>i%I#O7SDa0B#_b3NgQl*KdD^V
z_P-wmdq*V{3S2kqVb#+G$rDcTLz&pX6$)pB8`1s*0gd~Y9#@aUOl5SiVY=fKF&|8f
z2WW?XY`x_*nvh~vBC5Hb#eZli;b|V(HLLS1H@lY~Wzf7W!1*?|I2mKsut4X@WJad!
zftlb$L3ut=SJ41B3XTx7l%&`w9FoeVPHi|ki*Y}*X=s*}RZQs3mFZI*Cvj_8a&kU|
z9tLP#hriAKc0XfOjrSAN=obeyBR%%KJCOw#)81||{-;hN#`TH*K|_oCQNZJdq>;k^
zduQN)8+1d=WDdYh0-dSKput~PlNl$*thx#&NA0sgAh6}h=*RF*{|h7D<8<vTo2_v<
zIZk4-+fOx;REAHU&^eOmQ7M9_S2Q4dSa&_TR$IhL%d%Wf(Mft`3MCv@o*V7)iWODe
z$7N0yH9gF=Zh5Yit`hR(`ck%4gkAY{h}S#daVXzx#?Op)LGRU&71ESm{jJ>BVhnBW
zA@6M;Z>4^aesXlb{*JxiR-@^BQO_CRsu{K;CG-rIBSW>js%odTa$zY71D>wPa7sBu
zB5(##Bt9bp09lZ_CZQe@aDG@fFhB5Hm4tvBbiQHRz`)bNMk(y$irC3kvX5GTQ&l;k
zjuC(uPw)(&<;PwC2dd~gR{dELb!R976HD1C${sMz0G#W&P<7(#pV<b_>wUt4Fm`A)
zv3h+G@I@d<s2)B%m2W2Cy~oyygi`PhOlMQ8!xw6M&X#pFIZj502Ms*9to(;|<f8#^
z<bbPDhNa5zZCV1)K~J^se_tqE@o5{%G`LSwxwKoZlB;2e(n}G?&0G@dfQ~V+ArqWO
zcr!Cp&$`QeVv7A}`US&!*D0P1HPGi+dKW}Ufro5(RDB_AnK4NBwR$9+NTtD`1-`9-
zWU`aH5^o)<4$5b>VZh`L8xnAiv6UE_D`H^F-4Fx=@;5Y3>)OD(Pk%6FHsbubVawBJ
z2gdOmv+-GF_h$9XoL5_eVjg~`R7QAz*IRIxjU}70(Y&o68V&HQGQ>Gu>S4%I?qXIP
zp|9(Hc4_<4_M9CxS^gx0%B=p({^aYn^|~{Aa;Om3Uw@$)i3xS0j#77-`J8K=xK^Ts
zv)n*%&<Nv?kQH9l`hsq0dzobu79m=B`aiD-A;%kM9-`WV*qL;3b;XWSux|>!UJSM~
z0}XJ1XC~J7ux8D1+=$V6Txl7EhO=cvfoLx3st%@?A*zjLc+wQxrkJZKfm`bt$~Ky(
zj-7^vyfB~EeXZ{vXD}9sv3Go2MYuMmTDzk8t^bJLpJUn7(U?<pyX*d*wg;aLD7Ulo
z+}VnUVbI%l_a|cF=B^%c{b{v8Y#121qV<kYE?fy&0{c64XE;kXZDxo?D#Ad?;Fy~v
z?feo03vk4-BO6NL9}Q!~ZFht9EXh*cMbcTWX*a5!9<2jPK?i}>@dE6Osv+i*{Y`>q
zQFTxmQx>LNde`^_>w094F7tX|o$ZyGqEa_zAj|7XE;y!JFXd*-SP{YjyI_t>EU$iz
zP!PU<UFnpiG|$pQcH%>|HK799c~H>Jo2(>B^Z1xN28PGVJ8EuFOGUlgkZ-LUxOwd5
zX5IOyrP<wQPwSTGd5q&zcj5$3w7xRGm+@8fCJF%DkST1e1TBC{NXP!>8EW?j=0p{j
z5;gr(RI}{<oO*uhKFVK$*fQA`Y`#<oxuV6fy`B^<Tq|JD&AJY<t%@=<7*}@3b<jTw
zWfRV*xso9W%(`EuTiIVYUHxu9jUWryz&tzD(3nE&=EEP2Z?;WX5p(rLHVBx{=KPl2
zSQCE6kH26Ys|qYb1)~FbcTB@yz^7;r&<`>?{<g)~dpKrzWSnO7R<HZ33kAR5Y<gU6
z7-a|4$3x1ul>t<90A=b&6?b116O0$9LN=vkTRsxEY<Wt{Lm)c0+8{SWOiY$i5nEUi
zy>kaX97!CPqm0`70Ddm1EhJwg@v?v~Dv7HSEU`L-<HI?k9}{|)O4i!X0FvS|UTQ9s
zZ&StWaD8*nR7M4oByi}Lcd|dlDF8a8(WN+g+roe{?E69|mruxwy$G%1P{uP;<II(v
zD<L^Il}9HoFAgP1jP~1wKyY6X6Y6=y0G+eQ;2rT*ep7<Y@CD6qlK0AC<&ILm<UWre
z68nt$qMjyK?lUqWcS07Rp)1xMVsgH<qq4S)PH(J_EY~_wB<Q3O<gJL+oTRna5~%3a
zxISqrqNx?SCH4+M6@8wDGh>9uEp0tE-qa2o(8x=ZEIlIr-adg{O#}%72E8bQ6DNL#
zH@8yqvNw=pRMsnxbkltMYt76ungRn2+hGERLc3@@(iDVwuWf~8Q)>kl!q1JLR?+#;
z`Z#jrt0dVu^6;TO&n+<jGSkD|rlB9|tr3n#KKi?t(mWV0T!f7Gh{Yo<Uh0C*kau2`
zYLG{OI|Hdux<W{CG<8$}M!GyL;rU#YBUp?j$@5S)BV-E5#7~^U>QRG`-q5HC#hY6t
zyzdEAcDf2kXfFt9n@+Ea3=ln@g7bid<=QO-_68Uj>}naaWD(XcCuBVYE|TC<G&j+m
z2*y#di63G_PS1zxOImNrQELLrHcn0sV(nQ&;hVAL)}xnFR@AI!&lT%nwHR*C>JIs(
zh{Zke%7AiDU?k0OrSkhq&FD~!*5<UW(gP|Crzs;iz}`1K)_4s0x3cpNBz>~<>~pz5
zi>W=A4$W+b-ETay%W@Th1Per>r=1UM3G4n)`XMmEY-=L3Fegukt^&?gi8_~{KAM1p
zl{(>v474C53lALI_GzuquhmINGK&wJ3GNLkGhJ`7T7nb-QH^CbbX!v#J5ntxQ*m^&
zm@AER4AxMAwt?!HgK=Ow;si+&Yez~)<8-`ft?O|sZnlNeoWr7kW#0tBs@}4``3dj^
zacA(0+Lsr$!}WsxzGm>9qyqZ|rDvD-E(FY84fG<rUdfPy@s3ZM_v7Az;YBQ-)(qWG
z8#S}iC_wpgR6o+`<56y0GW5UvAD7yCKWwWyapuRR+crLb`o)fqf4$<C{>j%rK0l-S
zx6MtDe|X|&OE>*;^lzqTe~?vfEShrjnI~TSd1d~(qq7b#fB)L^MN9wkrPgDAvHAAK
zg>&Bgmn8e>uHV>ozw!I)*MtB5FP{GLA7vT&ZFTCf-g}j^G1K0$o2`lws_#a1KzWI3
zQzqsTg08G53CoD~4_+RhG0C7{a80IuQS}2X4+3Fu|D_z|eAwo85|1v85_4?UQEfPm
zX&TiCWT7MCpXi6juWO4~`mzQV$6BL#KNs2<0Xt9|W*5=PvvXp)%AxQv+PG+5QhotK
zhGWC>=I(QTE8bgCT0Ps~xeQId*V{Sd9j;T{ibp@<)VT6B>urra<CNPi%2&z=8(Xfn
z*vH*nv}5O`EvzcJ9^L(!oU=K7&~qH8buuzEH~p}KR*??}sJ+XQFFF*k1s2L$UWPxT
z&fK+&wBnL2ur9(6EGSf-kLc4DBuUaCw%9j1jzu}z1njJ4aRg|_QI*&8V*1R$?Cz$H
za$jE9u^|<u$i!-(4{<sJxrkyB@605$kz5mK7{+A(;<EYc{2_c;-k%+ead@MVNx{1Q
zgxR^(zI3jZ%a#ND@eK*hg~sZ%uUF|mitv0zW(hZVZ#53MKFZdQ&d?9W4?JF^{9~TG
zvqkTyGkLBIxDEsqSQ&zRoHqJ#rXl}Y=Av~4MlwcotJ(B30cL)vV=`r3p*p!ZW;!sG
zq6XktFv1#J!OM*<&jMajb!-$5%#*`3EPRH+4&2T9IG@~V!@rK*!)TeYhuN9l3hK?-
zaa6+e6JgEk*cOOu;!Lhs8cyEitwpjcTC2kgqiTpcPgyDF2ZZjcMZx=1CZ8;fTDhT_
zx?UR=Z+V}Ngk)W@VngZqw#8{9{ku%AB;|^7E3IHiu{E0AzjS@6AAFQXPvY@u8+lZH
z?uK9PPAFky8{6fJ(LDz{Sb^2YvOC`KgwOq=KK0{IFUq!Fw)~j9%TidXk9?^&)|aAJ
zBTbH3E~gjc$N)o;#+#cBPA$K(Oh{LJse6sdYikbE(zuJ%6-*IDrv$013X=vA0zk^7
zM$fUXV6w3QF>I>kMD*g8;@~s5gse!lh{QZ9th<T(nURXJkW!t}!&t+q;79u*1DD!f
z4m=4BAU@Ty*YaMJBM3iKD|u;BSG(tT1K;?@j5Z7?dmv&36x=}?Ft}_$@irFNOr;U}
z;fJV0J0m<FwRpbiW8RnMA;cUWQ1%CSf8=_|RlqtuyV(K1xm$H0&0vp?DCJqi2K2@U
zW&5hHsIF3IANQ*=D_I6de<Cxfh?huXpMhiUf+Otbn#^<T=WOEwM45NPbdwdk@1Ymc
zl_MP}^Oc2(>=;L#6wjj|+BKOlW0K{d_#ajI7X9iHc$UD%WD5Yw@O5qfVF(=cE3Yk2
zoHDt*cAaBGI-3K#Hm6ru&^=mL1RdLY<?#*XoY`JVa39l9`@rDZR}0?AbKO3uaa$@m
z2ImZ;22X@(P~$%LdfJwy&Mhz0FSQ+DFU^zA#hM_WI#!MuYsfJ8$RqD3>tn)>nJeQK
zvX7rG4PcpbPypL*vf~q@szbEkJ}<QWC1MW*gAbUC1QaI9cgG9Q#U9GCyw6U&1ng{2
zkvjx&5HMbluAX2WnM`&!n)magjE#YPWszY%Jxzg$JEkm*U_+cFAT}M%vb^u-lvR#7
z-4kGTJTdFZmhZU$m$Aj<*Pam1<o>Beyn|fJiXdG%TTMEu%bHrJJbp6Z+e-%ve&1kM
z?rOb|ei7RJX=GTF|HsJSJ<uEDtq$;3!vQ@TP;f1?;4SXbnu^{!tc!U9W(h2F-B=#b
zNb9<SNj5Y!a4cgOB?5@!1SX-_35<mRmf1;+PdiR$E8dlMc7;R{O%?+*<g<ektqFMq
zWr%snTL?~4*u|p$>@uJExy~7)29<e;vkLERpSrRkMym;>#t_^<GMrmZ<n@c<5@5HH
zG(m`&dhOSliBrnMZ=P7*d@*N5&OFD4IP2cM#xqa6`9x>T%oyh${X>I?v6ewj#mHI3
zu}tsEG>ptyAljKnX^Q27esoVufydyvYI1#KP+mJcFRsS7E_}zL?CvG@jWt;X$H*7P
z$wb0QPS&!P`GQK?&39zx7}|%;9lT8+*g<3q#&Fn@ViOf5pSr;qf>jZebdJRe2z}*c
zty2zhnHqE*93kXJr`Pa=sZ39ZU1Cln7IC4cAe@ND09-4`Hi2Xo;K7OG#c^G-56LPV
zXWU?d6Nd!|gH65YNBE*&SQ=9)fLI7Uv!!AQ_C701|HF^W)B<zsV$P|2cVvNgz;!NS
zDBh%uQ`!eym$E&-Z^TbvbU^uSuRHRQ)koufa9HDd$258l3%Tt42a$@`;I$4ZU!4sQ
zVy2C0w;<1ihn_faBe7@^BZL+8GNCx?2c`$v9TZIrUNUf3ko~QF@-4ZE?&m0U&q1AM
zzcU`|rLk;s7aoIf=Vf+LMUNsnV;&WvtnfGy$du0%h2BmnMv-{hpm+jm8MFC`l<@d0
zpFj=GYhfbB77<2+6+2Sd=*tQYM-cS~euru;!oN$cO1V-sC2HT?_ETTar*RDoj5BX8
z5C6a~aR0iE%U#_!$LSyKHVv;*zTK^N_t@Qy1H(ko$7{C~yg@E$G~v8H;N2s!L~$g=
zj2<57`s-`O^COmOw{7dM$?88dzt6o~heiNyo}`xvi+q6E{hZBt{WL5@F`7FaEcWXx
zS_q;}!na4N%l&@3c0W=jeE_816Z0c56h2T0mD^Z@i@uzxFkn13W%67wJwszakpLFb
zaQcuI9fb%?*4CTj>T{|yDkF=lh|@Gda4yp8NV|oJ7t2!Ogn8V-c6rJ@+@2T!F&w9&
z2<|T56`b|Nv{{{-DYR3ZFXU7=ZFOH9mp1aCAtHb9A$@xTQ6oU<zhHWJKr{0Af~VW`
z*KoJ)8QivP{ot;G+h-5zttA0@!-ox?FVAi{ahl#m?PH$gSq0=46C#nH5&NkS_#yFK
zb|MOrlhOYi^cOl%&L`iJnnLplEJP5??C;Ig{RNyPL`WsU8B+Z=IDTU;Eh4p6%~)gs
z<qR5DvG*j0%GAq_O^XEN>J0KIQ4-}sBM;IfV2;Y*I^VdW1FYIGcI^I+kj%^;pBP{3
z-ubguVn^D&Z(ksjXx_5l3i{iOa@Mf|kw1nJ-DMig)(?M#R2R5Y`O*MiJfdJ_chW&0
zh3#xsrAIUTWm6h@tglNn-P`W3I<<s6xRe5EX}d8yr+Y^30b>U%IG@ucH`k`4DG`do
zvSLtO%B1wxXUz***f&z50yE?aS1judrAp*-K{5z*{R?O~(pxQ600{a1=)#l*Imn;+
z6@n(f2(G1I>ptr0ppJ9!f4R&wDN>AVz)bGN^r-3V!(si4qfH*<nT)4WbZMomX5`{=
z;a4}qr6d1L91js4!|(XgoT6G-OG5OrbyQz?3GHHh>gw1-E$#z`(Om`ieMbg|lrICk
zR2}!II)1aO4I7RETUyB<tBx<E>d;HovHK5IM-qN2#+e#07#V|vU>YIZcOC~P3u5YX
zc2Hq`x2=p60L?)*%~D~~EPwB?K}#(2ZCJBZUr`vnjP(toYvd@eAP&*0K(lcH1Wc<C
zHXOzZB4hrp0HxR`KEWb-x{7ugBR@5!CD&s;z#nffdumGgJ9|@-ON}+5FlKVrBwTy3
zb86I~@SGL@Fz?$tdhe<L_c;S1Op{9)y{34R6qBOW%(6AMk4y86M8Kx<%Kw-@*fb++
z5?eCS2C#<Jj%V#BDRP1P=G&Rw>L!1G0#J+l)s<L(Ms<i4x<6I6oMzm3K<T-s+lZ&A
z2@7%5D;=vK{LoLbhVH4LB(bxaMzIP|BzLg^>J$sU`^__0;*msd7&TXyNcDkr)wHm=
z@iVnpFIYnDDkpRTI?ueH_HRT$&ID|N^AlXviJK7@%N$kV&_9jI&9>*P9h+Ot<$jeH
z94{o7(g65YI-b<OidV)v(1$*2L00H>X}pvcw`n@=XOGtEzd)7Rr*ZWRc)!k8KFJJF
z%sz@aZM3U5X7p4q3!HMTKXfIEGLSRyJQICx;xe#KHi!-(1$<G69i)?vwUQsES7>EW
z4t@6z1{xwzC2ml7<Q%YIp9!YlM&HJ^)YOWswgmMB1+d};aj7M3=+uuVe7mDqyU0NY
zONZF|&jOc~Q(6|u=~|0)Bu-d*DcI2>CDb)V(R^me_FV=eP$mM|4fNOBcg{sdcfosC
z@o3!K|GSO9b`Q!!uUys->4!enn%b<-ot?I=e?c~nK3fQ<^5{35o<uGX_lu=$&Nx67
zF*9l*+UTmGxhROp6vXprwKCjKMS*JPAl08ttkmb-DkfHB+je3{U__ARHPm0s^~f5O
zfb%C*_iP?ty9;VA^_N}sB{;+Q7{Kx*P{mcD+T{E;q5rI;LA6uyH#b3jp+;%%hCPM5
zih2F@4gbJ$@QLH8;ge5nt)mNpi@xqey3U+@#(LZI(5b)I*XBm85AEsFfGcER<clPe
z^KDczCk?K?`&|JA?pqOh$143$mTBbNz{r>E<3K*5wiy2;q1TtE@jGXX7Q(*N5bOmL
z;uC&mbPUu%U;$&hZrL!IcAN*Lj9kfDubdsJ%=Qn0e$g!+r~Z7s*zAihMw8<*_coo7
zAyt^18m<`Q?ImJS@W~%63P=(vl1&OZAc{no#BmrN@T{oQSSF{aR0TmzJ3(EVBIS*=
zbd%?_2USnp6t@?@=B~|iv|AHEzk{?@DVy#uauEOT89WWj*T9_b8m2fu`*M|Xjcq>z
zibYDvOUmW!e3#vQb8G2=X?2Y{nk6d8@ExVP`1Bc$ARnY-MPT3EGg?995bP|V0LY(q
z{wa70JoJvmh&`7`835E^I2<V%t-pL1cfBb^)kB8*#HCIQqyglb!P^Hm!I}C<WW`qJ
zhFBU9*4;2cQE{&w*c&|Oh>}Zi4Nm2TW|leVx>#z-=d3ER9c48yKn4nEV>wR{=b-s$
z1%d<i$o=r?3K^PLW$;)WEw)}RENpXseAqh@pEh#&p!eJG0M9Lr(ry~MpPk3d%d=DC
z9C-I2YQdxe=LOBk=cb2f^{r=pV%*Pb-021&rvLw)fEjZZ9<Q)&U$wX5!i4=8>_B(6
zz7M&3O8M2V7U{jc=t#d(oB;y+KElK-&HW(48?Jm(XZWi$@8>4(EyWGsP3HT(R=E#6
z6Q8@YWA%BLG0JwC_E?v^?KwgXyG6ofe*yLJ(sqW@Jhb%NSjn(UHMzF-xe6e0&@wiO
zWKuo7(EQ8O#^+FW*20^lUNp9(T!JnzTgzDx>{FOPM+4LOO~{iFz9~UzUSUz0DKAp0
zON`Q!_;ev5CJV{hWi1XBz!YZaDn&S00CzD;ky_HSf~;eG9|sL%Le1iZi3~q3@lD<c
zR{`DoV&>!f3p-~3#GJ{qX1jkEVtV|f!2@gTV~zW5z2p4>0U%u7M#D(0vNYdyi8kDt
zRB+c`a9?YByjgE0K4?4)L);~r6T#9%%KJ}hmWoj)rM-EHCbei<PR}y_D!6r6_l!81
zR}k@g?$AFQ-y@DAFlL0>gD=WtvN?D{8fl^kUleE*B}Vazl@1iTcEXeyu&{!Ja7yb)
z4M+(^LHe1L0@tH0Kr*(=5;q)WKp2g%a}oJs%a};*d60WzTQ4;5JC`$C-(9|M^|2^1
zXIK&3SymR9jZ$v!-khxuYgtG?a?9W);*lC$dN0Q!$s6E35UH6IGmIACE9Dy%hIGAj
zEoH{S%AGOUrMd-_gsD~2UZAU?e<BFmLaBp$7UTjz`0D7LdX~r}XkAUz3r{!+3+xiw
zeF)fedFiu<e8rwdW>s{ULqOcPn#VidqqPDsB9$x-o(t7-7v;hWsn2ZdPmnGE&NcgY
z0*Rvb&P$G}2HG&KEGn#Sga^ZgMaUkt&=Hm+wy>x19`)0eH(mFGG`T?Q2uuI8gUb@`
zvr4QHh=lPvOySBJj0uQmaND}I$MZ;_hJ)Tg2@Q?P@KBrQK(_nJ1=IaZ?}v_u7?yNq
zdmqQ^??jH6Tijpz#<;rzybsRCbTMdxxVj7>2Gj=pTGQ)Tic(;lfXZ<fT?^5FeNCje
z{Q{;zpo2%qLR~S6D%x3LJVF6ClRUX4SWrk;to6sa^pA1<qNf9u!i_!SI3wpK_g*QD
zO3NdKz#rI&Ptfb);kd)!5k%_P0t6YT>sUss0bw5laBGc05=}cDl=HouNi#J;D{C5#
z_xgp$SB6sx2UVQ*vu;ERXFjuc@09Xa4X&5QuqXXsfTv62eOG_GCrKK`NBVmMo^b(r
zkFqvjzxc^S7NB%3X}A5@xj3oLS8QEVa>Z>!ZFVt8z$lm<1#^>hz1&4zO*9xefTNww
z@bLMrksADQN!5z85487rws?!#H>{nEE*tBl!at;vz<5d!sg?W67(xy*iNN2?Ha}y>
z6)+qw<NUZOvjLytDS-Xua7_rzK2?^Eq5;99^~vQ7Ld(M~Qb_<^%#A<8r(6(`W9``g
z#CT`eR#~_n9*@bvmMd+J$O1Nr-aI^73Gk>NUZx-ZPmg_b-X9<PeD$$sUBQ6jtAFS*
z4d3#CMIqDX)WV&ic6>5ca^8tVoGvstJaOFNv6QZkAOu>XW!A0dk=#umH&s=H<qN|Y
z0D<_SA_&p+4Y;Za%l5mqY(91H0>pAc?LR^7cr^(wQfUx5qLGtD^tl<e?0Gp|MlHL&
z$D?k)G50qe|GXn`%9Ky`pLp-ZO+TsmUo*@8{hzj0pF8#HKOg(3@MhibC;V;SPhMLe
z{F8rp^1Y|Zn*RC3*4F}qfAo#{otKtR-?TF7nI!`)_T9hvm)th{-^b4T@m~!`JC9>m
zunJ1AmPatP?x@gyb&NZ!`c!n?2m#o7Bt%F*tDDya#U4vfn8o(e(!gMT-@*>G?26jZ
z<lPzBkN0Rij}4xW^^f)_4?k~vVB3FxRhs8(yZh6=eq|&QuI62h$E+VrI)A$8zzt(f
z20kmcoz5+(6LEyCgMlg`bu%z;Lv(GF`RrS^jj5LQo=P~Bojiu2mQBsZg(~Uh6q8dl
z71kdH_F*z^nDg8Dn%UZz_?-8dX5dM1F<MfEN<5t(+}U95*aN$-G`eF+FQ=^@U*cF@
znc&__CMMQc6BhT9pNe#gd)3osy)j{$`0AXD?z)3Dxz^wMJpNk2TL1Mm))NzTklV|`
z<5_lPOPn3vS(&0+huO9DqoJW8joVQY;JwPgEJXhh1nZ0J{Cha2HXBAR+-hj?px~hN
za9aUw8q$UbJw9GrPh<8Qx1ZNx$e--bF>B$y1$Rdya+45=d8jRsV-{iCB$jIGLKF(k
zQm#vF2oR(3B{-AwBdA+bKS$Fb>|1GAAZR#tI8PBJ=#QrfuSPDz4O=490Dz6laB-s%
z;mWFeUcCWCEJ=3rFb~S&cUm2XK$%;peT_@w7-hCPek4o_-Ub6rYywGtf`u>@n$`br
zD{Cg6TwW^6<f~^+S?H5f6@F}MPx3M2naI(52bHf|yjMXkIR~1N;oXJ;$C-%Jt}VOm
zdgnd8qJhP4^7ce{+$NK0=;#9HD?3YjKDT|GvY-y2f&MmS|C#98ajMm=ZbJ?KMLmmJ
zy;eS4>kAbS1ekb5AIY@799M7YoLC*=47Lj%ji&+cy{winno}5kvjf@r9zWeGsMx5M
zs3Z6A@v(ro(~)9bgpwz<t2$F`wshpKg^XvJJM-w}EJ=#BJzu}Rp4KNN3w*pR{2*be
zl0g@?S!KRav0#OSun_hLu@~c9ir|SvTv<Q&Y1qLJpn|~%&GNVIdwf=T(4`C;@~k}u
z<yHX3-0pjJcYiw$m>~iBZyUWkht1;x@*g<UMqATdpWp|2p<j6?&C?Z-@8~ml*H{IQ
z<c7E6SW)lT@khsst&sF#DCG21ve{q{0HaR$Q|b;tsg(d+qr_j!bV`bhx^N$9BEVAM
zZSz3)l7c{TbFDD)tH;TBy>wD_h-%$<nHfD6I_gA>e2aENw>Y@vJKKtS&JX{@S}hIj
z4xS#49Ds4PiKzG9{HL!x|6bJjbsr??&UZ{D@i+U$g~dNzUcL9{djbZnn$eF91*5m=
zODd7)`(DpG<M{&OcEPZI<c?kMnc+Fnk3#r#@eSin|L&=a-jZnGmpiYxyFte`SUR!H
zqFVmTV5nKCTU9@=FQEckHSTJ>^OT=%X?9m7Q(N0S9Xq!ekwkNHW7%(>l64~bhWK6h
zTRJx9s8dp8r~bi~u|p7Q=v<%Lnbqnz#p6EzAk$(YZ^-<Bg2P66^{0p6Dcw^1`GT4E
z%u14a_77(u@xPvd#V0LSDtg7pzAt*8amWj&-~nOqn>5cJc6#BUhL!*O+4;(c1CQ1!
z=ko{8VFgR=!3=DM0c$-&8*J?u&)}MD_rEdNuJEO-bVXZljCr&&s<d`4t-8P!AtaAq
zR5ea;kyjvFk5RQFL<@%dGc*DHCSl?))%wd=$9YT&21p79;p$PPn(;Ua^SOB5QHw7@
zbge)dOdM6y5?vL4<p0^-&t3jT<+|J*-2TZk<1At24Jn+7<+;l@u!}h2qoL{urF)#g
zefI(|)EB^zL&}J0^e^fn&JQV-><LmH^bU|8jB|E2*Jf?JUaXtn(oJZxT{hP6u4~4@
zbL6n`=%V3Rm^+t-6Q7<8napLwrETVjM3FSsbC=Jwn)gS=rtT2;-f>0Ku#<==7U3vh
zBxpSJI{N44XawsRDL-8?)AoES=J#xC!R31cnHf@fiuFdc3Q|<^QpS#DIo3rw2I4io
z2u3Z*HD#HQKBWO?e7GcWp%d1B8GzZ3PDM?qooVQ7Fkgx8TENo4`H3JDuarh|Sph*E
zC;Y<KjNZ@>TJ`QM`$sOPX887i=WV^`?SawZ-2<bR+Xp?rHSJ7uec9ks;LXwuh0B&|
z_q9>eV3Xd}kW?^ycOcYGY2+{SoN5BQfYiG%L`L~bWah81PxwPhs=5YuuvjVbb0!&}
zVTMptpQu-sPT7)b389q$OSPO6Jb4`DDv!Rp-a=-eQ~`*3>JbS_Qejk5CcW2Y4uzOo
zRKwjfFUb}Zgt-c3O&?8Q$DS0Yj+#OBrDjWxALJG=phRPBMep+Ql`AS5-prY`ziQX=
z!ooKrJA<~5etOvZ8*RY@$hk@Qq}}uJYLvItyrA40aMvmh!$-=a?B{N#Eb2RbdXn>%
z)LJT`?)|z2)Kv?7Xauup7SELTR)s8T7uXR-<~5nc9g3dbYI~lmz#NeK(M}|v?c+FA
z9zRX@)Rwqa+zN?TGBB4*HJ5gRH|>w!k4Z>ja9?x>cX~hLjd~)vC7BP{YkMAQR&r`5
zKiE85%D$KWa0V7@>p{H4OO{@xkBin29p*zH=pT$9POfE<Ze7%o%qYxkCQm{p<j)G-
z(%6ovxl7i4@P^$38~@f}(+KVZ;{v>$X>0}64{aV8@$NF+Yivu18M)b_zaOa|MaMQH
zJ7%OP33JzYL&3<qk^L_r{T9?QT-`B_*B7oV!D5n#Un1=L>RkxDFg#3Th9S2Q8ic63
zlADE>-?HO)iZe(R`d~@YBIb(GDDoK9Fn0pYr}C!0+XnC9xG2F|9hhNIi$cys_93WL
z3!@z#G_i+W031pOzkxqSWa*X74`YWBY97%ECexd<SF-Q1GUKFQPcr+{D5CiC=~??j
zv(M*YUE{tH;NBlVFv$N!kMHNZ(-fOl*{?W9mCnc>+m&ax{c^$9#XERw^*ic&_Um>M
z%j7Aix~XZJxCr3<3F@P9v=8%JM39A`j|>@IhmW1yR2>WMnKUPSK*M-rVs)k<Qa-j#
zoRJI_a++kV@ttbBLLQ%KNZiZg!ax)fCMMasSCOqs<+bU_sm^0ta@n1g=#R#0S{U_V
z1pZi-D@H6r@tVLrhm>23!G**m+E9}J^Q8pdjdA{#MIrzcBOeA`>r0KVR#=jC^OL)V
z<>6Nz=tukOFrm<}lik$^6Xu|4=%b_<@Aaho@;vLFgE7Od7SHcm@@^yZ+N>WsT!8uh
zU?<jSGm0ELgm*#u-2_clJ-xni`#}^5d*>h}hV?E5Wak@xQ5Bp89$i57rAY#?>^zaz
z*6P6N5emNUP*!S{mgkz$tT8kjlHT*IpXgodZ@C;DE(0)}k+S((x6nyz3XD_g0PK8a
zYyh?`fgvs!<{@9k8BNij1KJ&ux5Q0PC1I0`g|q3{QGRUs|JkxE<$EqbXL2bk9y5cq
zh0*(1M4+{7G$tBv^~AWm7ZkUT_p*<7@bk7`+4g_4O5=op`H}MFM=g4%-IJ*d5)U*B
z>f$%o)OKzxEf(QkWzSjNf|fO<QMP$Js6HzHg+4X45bR}7bWKBa%`UpeKXWcloc{h?
zY+`ZR{9ck}K&7CKSsdvT^AFBWs_@0#UU1t`93+a(<yz@9<cQf$aB}K0O5vY&fznoI
z>dJjF$cnDLU@<o&Baj5XL17Lq#XK`2r^t`uIX-t*>$RsGL5@RyYho&2Z0=+y)`e4{
z2P@($ukN<n?fPM>k7-1!8NI4;4IeC^BpA6yb?~z0(TYb``U*T*CfDa_9_zr*omV3A
zM*7ET-f-Tn>#xcb&V_ZlZnN$@4X^ci-GU@Yh0N=zTYAnKU(%gVPmuK!oeh?kNmk^w
zj&nVgLJ-*;YRGPmHaE}<ML6d6;q&yCk%<`^o<sAyDq)it8VQE9B&3D<;Q0Q9hDOI$
zIt3EI^}^^m?a|jcbC!;S)27HcJF(t+dA8+c>x~!ut@E(GkZQ1xRLwz>O?uHYUaAUZ
zIEhXR7H+h8D%XBr$5(5<>hX^BdaY^RA>|QU1n&oUdX;Z_(v%r#3C<Ods?)q7db?&6
zov-(RiBrJ9x7H`}6Ti<-w?4bH-5p$)>AY#I8P`>w={zLhi8_CZ-jqucie?y?+q2p6
zGf*WcvGd60ByR2-L4o?N*YPv*2O0!$eGz*HXREiqW*j<(%3wohCf)!Ir2S|{P~kf&
z3$c=bX}?(wjSLwr&*#oMxUI;{)Yox}q!T+^yEN9{ig4jlbY~_=IT$41)Q`xH9hszj
zkPh2(fmiajnakmJ`xhOLeoh>}ryQE;Gd=9oQO7H_o^R<I_1GKS?e#u5s2|$pwJb9Y
zJsfHq>TC}fJsI&{OWuPk<1`BFpz3V*SCIwx?A}jXymbKuj!Q|(HgxJT7?P$e*RpAB
zDhq<Bs&g0!)$O9Z-NRyB|I1^#f7SS57qjnIu#=E^Gk*42)^m&+i0)D>VqUx%c!C8b
z$2lQd@$2d~HsQ$v+gc3IsR)+F_oIR%7c7R%(CT=F$)BvLQwjR&HggFXSf%&n5U!$m
zC@3a4futnqHY7epcd06K{0cK?1+9$TcFeLe>e=z!x$xGUR}*HvdS$E5c?sw9G;dQt
zp7OcgrE&LZ+)n-7Ctq(n(LXv8X&Om$pOw=wenv*=&g&^p)@2k_5EXW&b~n5gU3Z~M
z7CEYu%TC#et+sYYE?CYPL&Zw<XKEt<(=!3rO%_jJhnV%W^<`CJ<@{G+xE;)~;?I2J
z30?U(QaXU%X>nyP(ydZUrI|R9p!eFk;e>b>^+FSi?XZ|_@~;w4r1w{Z$4@B+dXT=H
zNh$1n9KDZZERQ;p6|AkNbbF@A@d}g~Yr;myk0v7d>wMThI$DcB?G{F<nR**}0m|I(
zFSI@K*bDlSJRg4fJ&a%)=`uaMTVi@}cXxohr!Hw=@RL1K8_;ktcq$jQgusdFRVP6}
zTIgS|o8?p7lPvrN;3g;_fKYPJeMHYHF&$R(hE(9ciPapC+^TWu#l~5vxyLU`qQNBN
z3K8@})YEF=0k2N}hGian8|#sHN(gKbNq~Pa0onftRWK)I9)ol9S+20W4m)>ua<sT8
zT=cVI7eKK>#JO1Y_1~X*lM0-Lt6j;Z>?&mJf4t+!-2UrzYNGg-9pY-$5ZL2;byRs6
zskfQjnRa+Wr{7wzXwkuim%8_>`GK%b=P;}9qD>aB${bHnMIV+2#m6n_rX7w?-IN-R
zZIUXzt?unU7kmSq0Rf}utm9PFp+e(M4!a<atrVAZ!~l$pzE;k{Oa;nDNG_lOUNT!$
z$B16rgiW$7c}>N0>DbooXZaZqB)*71HPKD<=?jh)Q!`V4{D@8=vz6+Lp;gX2pIF`e
zQ%|wMORg-LqT7#60nMm>qBobCnT2kzukK!6@Himf6`<VIj-1uFrD4#xeq+yfe`L>h
zJZvxcX5a_^sbRSMhlb&j+A#3fZ02x+fhg|OC-22n$?#6rr0QOx79eADKaev!TsDRb
zDXsJmxz>3!n0)9J{+8_Z^*w-|?2nM1ksqBS1|)3@HK^WK(jbfzt`jn+y38mz43v)j
zQfy#QTE!tk@uGH~l~o2X1d-Pdyer403C9WkM$n4cbAsAeCp15^iFJ3u<|S;K`D)TR
z+@hr_P+SoO4;lv^7A;PT9{onfdM(l#6g)6^KO6Ekq~*@dIG^rcbfDI^jyD(Ld_`)-
zRn08VqzvV=sLBEV9kB}!7H_Yi2`)7r;<)2fBjow!ylwu=qs}Kc=Kv!7*_)nJG2a<l
zeciH%b}D<C6-^gTc4gNK!5g$4Wn=f8Zf{wTz$>u+TaoJK%}&H|<>~D;DNm#Af1YH(
zwpCllvWl!f+;dVl&n9tk&-_Yi4S&mPp}mJ*^tUFr9*Ga!G-@6l9ZmCms@1rg^baTt
z_G?CX{O@fh66~T!%=%9)f}vm-ykO|#gZ+852>#s77PWoTJns)>7h==<LV0Uobx@+9
zgYuV~WiR-hI$!n41t_S#AV(U4q^``go=+w-C}|8B(pL9zi=%6g=C!GXAgzKw%f-cj
zen-8Pl`w=Bt|{NC1V2>GJcP*7o%B*ch2W6bLe#0Y5TH%~OkK%ai-Lj{#YY7N?x!V)
zT3ItQDCeWV?b+@}%3y@@jeeNBOLy?K-mA^_jzr{r-0j<YSa0ncK*l-hp&<OopbVMZ
zAD3KOU5v}a=5758+leS2^Jrnk%b=;Zd#mM!@g3=sLlzxp!DbR&95td;b^s)cfOoVy
zY4)T(nYpgpSkXkYF7$Fb*xLBSg~YLs>2&=>V2M)^A7)4CMYad6km8n8fSD4x+p!}d
z2e23wSe!0da08Tjh_Wm<EU(c&(ny^5YrS?NZ+144Ow02d`^(EN%saZJIA`u$p9x(|
zYM;MI_pz1T*J52qtZ2?ze!bUwDMGoOHo8*(xJ2*%(By0}Id=!>?Yj+m?lXpwT|L^E
z;ji}?l$*UCGiy=VeCdq<QSv?t(YWgZqW0<NX0r(2@vKb*4`pSqv~{pks0;Jjq=OUx
zMXna#BLqVml3Sl8B<U<E?p6r2OhtgDCHGKxpMn$64RKmhq^38YHH!Hd$b$iH<D}|d
zHWZy}75@s-JJ=h<YxQ1B*-$UfcrY(ZqbOws7x?=(mVuUv6C@jAroB<wvEcZWsBOny
zM9JMXdCZzD;oQ#`Y1~~&$~TIqbI9cMRXoc55_~?}Ua{@J6HbNf-mSFjz1}{P%WBe{
zjl}Th%rv|TZRk_klIxG<3o|(jChF&{FFOxmK=Wcf1r7J#Nnr-v1|SHu*SWIU(X428
z|NOzIPr>zFsRy;cd+WlPjAi5RzHt0kp|J-({o-$4X#U%Ge)@MQKl%GFkJx^7wCfj|
zf9kSb9Dnk6f17hUXY*fvq5r42y}z1V@@m*SCzltV{`Jy(L9_nx$lka9=eBy|fMFod
zWEgNq|IgCycMGCfi;GfBTvX5J9YewM{fR3%mNJ7q&uoctyqOmyZ8ZT|ZbsChMkAFe
z{5vk;|3%%Kheds+>AxL2nW!;A+>NxzCJ5Sywi+fhl?Z~UY249BWDt}@G#qi}nCbx4
z>4>-@tAcc(jY(sI0hg#@L~0@ub{X3wXp9iC3X-^#6bckn)%Sd!FX>r&x_{^Vcdogv
z{{1E<ics(SKFj^w_x*^G?pJtFvA6=%0ee-HL9Ewhk5$>!P<&aRVKQzU8cm7u3)Xy0
zzGR*@OnY+_3!LMh)p|{bbfdbBd^t0sb`nK~bft+Ez17J-ltlIY7-6@@>H4E=Kj^Ku
z94a4wVoG`Vj`Kj-X^FMHI@>%w=Im%eb+?c8ru_Ic>cOrW=h;OJd7Cas%==G!2+Bfr
z@k<fq1!G4l42)y=chZE0Gka0#w55YZCR0qY%L1wL;zx#oDL$$4z#^Hc4qAk~!S#@v
zD>Czyfeq9o0`^HA*@QntJlksdXO*DE^|LJF-paSbYo*5I)jlg0;gmz*5@<P?#4g`c
z8=1Si(aFPDKNhx~N8*@?uA#=5fw$r{R|BmrZT34lD01!C_9tCw>}-Bpc?g|IGw_LT
z%AJ;um?QbxHJdu}uCkD-*s`TyN8=ogyTh|*rbR?|fRbXdEfkniZ4Bd>)7psERgjGv
zuQg_(WGlrmlM<SK080p*66pR!Ny8slXD-FUGIqQ8$0lx_PeUm%jZoyo$$Lgo?FE4=
zoK&+!v6D*RH?sQBGZBcM;sh6!^!ZS9pyi>JWwHTf66E6~ZmS~bK|go0-fx!S083fZ
zvD4{TWmurvNcbONeipCk-ck8*vGSd##?EZCO1XDl`Mx)$^X)dvm9(~jLb(A&%<L*3
zb(<8>i0<)v%cLml1?BUiWW~2qxf&%V5lOE~ypVaqpjoa-&>y;A;YFj_1wn=YnzKXY
zvvIc>5^P2G+tL92aV6`5>fxuCt!~W{mmSM=Va=W~qTt#m?IV2`$0_Cz0B_{6$sMcs
zV7MeO^N>;U)n4P0@psE414?9QYJPYF)VR=Ei2E3WTBZYP=7l+qV#&**(h<{&Y7^bp
zegHu(_uid7J-2u0Z3%XZ#`Z8uceArGsW9n&eAL@)OKltYqRiH%Q=Fm74d{cuo;!Pg
z>cqLBv=@aPXSTNbwOBJEH_i=b`vzn_fK;Ki3TX=`T+64xAx+Ej&Tj=C0EJ>n_b3tY
zqh7`w0fr(v&tj*s9WyLBk=jMXM6${B78y;b;4GtLq%$j9Gkr|*Tochuf>Q%pBbeW1
z&ImB>MY|(9;3_9|Ux*C73GfmHT)2Dwst#}!v#)1!V`8<rX|+TB;Xj0wGE^6wGPK)!
zzA1TIl>Nu?81HM8{33gVuK#KMz~?j<w;Tc=#^_cg-M5wLmAiTi{r6V~EBl{uzS#RA
z4r5nqlJD^fi;$|y^D@rHmVuA`MYT<YGLGF_P6Ue0P3sLuvHpn6-bu7a+nG?{lcQh1
z9w7w4>k0~L-C3|AjPV=eAj7&9%@Cd)Kzhe$ai5Y%p;BsD5~Ix<w1E)ofk7*f9Sj*e
zNG$$yv(6jdNFP2Jf}%VUh$0j$%|N3&^LKi&lA~#vYfa*qa#k=b%=BrvT2puA(J7s&
zN@qE%v1yX}%JjDP@WcRdFKj+w39$6H=<Nl1OF@)1P2Zze+I7&Tzm7P6(y!zCrb98?
zXF#5hF3^7yu(dF;F>b=wd>1H!4AF&RXB_JP{5dDURg#Q+GIWQA0~Qyd`T|F*tf+|c
z5CooMa0DUcurMLi#UZl^H_nc0^?v^BTEmhX@LH;Hil@~2N!=L{#;<5tS|<S7p>_gh
z5E6$9e^SPT8&|c?8kUbC|9IRpP+t)&pP!%AxWb=ZYK+dkeRqv<ITSx;k1^W)@={s<
zfS+Oc`zal5$|YahyS@GL)^$BMb&uZGcVFaVz%HfvpzW2e?5O^44yV{{0y5a!=jrb3
z)Au=BZ}hUKBg#G{^yX4*Vzjh0)D>hWS(XMn#qw!=F-QVi!C0!4slSB4U$r$2T+Hp(
z&f%%2^U&8f<vM3IV*x}MKxad6;9a2cM~SCeiv1+=N2j+L9GjLTVbHMO8GxGFWYv1a
z-yA$7*w$68ybSUQsDh#Fu*e%_dkac6%<6dmB^8eTdq*rQ-GkuY=oJ*=nB}q{XL@^#
z-ujHLOJnaZvR3o%nY-P6mkVF+SLCVO(mlGXT!RdGvG+iNy;%2Wec!%wjd)|vZa7sA
z#94B(DQaus+;D$hKXjNv7|PmudS$GaPVJ(P0uSeBEoBFf`3q}Cg<laEV(p4R%>UVl
z0VeO_!u}Fk#Q^suL$$lN1ZHw^SEtuq1w}D$7Py$FSw`?T9>7GvyqUX1FhO`D;8amo
zIK!w44?(a>0&*j?4QwQ8rP&gPZsUsiYs5ugPChqW62^oDJJ+PYvVfhGEexMTO0h>x
z@;(DUMktcwVLmye^uEiFf5AEFyBhn{{%;1NbQr*P`{+y&8Wt}kJ!%cNeRu3&;E#8<
zdTDTglG;xeckvmkTXB+JHC`jU=E=V|)J8?GCtk&ak`#SNF0T-YHzWaaSiJ)<C()R9
zk8=_#T6+A_=oKge$j8+xa3ET&lBP*v5>RF^ujVgtYc#)4i(-adE4UJuhXPI9C*}9J
z=q}1vku^?O<RaN)HR!9xReU5lGnb{;j`gq4ApOUVab7X&a!ra+-_@!ZB6L6cRu0@(
z?kgQSOOX<#ga^*+=r-tk+7J)%FTMt2Z&KQyw%t5<=I8nOS~j|E-$Gi6wh#BKs@Pg6
zez&#%E^;Z(nKcgTUaq}}Q`k?OxMdBjKVB%~0ACpzZw#E_5dEyWbC5777IJy#4bzGi
za*FbiVJ!u6pf+G`H6g;96Y&(v$w>rHiRDz<CwE~9#mX&)+E6VbNuMAHhSyt{v805=
zUe3-e+SNqE3ij(7AweVfo``nI(SyptCEOAWB`%Vh=opB_@U4j(OBWp~vb-48_o43L
zdZnXRd01rYakjOZl1vaEAHLnz9itm<GaRn$`LIZ*bfy>!^Z?Lz_UU>}iu&?HbwA92
zmSi58*pze#h!I3UJ-qX@4Ww+*(3zJOa*Gk-i_<4!Yh-a1<Y?G$CL$#XU!|*KAVH%@
zqDD+XQTz8z9=6^X{F;L!3~mUVpL43mL+Qj(xJ-(0pgUg;D|jKR)grltkNU`XBBFaJ
ztm*Qp^^>~%Tr!V2;g^p+x_kEP_>4wbdQ;j>_HWzzA6{s)+mt)fTqx9c-{v)E_f~e{
zF6*h(z1`N=@%5&HQ!Gem!ccai@%l_%YF>2f+;B`v3diC3^L)o>lu@!(2I@g|B(R4M
zw*ijDJT^&!?veP_Zqoy|2G%ApeseJXV({M-)vjPc0^FM=SxBFFR$!|rq}{OkVF|oQ
z*D4^8<>DwVv*oKHZU&s4Wcla;1pKUb$#HYQMQRMPPkiP|S(y@9zyNDh*2MW)TR<8J
zsofztEPPB$axJRORR4d(uFv9n{d%@f(4XHn-EWAp>?unzHtTwdmG8q9Bc(pUu=iaD
z>-DyOru?~UQu-58jE8mjKUgn?nrvS>U_pDw%=QvD2jHx@kI<lS^6LSCwG9Fg4JuR|
zD6yAfyULoXqXGOM3UxUu#I$tJ#O6}rjDwvYMIUv2A+CUg#{YxKga0{b@&P%*oyGo#
zyx`gb9JiD_f->K{Squ$YlKw=h382hgrTUM2BvIlo*Rz<yaai40M~{g5Q^JVe>wd!N
zmEho{B|WDc^aF1z_@-WSP#&L8ezUPtj{K~zlC2mnCM0c-Khw13=(D5G$*?)^e4`)0
zaSMzeEg&MC5J)uI23f1)M}Dql6P6_TuOh=HOf}V(Oq3xT0bh(^_S51GF5i>e@bS*_
zta(znl|XWCJ|s$0iTu(qP!w=y=>`}$tUW68%Gzt@oRSP~(xk9jz}HfoodQO5XB?yA
zfO~-0h5xk9yeE>FSf9Cr;1k(EHL>13xX}D1-BUwm8r8yzKu7;S4ZeKk;YE8NFP~Y8
z1|KL(>yFCq+r5>YHTFLZ)cGdgU#wB=f+y&@3iMsoQPu%n4-Pq$_lIIi$!CkwfrSa<
zDww>UL4Y6vtoNbtqzu3wiah6QrlVFD1z}oF;qFo-N+;`Mvzs7*LdFAoT3}fQI3`}{
zzXA<ry)0rX$`e|+`D(2YB-~-7h=W_t;mOA5?htO;cB5PxcC)x>bLWXzPSx<9g2*I$
zHTgsSiDNQjJQnQYpg+8{B(40A>!QmwJzas;2RK@2lqg-_tps`GX+?cMlqKKR+wF4V
zdW11G8cT!Xxb<BAj8CK4W`N^U_aXld0@pW4;m|Wq4$;oh!hN`Y94(G9Z=U(%@=FyS
z;q)HH=cgt#++?~S$cDe{!=M*6vxSQt7YWm~j(p>QD`v<Ne>v}v$@OARP9?C=JO<T}
zti$=c#~qZultW)k(i5u=`h{SMqEKUP@y<O|9Z9Zeb?1{<hHf8Y&ImV*R?k~&+Baa1
z=>4k9YH3cf{B@>(&3qQFoxO}Pdy0ijU;m$({^N3UklCBkMih@?yM<Z$OyLs<cb!Dd
z2aesA7+=&nV3ol)Ma9R~zMQnAKS|DRX*jqP;13O~6yW2Hq0{h!A-Y{7Jg{O~?hD2-
zV*4T&W`yvX7vQNR&?4KqjAg4}9jd=TTZ>CfnqgHd8)tY)+$<nH{|;R5YxzUMCi9s#
zE0}87J3lPTB_d~c-&;^<bX}dWNN%OI*?Ju8_msz7N^e`glbpBJSVrQ%no!w$AZp-}
zul0ONd$ImBq|;sFHkF=i8asDWEB<cz>=y<6Gacb}jXfuJF0w5ylW*l!>W@R*3EDP%
zUm0tVve_@VL~Yy!YhYK16lv0UyU*rUuVVH;T)d#X8(D`1!HC~OUzo8asGPld2PzzA
zCGd$bdu;tPa+xeQMg>kyY|X0>?*XHC*%2qo{M=(`xLu39Lf5Zit0T+l>3du|M!Sgm
zZhI=Of8+u3U2V|LnL$mtkuF=UjcpdG?ThXm)F}?ijRfVY$#$i;vim(u27aySF=hRY
zKE7J7#I-382XvOhf=Rv4{YR^2S&~|@RapG~vUC+RL*DCJSU1w!A#mSV%I+uUEYh-{
ziI)!I%Z!{gH7aSFKTl;$*k;2TgA-x~KNa;3Dr>A4a<y+g`{ZA^0OPq(bP(5RP}op~
zKA>u;<<p5-VDrBc$5k#)(KP7BjH3vqV(+kEviX={q14pUB~HWPEo|4AeD`3(-XzyV
z<M0`K!#oDwN><uZWM-f~{rd{y4ZE}QOwrG~Ee~5Y*2hKmgSxu}`Rp&p&D}KVRV+W}
zMqJOMtZRxplBQ<pc)5AkNjjzCKNKqY27F;fK4gh_9yhR*i(G*TyJY+)od{g0XZ+36
zrty;1Wj>SG+WT<^J3FcLh{S4)<YO5IS+AsGbGj@!{ZMuJ)-HywxftfWZ{n{a*0A?O
z7z}hTQ_DMM@U=?09VJed;4FER0mcKaL5-g8Q~CIJ3};wCFVBs2OiYyvu>0TNj{5en
zz9+TJ{-{W~cUbwV$o~0%$?N)(${suL`QtWgecQlan)(%*zVDA6tfyzf9bPL{S)8|F
z>!3Y?!+*GDG(-IyTxMpNftygeRmkY+0C>3O$fc3NJIVuJKBM(UQ7sZN8vV>6gDMzs
z;c38r>$mtAUPDPo4~LHeR4#`g<2C5U@=HOsjua>TiI_rJ2Q1(Rii3B-!7S0+RR~U9
zEEO*9&-f)IxtjOvj`UysfgJW7&vWGR`x;BL@<WvMa9fZ6%pwO<@*Oa{@RYk3b$v~B
zN8$@^Ou%>6Eq~4yEh__xw!&nJW&oO=o1dSUIv3sxPm~ggu}nDKOZ|(7FeJ?>N&1F@
z-@iT*N8A9BKV$!WH8OfKy|jzrfHpIH8de7N#Z5?%)IPX+35!aA-{{#uTf`Vxma_7L
zsmAei21P@2k&nW>h##F5UbjMfBvP9P8$)##=KZS<(84)I$LPNrr%WCXJc7}qwlgxs
zGTwY{CrCizg8DT`+m+O)euuWZ;VFOfrGNb2Uplcbsmo-yX$I_RZER&-vSIMRiR;72
zXA-W}$sqDYbs)@#Akq52xgS71#yJS!LBx>M@U%UmjQnePJGIOtZaS@#toY&gX+_a4
zoC_z21u7#cl?x(G<KUMk8Gr?--69|@tU}N*vB`s1h$>pHfqZK;j$=FUivH;v^~=IS
z5AL~fDALDzM_J&#YvrX_ugpc`?t9*=Qp`7XX5B!sb4u^cz^6M12L$|{)TkuoVuJOX
zHgutP^p6=F9+=ZOw`NN%ZSA<PPa3N`zDhbrH4C#i#DDcJ;L_`6j5j0CV6EiVV*Zpa
zwIT+^;oSOFfm>C0{#wZ7`I5|C46m2Y6mtvn=GLrmG3Pp-^i4)MTWILs$Df%$dE6l!
z-~%wa-R;C&m12jL?@~<RhPls0o9R9~!7v)e+Sxtk^nl#(#;Y=I`Y#t_K1i3W?HoS)
zKdwde{0*3TeObXKn?v*lfkLFZ==-1{>PCd#+^HG3rh8PZdq{iFu+aA1&ow<Ue}(Cp
z1=AaR{!q@SKYir`rML1Ey|Mg4Hbpb$AplA<C8Zy+Dn_QlVhOmJdr$}{BiUK_J+R-z
z4i>Eivp!k!E$)1oR6o2V`54r#sr)`O?Tu`VQc3B;OG3g1;4esk`sJAzz{XS?JJP1q
z1gNV3eDOrXMwnmtdfRpPZZ}<-_tyN*&ez(W|M(yO)c@?r(f1DByR!e%lV4n&@_70?
z9Y5PL&GUC_wbPAGv(9u~+0%TqdaCd87yjY*?IDXknD$0U?91ccd*LPDA>Qj<BOG2s
z3fQarvC>rh!26qXTVC1v<G=F515SCf(Hjb!`ccHl8bdxNqWQv1_y~s40LaIp%NKdC
z{nh;h#d^@*p;&a*@0%-?^SW-FTH351`hJ;owbd47{ifMomeL8x*886F0}=;)^xREB
z*WrkDZ<!1rbfaPx&HxMY^DV*YTIBR|ivHZejG01=k2_uLw>=^P-A>?44f<nVluo9S
zD#O2i(ka1-(<^2r)GaG#)eUd9j!6UA1_Pt<n9Wm=Y-r6&n--e5pYqYAEC+jDoFkYC
z9`lFt&HbA+KM*q3G_ev&+?lk9bh3~8e=Ng)Q6P6dz_4%)0<5B1L<f!T!P>7!&fhS4
z6jz^o{l5S3_=Tr?w(I+@X!@cmZ$??~qN{JSUDH@?4!VcX7p?z(xbpG^ldhYOpvTu*
zpOA7FGOJG;d{_JTtZ}v8F`qC)ia<8Us8p4>rB`@0bJt{SDOcr#K^c97d4*WbH#45p
z3cO_fmGBmQG_1b%BqAl$%_&-~oh9~8I8_mC`6nzm(5?`ARBLhVp=Q_%xr|%JTezp{
zgp;)k6qTTpR2`6Hu=0ns)&WaihA?v(jjN+=NMz+%BOTXf<%Gv(`V?WZF#mK>#&2u%
zmb-R1IsIo$iqTmy=^l+&z8Sx{@^zE#8nRKH&9`;n%8r2-HvYdH^f^ovW3Pw&bj|^F
zp5}?sBm$R&{q;UVX~afU5POPo*`Jon$=A=BE(5agP390I1a~2$X2!_`jTUst!dVys
z0~{j~K_+XgJCVUT-!$(rhn#aUjK)foAwnWtvqkC40zDX5dBx5*?{UH`D>Q3j*6W4H
z3dU|)UTQvJba7WRAJ9StK5xIu>a9GDRQ$b!r27dl(sky;x<@DlO>b+g-~BwQ|IfwA
z-8b3W+}8hInPM)}Sr0_@cU{m7oOjk)v%w62sQ9Ft!Ss3cSgGd3sWur9D#20)3AE-#
zDSM^!SX*kzNfI`(<qa5#Nw1o(LB<yu6Vrk>M3*8|>jg}MSj-JTSig_CkeV1KX2o%|
zY`$Mi+oFnE){Aqr@LSO@ackU~vyM@Vs!AClI%8AQG87>$2VU41nOZy6G1WC;y6Xn|
z3XetOKDv1AMP2iY&bAxKexvN;l_m$pC+fK6{foWtnNl8W20Bg2ofl%yHO`zJw23yN
zu(9eA_(1vao3igQ7pF_)702K-U260+pTwPv>&3mIP#(ulZzm=KvJz8rLH7wKOA-w)
zDhmwcG0qff*m{E~v6$A0H4b|uv@w$K8Yi*Jy*5bL3!wn9sx`+@whqx93$ME-z(A<l
zO+y0SWcf6bZS4_U5Nh}tXjQc9`039&@$$Jd=Jx~Q=a+J&YELfADmVZ8Nydk<Y#FWp
zvD0pE8%R@bzs)Kg`<=FdTN+yx{lU}JiB%5CKP)cld4PqiGG1?cMsF@?>p#$DuRovi
zxQtcq2LtWf824byMzklDc!%?ZwQAE)Bmy!jJ{r}D<+K$1YuE*6pkqT(=?JqVm3Qz-
zydiqLnC=s@DF%QSi4hL~+qTAR`Q4yoM+q0+p(%8#M211)F_3>woceTCwFZl1Fp=g|
z0h}Y`X`gYbo6{`A0rk(*$7WMq1b*UFkapDkhJeUp$4yJzXsq=VqtKD3YsYHBV(-7#
zb5(|$fqiYskE%=q^P9%sm*{O%9_hMUHMR#|+<UJvam(y+n_iFZs0)~m8Ip_#s_-40
z#ZZz>!tBmOsOHFmZhX-XOvIFpn_Vu=E_awA3HV+#vBqq{V1X7nGsaMDKF7Jpe+JV7
zDmwW}D`>+K3V#lNexkvpaTyY{VeE7kSH_+2n_pQdlc98+gRk#mNsYRG^Nq|j85+<|
z0y&S_lx5f|yPy)s{8@Ix2Q=(6hXA_BvEwMkvGZM}FUgsAx?&(V`SIO>FIwy!<12fL
zb(T=w!_VQJRxup;(ARbg2h^y6o&?*K7M-O9LwR3Y->o8`a*h47oWcF*!WEYTGj^k|
zVoJXiX%>oQ_=K|)n;3WVueayKfpM*elbCI`a7mebVTOUiPn9$>2nv9Br(($>X%uo&
zq^O%#rdwBHwJXe`>AvD1O^Q&Tu~1at5eCB6i4oj@Ao2bn_l!%VHk`sM$?~LnNP6|B
zZsRz^g6W}|K0cY@_{V1$UJYLHesE8=WuKC!v?!Ki&M8V2$&0NK{l9M<JoFOf9&T*m
z`o)t>zp6Ru*Rm#k)2_9xSrhT&+iHJ~;eek5M5)wrid1Bj^}?oP3dG@dgaw8|nd5)d
z!9-`MY28A*;;48a__~nv#dMQfQ;R)HU4UUNgd+6Sp0a!*vS}GWZ#*=_Yzz=iVgvxV
zbSBb23#R7j3a~@5&T3sKz6kuRcpr4E)vgO<NT3GpY|v$=Ic(0*=D9hD2K(40eD$@5
zSg#XQ%6O2%Kp>^96E(fPZI-8%zFT(q4qYbO157y~qO&aM(e1R#M<42M{#61`8BE~)
zS3@=ZT?c<nX$SokaeghWKXL6y1mIrb%0j>c9JXYLh3I`EcNjMZW++Le=K4sU_;h%W
z+y@5ua(E)f31nUfBHoY*saOTi0lu)kX$fcu0{w|554%8G-Hv88HR<c6(J4nbaeWYC
z1y6MjnM<dUuKfg%1?5UQp=D5qGWgx3%io^2a>Dz;>3{?i5*M&QV4*d(&044H)^v9|
z+a3<!cW@Dj|Iec`EbaH=g;w-H_YjD5z+|ma>MkhX7j0>boiMlM#Zu$zoAQh^qGMX~
zS~^S_G5p`OimhPnbdydu!5+B;yh!eKg-03rNX@CdWOe0(gBDG))b?t){@bTN9zVuF
zS^vQV#h;(1<3Kd~Ug(47k*aB<upV!azQW*=;2!JdvScm_MRaC92@Rf%BngwKlK_Y|
zx`Hmx44phbHQy=qBwJNmYbP1kOlB)CVu0wxF<EEToPI}gdYR5%@;`FAZXWAW*nfIH
zrSq=dde>QZ=WvSkL9_jSjkEGW6%fzz=+-=+SQ`H=07M_MY(30)W2ymTL`hV@!n3*c
zGefmAfN&#?x#jc*Yu8Vs#pCtDx~k#Z*q+-g%g=BTKR`D~h$fpGqCttN!zP0U0}#}W
zsPiG2qOCHV;p<%^nOlDW+W=v7Rfzdjk~rccP>XE7qLX~qLU8qA92Bm?QtO$pRE1~x
zxVT1ixFmk49=k@FZL&A(`_IU^E4`Y&X62EB0nk(VzE_&9W{suD8mjM>v!AWB>Xj=k
z`cqA7u6JyXNrUE6Sbh#n7@V(ZpRkKIF~SH(vYi<pJCvV0qH3->#fwRYLq}wk2^Wc+
z<8LHRk<^KIKmD@MxLqnTA7Teq6?`la(d+!{kqoRioFzEI&|~1w3`f?#h?`z0Wq+wG
z3)61gH9Q`JLFytmcl<YU*bO6uW&V;AOK;SUB@pu`r-fbG>9qA%qev+-VxpQa0h#bP
zw6SNqPBGcuRUY25Tcd1$<Haj)wk9hDCZ+wn?sh~<_d0#o%_z&)=(j^zd0L~n0$L&5
zKNzpFIEeG0qb-Ef1%xlOLd-z$4j|%_?m2v+`Ucz#P+bpxn$nYq8Cda51!{miBQF*S
z^-Wdl{=8Z=Y)Ci1AzwAmCxM;i%OqW+T%L!)sw8$XoP^!TR}tllS~CC)cnUp;&G#}$
z6`m^GJ~e-YtIB`jlda72gk$V(yX*-qWaX^U#@&5kCT%AKsM+dOC{k|fx*`A$9?Mm@
z&}MlsK)d#q=|z3{*|S`MO<#WT+}LxLfbCn!<gjfD#?yS@i}uqMh-(M}FCM$8fOzd>
zoPZK&0S6~#hdGw3%fBPsF~mOemtU?84CPoVvd$kcyNb$)<3&~)*|4tyi4iFIR2G*m
zjV^SL2K^PLNuH+#N7wo@Nnh+dxl3FM^5@O}v(s!`(fEThL~30Vb~@8XNCpgsm$QH1
z-<{l~7GQ`RM*YE=EsISu!U8~SCb_;--|zX?xxTlzDM9(>yzcIfN_&Ie-qn)wLygIH
z4?wl6=70CmCvMDgE$Z9jbL`TQV}|}=9`RD#YpjPAW2mktSpt6rwSA@1Y`4IDdk}kW
zKR$?ax1{gf8n|`)*mIfwhvEyH;%F-TswmVO=Qu$0XdJtV2?d<L@I!=Nc1;@2N*9`!
zHeWTsL|a5wT1+^TDX%<4*=5~gexI?~vkj9{>yttxjUV?<l!RES>XH+JU2t8yHp;w5
zoh-np%8e->p6@B6KPvmPtOTTEV^BZ?5E1ARCN0DViSC<lh@>JSPKbe0SxyPY^Y2Gm
zg2zzh`OjUxbEnPPrt5UJ-|6k|);-1y>w=CwAIfcql!sT-ocodD4!qrKOQ=-#wb@gn
zEKLcvFA{Xdb^87Y=j2DnoRt@tSOkO#?U45cV}~`G(!Q-{*pJ*H4<M^;@Rbr~NQIXD
zPL_|%<<#w%zD2ZVMSv7d7EE=}V*>DkSkH7AXMYI`)N!cCV7~#RGDoC%L^v=;ZT6{~
zWTeY~jx?&~0!YE7<)^3@K@$5>=;jg{bC0<I0q|F=-w{=P`SM_`8`HqGrSo6;B=-I>
zh5%`bSuY5U>6uwkJCpAh=uB+`J#yCiZfBdP$$m^Xf7jd&fSHs2qt01A74^WHHKp3Q
zMVW-D>u5z<^3vS3?6(z}%CNd@XqU27Qu_(L+w&bqGonl7zX*@7m8EX0Y1y9*qOP6k
z0IrO^m3s?D6-)D*A@*t)&SEJ?F0mP({+;>r!1SHE-<al0_AcLr4r^7q(L1(~G{e7e
ziT$5P9$8Q_vL5fH1=1nnxXVnaVtHvuVxwohr+JT;;V4bs`>g}@mHh{HRCb-!TT**5
z8&JM?2<&U^#Z&!hN;@U`ugdywDC;j4+2*NTI~Owyz{y30P*~xoE&znXioh%DGQ<S?
zn^NxG5ZI3IA<j@6{*#r&F0l=fj(z00+;BnrN#PQi)dBChx=GU5Q5{mt6W$%lNfLf)
z<4|1dqkcsuMI@8yz~#y(Ab~PdlAd0YEH?#{H))7)5~q<9a=Q8*9X}cAzt@YE=9686
z#%$i)7Z$H{C<CE7Ta8kDFs09S5QhPKo!)}+UDNp6arUb$`_|as>$U%|qm9k7@9tpZ
z`_36#8b=hwbKET*+eJ{?JwqxLRs=v$y8#<5p3@DD4gp+KHRc$fisuGh8E2R6DHI>c
z`Gs;GbCy7I804+OnSpyi`fA8+d44RHXD#MNRHrA7mM?0(K2?kn4<w}zI^AkBS@Jrb
zlo1LY_`^$)`57Q`+AK&nnLeR&B5JGX=j+ex(t73Kcj}nsNm<JxJopfX&A=#GOAf_Y
z$|+bFXqz5)DnB07nRY0T9CS*RvlS}=wmbOR9J>FyeNQp{^vJIs)SXb>Y+1`W52dga
za%EV($)wjfTdZy)v>V-AIu;00BNubMY_ehr+E2=H^9GU>h1gBH0K6#vj&K}YqT9l`
zQnYr8OD+Ts=#;ASauVg7@WVjq)c%i}-#;XaTP8*%VTg-(X1P?(jDqWL!tHU)-;24(
z>o^d3q_lf#`tRy(cQuypnQ$n!F6B0$Y}CN{H!FvlQaUe~lsKjRqOSkeS5u=KM$F#c
zII|_7Abd+;JP)`ifg+5Wh0<YeGaVSB5sLI|pG{r{4)q3<72rvveY*k-c><Ms0G~v5
z$1Y4XY!*j~pCqIPmz)^6&4+x7Moq=vTDuxP#Fo<G@vrYR9N3z*6NXRo;h2S4^FolE
zOmd;skB7aXUFYTCCet9l@a2@ToU?K5WcEsXH4{*lat&~sAj;$%gZh+Nf(1+1<~8cO
zeyqN0``>!kT}l5HjZ^yK^^eYz!s*ug?i5*LA(T>X79~(#0d%g!+X4K7<A^h$^QS)x
zncuk2I)=B%4_zVLWTr7x@M_UlJ?IU9=6sHbjJX&LaU=$mG7_fUCj91ut6+^QjO8Xm
zf;tVd1b|SAmw@HdpKKwZeeg}REXBcNzs&ftn^Mgt2%zh4u)jz^yR86Hdm(VzkkES^
zIT5`1x1GnBU`5#*l>68^>dgtak{ktRZR2p!!6ogO)pY7cUfN0jbNSKYP*>M+gX47S
z{zlR)zX07_KP`My-uSOF;+f*H8J$cF+*B2kI3x5D1#F?{QweAB7%G#4ka%UGU}pBA
zey0)T&m;H|T<ee<{hUi!UYz!mljFQq%3o<_8kye{r!{eqq3u`-VGz`cF)3&qNh~r`
zDm;Q%%Se(Rzc;RY?y?Qp$oppJpgbxzEXoRJTu#o~q*3;ntW`Sen>u?LoSl^3aOEMD
z3M)JN9|i%pDsTOd65W)1H$wUG*uj)XcbxG>Io*`a_e}+!Zn(ze4K59H&*eCuY>~zc
z{iT!TMGUmqNtI8DojG0%VLS+A^v})NX>7<A8ZD2pE;H`t()Aw&<ZGAn#rU6?0C5v~
zlsE^6-|Ck+^#8`x|NNEc`cnr|H+nqvQOq3gu~YtKWa+;SY5dn;e6+lGM9E6G$G^_`
z<-cw&sC+tnT<$+^?wFJD`sNS3Ge6w9{~zlv)&ICW>n~h@<vW{(tc=-xva8GTs8V+j
zbC~UupWBq0R$cx}V5271t}H`N7=Y27e|=4EanlOMLHN!yIVMC<<ag7AfPb=oF&-R*
z{-g={0D$T;%V&})u%BjhB#z37$RMyw9ky&%mHUF=q~NbBd+(PiHz<bN2JVy%>?tTZ
zk^JD9fmC0;l3SMYkmYE-DL-gtPo29dX7VMt;B+8NGz*Gnpa6sLy$wieLdO9nBo&|x
z(d%)vD{yOqipa*-nca-=X_An6r(u2U(?S!Tw_}D7{(Vu6ri-vg#RGE32hE=_c)NV%
zcGU#KZZLVDSfKh?<i{J1&U{9sK=FpHK=je$5}CrkqQ%LF<fDCll&dt5l#-e8Q->Rc
zQn&)>?NnHnbDdvMIV`ZL5#Hg5_>qvrwYJ4+;QSAK8<o`cuBQKtZlGD;brz|^yvo}v
ziux~frTQx09M%o&&?t*>Ms2oLIkd@#^Dxc%U$p9HI(V<{$i2#NE0b9sjP-@-);ZfT
z>a+iYRv7;;s-6;GF%gU$#{3jUiMy7~Tpv;XZOO*iF#KkH6jVl`4!{hY&wHOdH%uI^
zH}4d%{H8l!1%k^#pYVcWv)FA5XYN#{&<B~o(&n=-3(sV)P{CUkFc5(ulJu=q8k6Uj
zG9-pzW~{{yc`r6UuES>^_28Uioh>CA+j(7=34Wukt3}t>soZAaM1Ij9Ep`2kinXP6
zpw0eZKyUx52u$pUa9vO5^qA5cdC@>p((!A>qfSEp^$MVVvv<c4qt~R_6u1e`&|HFD
z@c*D_LLbYB(9+SVM6cZO-Cr<<Li65Hdx7u<k!Hi7YZ>7aSeG5lhDAU|kcimg$OAl9
zF~R6X%f{EMJxPJ#6;5$in;G%!u<kG1@p-=%xd8u7!b%As?>zp}qfkf8ZbWcd-@vXX
z017x_C2Adt#{e}0+i97nN1&8)kIlT(seiOW`91>jfYPKNNU)bA*giMuIt$uxh_1Y&
z-htMqqx!yE9A$6!RlfGMcbF;^(EeMdWYYj!Sil=2hz`)th2B;36YrmANDpwrI$`$8
zz+Vbyr!nI^^i13XTrrH6^MO)TOiLR{SDXlDEsV!>*2}{{U^OW2zx)JerSD~+3?xoU
z`OiJV#v?m~`41b<_!_?rUfMR5a&bMt$lOq!Fn4du4F@?Si6XoF>085S7`ho>9-71T
z+OZI7-aol3!^mp5uB(ClB^u>(o%QQprSGoNr3@&2pBGstX<qKmetb=7XtQHL)Z3~&
zhE4<Wk>eE8c*?Je(Sn2jTwf*_C}Tk&W4cLW^(A@nAgcvwY-O22*~IrJ2Kb@n^oK#~
zjj#F7<!Pcc)-qX#oegK0ch3Nd7;q>aiybYEojU)i9*K5`X5K9)W;C}uRJ(3IT6KsR
znbF20Be8%5Mlq}k0&W#Qsb8_y(4BW<_68I=pl<S_8s*1FR6nu41SoQgJBFjt$jK^B
zW%y{aSe6a15;<$Qal-Q03dJ5cU}?5~+i4dAo~z*bx~>Cd$@f_d_iR!Bmu(k)lPr*E
zZt4dtQT+!TG`$XiNv1g6!>WTRYgh}z<er->8c=7hI#MKQR3~}s&X{8wms&pKSB$h6
zJ~PY_=@-Dakjp{~vvB*RTSD;#PKZ<ZxuX>xWc$eOlK|-46S=nJE9N|8(#<ThC5iGs
zxD$iS(g2}@V)x!yqp~Yvx@EZ)=r0FfvnucgMJgACNzvf9b~ql8Qog)=RNL`I4s+d%
z_||p4Yx}W{JuB${TJ!_HN?DuoweGIx>nW!xt|D|jI^}IsvazJ7>*}XXew((l{BKUx
z{)&xq>XSE2b%XK7(fc9$qL<saX;Udutb&6daRzKb0_Goy@zFX8JcPH{Bi0<k^NHm>
zmOLq!Lksa1z#+q@OseS2v~oe_>uL<EKs@Ocx3gGF>^S@!=mQ8AiH0{On@a;QH%eTI
z(g@o4krJw}P*_$hhs0YjH4)A=hVxt@ty|dC7Ivtd3L6C%^_7L`{GqX8lVv;(y$E4}
zC3AYH#o}xGQq%ucueDBZ-vKk$_O-LszN6C8#6<JOz{ely`OzPyI#fO~`r6G2w)O=3
zl>z-jpEg@-Z>4fOEtTg)g|9jV)OE0Y6Q_WMh2}$i;v_n@6oU8qN#iEc@xkz(j&_61
zSvVy9G^pe43k947+3>m;(BU*Lw}_tq1&R5<h$;K6pVS}B-ZAJD;2kbV8|q5Yx?<WC
zLDxY~z{eoAmcmx*#)wJo6JF9m=8!`^qc+0`8nZTR>Y{6LmC7e-de<i_H<bi=;O2aG
z0bh)=n%|DH{Mc+QQ}z}07un*z9(OY@Mm_ABO%UDK;@CZM_CzR<%=c6ujQI3fvu8&b
ztGBdf8N|^TNFtB@1>!XTm?5WUxSsgRP~#ix4Kc%K@615$8lmpv6?(9+Ux!i1?)Z<R
z_P{Ehptj4p5MC6sNRG5sJfK6<>*3K6$lYX^Gu-&b<i!0;#ZpSPi>&ekYE5vjLpsV=
z<<7}TCpI~i^PlHnG$aR|jncDqAV0Zhdy%EQ*=8B9VUeHxw!ZI+mdc)*w!U5A$v=AD
zeJ08d;l0O{Vn3X6XE4zZ+;Fx&Ow(AucT)pc2g97>NvE}{+;(DCam7ZKa*}n_Hzr$`
z1`Q$90#iJoc(uNpI2t|=9xa`LR#dlNaa2%HnwF6u$QF~lHCLRBcSINL?4~O*-!c3`
z@T~6hVuwq`HhFNO@87WsDvYY~RUsmglvI!%ByX5%t1>FsaF7#|8kjg@+UfaV#HSC6
z;jGDiN3ZBO=qU85x^C7N)`VZSeqe2+sOiL4ZNMH;*<a;rFHU~AeO<?q@^hI-w<1lS
zi$bMvuIajY1G|s>6PHNQRt!LOq2ptc71&QB>wmp)K!Wm`Sj{wNnM5z1f$-AnuTgRq
zO5&1ApE)hp{n?=T^c1xW#?m9TP*KD_MAk?9gtL5+pBC7@CMj}0T$lQiVSDHd>T=Jk
z+fz_W2oM;=jbQkvoxs&MZ$PvQzavsqq)ub05}R<%Uy_akHln_U73CqNm|Sa=4~zOk
z_5JI0{bf*I2a4PJ-)&Q_%+p)wjB1JlA7?-QTUovTmdR%9)jf<J1+8l-G(5mN+OeEs
zU>*@|Q{0L=yP}?`BXXxu%QjAu$fKIca<Vwa@Z=>K)XC8h`(vdzNXo(@P&Vh3^5~(%
z&%<Sb7t0z9vv6znbl`g~BR}#dZ$REM1kr5wxe^LcjlmwHh+QwBgG_$K(iU8>{u-tt
z;-|v*G<#*D(R*3rGS|e7uOzM1Jw9wR7NwXt_ip7ud>c-ZeS7kXaukE^@y)0{mR}6;
z)T3-nF_ORhA}^-2>FmiSKXwID%3bUDMK<=*g9q=w=`RN_=3wf<gTt@mFNe3iQOtp)
zj#fVc0xuDXttI#^xv&`5E8LPdvU`kw;wPdEF^3Sf*%B<iqj1`Ci#-f)Kte#K&{*RU
z>qa#^)qSo@i=qCeDhMFBiA3Zml37~NfDyUHkV6MW9*NXAb(3=4%8txS^k^~E=DH#B
zIW%uRe`BcOEyb4H)BX3Kx$cFiK66>J<-D%9TmNX1bIJqfl)lzx`*&50JC7Zt?G7^j
zf^+6%waW_6g-Nv}XR5|KIM~z{(y42H!;2=U{||Y7Al<qY0YxfvG<;T&@inoWCHFCK
zv2T(uCviK1!emw8Rc61e^Mw=3o0EvnIAh;dX)XW1p88GqJ(0`iFUfIptvPbA@A&<T
z&DJlQQ{=fb!m~Fs%}~CT2VWtr+XKDzVsEAStH6G*`x{CRL~dnE+S&4s(OZe<B(ZgY
zZ?<L?%lE;Aq%|*pZ8~XuQu$Qi+dQZ{%-~R5a`8ZIBdegoq4Q#yebx@+Yt%0;p=$I$
z&rC1RO+a|5?Ns`HUPSIOYWFv$hQTK0uF&e^gyv%+R}(J+jGN)PqnARi=Vbgy$kV~t
zZ4}B=VT+zWV&agj#$2u2A{K+uVu=hvs(|_x#DdUtmk7{-51XywI^#wAqbLQHPPqQ@
z(|YB-gp_We>~tc#?uEaTz#H^ddz!Q8N-NmLv$R#}B-kQHA}sNPWio-AbHZ}G5&C{S
zrw1cWCiL87;MZ{K7@=R*0P$uF%CzqY1iM*O+Qswb4Z}%?&esSI$cvF`jyF%u(uBAD
zLDA&b)UM!7vFRNLYRo}}`7E75MIHnwGXxpXz51IBNlprE%C>R$p!wb-937YCSr$5<
z4LoCe)T_5WHdz~#hTh7@ZT<4XdsA#B@Oj(bHu>n!Zj^I&Vl0~%zrD#*<U8;r&n10}
zT4y`UiQv_s=_vPh*xpzYLeYHmQaUhtLLK#`pCvCZ&4qSSrP5V!TrJ@Cs-CmLBe7Z*
z9Lf+6giHoGY)FLi>!dvrhSG{3j_s&&9qKPY5GN#tmmfJ#CKi`jp3Mq`d+tQ>bn}-$
z$CBMM38bJgjgywal9-uocs(5kJj#~bVn)7z!3JB=jm|*n((nD$rE9F6ZGGRg*!w26
zJ^U!#W_$SGy7b#|%I9f7d=4pnRnFrCApdXrd$X+FR!uSv0iUq3KrNuGO+to~Q9Kz+
zMEKM@0A!-8O~q+IimGjb+!Kle8dpf|HfXs(Vn&=G+5mavLBh;i8Qp<4u+&mu*h*at
z%;zo*xJ%2>|MT${(lzCda4dg+=Zxf>gXfYYdY{of4zE=11H=61*(!IWPV33Mk(&Rh
z|J;r|7PK_7GV84N)l%(|G~*o<uMqB`XGgc5rUd)w5`Cb!{NiP2eBP27fGxHkYEWM>
z^;)52LWY??{4qo(D#M0)3N;NoELQ-wl(2N2MI+D|hKqYoE;kfZ=ct)|uel^bhV4^b
zCgD#fjk*&}K7_1zH8S$sgn?FHq7Lg1koe53#a+V?4hse+${|O!1g9A<VHS5d<ws}R
zk8j%zZPp9Ao~Y!z4k^luZPpIdI=_(S3kAd9tigH0t!d7SXq+sQ8ds2bDYI$2{}h0E
zfNZ>mo^LFxD<S^Kfzf6K5nO-rerIqV`4cw-K?V0MFO5a*Y_?~OFVX*qPuZs0Ao-GS
zu#*n2<E9JT=Tq82k$u8{71_&-J&7xuA;gASc9CpOhaQhsA@HgS21{@czbWn%q3`ef
z;idSj*jL6mPis?pT9kWHc1*tq?xwhQm|6pSQ;YhYb(V`!I6j$+l<^wdVrAba2Y!(k
zRsgeRLWlR6XW5w40igWG<U<Dw(gK<$@x01*X&c})6${x?@G)6Tz*W=^oz17dv#z8m
zb7r_`ZTaDaQ(C)D=)8u|zkSS$Hl_Z(W0sw#VsL={3iwY3U>ZSAR<gT0XukppuRWD1
zqc<tl*5~-JirtNDd9-}E_Bu{Su0QgK2cfo~9`OTb`##t<@$h34`o1i-+bx$4>+Uz`
zZ5N7^58L`4o>9K5>P_h>lSiELB_E{z^8lFzmeLnmx7J_d5pyUdNTcjWDxTM(Cnk8#
zWCNT!2!$nbz;J+4Rw#7PX?PCYONZ|FK^X^u6#n4)ra@i(r#J*l+5H;13=D-pJlT9+
zoEX({o5*shastucFU*^#R(KQw+HZuW<H;5iK5lqw#y1MsF&yD><hfsE507^pSZD!1
zxvjIBlp;B4Od6GkSEG_!+D=(6^j_0HidT*))*Ws1*P3n?5Efa!?M&WO5Z`!;%lFLo
zCa3LN_D42ct;kD*uj-d=$-E?93!uJI`88&ATxT(XljI_E&JHM+s!opoP?G01J#^O$
zb;VG?vvnoLg|mD*yiWiR3Zj;nTby*7+BFib5}hWAm`2+CaR3|ruwC#AI}ILhsGlSj
zU!bRu@jfD^tKLeJ1fOYiH-s`{<_6Gs)uYQ|8=OH65isv^&wn*@=gu&X|AMF4AL|)K
zJX)uFIM8P4n5TzNpdPf=^sa9bOiEqXVcp&Fy6zpV$agGf^wtKrc1OW4zl!u%9lG)u
z^Kf!=BD_=`lInFKDjRj#C<=1nI{`0=__~yz1`aHcnVMcjB@=+D0u0;Ds&an46!QD(
z_kl7cvQ2?5QUyVZ2bJ>(O|VWdZ!R2Oe$L;_G>@x3sGJWk7qQ0z5w_FNEYos#WVM;<
zBpgA94<bV=Mo!O-H|$-Q>f)L^z1__j1AC2ipMsZnvP~mauGmO9_rEw-2ZUCipu^78
z#3T3Ce0pvo4R-maPiM9?`qgxNmG~<G)3l2lV^cmjNn3;HPwulc5|N^GkMCyrr*Nr@
zB}0<f7Ut0uO-*obNy@6Iug#o#A~pAQ0{WnEGTaht$xK>OnzW?$YDMNevbU_qUZt(C
z;`24ap@vJnJAfBO^cG58F=bkgGtHcstgt%-P<>}UVg310iN?_SeembdITGp*?5urp
z^Mv`AKKSiV)%GQ(i2|DKP#zZP?!-q8+(}5PqQ~mJIlj`0gTVbZo7LC$MOu;VY0X;#
zn%xd{&P@Zu2Giwy{V+s9KZ7I^WsQt6)X>d6rxD8g$s*xE#s5FY%p)f?VBV?<p46-#
z@j;f0YuLE1%ot1S`AU17Vpi<71l=w5Ww(~G;6-QE^($x6p8j>)_fIDm??^syaM<+;
z<7*ZT+2=Iv=cE4FG4|XqH{H5Y@#Gf;VNoGB{(03a>v!I0`t(S}KV<9;-R!D;Vsl*m
zr9Y4GT<P-T=OZ6}-eY;xd!cB%V(R=h&~MW8Ttj5bi~6pD?6b`?s|+_roGITl86cJj
z^!!{0OiC*@@&rx=AmSsE2wmR11*TYyO8C-pbGzNkjL~ihlTb|D2v}Sjkna=GI;x{S
zw$K>Hp8GSD`!eN|X=ngE3UTri8*s2G7G`td3vtjAP_i_jZqBT5P`FP{1x)j+!LrAf
z4e9=K(POC6EEeb@y_t{&HWe@iw7qoxvtq$#J~Rvge=W>=$dIe60;cU8(RJHCFUr!W
z>(VC|YX<QDMyGaB>EfxdlC;-800^fjJy*c#EoBu0#wc4;R6nZbPJ3OM%@U<Ne!5M0
zSzLMn7g#4o-3@ahfM~T3%Bk>b^{=JuH$!=r#6Jofd$_6VDc59!7!ZVLWbiD-PJkty
z6r_NyY#_wAGf4&qWX)_2Jz<qF>?WH-VEmx%epUS^0LwwYN7lX@W*AkfXEWfR_=_(n
z%pKehME$Ea801KgmW)8OY#CGC_uZ>XHeF1xS@f2}&eod=mSF`&AK1R73M#4?xDuha
zJJ=f%l6oTSmY$<GCd}plZ#Wg4-$}Fw5!y1D#zWfxIvk@FA1!PcWw42%drZ10C}^9%
z`HxdB@bt&8|3Fxlc9uC4kKf$rLc>w>mR4L>F#VOZc6??yc#Xe|0!Pm;7pc4Y5x6ss
z0#1QyK6zKgGE42aaiXwefvC)*CLLPZS{o{yOEv>Q@lK2dn{uG`$Sqk~mG}y|V9CPy
z9Sdr4=37?y)y(kWE)AW5=Y5m9W@{9S-fHet3>S2dm@ak9Qy$%H>yZ+^>%$#7+pV_#
zFUIRE@tVF%ntuC>ngJVadD<N5Cdde(v{0OAcdTq$K2)(BWXAq~gRY88BVZ8Dt*a+X
z-%bfem&q}seTm0Sm^oT3xQkfn;2#krt;aV02+epnaT{k(NV(eDNw|rDOvJ1IM9`Hj
z7N;QEXfw~}uyvSOYk`Xv;RbG?Vg{`8Kv<Vf7sRs5TsAddT{vcRR(0Cf_Ks%z_r6M}
za`$PaXPx8$W0iyb+wexKdvZ@iip|-2xUKKoge2Q+rLX8gXG%)L>HO$Cmn|%sMF-tD
zmri2~j7m?c#@UVJq4E$IKAGqjYUd<OtC=fj&SazMAfG*#wIMs{TTC%n2Im(J8t%iU
zB>0Zk>9KH)#zAGW_+K9nSXY{Pkj-+$ghUU++H|V&=0j=V&S63~%Rk0=UU*oUZrgdQ
zHIXjvH!CuQF_+0F$S0B5GzkUWku(ZGUinbV!lSR`Wi)0(Kk|o1Un_q8`<#5D^$&BK
zt!JYC?labFS}oSYflBTUy|o1XQPjZwW?TDzzGCK{g+gDW*uz90OF!8hYTm;!&`ZGM
z)*e!&^Bi3^+tuOBNxvvtNQDxtE}myeCeK><ck$Hw3{A9|n4M^6tqgo~7pN5DZ(8z$
zwVu=z=%~0GvX#7hdN{DFY<lAQ@OI(jsu2O+7MZXaz)f`q%@MvG&>N(B$4pxB9+yl9
zP2WC~QmyoVuD5+1WeZns^(yxqY?m9u|4fKpM+vX*zLJpin)9pKr~cj<Z)x<S-k4yF
zP6?Kc71%0_loQr?oyq<S7vQJUo>X@>pf0|^^Y0w?B+Eh$J9{1{8*gxv{Tk^97UHfa
zJ4w$4tju!>AAV+OVbjjRv!-YbrC?;kr1^u`f_clM3q1_=5}0Mj&k--G>3QZ%bH+h~
zo4?sT!e#p_V>-g9MV#u`KrEwLxAN2d--er>iLy2_J@D0AXY0(j@Z(L;br;}UpD;j4
zpVI&53wry5gk)UPtY@8*Zc`Y&S7ft0==!Q0x>=n7HcAa61`2=wuov3h%o(0U0GM&C
zQ*2!*qhU(Int*(kM2c=U)%=Y>XPJa@_AeEV$UVqQr4e19lU8na6XxiuLN&<Cit&Jk
zZ0igDY!1p93=n{xYh@7z)NB~lXIM-qcVIXnKV5rz7^|xHPzbYl-u3LgcR^Xk!=1Bz
zcTw|j^lVL-De3WD&4AAO-VXK!-Vgd_*8#tlU9eYK&Ra#0#x$kGpIv{~D|X)i$A%n_
zt{$$J1eY@tz46=?47uX~+|*b!3b`c^z>u`Y5~Yo~6&TH^*h;y|p{M&M9`NG|yjq(r
zD^)s%%SI<S;81n^f$h+~dED554$^ZszV5tiehM^mTLMRc%49P*!7L3ro;mjTNz^g0
z0oa>{u5?%!CgdWu0drIN7dY<*)MYz0EnBnQFuu~9uj?r>J&M!y)hIvGuY7H~sJw?$
zmaRAW#}zuw#cqplvfamoeTuFd$E=%01J$xa-G0}hQh8Tsav*`KVqFvr#_Pm!Y)WD3
zsmm}Bw1`5r6sAh|2|<x*K!~xppl7SUIs;?`@{MCVIye|pl3S7I`*5%%aJoC8OAbbI
z^CBKM8DX*jn8>Jci;xdRfC-dBOhlnxEsG|1Fn1S@t(5p;18t-LJ;fD=*-7kS5<y81
zjgpTU+?Lx_2gW6tKC?DExO8{coR5+pwmK*Ewpkk-S_cjl<yq>kHz)rvTQS_)k#gs*
z@|}WnreyAMTW3R`e0DqRn*4dYW&o<<z`LF0(5JMHv-1xc;3_m>B7=@wzWNsONG#ua
zjY#1N8K;&e@~QD56*kV9bpqa%aO@a`sKj}RYhrW?YJ-dEw9xg2(ZRA8md1=e5O~&&
z_mMRNE_TxI(W^eIT}~s#V2q(`uuF?M7J#F}+Zpj>oqII=d+7gA4=9+N#<Bw<Q9QIF
zr1Xa6g>74wr~i?R#=zcd1DBfbcj_M=j<V-#EDxM*_VaoR<I<k<au1Z`A8tkIde2wl
zm1O_Mq4MFuO67*UbdB|Pnlnr~0x6%hx&(nJLgCL4WG3a6kRHQI1rmo>2}y0O43P$l
z*5NmK_VO!&v4|r8;t|cLi5M#Ju-Or`pGN7w2h~+1z=%Q;G2|_B2cN6In)wE$@;p73
zr16nLjyQHn4wb)yB0+X0slyeyV%rU2;#Yk`Y(eQh;I=cw;@Myaoh`^P7j3~#!`?`b
zz(3Ry=MUPkN^3(Gbw1_cc%>oDIjJT5->k9LFUzd`;Yvi?Tb(6Q%11hLxTZU`sBPAy
z*^Qwcd_4-et=Mxz#mWmm7uoH;1deUOG+y;}oFwVF<tLnr1Uv%z^02||9NE1Tr{nw>
zsuMo3nK+^{IA1UOJdiarjTt`sqYNh@S2G0bH~>%5Eut}3-f~(+ZMNJZ+4-_^GAEDT
z9X7-&?uZn(umY!&9-|qk@qw`y1e?z1r-r5nyixec(#f)uG}F!hd5ZVIEFaf`sJ^r!
zOPs#vH_8JCr7z0%;O(~li`Y=VM?|OuXgBG11OmqEjJI$sPq6>FC1n6Pe7w#c(Khhi
z(;5*Q5@dbK7_3>=4Q|T210|+nbqX7Bn+ehw-UgNu(f&i#Jczoej@V6{fbhBjE2YMh
zSU+0rN2Cv7NQ%Wa&xoUEiYEFH2p92OG=6x9d=zjTudvb=fn82-VIFhbI)Q?G1l8Yq
z{=DVM%$Urji|PYEuoRe5`kd`8Jnnb(eL7pOt~*X=@cgHgS%CvMi_cctceGl5Z0W3e
zBWCWVyn~Hrn;hpR)PcW3`0Sopv;}Hz9`#%6a?pY~vRZC1_&j?vV3^jU#u|!oN<aap
z;gIUl9KtX_n4pX|JC3qr35*t>S~d*%v;?~h%Pt?a$(-rb9pV*Hn&|^W1V}e%y`hfX
z;xcP}-rGSti+#RuCsDh|bUY!ag!UfQ5eymw)YbQALZ_7D9NBSWxREWfEi|*QPfj$t
zIc7H79#uZX==vF*#aLv?Hrd&=H2~%5T7vE-jeF4{--`q6Q~7#cWnV+oz~|%bUA>xt
zGrDe|xW3zIbHn6ATi<Z@sQHdDRF1*f6U6AOIfj-1U88{D+eGXraTdIVj~j}TsGEqm
zg<|%_f#s%2xs8vZYmquJcjF4Eb~ij<uwy>HEP=JapSqW(lNf!VyFW9iP^SVQtOqne
zv3uTI6T&wWWK7_}FjITEwARZ#@I>u5ao2l3f4l9siqh4VLa${RmRGc2(sYp}A*J)Z
zvXtb8Z@#&Hvb^J)j4$TaB+lXe9hoP4y-5r*=ZeNu4)nBqlUF$Hl{`Z-juZ8D^R7(#
zM0oH{-5W&UApgJ|q|iWgAl(#TaU|k%p+&zMI2DJ&^b9R4Hv{Ux%Vk%eYnd?&Q^=d<
z5RWAtXA@6kU_C+Nlkm(Y)%1XA2@R~~KIO=9&4jENMCIaEBaM_$AJ9G>TnA&*Z5U;(
zA8^~*>nXRLQ#$m0XLt1P_?szOc|+UT9o27*XtkNk>>XI!oo}<KDY_rp-*->bGk9*#
z372MSvUpi7k1mKqpU7xd&_&)7{W0AB8V@?`a|o+^$-i+7LgbS|LN>{pgO^2kX9Qbz
z`AL%Oz!3_Hs%F{I64q&?DY+MyT0Z0MtIk-UwZf@y-c*|{?HG|?IFy3O0|&*>qfcUw
zl7q}8B(wAI_ju>m<t_?7lu8CZk6cfY4E%G=6Wu%R7pFW}tg(f+CaWQr*{3Z(%F_Au
z7eAfwuB~fIPgc<jZrv{k4hRS4yx{_h4BOtcv~cHEY-=E=N6zM_!f``xEJGDPXEv%y
z_NQk|)w0f=@~9gM`{0=$s;G_Eeo~QnkQWyK<-5=T3y~T5^8l$@Y-765MTT99<p!*A
z!6}Y3ww-l|F&1!!X=Mgh+i0_-dSxo&N%Q3Q0p*SyFl*We4|rMx>QHp-D>b(A<lFm-
zEPozvH#cayJ=<(7zUhI-P~2ueN7U847WLcSmNwfbY;yP2-5-eR3)S@9V+fo^>s&<{
z|G(yRPf)WZZgd`X|7l&1V*O6nua|=kq{Fg4Ola-hvDsc~yGL$4tdt~FbbnQ*Jl5EY
z@o#Pp{~|80eD1D^h!?63p{b>q+_Ifszo|;J3&h6ghTM3ROSpzSe-5(7C>4^~ntrL{
zEb%1`U%7K9J95n#z+UjwRL3$gy5f45P$QCX6U4AyC89`KN)f7VN&*=Zx+y8w+t+&&
zW>~UG0Y>2}FGQO3H0kp@yb^Z8Wh3o@Q83j^t@toH=NQwt4ln13(nubeLpiJ+k9*_o
z$Z+;Wy<M#9WyM7q_*a4r_O0bgZ>z3N;&JI;QfpM-S6Jzk0e_`*wNws}T3dT{jK3UF
z8w80uXS;E9E`uzJZSq^BAY3%=gQD`3U}nnj<o=>AmhKtl-S%8hU@y-gkuUjZXQ?7K
zpEFDSG{51|>Vm<5<5W@kSu$+W`&7yS<QbZ7mF6-iFK$L&j66VUyj1P!qf{9M?l-{T
zkPOLC)rr^Bz0`gz(Ia-$)W9W4-xmk=p{N`%fd)HZI%{px_xHB-$O$LgS>)w2m}L{}
zY5kMV?f0wNQrZN#H8KAcfAJiTD~LzH5Z$_*<{jP*F;$ke@lp#g0E87xu}Q@4_OmP`
ztqKV>wBR6?TA%Qb9t1U?796R>aDYZkM2M~MN(R{`VKUBwMY5hvB!UPcMS!Mb#4sSp
zpJgEwPY6~r#Ald!FS*z4NJJu8#~8z^BF0>cyF*lhYR-8NkQiw__U=5P7S7`}-5)k9
z_{VmHR}QfL8rzueaJ_9SqfOo9%0~~%^aJbK`r3KE5RGfvlzYwgCbdV_<T%sc1kmmf
zJp-3%^N<3;rY6x%#_k>VcP9Iryfv;uERbovEUO%BlwCw@n~k+_Hj!<YAs2L0*5Br?
zJ)Y-WOd`^_!WlgSn&_^PqN(UOWmGeD1u*ZN!Ih<pFt$AdsKjR?gi^YrtN=sx)Uk8J
zB^gSJcMT=$1(5qg{a2??U6l0wnTy@CZTFrw**;e~qm&}0N^eWn6?d4v=_K*`+V{0-
zJAr+Ep6a)5Q-L;S`_}ZxXVzUuh!V|Cy@K%7QZm&Ldt=zry)>Qg2f7(v0+i(Wp4sl*
zUqo6yqXy?L5rgG>r6zFbcK0>ci3?C7?u?YQYA?JXb&es7f8<K@oGqUPOvB|TBVw1~
z`BCOffej4eJwvU}!6G&$(i&J+2-gAYb?qz%<AF#^-h2cqT$6M@d>7&}6h?DMxj8dq
z_m+WFU7y+%_x+uWt9|RQGp?q0pT5T)m27&~q)d?<BM<<^4QExG4OJQ^U!k)NLK~z<
zUf3X|94a{fXx<`y5&4n~Oce$AUa=AEiaG?BHbRNo_8yR}HE99D-iRk93;YpZ%5xVB
zCxA0T)3e)GiQAakMPt^lBBDRF-GZ5>%3WDhN*GbMu6Gc##A@KcQ4kc_m*X~Zxxeec
z$>hHLq94yVSM+|OnDiEX64n0&{R)!snJ62p3+E}tlv)20ACo@gOjE@vTJ+M?8C$kD
zp2DIvam)3j)09)J3SwZFotA$vCkEMI`Nz?dR3N&ePBqV<6U6MuJwXB`*bIWKfX3{K
zT2Dwy6Gb6NrKL8yacm_j#QhlwB^r|agU`79<(QWjO1*>;?qH?dh#x7qWlUCl;;&Y+
z8Dte7MnHZVoV27RVY|!4fy8h$V8k*cYLsVQ5rLyT%;iO`&B<M}qxvsJrC8$Z0M>o~
z4Y2O)Tx_!UU(i23057<uQc2ZWZO!(3y*k`NQYPT$A>Srd_y(Y@uyj6DDEz4T+!;|z
z&e_8Gsd_Vd5rL?+bGz>;HItnly_u4c_S?#Yn=wP)4NqD!(slZ*<<GtGPosa_GNbYI
z<}VMvw8rqM^RXW)o6=^MTwD2m(G%m=Y+X^e{JoDqTC=A9RO+H}?+*Js`&8$HZmYe!
z(xh8|u*?6{j{LhHuQ|P8*vdZ~f2lfk|AyMV>kJ<ZiLK6B73p>Ct>u0>%l=UJ>^pm1
ze6F;Q44PPdmfw3hYu|=C??ZnW8}!cbk^Xzu%-TEqsn}K1-x<F2oxP(SkFS{(J9hEv
zm;6@##{1Cz4PM87x5jnp6ZJ3uv3=2W<B1oh?c%qu-S)|Z)9v%$9J(@K)&4d0UROrA
zx(>Y*ntA!%9T+Al){>`97wb9~1t#oRpKRNw+}G(kziRP#^5rLkUw$cggO}&5`G-c7
zJ5HGu>^1TUm*@O)&P=FZFrxg!3BzCiL#_9d!Baw>I=;U>*kj6}{p}&{LC<ZQaV2M2
z?eg7c_Kwb6Hhp4s?u65O_f4qZduDjXq*;4Cj(s*NXtiJLs{OHJ#t%8|ePu(f%j)TC
z{o0pa-s-y4ul-fOtTSu2jZ2%5xOjiN>#h?+Xa2JN)X<@8$1V=>+i*NM_>WHp2ORo#
zoa50UW8HQ+joSUQch+>Rd2;0xpC|V(8xa%yyKNcEroXe-(dW>K3DpZutnnN1`?riG
z&9>X;A3StOQ0|3ll1xp`2+rSgc+l3FA7_7DzP;CYA?jjpE;i$n%952Uv|opnM+FX(
zQx*oS+BS8FqZ=P=?X|ISr`r#`^)VmuhG85VpYPgjUN1ehH;ALim^9>6?5aOZJ0|C~
zJY@gSk?-uCy>U$3DZ`YdPq>cs+c53eXCsflu<V_^ZLXzqptT=!Zq;ieYrO+kap<R>
zj2(Oaa@Ls*UTz^z?taN>@6nf@s(p6tsVBXE6Fg-@o!2j8pBfYNl>DN7`kRisf3~(d
zfB4FPk0zM^xw?3thfm1#wPXKj)DzWiL)$;UkYK0d|MBy;6U^mV%?EuiTCFXnTiVBu
zeXTc&G)k!MeZRsOuORhg=Ura&MTX0~A;G(bKIJ;)vdgZ-aY5y~UCxgQdg8$7wbdi}
zTmQ)YnZwC$xV&(eQ~RrbsPo?C8h-o*$BETh%Odx<lzg`P=!q9fepc>sjEm{B&GFz<
zmluX?bD2`oK7E(dhU%=bZd0bT&viULVcU$=k(c+LdEs(K_Gil?FK3K>;)|GVj@22f
zPQ{LKJ2YzKJEO+L)opcj^j_-6jd*A1$f517L%cJ_4*9}sm!Ipuy-@3Q=%wo2V;Aq9
zR2|c`=ERhp<D(um_*%@jbk_RuZOKZ4&iwVH1grIv^KW(J>u#;D?0t{@DW5NHOS;b{
z>z>=sME$mpS@*B^9-MZC&-8NEvJu*UZC|qb+fm`GMr>R;{fV{P7Dw*!@Okd3<JDpR
z<4D`(SgP}9?LD(5apduzIqsftDrZ^bQ1uxnI7V!!TfmK;KJj$>^k2q~E&6yu{qZ$J
zPVZf}p>C~Tj-%U<;3>~lFa6^X!>$u+hK%)e9m&r;{`8Q&++lvPZQS{x!M{7vzQn_|
zaHV`x*Q!Ugt4BKIsh1ruS4COxYizd^^LH2Lp^j+N{PyAEKu@zpd;4HzcdBOK^Y<d6
zbPyZ<?QqIrm**3MUViiVOHX>mc~|{xV|9Mw)Th^;imlE+z1L$(iy>r0jC-8Nly`>K
zE_kZeg>NP?C}i88I9pEPNyD8X!N2i-={X+Ih_$CScm+H0XG(?yU*Tu2_j)t<C9mKG
zyGOW8-WT-k?y>U>?+*$7BRBIm!68q$9Gk#1souNn30I$xA!Da-DfVsf`lnOwuJfPs
z8!~@l_1=9$F8^+L@ao`UwKM$o2K}zq>o<F&PfXb6^>0tMkBqC%Kh?fy!HJiK?S5<e
zb3>{Z__d!}<5FRMn@8*1c6&UJR%g5Yc7nan{-P=A|55^*1G>v@zIVa5&GzHP2o?h+
zRP@-!>+ScW?AH<k`#K%&@Hd|Ao3Z!k{=FWY%<-RPX7kUVtse1|_e)Q@{_;r<%KxLE
z<u04<zvpLM%zCfVVJm+(G<ZL6w{FpD39KR87Ch;_>%@dph9CC2l&m@3{t;iGedOZ8
zKTSC8`qGmG<64e*w<Aw`@h`mzu^vmGNDSIF((xUh|I4f8%a2tr$u@b%3nZ%Vc}Xrn
zZ1o(+<DZRMyqcS)eqFw!_N5~>ULLjKsp^G$Lj!*M&>m{i4>X%DTJGuxik$6LH<Ny2
z=-%G>l_~j$HpTd!#`epd)30WD)aMTm{$1jqW{p_7`VZ6M5;MpFf0$OXW=P^IZ(g1m
zlDK-rMAs?f$4*I`zz_B8#OnM}Bi(kL;6Qy2eY^WZC)Xi{kxx&o&KT>%H@}LzKXyvu
z>OWM!y!zYSgr=dHE|+7r?Vfb1eODx(%t~VTp4#IRX6<!6zc66ju_3{$_dn_V(&}xF
z<Exkc+lH{<olYCZkgWESr;fiAyCCOx-;O%=!jrq7yd37YU*3O6;<&i#7?-rqmiztw
zrD4Q?p<~8Z@2>g%TV2moKFZ=8Ys!+kZuPd=>((VybakI^ZO@M@^w+Wo;6bwKY2W1T
zvuQi@Jr6CuzqVotJ-Pbj|9$71HMV4Emg`XGU-$Bg+mr(3htJ>E7`M9?6}?ziX|LAz
zHN2?ne{JY8$7h#K+%+<2W9{)VOZQ|BdFOYRM`gS_;^%AUPcPp++rKu0P&Ib(Fvs?#
z9Lw+O_c{i}KD9e?T1aH(Dzed3Uh5x;HCaK&_fK$~GB37z_dYp)mtPL|dTvOf*B>XG
z9@2hlLgKO~rjFmXfxNcMul<nZD<{_wLeZ3xH81aei8yy^Z~Ul{^V(gf{M(bRFRgz0
z#Ai(twv8H@IphoPKN>=6_XdsNx=8jtv1aO0zrC6K5i%}W_z7=EN6$nz{+=(gmd)BQ
zHfU(ZABWz~ZF~EHgZ^LAqAF}+Y;|#-E=hTBN8o3=qPRkBhvCYt!`N1~V!rxd-ueML
zZ>7hk>u-8D&3Tkt;QvG2yT>(oXY1mUh5?acf=PjqN)n(TP$<|r;1(f3h*erH%5|ir
zjkuXP?$I5(dq6=FZx8~Oo8WXx1w@M$3LAQQ45N}dCK|AX>1~`&5p0CwWeNmDu1S8+
zda--AozD6E{ym>h&)FTUK;G|nt@W&DJ<och>h;xSMs=GewVpC;tkPa5bn47DsQvja
zxZ?=u-8z-4{9Ts57GR@B<=u>Mj#mrqm{6By8n&g@^I~@MI37ODcoMvvz6?1xE>NaB
z)|TF-jw#PJWFT)V9q7A`oR>-JRrSdM8J#0%j9ZbUN7^)%H4YrqqC+a~%@auh88-UQ
zQTH6H_?(ik9a8z(uo@&f`kCuimEGl@G2OmJhpRm>Ig~J8Zy39RS?u6{S6}Vq76nV_
zHy{+AQwrIlknCiKbGjD}iNydEQN^8XVO^%~E%$7LUooX?p}zpBM!Z|E5_ZYfC;0vN
zkdaHy>yF!3uI&;98jCqDEP0Yx&f(bGqgop4zCYt@D$Q%<nzJZYVWsx=$VrFnReTZ0
zMrKNvzr-?1!<$yD0fNF8FSvQeA7*Wj76=-%QJj%Cked9Yju|#W7Qfkbr9^3{R@oa*
zuc%e2)^u}uiH^H^ecd|Mt5)9EyBxgdoz=5gku`6;IH5JyTo}r~Gq&%tegiV~^FhOl
z{g=URp552;Xc7cA6UOj&tm8Kb`cXIKv8kE<{UfI-vzFI{>8gdg>nWKy7v8xQwHj`)
z@x^B&+KrMCFBF?%XO54)@(8XfE^pw`6&dpFj<rhVT$r1lV?-sZ=eu-{h;Zy`@76E?
z7GUO7+jC-fO5J`^U8!L!07+cD-zhH^H?Cj33V9k}3dLOh(n^)Le`Ee10b+y*5>Ac-
z_ojM7w=dFT6YoNXq{6FEcU3X>b=LoiNWc5N7QlKd3#`c}6-6_%73-F&2VRU=J)oH0
z6lD2o_4uWQg=wwTZs796i=5LU*Tw)h2NAxPGB?igiDdEK(ha+Pn>F5bN|CN9oz;gb
z6I~o|EUeYlFOC6MrGr2j8+E8e&-goa;x>4!Y1KUAAMIw_gb%fMeO-CFT%~ylIGP`m
zp?^UNK#|~AY$`n*)*8mm;0n0F&q={Z%e#39)kMdQIMcXP;HMbWn|i%pW0(Q|Ud;7O
z&BT9zhbgm_@06&@fteW?*9`dff}}jmaGc|Cj`KK(t#eM={KcHeg`ZU8k3S#mUYxqs
zF;p;$mEPC(3@c0zhApEVis>s?SMNgQtC97pV{H5q9f6qZkS^+VO^BJgdZaMb2P!Tg
z3j0J-IT{)Sn~ED_i{n#LT;(P<0;uuh+<<;}>z%1`I)ykehf)q~xhEKnQE_f+CP84f
zD*QZd4?nJ7CZXi$z$;15J&U_xlc5*ZF}TYQSUk~m2B-)9Qnub1KPxWPoh_1v*HZb%
z(fgyjNpjrHLm>rpZ%)@IN`c>+S;>3Zde3~x>Y|xXJBC1ozOA(=ELX28^53RI54uFJ
zttr!9n!ca+E88cM*-xDCRw-5J#_UGDif#PtHsZKT69=8+#NEDi$}W`Hu%wZd{FvP<
zf&ZGrs@CF0m*9!?XX}NjKw`UO{F~W4L91)ut|Z6NPG!1tg&vo-6^&F=o6L)#wh#a6
zoFu9~rC!x$@2rY(>^|ojtZt(AN(uN(IFrk*&~xr^I60lj{vk3wJzD=j<_Wt*aneK|
zW$Szc3B;GV@^gX#=5$eeb!s)|m8HW$mq%{oSxx?0?Bv;J9eF==Xl{PzKGVk=1`7Xh
z5@uX%YMT5kG^-$gcF*bt>t|OhA{Oo@F^{$jmC1n@lM3v}u@U*m>8!kc_>CdP00tS^
z?B?NVQjwDB5)a|{8u77<-FfF+fl{|iZS;xM^q@sLQVw`b{cL*Y$gcs-)VTLIagBhv
z68umC^wl4r#Ig8|ESzrUdO<`y0109kO+u*yftK*~b%T|-`5B!I+UMKh<0p9Is#7zE
z8n&Ati(j2MV57k|l{s}C63$v1ll{7=zSma|9h}A+xD<ty$v+;Mp8Qj5hr$GUp}%$b
zOAarQV|!?YZIqj>uO~ZzeUxWED%s0ESDybO`zR|};3R*>1)nOjCP|3jT97NcpjU0E
z;(NC1-^xUzK}Ut(AH6Ww1wYtblq{`J8MN<y1~s*Rgp=kc&gR8Yv%uD6l&ja8j=FYj
z+j|t?El18R(Qs|#F?&}wh3PUl0x$-h^kLdq$pD_Q<>|S&UJ|yhlr7<<u|)#0w2dWm
zixG-gY@lwp#6#|~^+}E!dE6TURY&1WH{RC1Y03CqgZXX=0Oy^>iA&R0%!}dEA8%;*
z_UGzymi1diG@`8t9jv+n-BSPZAu-?@-6@)%5?s!v@~|?Bce&8Nu3nD}dk%OQ>_&2<
zQIb0NOmqIfpk?NJtg)7=zi(gTB2rN6(F6<Y0GXrQ<lg9*kk<6oRDltwXgJH1vU6hO
zbA4^PLvlTmybB1DK3_CVyLk<P=$P;~<j>Xt_J7lCzYb9AR>R|7IC6AD&A_#juRNNa
zxHTV?W&M-FJQjq_0>5#L<w?W>aa5wzrqAA{R4lZO-dMHP#?P~#_oI_ciSo;_RSk-A
zf{jlBa_qqg^W(&UR3JM?iqS#EQr1{e%gq5U%3K2iA}3en&lfyy4Oqqisa~}wIJ>qS
zg%!^v)*T(C?#b@wq`=ux?v%Y#ksVbjjgJ1(Or6TR1b1Dqx2jQTREybcgDOQFuxn*h
zCv%R;O6D97H7b?QAzHI6UDikAkwv3r6Ft^P2}1|&9B<IhJcZ)>vEt323)j_=2v>u8
z7&X_p3qJ?617VnUFic>SRI4aqBJ@1_DVUR~n&t#k+atF1!?Wg}3#;N^==D7nPyNhw
zo#M3d9t9@KOLYQ22S@fS0uc>SqBOHK1{a(GR7}Kgm!dp3x*`wbZg|(B`vp3U#gS1y
zLpZjS;k)2(AfY$Qwx+Wy3g^nS=1;7Pqye~BGGR40+f7Wn8FHov6{dGKVfFbd`7<+{
zvKEhw^(<X<J}hF1g0p=M&fUV8b*ofD)@e>$YJgj=s)>FDkrNV9ms)7!=VQzEjai36
zxzS&|o0n)<GqQ`A4WV?<p0lwmC48%ws2|LLzVwA~G^604Kw89fjs!^Q?y2|*+zTru
z;<hmRU}IZ4s4tdUn7iEu-?>D#($(fkWVdpr{I;>8qXoEFo#jW+N}?*E&1^edi95ax
z-LzOKKkIr;wW7+NFdJ}fvHk^-(3iD;)PLaMbj8GE*#*U;W{;lPZ!0FCT(p~=Kv)he
zjcFd!RecwF7LSAMG~U!6wBFRt-~OgVg6^)(c!9!FBxQkuz|*BBHQ_x9Kg8T)DC^-#
zY4URr8ovUfMTw(Ys7^>-&WUOyQKU$}Q`=JUXXyQeVm2b0q7bJlije{VGuNVqE=0c(
z-S)2zsatBOZu-boXe3a(8rpEmre0LkIy}7=G}9Vz3?skAU%}T{FCO0V@V0r2*0P8R
z#lxG{S(u}PEOi6oh_}bA4-2y9CM;9OKx^~_tc>E?9AZB{)Ib2JcmK(m>V-B{*TIk>
zBe4J%0)KF(-P{U(0zTmfVKpgoQHXJ+&Ci{xOHOp>;V=UtSMMkG7Bsqwe9#iPBBkoo
zDnOIxr0z09Y^Q^x{UNcNtuDl=OSG<xYx7K^4U5Z3;QDuS^}>%d+lhlr=aeXApz&ns
z>=(S)Gsd!_`Qh-{sgp%BUk>k|HDCB4Rh45o`y;OER>$T-OPqCyxGJy_8Tx%|(AvkA
z10hj1Br@q#GUCVd9}Em8@D?EjYoTu89#ZciQi<xDH;a!WKoJU21NY$&D^tR2+wf<|
z&M``R)IP${s4^0;7I9K0S2NcntZdb8-c7dOsdyltq<UOSWPqBa!Y-s<y^lYDMVFBS
zf~(6Ijd*VX(cw!R?aJuHb(dy8&$8@O%-oLb>CF)|e6k9i?Ix>5k^fCXmg3DVtYW33
z+b<g(`Q1(RvC0sMR9IS03A0b#l$k`yyYfgI8lh*=(CQo#0nPCh?@*M&)Oy3S;G=-X
zN)7lx=qshcBVfn*o*T<{QpY<|3Hlj%gWqi2hEy6SmG2CH>ZZCXpj4$1mW25eixC_x
zm5Wd$Gdd3|8Tt-4ycj<(N~seHKpoqFZg$|fnbIXk?Iq&ia&8>k?<0Y%v>aT$vQ4H3
z$QFYuWCJ5>Onqsmta*whmK86p`k9_K>;69}p48Olpl%S`9WbAb@S^gT*zMemY#Whc
zi|*6W7OfEQ5>rEQSgSxsIpwB`3lF2xp|f={$#KR)Kq0?Cu#2{;@R#(}!=iq_VsPno
z;V;=T9X$A8oI0Liw-6Yty_X28F5~|I4}@=pb7fGG@eFzdq{k$jDOCZDcRK)itt;+B
zTgXRt6PAZ**2S#N`Li!YJTZTAYWUL9VCXn8$u8Dbk4uvi7WR(k*x80ZudwA-t?kP|
zPR-{1`E|#YiEi?DargMUb=4`38x4MS%E9eQLu}iM+8PHC7#ScPag$32o&D?V^@vzp
zwEQ;LXk9fxS#tSEz}}S{z=N0&H7MxB)N-)Op!6|DNH-l-+E-Qq;JF*ysKdQxPsEg(
zTM;X6`@x|aMQ(C?dz)R!#6z(p((tF~uA^bqIhU$9s&r8^NLY{^^3-7GO1@7gOC*t9
znzPj|efGNbQJi+6&&6`*r?7Y3&%`JMm3N!B$iei2h(WzZbcCR`;~D-={(n8tIqO$P
z8X8_+y(oTgbF|_8rxOM=AAaWC^y_m;U;pR1J<rQ_8D9wC1lX0n{G8y>(QiFojjj6E
ztH1ec=f8gQZ<Vk9%g(>7eDULti#_+h`*dl5;}^pJss4{Y{O{v<WsPz1^6>QFz=P2C
zU0dF{7tlY}>^_n#Uh<Z$w~;3~cB>nn@o#j^O$#`r0xf->C5gbr8Q`=ar2<+uyo&;j
zf9mk>{&k3#!OF5QUfzFSoUPql4v0!X`+#6+CNHL4?T!COw=6@BVaS`f#hy}KHMiA%
z>PX~#PUQTUb^7XrcIKN3%S6Snbw^R))bxR=3t9Ug?bDhbwsu6CLKU~Nmb`7F>KvcU
z5DEPJ#kQh4RcAK;v_Q8NtT~z=VVyEWo#1^zR|!Ohv4Fk7`A%JWhyw>dvZl?*5n$-z
z=2;gkk~#um)+V7%FrrDyjmzYxg{XH7^7CA6_^NvKjyA-Ov|jyMxzP*3!}rMm@UKU-
z>tp31sr4W_WNw~Ldes(|7r`7U%Yn=7!cRam<=Sa{in>qqN}x|y-4S4xqgN-W!gRf^
zde11Xp-el4S2$1q)-w0qg!$^egVy)=MdpvaGrKPQcIWkVi>;Fnr)N(o?v1*@S)8^1
zySE3H{&sWS(tN^D!JIG=6o7v_SULtJVE{qdjha^^f%m}6A+t3CaG;{5bs4t>`i-mn
zL1o1`q3u=sb5$`mcv?REb92e24g5TQ%tk%>A8L)Zrn=L&k$~5w<_L}RiYlSMRHr)9
zrm03t#6PF0PO3k_MUnDt4l5M{Zu*b=)`8YaY(vTsf*D5oF}_ZH7B5Z=1W;Fw{wx-)
zi%NBSSHavJ%aZlr+;|=g7MANJk%ezJlr3F4wE*FSUOQcopk4Tv3B|`-?CrbQRh(?!
znBBll$CUr{Y>|)3T}4;J3}7{6>pkP;)hX_D72S>5<zNGd1H*)<EeyT>pmUnOw%o|e
z(?2c6eQ7cB{4qN6TvrMvQV!zg=g(H2RG#l87SX`MtC7PMwubd${Dv9f-l`;VutbW2
z`JU!ECl=TALYt;JUC#GUZ9DL-oAzPoDb4I)Mb?2ww><Fnq(Q~>Cmw^^fn61wAEkXa
zbfe?a((%w&=I-zD2)B-|K2<pWjh%<$q3L>`@3`7uxObW3P_~7k*fQ|Bn3bD?NgyF(
zG~1Zo`e(?!UHEyr+7Jf-e(Hd0U0rOEHS5tR?a&V-52iSdG9MkhhL14t9}iV`oY)gN
zs4KuNxp#BedN~pua2O4!vh`2vtC@#_(J8SRc+iFM47Oj1b4Re^sj9pMY5b_ZH3{LS
zk*j&b3!f{NW``D=6}o}gqE`~kJ3G)TS%$;S_isMCZMVT$vdfT|ArwVB?v`Ypt9Pu#
zz~LkqOZC|{4e<Mu8PWP<_Ux{sMCBwp+2qFH3UgvI<OJ?3cgl2K`0wN7C)v97se_Nl
z4t&peQKDSUZ)kT7juxk4E+q}-9Tr5VA`X!CF+gmW>7PZ^ZFLH!VGMo6iJ%Yv)m$JV
zU5XrE1<nPTiv}KZNM$A0KUEIavIT?1c#Mq!2?V@mj=g&Haj>rcXTkdDAA<Eh1uLx`
z6S8^Bp^8l82u?OV7l{%fh2YHV1q`Yx!)FfTrQs+Mnm`yaI1bFo@jQ|<b=6;El<GrN
z1Pgg3Ob1A9Y|+)?;-%<o>m5{QLyT{N;Xu^6@?CI}>a)~;fs;PXN|u(ZYeFzvq1G3)
zJ~CSqS`K_^;Jj-t<^k0DqH#Hto|`fVa+B{ON_01FlU7dZu41dy9mUk_uwwS&Qszim
zfpOnp!GeEK<jnmC36_F=;ifN#tDfg)bTOxD<431Di1?_p0aG<zeFP}s1V0Z+N*H%e
z!#O0bqrT0iFZO4SHMg?N80e<|(Xq;GesZiM9EkyK1k!4HmncFAIOUmIKf(bsj~Ns&
z5zWdIF`K}dW%667Q7#9ID=jD8fP;PM78!ju`8H{=F&PIRC6#-sg`6)o49uO?nno^<
zDV9F_K=H8og7$UmC!q=q3K6G&)xI$MO3~;j=1PA`%UZnMon^k=I~IXgMpZ<#rz3Er
z?l?D@FOIT$2Yv^X5@g4?2;e9bK?hwEGNm}PRLrKKyuX`1G0-ni(r6Ub4WgQcp*#Xu
z)PuD{Z^?=DZqG(x^lL}i4XEYjB|DBL2Xvp)AkudPpmLU$r31A1c}n$LQF%ay(Alc!
zn}YYkm&TGEyJ?u1;NrO#KQEx$Z(Qx&qF%-Ib&`8~swSHw=daFUlhEYkt%;(^+wW8i
z6@294l{NT**4(#Bv3R@%>Ww4X`3D|mW6*l_;c!t*6x)j>5l3Ne;-IVcw|U}(408r|
zR1ef{q<^4oAu5hr8aPfHk!N5v9}yd`Kelv2LqpnFTTqxQ4aZ&OY4nS=IdHnqg?Tr&
z$#lN)m=54Z+u3Z?ozqmtICE_f*Wg{6HJ;Qq*gWGeb*yCu3tMFR^R5nDjx+Td(sH98
zfP-i8puKNuxss#$TAkwPooa8BBv#sEqza&irU}J|*UU6r!R0AzXz?QFzZ^cG_UJKR
z#xg{_I{DF&yrKOQSDsi`@b^wd)u|%O&@^NYMi~74u~w{M{>zTdbJGo%W+$&|ydz>v
zJC?g(j*ZAg?%k^KYsXXzq%V#nt{q4SOsOXb2+E=8mj*d19C;xrafpxcrl5aXpM#zn
z46_)cC%=doIid}>hPe;%@-2hgGtncUO;U+<i1vVPb(!%4-P7QO#BA3%9R48D%y{Yz
z&&K0BwlcRM5s1t`L;tLe{yFnqpM=dzHM7I!@gs|P74(tC!l)^|`A;z52ASU(E?D^T
z=5T?wBI3m;-Uv4ey)ak6atJwUpNNq6wu@1rUgr|?>DawpDi#OeOwhL9@vB1-l>%go
z_ZrS?DiZ)&md{jdt5OcCxOw^~_n)CD&&X-x31yd>+l;EX0uaERjgT}(3&bLk7=sGr
zSY95`Pl(?~Kq^hn+|a`sHNOS=fM73Bbt>KQe>~Fwx8S2P9`G3p^BY04GEArD$E+v0
z!>i>oSN6=u9~9j?<}<N*_Rh&u=H{Q))}R}gPiF0(U%T(pB=k6wgC`^N?{}AY;5}~F
z<(~f!oM%Jf{d+m54vv3waoB9NV2h`V^=97OVdUS1FZYF8vzCTFM`vuh?vWPu5vNjU
zc=E}_jc$UFY~L2Ny}f^SMXd&txT90TJ{l%ztOv42!-DA;X+|6}`kaYs%hY=|Aw8FK
zZ(xX2j&vW40}(p{Ljnn+0Y>?AAqXqudq%tr@;?QTn1&2&zrc~YbReu?XO;a%-Fb}Z
zG4AzlN$)hayAoZw$>;|}EbWqQZ4;~b4V+fY4psIU`mF->j$jFzN5{jWKf@Q^3j@QB
zCDYf#izUMgmYu^zll_|}Hs^mt;3(+E=L5q9ma`|bmfr7BjDEN;!n`MI@$*P)Z^7!w
zfOxI|g&TB6;U6{4vfdP=NPJpIrofB$VRqbi9*|UDiBG_^AExuhH!-nkG}b`;1Td7z
z_W{1i06IcQorj7l8JWu6c%~c`bQR<rxo8#<TKJ|k*V_`%Eo1-}RVQHIrGJy3Eob4U
zBEBJic@GbkI%jnQ&5m|}BxFhU>i+EW@O;Tc&rA!Jm@Qg7%$_}hc~Q|fLzfFM&;4Se
z$6|prqi}BDl2cLJH!GsW*}ebl2-&=&m?6d6o*PU880X^ouF5z@5lV?grcWGq5Z1}#
zsJR7&oTyT2n%Xq#i<C!ECc(pQ6&MrC=Xf?he}Xh2{Qd+(jaBLLG@>x{PXoZl^F1><
zy;8%S^Ynhw0B=!rs%d4XQA#HmWsd;dKmj<u*_soyd2vhRfyGU;!-dntG5&Q|h4~H5
z?390y;?8lGqB|e1KD9Vymk@zuiYZT+)AHf*L>6cH7Jc<{E^EOg2Ghn9O7s&gMlTiy
zi>`@-%q*rwy$Sjgkc$|$?ZA8@r5>cxQRUoQkj-u6#}(rj?pDR%i-SpYXJed@h|>ra
z6R$O{@(u<1ciJWyB*?%*0&T|+!FO>yro!K65M&IRoX2+|7gemqbw>BP;*p&eYwD>`
z_DuJ0T8(Dx^_n1KTGsy2+R&l=`(N%`nr<mFC!Ct|Ub%y70{|oz1SX54Xxg4A-m6PY
zr6FBZYW)dZww3ajROEf+Mqw*p-ZoUs-t(G7>WwY{@$zGAcA<TM0ubS)@s9@&%*ZfC
zz_#Ed^7w~Bj82E_?Tx6QN)^rEtJY#Pho8}=0UPST63KAgL8+qfp))7qT(u*3B$2|R
zN@b728!TICo2(kd?Eu5vivg;wN+KCl?Jf`VdeuhG^><_)Fr93eyT3023(+k@6N(3m
z`wlK%*)(x!_O=aSpsbP>yeM!fE=W7KGt1hyXSi?y+g4i?k7^nUCxu*mwAwQ2b)jgY
zLue$F!sYrhhd}3mHpy`mNVL%y4OtXECyt_~E&Z{s5#1%x0`3&11V<5@)ypu)nfR@Q
z9jLpgSbUn)wNo>vqk#}3TLl7Yq`*uBwTcqTDC;N*2oUAy8hJ6^BGyLCa@z<D%hPaU
zsE+{J_X9=21Qcs|!svqU&iA!E>R8faNUxZ(TpT_){YmJ=uy$bVnJit$<wwWUtgXWZ
z_rJVpy<K9S4B9T>b1S{$*lc65N~I(zO%sY@Q`-s~VX-?pXebNTaX}H*k=6wYTiown
zRMm}q7Zykd6vWDuV4h`mG5vcjz;o^DRZOZ;T_g^IQBR2X)+7yb(@ZOExS;ohVgOga
z2sBR+r;JD?I?rI9u?lyfJVq=Ds8WI0sEmOqj$7gNqdS16MwNa|Ri?7X#o>hFX1FmO
z{W;BlT><uM;nQtd2d+ReH+ME?*!=0&!`;?z-;RXF<~l%9$Jak8s2&mXgI8$^bO+)W
z^eB+D;8c)vP-~EkRrV*!fYX6;RNM=oT<Jcby+(Lqgn)^`*3<y%FEOXVp@A>(ccMH?
z&=my-db?UKB(jjnF84*tV9Cfq{b^jLH=vUwLK5He<Z_#XTS3PR;)D2&c6Adj7?8#x
zcl-f(T}(YOM?~9$b_tXRBzJYnHhf#^tT7b?6m#HMKj{cJdzil-xNHS4QDU`BSUjwA
zn+o?%#fDoiykebF%oikFURbCYnlM*noEEUUvR8(j5DEo?;1frjF?<(dGSL*AZGcw5
z^_OTGS;_4zNgyVtCqQxq%T#U5<8jm5kKlsvJ;y6k(5$Rf5v^FgHrFp6cwVM^nq^uk
zS8*{E4pi5;8{c!?COIRPBD9%dvv`CDhFXeIQHs)_u0kCuh2X*u1dIBjgV#suRl9V;
zcxNt$xGq&WYa1Gz$}O_kfQ^7(_Q~a}WRWN^TjvKR>qtE)ixo9V?0I7cb#aC2$fCsp
zt@V8{Zr|Bqj5~D1F8tA7cPVcD2pxAEoW7T}__rgGi@gONg)h{r2ew03MoL7rcOEZ|
zMPV8TR-B>zL=DMu4D7x?8%aIByaQi><b-fD-~d5n(+(qGG|@$M=~G+5Z6jBr3x(K`
zkxCltjEtlbSsp>JQF)X0y|8#L&OUkRMckSUeQn!ZvcB@7dH@%~1&Tz7kKN?&t=Q;`
z(fyY}W@C+YAt=&}fHq&Xj!u{-3#@A!-Y(=On8zPz=NimIJG6^$u%kG(N$a@Ty@|!w
zs%tS@fFNR&gM{;EjgX)*zRE3GQA_DtjRr>JEd|$wUed-71Yfq{cfm#{lC;$-?e)(|
zff&FmL2WG^gk%vSF<0TM@+9=BX$U*<6K#;y(pUvYxSoX;z}DcG3f(I!8#fm$VXDnG
z(+h(s0-MxKSLCc}l@g5tWO}?r5KeLHO8K1<7WjJ>zj;M1XgF@GtK82i7kw;OxLrrB
ztjqVD6W*BeC~sUl|1AmCiAzlD{Ye-{=1+Z(i+v;F0AA{FbbfV%h)W5>p5Rz9>9bW8
zZm0ediF%Zgqs#H-c=}2uz!&rH%nUH0bjNZOgCG7Kmn7+NshuMrBWYqM5$s)$R*0y?
zGK8MGEL}uH#2l6Os8~OMga`*iuNqS7?$$X+Dk12e;t;wm4$y1i2OXNyEdo*z9pTZl
z_;N((yy(=zt(%H(BF!BQ2WH-QGAf_NuR0<KO)iG)R0Z*SpUS`c**fL5N~Eauc#mX^
zn0-cr&NUSy4%>6e%oJt7F5Pw~XHHAlR<S_93ML-6SGc#dsx8c0V56_pN-&E<m3WMY
z1s8PZWnNG%8offj(B~63%1%B9;Dw>Y{274?A4|nelk?FC`Qe~Y-i%(LkufBa=o~7O
z22}|!1p0mCst57#8i>g_w^;Vk!D=z$9USFHuC}jTp7T#Anh9<Aw&G{Yuy)%&2BR6l
zXzfja)qv0}D)0^fa{Vy}UaJPyhN`k<MdPw0!d911x-*+$4vakTz6=n8dW8wb%wE5U
zmxE92K$QCJE%k0l#`&c<l2L>NH5k+ek_O#J?~DWw9;y+{FLA+zzd%&R30Mq~%)n)c
z)<F*9c31K}31Cm1r<9I@&FUi_BJuGASg<g8>q5~|h4o^|MA6BCmm)m0|1r^<b#VUt
z@ImuE>+Q&*sa9>XiEV#cl*<_@c0eczMY%C`A;xWc%e)(rL>hB3_`}c-q!n=ss695_
zh<n{ue5CkJ*>A)FQK@25SUbt%%Cq(BZC3ggW18WL-zp05fx!NXZtr!a_Fk!={}}*A
zhs%i*K~Mt^fnRaqc8&y(Q>SXxsFUim`4u_`>It0Pz*BXyMHu$_#>;DIIH*2SZv2xg
z*Ap{cweW&hqPt9oU+6j2?YD9Yzlj?v7{3mjJYF$m={Mu$v!szha<#tq#~<M2l7WNs
z-_#5ooV<cX-tK57=<-vftm6Bo%1I67g25VcCV2~vfF^8kzF=TzJ^lK4&}y9{?@{lP
zi7ur@Q=L-yM_}nN_=2Dsi#QJ&<~>&jm_{glE>jKe>IAhnDsvaXN9bO_U(7jz9y=*!
z8~pGMg9(p8Yk)r@=*kdWog+I4n~4&s{Q^k{N~V}t8`p&^tF2!pSSA$*=Eiq=nCGnX
zb7shsD+k8T59g15rLZ1vDVlx5`eD}Cv8+GwP9;Vahup`DmWuvYUeMp4&}xUT+d04Z
z=lWNLo4UT(So?1kqo0V+j8{gzJTvgCFyDLc<^B1cAi-N@AHCf#nTq}E_T&HB6(#y@
zj#tn*S?0ykY}VE%uYYv0VPH1z)Y$wxB~RpMTpXKWCDl0|_imIYijSs;fV&Q3pR0et
z;0dCxe<b)s69js>$PF}C+-hrZK4cGanM7UPNsW*?s$3yw!|a4)Ib>KV+-9T4%t*v&
zmcw+h9J}2q$d3>cH-(jL*hNCYr&`0RQXF@&bunOQNMBb20Z6Y3I@n(PMPi*lV7=IA
zJd;i~7Oxy6{dHXat1NC4bkQNkk8yR}Vk?!COKIbUcXsBPwZjYd@V4&h(4vRqUt`JS
z_{DIOZjCi-=8F$zPaXWmGL5(S+|05J?Z{f}zi6F$uzFo#GKOjx*j7OT`z)w?lDB!*
zp*s!C)*nL(7effnFG1&uW(pb|jWcL}p?{+W9}2#tZMawAub_wC2s(>V@?esU{6@pb
zj7)MP;CVsO5TjCw(PFJ4YzKlKh#Z&}PXcOFh<X~phRS?%^e&8CgI|J{xP~kP5ogn<
zzCy!>RC)t%TDgN&u(#g{wKQ0lJc_0(6gT!o6>ZAPIxzEAq}6ik<SY03eeB=8y~3@}
zm$SR~&*`jR3Iu{^%$X~b0IK4>M!1;N=S=1I#X{MKk<wXwW`{};sYuX_@EEjHvD}l1
zKm-azauBw=POYaYoW2w|h*a#NRHc$h8k{?4!ph1ZhV{<rq=E5s_%mi~C6#ZlUnzMZ
zocG^HxYmY?o3tK!QNBxW24L70iS6%EOW2Y#<zX8zU13Wq6C%HRePI8@jS21K!-1iK
z2iN!YSQdhY3Tu2$<s0*^TwV+sQp|if)?=F5*R%AcUDm?Xk>SE6laCzj(^d@dbtph6
zx=_8jcJeAya7KYhzibf@YZwp$Jb3ju%UbA6Rsl)#x#z;x7(Z7xaZGOb`Oqh$XOh2<
zI&j?&?Ee{7a=&q#EtD74dvJx_qLVRC_YEZe5{UT)-ElZnP|=AUk)#&3yFz_a&(Au?
zh%-p^3`?N0ZSI7uHwSY;^1Vb&U)dD7-wJ=&)x4rVJwCV5)mGzTp7kN$aR+yIZ*_@8
zyw~>y??zUxYEA8$l`psjk399n32JHqRH4yG*H<<X+l$Fjt5L$}iyLuDG5F2;Q9982
zNW6fe+*l^`3_v-^xX-j|e8t}Dn!|z**+2J$=(%>j3!GH0`nLd#OIB3jgy}wV(BUFE
z<v__E4DJ+1BT<&4;q3?5<hvko0W9M`t1KgfApS?6Z)$ytviG>kwO3tL&yQ;q!Szx!
zI=^XnE;P?Px--&@cgR|<RSX?m{0@4~lglT#1E>1v*y4a<@nRPB1P8y{q+K|frI@_!
zX*>-s7!cS6atNHI_<O-PBwXlIC1eTG#KIzl#{iilME}(8=HTfg_rTbOlY17;17;?e
zs{s**>6pPQBnXYh7YT+i?Jj+rTujX&yh5<jFb7;K#Ga&Tgge1GE)|mR2yf`B$gi2o
z_%(5Ifa#DU(XY46Rzp)u<}pS#S6#JwXufw*F+H*F()605Ra3D`%>{Xu5emlr_Z4#=
z1RZ=Z?C^WFZ(Ln~TVISwP^)w>2;&ipg5A2xosJ+Okqk+1UCD0^OR5t31k-(0uc=L<
zG>O@{<0t~Bj2bZ1lGng;>tgiB*t*0fneMGNG6ire>{V-RKvGKC*FH2tYx7idSWOH(
zQ-To0BPx#EQ}qGtt7T;XzoCx4QU{<iG}0s&B;SxPhOmHjh8UaP<N3aUfXaA$8MeQ#
zPe$o~p<nHt6l}ca5suir)jj+1Vo<pGgV~`4)7u?dOIhgfNU6`MxtCv`u$0Ya9sI6)
z-=(G7H?j`?``8Wb{M|bhk)><(08@k!Nc@jrjEB(;l5iB^v-A)XgP~c3gb(yTW^>!6
zrgH>=YQX5kz!ZfMiFyxyP1RYPEilz?Mxv^6Bn1^ehl(<5`79FQjFt5|_!jrUW<Y8<
z7aSJ|tajb@esv9ctPL5a_421&C|o<qc7wQp{E=ij%Ps_T)XUZ}C=-DN!St9ujkMgd
zvyOH|7LFaS7+zdVh*<K8I9XT)Q<Oe(ZX#>(vxXjH!_uCH#Wb~2*tfknAR{C>RT*G=
zp2Mw5vGEi5x37;$J@i7o;dg*iHt>Ymo`&K-*BSGk{t>9Iz|Uh=lR;qj*U(4u1@I%r
z?A58bf@^P4VQ_YlKPwIP5{*a=92R&BJT*#GzRRQ{6md<s!~Vez(W${aks#g}ZEP~0
zVV9S&wu$$y?>CCtZ9zNQp3@yya}RA3OM)f+ctGyP0H;n&^Fq)UI|XB~z`M~=INBfD
z^Ty4ro|(@*EVsw@E#19!Vc2r}R^EPNsZY_(@R`3~&dJhPmo8`*XHIHozqr{m-8!o<
zUpZ0%1{}GS5rX3eK{^b|00)6Pv^;l*8#(+GQ$jJV1;)dXLq4|=2446(q9yFU400La
z$n!6yL%^eK)0|C*xrpJZ7-<G6U=sl!fZyaoAA}(847oN9Y7}d<chG1dFNXQ!2QgLz
zQePoSj4{^<+!kXGWEx_)xa0Q1`gPgbUovsY6lA@w#e1%no^6QhF(6pYSG98#w7t(u
zuB_O-J_g|mT})kwNE}NpC4+iTA9|JJYr3DsB^HC3Xcn_anqb?+a6tDv@RDj&aZJ|$
z-P=J~ZAV`kKO)ts(c(ID1U_=^O;ErAjU$q5Td?LZhJuQ#f^24+<R~saG_FKxs+IT|
z$Lv)#no4e~hAVtElb1+S`=?w?s&qIGl-mZ+LEXs6UOgd`%vF7$%m>63V)TazF9dQ2
z7Jl1MFy4PTXGrnQy@t!Pf2kNQG|yyRntui=1KmVd!EDwg>-{|g2k(s94HV8?c_+yF
zZ8Kg1>qX*~CRB{I)Ur6i8$zo8ILd(}V@-rr)o5a>snqdT((<rn`|X&@A9Hfd7`eVV
z@~7egDWjUuvw+%%yX}VtT$~=`4}dbpO@FMOdSoib8H0FCu-kjX8;T#>Wo#pF*Ryq;
zP#JV0fKw4$kRXJ`&gh;%xf3x#?ji3BsykOU?YlJVVV$CCI#wN&J9{?Ga(h6l!2f-I
zTq*Pu3F6pZb$yjh&~UT*VpyO!f`JN16(<Hha$0#VvA!3P=^vlm7{-VR1pXY=*>cf|
zvV3qOm1jAjG>JW~vL6*lU=C|jR&LR6WgB93PjdbG;r(%fQ$Ahaj0zI~-bVkHers?t
zOT>y!4Y-58)!^3;0&S%$yGsVT6v9++%(ZB>4g;7x<<k4RPEnmZCPaJDNoCj`YLj_!
zf~(iu;cz)RwGgXe@!E&KysVi2O9B=tG-yrDdBa7ESQ#+NWZOqS17$n#Rzkbg5o(&M
z(82<!_)+{2SsJKhMI6(9#@K);fm9V8Nx=R~M6tHzgWn}R%5`ijKzhbs^_55(p>=@3
zixd#>8LPQ=ZuDm&s5uzTB#D_-d%#7*gNj5-u^F7rOT&X8Up_Jd{?2t2D_DQS(d<zN
z{NJwK3UJcM)=8ObU`*gpZk@G$4yEImPhsHvzeVIt^fwG@7mfZ2<~+qC7}xH;!*NUk
zHUa}&D~th|&pxs0iR3_8DVj`>fsM%ZB`fRz1$KhhTf<!sg%;Er2KDX$Z$Wgo>UwpG
z<9;lJMI8(kiB6!`3}Cd@drPS#`p4s!hJ1B|lLP&K_%?{o@l;alGvHhVZA4SP4}ij9
z20_=ca#yb!!?SWOrUZOAMZ7R=fakpW>%)M87}m+l#J13{Lbtq9B9Q;9yGYIVx8DA(
z<!<*T#nR8$&4cnE*3Gv%A~e&Zb{+?TX&ZpCq5a1L&is*6x^NvsChexj4d??}s{;}a
zZ~t}B?D+z=$`OJ%9vFWpD4C50G|fO0LiT8y`G9DF>5K3!T2O)U4|rIQUDD*scOzW<
z1DA9s>dqONZ^h%K7v4UY*)dh<&STCPdS&tMkuBD_Jgp^2VZQxNL*c`rVa3!pktVm1
z4-Vej8DzO)o%`D@mn_pB>mMQ)(o}~bPLsI#aYCcnY_~2iL0#XIRjZDv@cWBLlYYSx
zuGUws1-6cZj0ZkV2GxqQfliPief(_XPsQLSW*${KZiJ!~%@oo}Ih`TwFDt50dwhWd
z(SzbkaxN5;AaL<#06NenhVvk}Cca1>huH!+LtdOLHkc=~MT}z5p~7Fpe|HVvAsq$*
z77jb04rKQ^&9kZD$&h(_|HhpKiQ+YNZPt9>JNpVB#95}^Z?H_-MJn!WndmV;vYY6+
zL2|A|<N2J4&5M6Bk7>=fI`G1&mYzrNWL>sWv#Dz`M)%W_2~re!_9r)Lw#!8X3K&-2
zo8%Ax1ktHo&<OiAE9aiiAV9aHAGBXC%z_BP6!2Be%fz4)ny(7Jr%{4K0&|>W3w(&&
z3hfNsvx!Wge(#>WL%^sOA`jz>@NnbG5@m_>@DwK6=aF@pF%Xh{DCI6ZLsaULb(aPr
zwbS2hikv@*9iNeNCEC#qhMbFwEgcclkq52gaUIXwNAYZ;>U?8tjZ$SECQDnPzY)gl
z<$B6W6P10<j_l}lK%KJQQUN-Cu3HEZ*BI-JG${dI)>M)G4Suv($I)VUtThloH-XND
z2mx?}4CKM(CXK1a*#*bwtl}clx@3ARLGfPapi)6>|7y9$I}j>9Jh&*#G;|KxdgN5F
z4)9>Q1eYsu^YxyhIx*5PnH7J*yI?CPCMf&2k599u;Zb+wA77t%Yc?x?VPg0vjpyvA
zKWIFU<n%1u?mwxR{`9(s;<MY250a#SIsa3P$H-;&8m#zGi4a1dqW=;?X$YrmlDO~>
zZ5nQ<dc$ZNsCXATVN@AmT{!goz!OGJN%~ZJ@b~Z-QgKLHK%luZPFYMYi9LJllm%46
zh=xNxXw}?_?@v;XVF=PvKSnl$ibG_&<LRtuae#0yHf6Yfv1w@kWdB<0L#mh!=6!{~
znEyBM2Fv|t)&bjSX9<_XcI&Gavbd*3>@#9P4Dc*Brr6%FEzlv+t#ic7?NB}DvW^)J
z+_(sUAgGWGL{5U(#tw>DT3bO4s8fj7bJ!RH6*1aGj50$<b{Kz6-3BbgxKwZ>o+ho)
zAp%NH5P{PurwxCvpB9$*FDsbvEU4<Il8Q#|$ue@`3E*(JB*b8=`Bz<SOo$cIav|ui
zqVtB&+8ZU<R>(0-570YSCKhg7C|bDcv9xsD<MQI8uRSg;1~T}kx#iU5*}t!vZLmH<
zzM7gnweYvRq8Up^Wd0<E=s)zq8tXv#pv3&FJ(B(eBEU6hHi@*MRY3se&ERQ}Sp%dv
zH~fRpOmk2rtU<#{vB~H}QHg_%8w|kV%QM_w^%1d7b$vVL6jVg1;lcL?o(E9R=21?^
z7tjVagNwpun2hfYePP@~nzOjs>ZaW7@_+yLF$N4ZSf)TRe6zuDXYgX)gvGp=msL1w
zUBD8L2I~XGgQqLAeI2^8edC<EqLYs*pRaRt%iT%4D*U^BF&VDju#&a0YK;)y$%Iuz
z2{1(;Tn8GC%qn80aY`88yoNPDaIM)uP8%gJ3-@a$j-N4hVpQpg_R$WId@ZVtPzoqQ
z2Nsj6&mf-zW;fBWc<6+wj>32u!aJK6DiW;Y;y`V<LIDOH_+OwKC^PbX^c)(!#<X!@
zsqa$rkD4BSZu#_ei)jPsp)dY{9-6)Nk5THe^esA2%vz}aQ2+%eW0C@$J@f_iz*7U@
zKy$1GIRM~-k4b;x+dLQk6KnH(fJcw5%}A|}t<BGqWE^#Zrlmka=xrj7w88#Mszee@
z!_=G~z$a@h&F3*g$M?nz7e86ri4u)3pfd}95A8aEQm}{M1Q~X5Co&nV=sx__>m%|B
z9DUnC%tFMG@cy!(p1Hs6>9}?9-j~+Nvsf-XF?gfW+xfJJwcR&PC6SznaeP98IPh((
z?`<(aOTMv&@44F*reAR?q?RBb*Hj4^S3Y<*Has6)gb;ODa)F&j;&3T_dJ5>>ysNI;
zI^iv=<lk`Z48R2BoP<r<;1enJ-ieOKz!c@iuteZmoEUzf*_C->^}6E@&mAJKlIjD-
z2TeG{KdLPs1n72bmco?o2D!kVvdiH5tG(Hhqe=sGalz`6!-5lI4ch5X_6^OP+-H7R
z5;-@&HuAvicNZe_XI}>QS2~=Ru(@F3E0^&|>vt1H=Fy4GGha?D-EUxeP``?&7dR_c
z2v%&BL?!>6c@kLz1&x+-$wmHq@k}CO8(GvZ4sSQ<VRDB#ZWR?rOwCYPeUU-aXfX^k
z;if98Ud6~HFfzuISa!fFp(`#k{u&pkf-*tSQ#fBlBe)9EM#v#^hv-xO6xKUIxDl}v
zgce)1dT9S0vvxiYgwe`|sC8q|Xql(2qlnT2qY87$(85V)HcVO~Ridv!EwTN%(~Nrp
zZYe)Ip_8pIi~^;GN>GD<amirGMk?EIb|hen0gF|ld5V$;Woe<zEe?}VByN(dEzdBR
zsd$DygGDAEM5{BWsSQDhLt^M4d=jBgr-*et35r}-c`B_ipgb+#*_Lhsc-JK1zdWjf
z01g~YR4O|dr)kTaW_fz2O#gYp{NO(gRBOWcErrR_+Oc`|_(_lO=`Zc_dX{d7dMK89
zeNafycoLt$@PHW<7WfCbQ&5L~(V+esR8RtQLvI={Bo}rPDHJdt%>>z?GufXv>;}P0
zCkF#FgK;Us$ZkQa*+{1k=c@plDoz9UxapgzVJ#!?2F#nk4sfFxI`)DAM{ldwA7^N*
z0Zf=VHhAB)2^X&cuejK>uW-q-3~<*^g=Nj}d&T@0>yr6m(B&yb(OhrjLHEPw3^9id
z+bdVuxP^F$-MFa5_V%Zf18gOj;t2&8^v+crqq<9mVn_}k^dU(!6B9f2k{2PYhL12o
z>y0Z3D^ODSHrQf77E#BE63OiAYIDV5pyU~`SqONq*O%O&5bwso!X-thf@h(%5{Bc*
z(_DdnKOzwXt2L-#^7E<1*Vf%N|M7(9&i@l2x?%sRNyaWcVtTjkUf{K<|MT6u`#$>B
z7hgSmseIMT{AYrqhd!yiWPdUFaJFZ?(~iHflbu)EhTJ(a`sSJD$h3*Y#k;fS&10Wd
zefL`3{s@niDh>SYI*deDn%<(J0CcDffmt5LZgS=UwuKzUz`ZysIb`YDs)U0NcCKE!
zuATo{yY$s2><;^8=e~oJlfv2YTZ>~O_i`E(kMt+A<{qu?nKyfEHh+dH{r2pzb#m9a
z$Ksd<#z<R(i7l8#vI$YT0=VxXs|5+knPmfrdnm=hn=#u9U|*#Gu7;%zWD~`UrusU_
zo#cAsHj*f!5cDEnLm>{?x&97fw~P-l0=^|sH@{@=mXVi6-8BKs+T{r&w1iB9ivxfH
z9VLajYq%<~FsEW@@!@4f{=-LWt@o@8p^(~Id&dkl;rXK_cvWlU!O6>McXrrEC1>|K
z+i)ufAP<LsPl{CruB^kpxQSeKOy)|%?>gDfpOWld4<iohKbWPma$?F$vSC(<!EX}M
zqe{g6$HpOE#TurnA)6DO1kx1y0sWy~8AKn%Ov<r$f?|slryNgMmN4;+LIFr=8%cKb
z#vob`Hq;j)@hF(qn3G&@!)++V@M#xqj$p}j_V%)KY<|P~VsxGjrAm&<cq7Z$-LZLQ
zrsLAA@pJ3s1MR&2ZAiYx-`<xyF?3nceo8TaF~~Z3eZXv*^*A_sc1yy+8K@cOM`{vc
z1;G0<-54=^CNG8w#?+ZkkA~#*=n=(-NW=)VgafRBVFqcIp(couV=O8{NS&BEvp<uL
zc6~c|T$;L{dXb=Hrmhx5emrw4Fh;<)B+r~`7-Dd5@V_;<pUYE!AZK7^<{y|m82KdZ
ziO8p2(e4#PMRzW1EjJ;8|Ca{OgsI!LY1*Y4>*$r)S85a7LPAh@y<^1UqcX#GF@|8^
z29s<embZO*`(z*?`xomfkze(H#yuo17_%8WH%K^Oriudz+)m=}TCUGV9h9-4{Q#ao
z)R*KR&VP}FU$26Ve_0QpjO7t6wB-T`1Luri4q|z1iBY9w!YK?)12)(QpCHm>4Rv{(
zII)*8Q5=Q+ki0axAGAR{)9>viAPwUnd{5=v4(&00aiM3%u4ge6TNfh>#*Q~c7Cv}?
z^}2`=$Zf5eo6Ucp9_X2yS~YCB5~{T}V;k658rGa%X|5>U0j*{`19!8*8sJ;g<8vSJ
zYs~LT@52lMewqR3*&XM9r^dJ#T#ygc3_Eq}A?t!umIgck$i&sA5<9UGXvkfCk&(FU
z=0Wxr-*=w;acK15kXujwxNblEm&@k(aRlEcnIxc_a25nAD(GcK{2)zfOP_*B1vTWD
z>Fzr>hZkET6%YH@7QPuizWP*w<sOEHIJ$;~eTu((SZ_MVNFhJ$$~HuCft0wr`ByQi
zk3Octcggjn5C^U7>Ks#y%~dhGda;-hzq9_Bdp7<Y!ju5mkVtHZ(mVJJ)JcibHVlmj
z*Si~gjql-Pl)*+0<lxZIjU?6^gPE=uSvg)NCUcUU_+2QamP1u1#C#a;8&c~FK|FV=
z?d@wJdbzGUkLkA(Rt+I^JY~@My6WaW`#fTHa0j%c1wj$!S*>Lh6VmTK#5U2I0wm8P
zV_BB*8<9(<*RhCob;JI9pYF&q-)&Ban19|0a@lUwHsmsH96Dz#1EuCI%|>6N-_Al?
z)^~o+9ka2@;dk*ETO2t1tJU7x*m@Cc3QVjp%mUC7zCju3C}f}+DC5!6zOr)TnR<MD
zk|fr-!brFU;;xiVS?|l8M*(Ox$RFc4sF63*qX$!|dBaS-H63zRM(>2AyYb5w>o>!S
zCF{~?bKcP0?QeE^C?+d+j>rcJ7cUN2S4S32_N^YaUehkkwJPj`d22Y?D`m9V2{Kf6
zRC$v?pIFSONUMb88>t*vl?WV`<{-xmu(<G<kvi9^0HGT#UEBg_2cc|bE8+;9E%Ic!
zt22!kFzMb%N+wYtR_E0r#ic>4PizHpYog<w<V-#O7uOT*O^myj$cW5JBRYdGKG+Rc
z?yI?;B4~6G)u&+ygi}Wi#~gAHn0|Fiy-|RE^_aV}uM^k*TIehJ_Y-<%K2j{btu>c5
zT%Nu*;L$Vl7rWISlr#$_AAT?pxoEr*RJdq2ykx0Zcj+O>qgjg`d{a!#;-}>%$+u*#
zZw9JCTL}#UHcN%Khn*Me*Pu`(Bp1d9Aia};28LUten1$~ltFAklTh>sV4gs>Z$r98
zT7=kfJ4Ba+g|M6pQKj*0hge;{g>o0A(WO%)jBmt9F-{7jF9wvEDWpF&kgR}mI4u`}
z-kMn*BrDBY7>PXiU9)wPq7WMdhc7KYbHQ+@XKHw9@|bnLW%VojXRHNZY-za0;e6`a
z#SX3$fH$XObd-qIP1tuElL4tyDnxC<2A9SzL>BiDIV46l1doWMUUBd%2w`_Bj@H#8
z@4>qdTn@p>cBx}2ZgpkKVC8jGXec$F!~6n!F*6~XrCoD_l@vlPFwLN+#P9EJ6o8<B
zUWNX_#VAL^xuOOVaj10Q_`t+lsdUAbAMA(0mQpv2&03Iep_VVfE*Q3HG-&?Aj`{CP
z-%yak^zELG%ZuN>P3`y4ogbFI$ii=#()a0>hQg_PErj0u+u{CsQ-j4zo#gF+0Tr}e
zex@8;fDbCrsdIDnFglXoq}MM#8hdH*fk5sf=|VRMN<T)m_NRwv#DW)WDgeiTa=FR4
z4K}enO!+8<x#=g)Yp`02NMekhy#2xQ!T*TS94KE{B9}c`>(ZbLWjf)xkB_uiC;hYb
z1HSCDPFNpwSckLzy)1k@Zn$9PtH{NBN79NWO)Z7nd=ImJ8B=_CkW+cxur=1HbNl{!
zcebz?76kOT#k%TM-Q26`xwVenl`-Y2#CDmg-l1xM(Pl&1?O$9N51Auv01fLE#vcpc
ziXKz|xr3B!^EZeWfILf-$PLZrx2}Mgs}1%)ut()1aHF`(lEeD}zUXK((f($Vc8!#K
zN?61b?cry`Lk)op9`-7oO6c#Eor3MD0tv@R@wapMz(<jmsR@fQ2h+gml6{wFu3Rm7
zd~!vP=5w+N=a22P-hO-7`stCA7<guB@j7)6PxQRC<t#*;h}MVVio!7ALafdz%Kk8X
zgI{NyGPL`Z9N<sG$3O|*kFX0;FkB06!G3{=J03a^lHd2nE;AmOTGI|LTnl)LPLLfB
zY{X_hy#AmV5_sP`6N|`~NFLV&3>it@k1Z2qaeZukMJz5IX*sp<*^x*Fdzy}0zcVZ5
zA~mt=W~u(o_hLt4Un@6;%iCV(XxrU12zA3o-L_GAF6XFg?#VV4oUqPufwZx<7h1$)
zc+N^lKyge{8_A@!tym1e@uP!7x-DfCkI06BYGIxmHjk3jp0Xr(k49%qS2D4-B?Mv-
zG~6-e>g#V|?f_ATdQ}zSn0lEv$3_%oBUcGSy)f9sqLWsjB?yRuvyWl_7WOoqQ3WY_
z$9f)paZ@`rRF+^hcPQq|un6tDTM!{2YtHukbvj5fJL+MX>&9C%O9n1IynDW*=pkk)
zg}){PsV%^ugyF`4^vO$##N|S3$`DUTdk`P{#iJZC0k+96zE2f{>?kGvl(d|fN&mN<
z)D7(od=H@JV4s0_Y&R+*V@Z_a6CjPwGImm|iB2ll;DDenoO@?k1zJnsC-eJXBe9;@
z{NTh$geTX-3^X%qxjztDxcJ82s_UEQhFY^My$K$drk>bqsNStIaG(#ultP{87`4~e
z9~7ZsTXAOZv-qu-2b*=yBBVqrYR3$iro2henU(8FwpoY-&_Tk*fhLj>NC0pXfI%@x
zUzPxY=OD;-ryV?BoznIwa3hRTBshTBczrPz*QCn%K3MI6oQo%=<|CbuA7vYK=QJb4
zvdq@>XfYcr;9h70TU}c(qiR#m_t%>Kv_moVzMbVxko8ee(WCygk&3y$?DPmPG~nFE
z4|cXR9JGApl2!E0*O7DIHdyXrb@cp~1rvBf@^$%6GAk|D7~FDd^RR>xApJT9A*kee
za6)N3olMyXH>5~FMOxeANhzI58wNnI0>v>RQ1Vf5Aw35&@q<U8O@ldTp$!C#P@6#a
zNdyobUS8P_#3b2Vlvt<Nj_-l()XUOOJkDJ7f0)oP4s$Su_Q_#$cfSiAT6_So`RKu*
zsG<OU9r}ZY2%H`-rcS^Sg|NAmYeoKSi7(u0(Dk^F6jwTSS3^%+4a|`XCAiJX?w8P-
zLXHF^G%BYPvK^6TuEzo}#0b==%UTcoazGf;dXzFqg#Ag|!*mnq7hW4AnzcG9nv$T=
z4-_<Q&#8y72gjZ!50Ns_X7u795`oxYCGi$;<i1j{%uw#i$R>mD6$Nc|S}!Sd{XR-J
zoei2Fym@M2VZ+4bxrf8r*&C1%{B6%f&-^n0nippm_gh9MdX}zwM9ywOXa3g7bztQn
zz@6%C0Re#L1ZEpvdQ#;86=E$c8sv&Hkejrm0Zy|mfXd5#Iczmu>fvYbOQVp3U;_k4
zo~bi#6EOT402z2nT=7icHaNn{=U{Fnbq~5Y^a#wh{)wkrK)Er7Mr=jO#P=}P5x!;i
zZIBt9F45z>8h{pQ=2MUk-*#A+KK;7mb(FO${vAd014RqSkV1i*!|BYGr->QHp4;aI
zbpgAR9o@Pl*}nR%FxS9Z5QnY<YeOaCCm>T;u^XKM7R4H)y+zTW7hPzjD7KIjR3qJ>
z(vX5S4I(^|IMBEu7;qXK;USfMwrifAHn6v#hSBD4tV7hLlpC=y40X=63y_NepqYG6
zH@SnoO4H<QL&ORy+Oam{n=*`=3*RL`u*!cUo3APt2v}x@SE=xr8xOLM|6`n)i%$Kp
zuqhV)tEICvSFA@wzy#z5XmXM^b|zq-0tR<%zjZz=ZVQG??}e?0Iv<<_>?e@bz=aHx
zS}ul7sN;Ez0UJh2IwX`cC_7Y)`H&n4@RH&NIQSM_NQVxUy!E&bKIjR{i2Xvip)CNq
zg2Vt-9StI|8FRWQkpL-b^h(GpEUqx(z}WHOr5SAznz})eOT_Gw$5L8QWd7)vcGg?g
zIlJM7K(?1k>Dy4{-4|txo7JTJg|7N+xmp-gXW$$S+ZtelLcA9{k_Zd-Jr{`oKZFQy
zFiMqYow>pbz3NrF^~j{V3D{@S7J|n|hf=n;l#gF7Uhm9-j<;BzmPum|sJLmFH&HQ}
zdf#?56w0|5Gxaa)Rjrzrd?VIyY(k7Y4pd#<XoKnGo{cvBUggcfxwDTdA)--=DOoCe
z9{%rv&Cpdf?4QuvDXc$*l8T=1ySkf}yNeY)Q=fTi7nr8*;SSUes2XWN%8*Y!RE8*_
zfeiYA>r<l|79sIN7!P+DT%Qzho2VioBs#P0ZGP7F>AFoUHeJ-yKG0>?Zg>){W<WLM
zYj;9}+JfUwBHJey_uv8mDol8CK$SQ#AdnMU4S`5kVZAqe*{ZO9y=9-)(p#Xg-Ye05
zD{xtjH;D`^%~Dg>dii0{;@><(Dy1N<xNGDL1UlG~&OTAY#``Qp_-S4kPCebVoGSS~
zZr+s!NVQjWdqY+WriD5>T+{5>dZSl~rd3nT=u3G2snmGKU&Cs5X&cTPSdBJ&wrJF?
zWUq*&QuP>P0FfAzsmaU5{Yw-d8+67Pg9CLMd<epbHU5xy@jRu`_PQK+v3FsH?d@N=
zn>MAyB3b?3y$7Xjc;OC6lpET`8))Sge$>iY=YJS-SxfgSP8HqTq&<N5#zroMiBO@)
z)=7-h<Bb-y4#A%~YG79(v(tiU%K(NTv(S+f1UbzY6iAh@dAY`g>44%%O8{*g>N_ef
zzH8tzvP9KdHg+NpGM1Iv?^h6EegQ)B9U4Xb$k?~66=7V5%&d}lQF8e~1=A`+iGU)(
zcm`$v(Wf1e^Pfb96JoJqs&Ti$^0Zxd{z5?{#xmxIluoC9+u(c{x>v{Dy<4F`=f-Wc
z_l3+{=n{>Q7YMIBfEPfCnqKvDRXDT@u#)4N5#^m}c(Pl)1GcX`bh@NUDt30}T+sEV
zM{{nd_#h9cP^f@U8cJI=?-Qp)mLd`cqXMEgK)FPE+0l6-MSKh|zF3RH1)&KMSI%KP
zCJ3fIg>2)7Z7`o;A#*j~(1j?LP#5|8tcB`<gP(LnP7jr>wpcD_S&X0De5GLalhAH-
zbeF9m1KOoqClzydI?SIkbmnUF{pMvhGv)u)qij5EXwG}Z`bm~~zWGAd7-F+$_Wp**
z`FlquF8^%>|B&lpLsXq0R~RZ8!E;w09#pL;^Nz`02|=rFEBr)Qw+liCh7a3c*sD;H
z;15H*(PS05l;*%|dTYtD2kdP)k7Q2LUC-jK?=cDzJONYs?@6jdeK!smWS$b+D=^!~
z_7!EKSe6Xl0YgP-dK@T>F@V~E#Vo9iUf8+(oAMDUl|m5Prr#^=Ill3^ssXIXV0l$y
zkHr@kEO!rKrzQSDr)T+5r>A)9hse_f6|p62p<jX5I<Jmc`h6HV=84|IIEA6jyNLe>
z!4=^H=8+%M4HA4!%d+ztZ;-i8o_IODfV4Y@*5yA-NCvS&-uTB=)djjppcg*4lwRWn
zU$lpn{4}y)4!?_qMZ{Q>axbntofbJWMkeMLaFeTnPXm^rDt?s)dH8{`85F|?v<z=J
z<(h660KWKG?}L+mwpcATIB&$hD5u=*uzpqv`;OcHXL^XFjuRK9su_e#6Z;o16BZgd
zH&#S-)4xmYF{A=c!y|qYjCAQpa5R>r{$Ecl?nzLHkVxXqEBa%7D%1g_Fh~X5Wn0_8
zUUHjhdw2}?r<Za5GfbbdWc~Y<f%Z|mx6X~tzENwnf7|dsepjjDr5MMZWX}J7=0(9@
zx3@nLe7FBOm&z);x3W2be=5IMru6+kqWHKkUkp)Iy36BVi`};4R`Fj0cWLK)KFrX5
zxF;>?oh$cVcm4Yqt)qnEThC#d{^zB+-WFs^q=V(_;FRmE;OQ&vCy%)--S_W#^x5@&
zgYz@jwM+A6?Ay9)Ik@-MAK!Q|+>l1=wKqi`82w}-^1!{d6{i;ang_IV#<H;UVOW8Q
zQ>!P+Pl7ZB7(~cCKN{l%^aNbbQ_!p+$zXq{uKIz9aLXPr1E5-jAjsO+L$*bRgb!$%
z`8NGKCVV$_4KR>khjQSlCP=%;zj;2tj`Spz{F`WW<S%`H4VWEu#8t{w8m!pBHSlL{
zWX}C&#q5=dBGa8MtB1xU{o_Tm-?>;H-GT#u6N|^*?P>=>4-DNSzqEz1r~MqqXX-!t
zqTFhg4s`ACORhm?m-%x1t00E#hJ%UPtHh7e^>4L>mDa4N@;~Ibu?lS-=>&`%JEYh>
z#&^TOv8@P}9>Sm(E}|C&?PyXk*YtIQwYt$}YKx5?D>8LZM#Fi*_LJ$~plxv_VH*Tq
z5DLrBg4@AjZJ3D4RaNLDA;`+)dAixEvI@-I+JzezE#IEhE?jH~&xgk?a%uAOlGWXB
zJowA|3I*u2FHa6@W*6|D=}%xrp5Irvc)>$4)tjThj2f*GCl1W)TMvu8Ry@`M;A&zq
z`t$nAHEwu!gb-)2br6f8a8j7m%X+b<2O3Le1tSd>d=nvMm2VFZtTRCE2}*$GPgpa~
z8$iMeMd|BOA8}yP5Jvx+XHc|bK%QZ3RUObU$V$vX5ay)jsMnEe3?>C2G7^EU=oaPh
zLv_{oH8QC8059Ej;Nk5f!)D|4igkq_mDb-fkAEFx`Q&2W@PSRN!Z*)4-VwwwW`fLS
zSPDzgi3k!}+}Y4<qR?T6t;EYC(B`l@oi~!PdPNo85J%hclzNQG!d7x1*HUM$VYc;R
zKNeEfP2Eedw*cy;7`C=)u>QYyYdOgpgiiV>fzhkDH&zUu^H*|eTEf<psswwVWBGA5
z>Pp4pbTM`%Y(u$q#b<~m=GZ1fzb&;~<IBqaqAY0c%d}Gk-<_=()=WRBP|Qy+PF$Y-
z)Vw%|i2956&7Xu+!{*uAEe#5j-X+T!b;GF>FPgy03QV~YcHpWdg}>|$qM}~+l0K1?
zjd<-sT{z~+|7E}=I)k4h!kZ}W$VwrHhlq-_#O@%=Ok}{M*Hg5Hp@js&2|h=)2Mk&m
z@zi`cS1})hgBL@7oF`~F{Z_m~iYROl2g|wDKQ~p5)h1O*^<eR^B;7^Sx@C35+^wLZ
zrMp+`>J4AL1C8fr!<P!iF7Df<QtN70>^}E`S7Y=qTve*f^R6MuVzAe8U)yAL&5Dgl
zWL1JGt{PiD0WWiMMRpt4)65wp1wJ%11-Z^~*e`;2e?T>qZNQFWG*T?T<Z?|EQ8ib=
zMb~0kuUv#3aDkZK10-TK4Tfb{?EPYUnJWK9EPBOx;q@|HuHpple<;u2scReFpn*HV
z#RtZk1F+__5JQ^#Oqv(WFb)oh+D8koD`viZSuuSh%X}+g-6iV>*zYuV58UVSk&R^)
z&5VYQU$(sOQ8--?S!h{1Z29gw(bgWuGn%c4YFod(XzX!NV)+sKWQf9_1dWNp4(k#W
zn?&Gh$WGm&uAwct=iD(VLE397*E|CjGzaGgA_isvwt)|14ihAvM>lcSaDmi8yRngH
zJxxL?7rC<!-U9=Ms@(W4$mCP+A@u--qQ?xtwTGk`E`u`|f#hK64f-{MFT0SU%1Tjn
zX`u9@>5iU-smld;!|Ed!>#d5Or2(tIV^WpIStJlSuRK?2^9zwE4ZRbeQz`q6JM_{<
zFL68%(0fDz^Zj|5Z`x7yhK)7;x!W;P<}p1)9cp5|uKG2a6x86sig(+(db&XYpB)?i
zawZ7+Ya!CMkpR(%y4+Yyui}Cn8BYprqL;;2_Ex3~*YtZD3L#VA<ri>LE+5;vHC}KD
zLUM;}(&pNWYKqtPu}Y6*f*f_$z2}lO*_w55{KJ!$`Qw|ckM1buepI=#hFfQQUY~f!
z<KWCE6N*Qlg|3^g9f+_R%YvGj^YRwqyx4f}j?xIrIqZ*XsizS5#}m1ifk|$voE*~$
z(A-sjfIV0+I8cb70D%DQ_v>faL#FCDr~UsrFOKP`%-7Vy^I?pSZV;jV{CO2e%vM;h
zPg}<lJT_zEhqWl?hfN=~;)TDLJ<NCecTOyQ+pl<JdcDKJ{#s>|AdQ{f%iow3i1i@$
zo#jnRWg-#))+NTXh4G|uGaU_EgL1}fDLCfqW0MP+S9Wg=WEoPMT&zbi{L1tn@|Sck
z(PDHZO4%<^(6AL0^7oNHXi%`6i1U0gHJHuvA^|ia4|^h3bf1U&7{Qbtg<20-)9d^k
z>zBB?D3Zq0Q2hNWSY4~+aSl0lZ!g!V{y*a01ghyfZ5x&pN<fMUi$LX(5Vo)=1$BhC
zj06bCqOwR?luC<O>SHSnP6tpSP+3C2vZ__7fM~VVN?kfY6;d@(pi`*&w7gaY4NzLG
zVi09dzU%&j^K?3$=RNN^-}%0CoUtl||Npn#%XMG(bw_gM*JeNba!_;MeQ4sEyX5JO
ze|2S9$Rh5413k#;^}b`_^3luZd&i#nYHm2yWIroxI?hP?R8ucp6PYi8YcA#M;905N
zP<cXXgS%A;b5I8ZNV8M^(FO;@cp4HlOnW1oHqIWktDXV@gcF9c1N!~U^*5O<SH1-A
zAXKiU!Y5uHcNZr*fCxylRiIAl*yt2>^5GQ;y#qf6Wq8}gRhT0A8c}P?BCDN&zPBf4
zYqT^rZF+JjYvfv1Znx%Ex~5pGzVm8t)@1K=@AQ+Q$h{swKX;w!msJZabX>r{*eIwy
z393m2<Vg8!u}N(j-z;3Zgo`LDkh~8^Cl}TL-nKu4EZ&?<!#c~f)wo<{%h93?IT-8}
z8OYa4AVY1`?6?(SRRG4|fmvvj%OkE>P&I*VF2aQt+&2J5#OR_54O+i>n@z|`h&u7<
zOVL+l<XvnouE;Sn3+Rt1X<{F?{NaCgW|Fvu9*k2;&mQSG*ee-PEno4(wUoJKeMi71
zgkL7gcOPA?+LO9WBQzqWvx<Ccz=1GdYY2=_BQ_bBvoK=}qD%{mwI^VEfH*E>;wfH;
zfFnZD0nKr*!MWYWyH_CO3rATRpqvHWO^oAVIx875pqcySW$Il_Zem7}7JWH|O%d43
z7^?B2EwoIYp&8c>A8o*vUF_91Xm6xy8fxZV80p3E9PQ&r5bT=PW?vta1Pev>LVKQ7
z2Lh)KsYBR+8{IfvDLVc5e1)Gkh#6^k%NGHXK-?w37@Sh&{<gX~<EH30P2(#JfuB)E
zkQPYd_DC;a{xI)?@D;o-8nqb(0F+HgP6XW>EKDa9<|y7*mQ@m)_R4YS1SBY+6&Z@M
zAa&H#0+)fO#Bv5=sLKA6RJ--4z-TP+%hOEiEm?jE<Hg-IlP5s@KA5PvKa@Ruce>`!
zQ;#o)+pOsS;qn#<cY1F?zOeLW9)C%|SIy*!Aig_5bk@D<a)&SS**c?efw-5`5K
zQKJ!}g0&O@u2vi2hN%BC%j{fDeG>$>U?)kKiJIE>qp1ze2F?sFLDNAI#4slTG)4p@
zP8cpELo@wB>@Ui>)Mk#)M0f%3#UvGA!Qv3W0yFN8*)05Y#E=MN@Y%TG(eB`UNYVz!
zDIWP&tMS!7>eZyO(zOrLH%txnwmkXa*3O#|1%~zktcp4WQgz6dqdWP+0hth8Q5XkP
zYFg&n0bCsY9)4iwATKF<0RLTGWNldHi|~Ty<bN_Eh+rghUM!Kslw<>hsO6zTWQ?bX
zb{i-qBxd%n=s@Z4Pka#9-Uq5EPOn}`q*z+qB*l<h1j)2(wK)+Hxxe2vkP(ACqWS!O
z`ydb94>IoQ$tPqD@)7ld^z3Q1@6?nAdcS|su4qI?_SB|p)~&S+|1@gTGPb*GY9=51
z#@(+LS(R#42`a57++2X483l3vQ_d8Ito5N-*yr_GGACqp03iUlz@eL&r$S*qwgH`&
zOlOtX+FT@a2t5+;T#Tm{Axsz3C()+PbRiz916qi{lx71}nIa9tw4VYKjF_dvjivbS
z?XpB2QP1T)x?)2TwH&{<e*JM{Pv$SF%r2R^iu5XMJ(7!zbf`_ZneVFd7sjWR9p$2-
zOwWwZiq1ojQw#VXKbcfV=9rU<lE#&YL~AHPM_W_Q>13%071#s}ry#-uU|KgMP^3K)
zW0dJnF2G7pmj$kK&~1wTC`H#hE)wPBCz#eFIcO%;Zc|qdpC28BiAET=Nrt}~L^5mY
zu`DXiScT8x9F9Yqxo#fHNO2hE0D%Awdt^()csEGe^|)ild@~<wx)x@S-8%-_3Np0y
zTD6aM_=LM$JoGem8g=u@dv_WnlZ{#sduyiaC>fj`>sbdIf0CKzJGKv@H)0{=k;s-{
ztC%SzMB20u00tzSDIg(+gw+r#`V={aFeJzu;OI~&gGqx?glQy611h-)4t<6UiCDuK
z{?>GWk>|{K%i*O=)Q{`TB+lVZg#>2eo_k?BOM*wg<I`)q=4Fo-+hD@$%!a8`ds-%c
zxMg;kV;)=p{$6z(=cvD^Uo5t*ohnZ#H#LQUa|tM6N8o(@cE9$RqA?#8g($AKGuRk1
z(|DaLl;B_}+-d|66-G4eZZB*ekA-IVq&8EO!T_(MlsLO#xI?s^I(k((GyCOj_I$g*
zl2OSR2tu*H1R5hqQYxFIS9L`>Mz*ro5`u+7)M|5-6__<BC{U~Bwhqf0T1M2}LsJvK
zj~Tlz|K@V}?ocl%o$3JTmNK(6=$fF&7KZ`Hki`_n_>f@d)HkzIL}{NuwI;=!hZU_Q
z?`mB^K=e#~mbngeGsQ_lJq8WYM@WplsrPC#&y_8J0YuJ81s1b#G{`gd(#O`A`Jhgw
z#2(_7d4TARa4(JkJZQE}08Jyr4X;Ky{gh=|<;2eyC*_mMQiP1cyPfEZRpV++-f6Tg
z;9CZG-|%XSusUP(N)SK5yE0OzBdT*L3>61m_Y!-Se}XCckL+>0Oc9XAyKuvdx#ZXI
zhzpW1Ob9q6;;c@(7{s8lrRvt{WJ@p#k6Ri9Lm7h!NxT#<!BZs({5(tuF;mvT2E-8@
z%|UP$f{+Sg%4<(jH#i1S_wm6}`pgn3te1Vgmn-I}aQykgTK=hzN|MGc8pdP)?RAnY
z{5|6Fs9QdLOQPwo?9vEz&_oT<4~(5Nlpe&2bL<xPAI{i2(+q}J#8NSBW%dX=YS0jn
zMzfvVyv;a}ILCT8;oHd|;uqtrAV-Bf&J?q<bG(YkSE(R9MTLnJQeRg3d6VVC)k0ev
zs{N?6qn8HPl_9|(DJ0tSL3nqiiFYQ33t<i?U8G#HeAu3ui}ZnKNnQ13>*V7`@h@81
zmDg26+G>+?%FAdu!L&$)U#35T21qriR^b>We~Q&UkYq{xK*3fihU?MPIDPM8DG~Od
zDo4cvCGs#t$Vv7*kQSYoOba=1z#3Vdt}>YJo_V2RM3I*>DmaD6E7yPqg)xi=laxgi
zK~!4-o>LyTY_I#r&RoJkU4?mNr2B=tv5C)~?pu-F*V*8!DXi(047HR(3XTaSrY>wB
zdUdpC{F-ZOw)R?a@7UiFQH|i@WMigyQ3>Xwph(gbQi3j$(1f-?wHwY-pd0W8x6B04
z!T}72AmH}>oL&@<4_5*KREXd5)}Ysq@>+D^dnxiw0N)6aRfi*Y!#H1SGyWk?I0eA)
zZeeb=r=qQ^5VRu$RiFtU)x<degra}!O37K3BhjF@Rs^6gzrX+5u2G3LmBBGs={?&g
zmGZ37N9$8#b7bt`98Q2AuM-^RHC2&PDMGiGJn@ux71UI0K=MY+PI))Z6L2SIiUIO6
zKERY&2B@gQkedL#Hgh-?8fAp)5XBG6%~JTWH1q+X9VYXgl)8E#>VXYEoy1Po#U)W8
z09Mdea-FcxJicnxDKsQ;T^E|_px8^!%+hYJP^8ONcw&GqvU@kCXbbY4&XtaamY)*k
zncbz?<0on~{TC;FcZ}UDtXZ+M4T0X)>2c5!{wdNskFmC?+FvTeS4<+#LID~nZxFL9
zP=gU1azJnZ?j$J(`9OA(ho|-`0$4%qU`=R_TZ4ca`66|dlSA^I<*1{b496!5<>Lx-
z{n<t@W_Aa_MNpxjucjc=2Pv8&hf<Kb0nb6Yl9mJ!!zx|X0j9mMBOb+k5<_9c8GVXp
zTgc3vYJA7;rc1PaT{b&;?+t$5m^gJS9U6Xr)jKZrtT2aJu67P9s754i>>?Zp;k&Z>
z{X3J{N<Tjk-S&c8OH9=;;UP05^0-5o>}t!y<Sk^p5vSOSmk{JJQ3Be)%=D2FHPtZ|
z5kO42Ic(SS?GgzSxJ6RaT<GrM;-KZ!4yj_<**IcALfGBBw1~KoFef@!Pz!(%y^d{)
z4#bj>g@IIJ$SE@CzH#C42Z>oXyEjaYJ@|tdZ}8#wJ9?W(J4-iYKAEg(dAxgN_P|-f
z+!5n}7|9sqv!E-&x!1wEae2c{WWPc5hL?b!2Xu*sQZz!UWDt1*zV@4jJdG}8!q8oY
za{}Xw>5Z@krvPUFwgdx%o_AUTzNg%bJcG|EM03CG$XmjdZkg%GOJRDFa4ML71vld8
zV~A38H52M-HCM4?@JH)Y<zvf-da@ooWJi=cwB_rZiRKCWL9#N80Q(>WC2I#KT$4Bt
zlSm@=gj=AxMJ!^rJ!L~Bm|zKjyjo$xFEf7`rdEL|(!)oP;UYOC8VwhPvJ|Bs&Lnd%
z>tQWGP-SMlL8VNZ<5iAc>Jp^B5;G25gJ3J-A`nA1DO_-xE0)A74ORcE&%P6X+ayJ|
zE%XZ9z&3%7f-7`Zm7LaRKW>vu&w_&^PV;)SHv!5_job@hN+eykp18{=h8K)((EfnN
zoEDp&>4`?kWH`XVn>MgD=6JLiwRWus(tDIQux$YA%)>31)>e#GQADu#$SK7E7{Lyw
z3K;r%P;R&8d3W7Qgb8QvRecf%k*8Q}l=ci=G>Gv5#z7kdsCzR`2{mAhOW2IKj>}`I
zwWsxcUp;8kpZm$stxfwJI#!izey3=4thKY>-gR&OXNcXefi?c_++u@c*Q&oB<5jPA
zFIe$nzuT{?%Vb})8Md3$e-P-jwn#^N`Suf^q1d4zly#HNIPB~O$piFUT-ca|@?v_9
zA_}wjW>5UwlBk`&RrBZViFZ49El-@1XFs0WwcJh^xR+<~Vq1|R^n5(QFEK>k59YL+
zH&`c1S-CU^Rioi|pke`Af?xC{5cxGkQ5uI2;M2zTB2Y>B1yikY9U(q6dJ6{_p$T+R
z+HiRCyeM>}GYq2y;vS2e;)gJRnOfPcFa=u{2KW<848=nw=QN#L6lho<G+p#6LQAI0
z;tw+TwCB|VwFvMd?~owqWQO`wV(Vb+hN<q@p((kIrhjN5bi3W3=gk=xc52ahHPI6$
zQ9lfS)cWjGu)9G|*G{GT1jbaGBPW;tZoj+c|6lL-IA6K?H2fOW*94j`{$;=2q0B{5
zy=?AyBU4Bw6&XE+3F51GaUe#DV?+;5!HKwuoAslZ2%Rc<_H$P5tMhtxs#7&rpUmqq
zinnzMFn$@TY{>%sJ;ixVK{~*7&m?V5C~{46*qgxBK|^x;{shwi)N_Dgv-)K@@JFry
zQxwZywlk*MCQU};crOWhQHp;tr?jaPP?R$SnhH#S&-fvSI@uJ=Bj^`a9(R)Kx);=+
zKwUocX;F$4`EX(WJJAlU>!K=BaV*sDs71QEkZyvb6ShMDJ1#H>3L*-_rzaBC?un9~
z#Hq_&*<+2EH8QH!x7j&0=E3B}SNrmJV6yj*)3L4Uj~lezLo2lR7ifFsJ4X(dO6Hm)
z%{PqRnd_xkh9s6+J(#771N?$~h!|Q(xHw3C#J1rSY#`Dg$cSJJs-_UAF&`XB^-}Kf
zu|}r*4LoF2mBudPks0w1sGuNrhZ;SF|5QU^!brgH#AD%5nT$%L)SN)=I|O3bE@tXH
z2<MmFyRmXeJ8?BtGVz-o`dB~p1B~V_Ncd=Whn5&fx0sbJL3+IR3KHEShm?+G$PrH!
zp^b`xUeW@E@Sgx-VRKBvkz*-C$cnb@_wYothgbGO6{mffcx_2iFk0JG&SL1EWbhBM
z%LrW~kx7KLD1>#2c+gD~OU*54NNC$0X{A?rt(RG0lQgB+>2Mr2=ah48G6dr}936O4
zPM^|ZYB;PbYd|T<dfHqg8LO1<80&mhzH_V$vB#Z>zLDYS)E>!$9q!Z0M&IViADyxh
zc;L1?>2(HN6`|Hsak>?LNOq~ElTWI-G$53ohe>!DqvrED#Rwb}$l%XNV{&#G;~@#5
zya8*}8v_(}jD-pr#FOR~qZUnBF5U<Lj+Y1vfkYFd7ZB+aM8#NIGFwQS#lU>Aa8%ty
z_6B?q2sv$XXI{7s<VkGg)4$Y@>OJzwc%mHAl4Y|K>YDY3y6s*vAQEnDe+6hVMXZq+
zu`SePv&E6@<3@E}1`*J`fc61Y9GFi7`y++vZ`%xk0^sh5f`F4ND*~9z17H%fHB=m+
z`on&dk3B+#wtRmkCP9#nAfuAze-j8;oKseu1h;Y;E!lmaG@<H3kTmp`2p7rAsdfOh
zUsohV^VA&miNwrD?uqg~cg?*N->F+Ghgv4bcFkLnJ@wJ-=3HvQOx}kY*WR(tZeV9M
z+8?2a;ck;PqE1}~j1C1H0Qwfrt+35LVaf`3P9cioC*p?P;AI#m#3WHCP~8i%2AY8!
z4-+!Y^=b44zmOVe5YJoEDx<=3J7Q-!_7&mm8QAvQ99(`97yxxZ?|3z+BF$2i$7BxS
z3{sDZvw7w^<!VZII*~4jIo{(V*Cyo}^{tu}+rwvkjAkA-W(iFdI;FVowRVjzBphCV
zrNZH{&=Z8^vpF2qVGgIfGA{pk8wW*$Ki&`yCpHguwjg0G@<DVDL;;k=bZpb*)U8J7
zY&DXY(35PY+Z5$13UmNeP-_O7ip{$Ng>8YS5x~cR`5<1Kpa>};_!=8@n5KxG3Ko`l
znVn#lN2U)=NG^=rj(wCpc7L0%_Oa8@ik*WWCq9qU)jP&D%b}UxvTuX-+12#KNrijX
zvzvo8(2%dDpp=88&OK@;jBe&$nL{Fv#$TW;7muuy)(wz44q}W>+Tju}@b>_oAgy;b
zMm@M6g&c;+Fudrv_ncWSK#+$5WTNNW;!jjz0_?Xky;0EG&TBda!A~#(@IJKD$4I=H
zaRfe}Qz}JriT#JYq}4koex6R$jy=SEaf0r~L6q%jTTGLV=kQKPvpM|Es-w^;mm;R(
zu@L=)dAIG!IDk48FD;Oj`C>C7b09|DW`OU@-gLusy{$+UOjBT<i2}8UW|c4yY+=)<
z$P6jNMtP#2!}MVhO)(Zd6*9;|`Z(m}u=_(x;s`o|GHa3nVXnwF*i4DT3^PbT6;Dvq
zA)B&hmCV@)oI;cosQeCMquSxF8p$II-|3t4dNW6Q&X*$D=(}3V5G{PO-EZ$0@9BOd
zAKpE*b9#Mmmgf8AnqLwzX1=f2AC;sSpS9{85c{V3y2A+~#*iIDZb<Pru^_-LO#*Na
zmG>EO3<Bqz5Z_R5k}>J)w-XX%4x0@e#gs5s7tP1AU68v1E6N_D2RRmKZzKDaLRKC4
zHw+<mp%}d-$uy!5L42I@20j9<HeE*$i(x~iwf}9o<l`6N^G|ZPer0hkJROX=jfw%&
z*Z2%_2F%OD)V=;tpVcvL1|09Og4^~O)?*7rfCFF%SbCPACZ<^JQMCmjAFL_vG;SB~
z1Ofu8)S8DGQ*J6vFp!of6ge?{RZvDq3*_@pQ3u*fLFgtiv5P@Mhu8soiAWEyK37?W
zKL(&=5r)t$(3%A7Z6U`R1CY{Q8rhQ3y>G+zM|;j|Cw^ouie&Wh$LZPQk8Njg@Nd2u
zm1knUK+DKPqi-wYQ*NNX3!fr>-7Q|dN;y<SBMeE<NmXkW4k9koDTeAo8dPOK?-e^F
zDT*)!dWQUoRMfki>qNPc7%mwB0(SKjx|g;J3ZSuoKdXmO)rW~fpCpvc3Y*G`Wi$&H
za%r$(Gu?ih%@awf3BnBcnI@pkEJ73|wN$TC9g}AabxX9*j+ORUG(EZdt%Y2>Yf#$-
zPqK+UoM&By%+#~W6l^1WJYxBKZ-7;w1BN{;;$lb&fkvP#GRl_%kVEW1&ML&ZUt}O%
zQl0~7%C=j%znZPC@+#tX(YVn-`0hFmYL`6TUd6H~@azMjT?@;m;|-ygYDFNj(Pk5_
zT4ZF)W5u75?MV>nDon+%EG7&y#Su2<m6+RRcDUVFOfgAX;o?7iKlae%H&DI0{cpTT
z>A-KpFUf=RUGj-5zSG~J^I>W|+CvW!Sq$X@dt8Q>P;ga*Fl|Kf9?Y0vOuC2^e?o9D
zZUO)tm>8Nw2fa*igWyg&8KEBpw;`1JtU=c=wRj@TOhEhz$2;XW!+**531cWmWWN)z
zceq%P?==rtgXjk(dx{^xJ_2KS&Hw{B-~iN2t6CDL#~yYqY;k8SSGHEa%Qxd}PILh;
zye%L7J$sD?M2i`%VbbKR{q4r|%b?jKTT%|VJ%X5MxQCTqZAB&=3qvywFr2TvDpQKi
zV*hcO0070RZhqiZRF0r9$hyBMhXtCL^WL0ls4%Zl73Dy}GNA~F-aq!l%;Jx?Br-u)
zr)BJe#Oz<1|D7AT`t*6k)uMS=I@J0M?Wey|f^q@I0T|g#VRo^q03|rGL1@F{fb>Dq
zME{>|1r2ykW8!NMm7*(K-P54aJjCy)Ht46ej!oO#&U`!{Q@7hp(@c0_1ALLlMTZ+v
zYZ3u?xd2Zr+5|-(rz=uhSNv5RBt(uf-KyL&Q`=KaKmr#<Azhqj$u6d}O5H>;3iC}$
zz_>|oyA|a`ND%=e2IK;n_byr>;`4)e@Iwr02lq}^38g#UDMGo-m*ajAEsrGeP;f@9
zU)U6UN~8zL-7~swJjgd;ZibHS&Zn+j)8E(hs{h!TXqz@*o>fc39m&9g-tmdX*2$?8
zsZ%$Qs$9D&&j*kWgl9Qoc#4e{Cz~hQ1B=70M`Di_An5|t0TuL_DgD;vgN2eLD5N}C
zhQtjSSU0FR321x3fLuhFl}zU7r3eAKhgvH1vyc~q`L=~hdQ6Ruxor_fn8rX7ACx^B
zMnD+?834fZ8RaK`)^%7d?NiK<K=6f*?DaOSn(k7~EhLo|5|nQ~@Ue9QqO;gvbi{<C
zg97O$v)bWvhTL)oMVaYF@Dp^5L1EUdLj4t;#Q)RY%xDmU=4p`hHanPrX{cfW`X%Z^
znu&ZRMpLyTVvfu~4~9p-y)o*s4*Vpgxd$4s#cR`0rH1AQ)&XH0z@zAdqT)HtCLnu(
zp7KRbP6p(-bJ-hqUUct$vgf=_>%{FY->LTV@b-iIW~+6Qmp8O(zs0nwp`$1(sPA;I
zP>0oIPmB-BC;uZ>RR9!|saGRnhktZ8d5&B{wGR9s@oF)4A2}pSsi1RVX~h7Ml;-5j
zWm^!l%#co~iGl2xB+eS<5V3EaC3wv+Q9uor&15J{dII_@5&IA*V7w|NI|3($=7-L8
z2JhZ!tMf~keEh0*M6T^_Db-v9J7}eE)}t=X;9T^RcTMl_*pID&ZCN$I7Y;xHi5G^N
zQQMsIUC0`7I(E6`Bj#!&5hH;9W>k_lM%LPf(t<7az#H5VFiEJsjhsHiKgI{L=w&pA
z$*_zL8+_uaA|$=wqjLlBYB?uWZ-A2qDgu~>6u~HErdE*AtX4oxo@A65?^<NY0o;W<
zE6Xv|W$}f3frHAoPj_xB&7L@G)B5D+mUKuQT)bLpvyIB9*pPp|D?0`ec#W%9%E!kV
zO;V^=5N99Cz!(nT49+Rhf52WuiMNT0tIP$V_N%r!z(RIgAwCDB9^AT&1I%nErpFFj
zir79L%dFauVyhddl`g<rPzR%rkHb-hOgMtNqPa-b8G;=ZhQYT(q=kEGiuEu>n9A^K
z&!KjidST+AR+9)?TCAiGLv!kDR*a@-pX6(&ZVYNx4XpAz1Q}h3toFMxyW-~e-@Brc
zfl!Qe=SDJ{81h8XIt=CvGA}UaP%HJQ0$NFYW9I&&;*?TcOx_*#p^eTNszBZFZkRmY
zB(<eijZurx$Fx@GG}u~*b^wbii%MAisIe+JER}uGjP>Vo{SmCDT#?nXuqR)iPoEzC
zb7J%4y)Bxl%QY7!e;aol8yuMIy@08w8@5m1vbmG>(6n{rCTgUb@gZygiX(o~v;Vvs
z9R60|l+SMk(^mzc+(1+UBL!1Mu2{91t_wUMwhD+Q?m|}-Zb(-&yf*;*7<D1dH)gt6
zk%J&ThCL&qYBjVS0DgL?03|b@=)9N6=+y$??F@g`wZKPx<%E2ENIOu|oAGq{@2%H=
zgsssn-$UV-w;&+UMTm^47FQNkL{!nJMGlorPe70|X^tmd1fiZAlv!~z*J408UP<n>
zm97sXWr%nfK?R!Yw_?&MkQ9jgRf5M`wgmz4DFccVKr>VXdQh<iIE=Zzsq~nh9}j_-
zi5(&@yBLW2A4ZWCagRlN9g4fsNP$9j8C9D{4VtNcdqKAUn+t-L(_3Js;cb{kdRj*K
z5R|2mXA>%9Xy(Kin;Cu%=#Iq22aPQRUG@y`jjlAj38aF;JfdebF;E47n*b7FLue+k
z4xnNO(~im*=;82C#FT^wLHrDy1|F8a9se{_T;Kpf1NMS4UnB%1iSP@W!6!t?+lQ$g
ziIctB2R$pbg6C0K`oat5I;=CIn6wD++iH!>KgFP{q8y9_ut?*BBn*yfyrEj;wh`b!
z^qM>%p@_g76j7+kZbeCetk$UFEUpm)7$LHu>)_V+32~l@yYFQ{S*Ii3<C5h}x^KI9
zb_N<8>PJGKMQm35Y21+ngacq$6$t~u1cKNsAvO^JN1en2k6xGW?23J)c|bR%@Sii3
zV-J6OQ+k=3(mr!jKsk&T6JAJyFR1xP+@`#a;t6n!7zXx?zBtufsG13GG0HOo5pFY0
z4h*<&jw>?TxQ5&X1Ojs>ToZg52xy+*cu?QXEFo)#k;5S5fHmvT@sG8FO=Bu?ENZqx
z)JNn=Vu~{uSQm+QjsFU)Wdc~sES^7v&={Mdr_&LU%f_$onm7?LwgR3-2U0EQ`YSLg
zPW9q$E@s7YK_5WO%L|hdRSFfG{TMG|rCdVNRRPGN;K1o@fs&}JI^tD{Ja7TvEL;4y
z0_KUJ{Fhi<;P#AecZq5<L`=gG#ZX>mCFY)pLvF=<vx~8q8(t)x1I~UB8^zIT;4Zo@
z{=7rqHMIVU-mQ-Z7fxU9($vVaexK04`@4j`<q6>8sWIwXTLh-lS`--ov5`<XvTS$+
z!d6hQ2FnR~<QB|Ip?&Fh!#RQ{HKjjwCM%Z#Bw0q7J2IUEV=QtX_%B=&SpZHwia!;^
zSw%+^w_A~q9(0UaAWQ}qZ_hFKD027U^UmxyvSMW0Fm!T^_nOcCIR(mTvoRP8yO23L
z>5D{i5gP<IQ4W4x$<H}ssKadz>MtM)c&MAsz<zsUB#Uu}ab*er!4>F-Lxh%};E*%m
za0tCXs)@yqMl?xu5GUYW<&g;%;kY%0fQl*bZSv#s{gINAK0<<p&VO$M2vLVE4HO|d
z3~n5dpjdktD4nr~|JTU5?V<nA=a;ww>>Ak7nl^g5duqIJUT@ZKK;f_6fA+f+#UOEF
zghn{UEc5+Ux5c+~O;}RGpFVB-oPFV!mjk{~cLc{<ZTyAh|LE(CqkrG@wpDPDMcq4D
zdhssy*OS&o{KfrDk>49%C9@x$+PphR(lOxFJ2dojR`1%C4*RXPzxDo#8mI7@71K-X
zR6tn}9@4Ud@f%O*o*VvuF)Fvzhtvd(XzCUf;SGqIh2VlFEx00baNWw0l08SVm(I8%
z7$PV|Vjc%d;+SkiD};|l9gjpI6i|%>3WVE~ww6R#XluJ?wiYsmB*cxaZ3Jgn5mn<e
zarbO#&-DFkg{;^H`S|dyp@bH748Th?h>H|Wj?nvvhJ+SI+td;iT7V3;42gMCv=kUO
zap4Hj=7*u3iO!@b9zfq#>D*?iZ7;_HCK;y})C2WSF_!?f{!D7DFK(Beh18Z%{8kBa
z3Z#R$_)4%x?7`IQgD<r_;pL3c<Ir^h|Db?_+pZwB1lvwS)d7@N=Rt0^5@7$Odb##a
zYVVGbi<20aFjfq>Y};Vx$FDlM_nemueXrjzH4^JPp={9Jmrvfbu|bqzA^FTw8PTbN
zhB(PdfU6D*4V`Z%fW}nV!^-H%Uh}EJpePapCWsL;n8yghBw_)8`4kike4uxM1c$gd
zxTGm)5WrXl48wtb7lARjt^tg5qfdnkBD=W-n!sTs<FnQDILPFng?nH^WZLRlRWrfM
z(X}iEG*KSgXrA4{?}GQ9P8@WXPj`O5AoaBz=mXb-cL7;=s5fK8!C_BBNYcO+?uB+8
za2ZQ;w0+Vx=o9TpCEAPoNRF@Q3{d&oK<>B!t#B?5DTr251)ZvK6(7v5oVF*Vwlx2R
zJPeHT(goTAzCbsyaTpcsO$ii<Hjx?M2=J$D0d&RUOcIOHla?EI1&l;R_hGLkF&9QE
ze47Sc-KU>y(P(ZIY6mZ)dG9jXR+xPlY}Vc#oYyn)?Wknx8}|*{ABMG#4#YAS;s2FA
zD29<o)C>a`kIr|B#$9od;M%2Rf+DC4Mx$rQsbHjE|AFdg43C4}%`Kc$v<d*l#HbX^
zJjgf&=Y!B1#fp_SJD*)U4`sH9yZ4_iPuxEF@rmx<>p4&ojO&oj#RN8#JmGuywk<l1
zhLqb8Y-t<PViPFv3kO7uj9UN#4w{VuU>uuZWQA1)3QqoxjBh@oUpz-O&JHf1ErK+B
zy<l<wNlU~p7}!-#t)&nHV56;_w#+psZLL4%JO!c=*A_E@j*CPfmY#<C&q-8oa#gF0
z#W9>KwtU`y+4PJ$X(#UIuau0Q@6`;IuE@}0q`E{s=8^u0wj*=Hg>-GtiFq5we}Mo`
zabjZ2#97JID22|L+9G8pL*$$!`y?HdAp(O)TEE-YnK=jXBt(V-OvBY^43BQxGIYhv
zj7m!|MfDEFpRxHKNPo;Fk+>)o#iW^sJ4Cq<O^(?{a{w?q4Hb!g8gejuMKG;trb!4F
z3FE)S(j}O+i|KY5@cuu>madrkwLTp@hPto6e~{Wb^6XWZr=vQjtDq;Fe+Aqn>U`#e
zVSalWETlkJ)Q?2C$TS`{kB@djz~QJGLzElkB1FT8*GP;VYNJxJnE+yt-K<{YvIf;L
z;H4ZGAQHc}NQ<EeAPz#lcu|rCg77j})X1UBLFQC^2_gW{aLjfAW16?OB+i4#{QU=n
zVn>bzm$o8fJW%ppSL}v~k!4U;xQy|hqdhjzZ-iY+e>CMV3IX(s>*eDbH0CM88nW6)
zTc_^-a^skUq*kgKA%|YjwtQj{#Mc6w>s0N9V?aHkxDtFP939jf_XGb%ui9z`q(mb$
zOgnT;P!ehlM71kLnTX;Gpl52=!vrJGJZUO|GF3C3HQWbV)XFOj5egw{GDfn2LrRrr
zDoGNSZ`&D-RxiIGH5ff#HZ05<ySZ`*+MofsxCuMQAEuw$AaMoO;;$>x_1s&Q6H$G<
z+Q46je;K_a)Jp-lGDcUR-wcySjP_lyvpD>bLA0Q>1tyS@BbNpP+|~#cMN_0D#dRl}
z7Ezhf_7sh9p()EQFkG0L3<ndh?Eau>(bi8fc@Vk*#S|>ypu{WR^%Lc_(ZxUQAMZiE
z?C1maP}pc1K|lL`ZmC9-F3%derauJ7(C5C;_ovzQ|D9`S*f4$5S2IcEiEu6q9XT`9
z*6}6~@<LY#R-C%c85wXS8?$rD$i%>80Bkyiwuh5VA;@0{<-*3?nl<aMQr6Lo>D{RL
zdApMT1p<JuhblC9f6ru+N{t8t;y?4*__%>1GlgFQ!$1deHO44rAs}dKYXTEu{ClT=
zwociMb&}07|CFP{%`I`L^}vX-C4K^7vSpai7$z1MvP}j#Nei8{MQD&9RU@pv6AL%l
zQ)YYF5+Zk4Hhc}ZT*Hf0RyxcL$2h`9`iV&Y;G6?WngVga3oI#CQGPkL4qLVLQ=y8#
z7`?^_zaWqKuDT=+1P1bh=rd75dVtRb81Rkz_^ut$MLE`>o(#8X)joz?BGiOu%#p8h
z8WTDm|B%|M?Muv7U*D2A+~O|ZJ~8r34T4f8S7c_rGIQsoFg`Ph$QroUcKANT7+5X(
zgP;@1T(cvZlT?l<(>VdXgpJx-ydGFxu51Wb37I0Qq|lE(X$gD@Rv$1w6=sAa)=fI8
zKpttnG4t7i&MHh_ob9MG!p30`6lo2hQOhjl41DNMv2fG>xDv~IX0OBvR+R(v+2(ID
zujM0+Li(BJc)}!Nd>wrLaVUgP-@Ot#8<^n1D{KNs8SOj3>m#et!Bf0s6UE4I$w*3Y
zib4yRGCcl(v1k<wa8C=-0z&05Ov>3>jPymM`0|6YR0K|&lyG<b5gv%)<N47-klek@
zoZcd$H|LD(7Sj4dTn=<fU4qWA6yzc1o!XEwGAf^ZxZ__Ogs-xA^bv#vFS`0pe_xud
zesCgI^6ctWI0$8p08AXpFkm;yYHWo_0%JoUnuqLgrjfqc5#bW`uhE=8ugF|Pc%XEi
zB<=Zpu&At%S+=>fLPmj<%sgd7CMhUY;gq5g=|G8BG38n?QZ%FyAGwBM>rjD<xgD4t
zNt=uzW4#VgN+3bvgdM}}g*90tkH8@{YueuT?Y`dZ!TV1efufx0_um>BW6O_XBrpUB
zJHNizjVZaHs&UFdiALI;h`D|`-he}@FyRdavl5jkSs7mqJ_zzgL_j<=DZ0dXmQa}y
ze<1DA+Yu;W0o|fXG0p;X*AXbG*6HT$41m!E&!eL(02F6@<7b$4gW<yQxbsekZI$mL
zitS=cIk}iE72h&-m<gG(CSI0nNA69|>)kOl5BM*|t?8O=cQ7SYKXv-2Nng#tpnK~x
zhu+EiCys^JX!e6Tpxn>F`DWDYQKdrE8J!cFK!l;0tN~sK9uDm{=<#$}gGkT~YvqH%
z1!Uc6hL;Io;~^6O>r7eqyaL>4R6v=fwL@nib>3t5W}E};GDs3LK)r~XdS={Ar|LY>
zph=3q`_RtednuUZ(O)@F4_@KPKx1$7_UUi?w@@+w8ic4ojnn@UVYG&AhRHbXScCvu
zK4RG;)Oxn`y(p78vs?heZ#?=xfxE`+Q=U%UP6!+U&+7jZ#Ba*aHhm8MHGW5!8UR1n
z3;r|4B`7M^LpBCObs%5B2C;QUXc_1uLi`xenTb-wWhQ+xdI>aT+>Sl41tcUFjL0e^
zN?s!7*r*Ffr`va(uvj=fJ(NA!Q$MdqGNrzn=#%|-^4Pb>yPm3ACPz19J?;pXpv`7#
z;{NwF)9M-&dyAb&oDOrH(0bv6TaV}>p>4)up=6DjnN%(nD$nAafaD89bU>&J_YJ_L
zIfSMiOa;Q>5Yh;*!vJBpAz<+TV>DNQTjn9gQP(#G7D*#eo%aLOKjTaQz~stMTx`WB
zVXMz0*oCP=+s#D(HU0m-7t^!%VtJ)Q0P+Gis~4r5v|^~;0i^}|87-flNZXSHBGK_2
z&m<6^V8Ebjd{BkCoWi{vCrAJRC4tmdWrH5(B+hT7*TnCIAmyM(cR><7Gzb#S&OB0~
zg`g9Kl00J~90Lzil8FiwLeLIo7}G^?#FDT`Ld=5V#|RRObDA7=?cogz;a0}hd8h{V
zO<f$_IdC*Qv2PUQaqYbwL(})}1uy|@r#x>G^Gol3-n$cH0;i^w-L1dwNc?l=#4*H9
z4CaJ*6DbW`ESOf<68Kvh+~h=0j9*=#CStt+tZYGaj&>UC2lcZAsnNU1F!*BuSAfdK
zqEQh{FMApvj27}7NLq=gr<xBnxlMW~l{|MQKqvKF4q;yUOlQmxAVz54;TZf}xzZ;4
z*)=(o_mEbnUU+()(rS$(w3g6wK?b7$#GE|jn9jAZQ(YD2Y$~jv)8Z5yw;{*2|1g3K
zOd}*FUt35%yKNboSGIDkl#94+kcmQt8qa_OZc5XFafnci!Xq+6fQjj?2oXCjM<q%e
zrbO6OW^NK;8|DC#AfM(qk=g`vGNo^_%p2{|n~I@@N#;Bg9ow+6ds&j<yZ7@YW9Sxs
z*0R#(f_AL0RGu;Q5nYDnC+_!@J4T0EAj(&xxxZ3A_VoJ`zLRhh+N<uGXbhcbRK(+q
zlhk?Z>u|6kbna=^14$Gx>a1-c_;<P%@#yd)YWR|75DP#}Y+4N;2wo!mG?UkzCu4@1
zTqiIP9Tz9&^fcK9(G*&S@PRo@m_31!%yU74NGd`n{d|OF%m)H<^;#?6KJwf=>7MGz
zc)*yazHfo6C+9;N>0fuC%>+^Yt^RdJ11B-nwVF;452-eRm58vK9nbJvLr@!_)J-fv
znURJb*3)cK#HbN5APrDn^BIt3sH6s0WBi>-H#8ulz}<vksz`*n+{oLY1qo(1Qaij8
zm)ngY+IuOQK^_Nxa@g|HY}BOkmH#m0n>D8Sr;J*%`S<C=O(uP)Ii%U2)V@eOA2BP9
zsZNa*^V6c>C5;@`V1Z#fNu1W!iOzkk2{%PtM|@e#S_}lu#DFcmh1i=0ToNG}x!90N
zm~;zn58_>15N~Iwy5%+5o(il+f<iHhrIYeq<X0g`9uRPmEOjx^Pi(*{N?AKc@_z$m
z@E+6YAJ$HFB8mIcoMyJjPs9@%TZMJv={B{4;0i|7CXhvxE_<apW+^6kaBEgFbPYl2
z(FOY+JDahPOBD$WltMj$3cO*wy~?6c5hcMdiPsxJBNXl2Gi3ZO2TUiY3Aq^1OS~b!
zARoIJI7vABYRW>nI8rhMxbYa!VH;DcD@xGy<8^B$G;@8&51z2l+#AXsy*0Wp>#+uN
ziZm+q##!TY&SL0}eDppfmM2cwOsl$M8#KR`YDdNg6MdYwVnzd&82Doq*8)+Uc^FlP
zDcK~N379Y-mWpW<5yM18J@zGdB08cB(+8we_+4Nb>zM`>rZPce1Az~qj1J`vW^giC
z!I&?F32nF=NX60Tz$^_QzTP#u(3mDdDx&r#vLbE>l{{3pyk{Kr^etp@GuzOh`IeIC
z@pC$9T<Hf=owP{Sa6(6SKe%wT3ks8?U@>sZRIAYQYKeRp_7gcH$}3EZ1wNVdH|rn<
zNt2Pu?&rqMN3=~DZ~@YSVq0A@^qAR|u^22XDNUgUdJG<obXQ465C=5|uz1hX6yzk4
zLe`*sR}5NKgw$N%Ctl0Z9Ud;$+&$EAVXUj+>Az+S@*{s>{<EKs{Bpy08&rSDvywMv
zdW7QmPXCw$MY%GSz!%Upvw~m33T@@k6ewy7$GYR+a1y~`yB%dTz@KG1Aa(9?0ieSS
z__Cd`mPP2&!S23hv<Eha060u51bRO9SB4i!gH8}42Qr_y3_qp!%F{~-PN(KcCgoXA
ze|{m?LR;9V`Cgtmcz^eD?Z`RbLTCm1m2DB*`m^m%2t^{}e4f6JH&b6<N~Fgx=aet;
zCQ!h$D(Q$PY)67=*%lnJ6F3P{(*b?~u$A!w#6~!L-ZXBT>$;daBhz5!BXov$^0{2s
z*7+E9WhmVQ7aU+9orClMYmb@nhNoFLcD5EM2SAG(u?Tqb$nTJ(*Lp3Ho|U(1FNJG<
zA~$gU58XiG^*_oMBok9ly0m6o%>PF4NI{|na0k+?fT>4DO_3;u!BY!9JaVG~h-&{W
zl=x~;0EeSc@1s6KpfRsfj7tax{SDpxvea0ZHvGb><C~qp6~N><x>hJWVcY4$wxJ;f
zKXLglbKNhz_Y^^??=+h6V@oxux$=qju0%~u_O_>2)=RP<rhYZri_tFDy3L@~=Lydk
zibTi?{Dg>dFizNkhZ)a8#0G$@Aj<1U(symkbeiX*eFJcX)H0XdCR>t;@L_@KZ<HPM
zbGj^C7w1tu57`(xWe%YtgeD=ZfY7ZEig}^nWh?4nd+O0(CpBk_KertIGG}a^5Fjg|
z!LEx@heJ+oC}^F3<;#7EBl-w}l#=YvdnFT(rwgIpR0zo}2HMtsdHZ)6!lvHIVPpu}
zkrZSw<e3Oevr;F}@H%QQ{d6`z=w$eUxw7?1&Nv+JVjN32rDu>c|AweLU>N=f1Bj!+
z7JnlGFp$vJc!Du(1a^RZ0Gfe5s(NI(o{xH7Pw+LljDwGQ&rckTO`Jd(LEEKO%cmqz
zsCT$K?-52_Tm+1*`CdD=Ks#9z%QAK;;H9lDILWF?<vKy65TFIrvIfEbg4_=el1!Da
zwN)gwuf|q8rL9IM3A)NauppZg0HG+95MkCA;e^9~;_|+P><WKxR{&%Z=S+v|N<>(M
za<DXpWm|ClBdV?HK}Y}`<wAVd7(ITyC0aomm3}igAES}S;qWw1r!BIFm&dwKUGANJ
z)Hw9#iM!K)P^XlAS7!ii?YchzQ6UmDs8v645@AjQ+^fa#NebE5Q{IbJrG`0DuEJkc
zGH^dLTD+NzbvgrZ2!JV}I5K~xj|0~P_khvE;@TmM#gWH*(yh@*p4}ZBG`+m$*NMr8
zS1Z3*_@`IXe%*e#^p1h_tp)48|F8T2v!q_H_f!#|?rVGR=lKR|{r86JoFcY;uw|Z|
zZCTWqev{Rwe~XrShUPAeun+j_L+kr*Nz<1s48M}PqGqUXXms-LeeLt!b$vMR&kfx`
zt~QyiR2D&G29h_JW9BmR0$yv8-r{X!iF~AkPGXQl_C=t$lKCK2(w86JBSCc1C>fg6
zj=k&sRmQ`o?o*oe7LvZ*!3Cj8j%YFH6*<Fm73GG4dR6d%o3MXBDZNSO=xk~)f+s)f
zg$@HxnO(De3e*(<GRt;ZrhV*5BC)XB(k;Z9<ns~nVNjhN0{$RkW~19=Hs3?KA)b%0
ze;<;KS}zayod96juMt*I0u?7^Fyzq|8|%tmvqp_q_JV;lPA5w}SM$@{?C~E~HjLDl
zK9Xl?ds=LI<z2fMEZ=&2?8bY8k4~ibW;{upZfqTY{DW)h_IuarYoMKQY^GVqk!jX}
z>>Q;Wj5~-SDLHs*nZrKZFA%Zp4mpDeK4pS`#XBH4I<yFcs2J=qRu~PGvlSzI*G_8N
zrVt)3<o{~<<(hrj&l)v97S>$Qc9L>?uiSm5MMF#WgP{xWtAAWz626uzt&el5DZ<z#
zskF_H7q++^IHfJxQ&;H{B(HZI0k=vhM6-uC8);iv80b}~kzrUbAODqt_)dUzO|y69
zfv@X<(jj3vvET+D&MAx2Avo85rTDeevL#j!-s%sI!FU{8*f&_HIL9YgoS@<8P-_f9
zzXAovmZw#sj9)&KjHkOV*4(X`dTi6PL;G=~&4tmPtMeAV*LMZ0uUUHgtE|Ci-=rr_
zwWLmOffSQ6EK!@{JASikp(X(>0Z6+DFZc@U7nc^T(~(=Pz+5O~5pX*cC_&piEo!E#
z7+8$T{#21ex4C>3o}7fr&{YhkF8&!{A=1ONXbdZIGsf<q*v^28k$P<oX#IE}=0)%!
z07pdAW<Y9Vc77;SgiLvEpG?tqe`BNRD#b9wmeHOakMHJMzkcD_#OLxSKPb1P2K%fI
z<b9a*=}yNOeuOQbub=DIY#$!^&uiYahFNh;k`pe5H45-9K9NT#6tHa%P+AcE+UhdH
z!T`_Kdlfy9Iq0WxI2;pK9uCTf{W33SNC?!=*_dZ-0J_n?U2z5jS)8TboA}kkVhmsn
z^_}>6GPPHpK6G_-Xxq$cpS#mMdH>Gxo(t38r>@wk*|YM(<94U;Lq6kGyJQYs<hG5P
zQCSo0wSFxpjpqdJ%gHzhoWzTnYDC!!-pl|`1h+}(b=Z>B{=U-PGz_U2l^iQlC^*K_
zMkD~`SP2xlQ&mer*A1kD&}n=vCjq3Y1SiC^Dm0&(JMmX?Nv3uXEs+ZM`qEy>V5xlc
z<Eu6ozL{O)UdaR1m))xiaPVwh(kKYC<Z;5wZTBS!oG^otbtVa_;@IQU{lUgb0yL<W
zx25n=nV`*o^9;!AWR7Y2VhQ^dt29m>ON^n&f)4>AN$n7|HlMOg<Jdt#gAmD?I~i1s
zy8Aa?&W9Y8N2!&AI7bi!QAVPT16{ge6Lb~o)S0+eMPj@$hLqbCs0Uu{n(nz%JV=A6
zryjM8_GV6AgpSVCT^*Q~CUIZ*t<P_zZ<X&HRW@WjK=0Et<u{3F$C)R2_Si!QDBKMA
zJwrk-A{kiN;&v$Ck$#B)*dD{SRaAtfMXN#j%LTPAS?M<w3K<2msVd^x16-y~F67$P
zO#8tu3c?__czjF=;9QZ5>6>R{rse|UByd}r?GYq{Z?br=HG+L(#Keh}6YY~jQ|i&y
zCu6(Pz02y0yEkYr^=8rHk4ML_1<_KOH7L7QFShwrx0i)^R8_L7UO?Dp4^p}LH*sn9
z`bZ5+yiB~Eh1iXfq_$V$;S7mMDiQ)O)&0nbKtE{OBAVJA#$CyZG|F6uaYdHqwQ$-5
zyfPh|DF6jL*I;Ob)gLUrA_Z>sbRNWQ;{e2Q(deZ-I;L7Ad7>xZ`QX?eS@pC1op)k~
zTGc;2Y{-6c?@5CXN~SeaQ|*=E=pUn+%|KLw%<^<qBpUfiU;sIlS0!m+0V39;p^xI#
zSge)i;TZM7G-g;U8c;$ZVIUVQTL>N#Dwx$M3w4A5mCW^`cnlq7|Fr0Pq+uBJu{Qz3
zsNtRxEW(WKdCwPp3{mSCMA<9n^$Z9<T)bgNU$JELb1Z(UAFpGr$ha!yT8<$aHKKE*
z)SO_9xmqBQptS!Q40wsmt{j4uHDJc_T|Y$oGSP#Y?duU_5(~#d>I#ZqwaD7CYkqhU
z(SsnRu80>qx(WqB;t*riHkLtI7&cX!(}eAVMTawj)+nF1xE<pTc*gxu$0E$qTV}q`
zC5Tf7mHhQxSq~Xgj)Pt}+9Mf!sE>eEy-S~t`IX(iVO#ICZkrr9@1Ff|ed73)M)wVx
zi9+qvy{;A6ZjhBw0LHAebfMT@Us+KPFHV(ZgdaFOZr-1HA=;<|vLK34Z*eJrm<d48
zvy>B1!5``FC#qmwRjpTKP{LZWE&%U<Y8cWDp&kyn0WAJX^OXpp(60>U3X0YC6j6D=
z3*(kCd3R!A?TT#e9p8zb^cCZG|HHWd=W+S?75Q+D{ORp24U&mGnNYXiWs`H)VIBN{
z4#fEJs9$3Dk;^17CpG#*mk&iellLC)N1^77DwwbcbRve4n*}$t&7G5ifD}%!!c}#X
zFqV^Sj+HnDPs)+eN~^tdb=!`zKq3|M|7F=R#|3Ja7*mYN*Lvb8u4}D(_C@!WyZchJ
zM*hf}pZsdu==bMSFJw-h=-oLs<yx9?uM);Qb9ghl&yON*!^mZLIrzJkxM^q$i-Yw8
z$^u6!wg)bX6)rld>W+pOp99delwvOS23S|(r~kuRtD=-;FW|J)%j^*UkZ=dlFB2G2
zrzOr8?iS+Ocx6LC%j0Gk^C#D|8VHDKjrG(1@}O7Jw_H=)NN0VpX5^P+!A9PQ^$V;8
zI2Eoa8n+iEyh)YIm1j_-fL8&fz75qBfvd6qVmvptSaH<y<uC|{pnB7dd<1+8oqXvo
z)d9=25L9m-sAQ;4bLv_B6<{jEkpd6!^KoGL)4X0m5J`Q&+TalZSMO<x7AK>4z#=3&
z-u8_g^EVJDBt=`pkh0zHsJF<siPeSL{=#0AGha>h&43Dxjn7?*TDDHbzyILb$G6;j
zv(#I9Cpwn5W~qm}LGr&`^XHwbz|b<cJ0)P^JE0rHQ9%XAY^9ik$Z*Dhn*;u>RUN<!
zp)v+*jAJc+y;`xK>N3a+0MjCwq#`t9)nS^D?>K>CMGeW677<HSx6L6}X2+E>j@TvJ
z*-GgiGgXwKu1;)f%9zre_Z{!m94ys7wh{AIc4;v-bFAh<rh0kL*vLTd!~9oG0?44)
z>Vrar^L-iAirD-dz@c^|UBXxyk7OxgYIA*Y2onk7rJy(%37M3nw)e68SV)j-Tuv3)
z?$hOe-L&|W)PbjuGT~w>l*hrd?zL8|<?~Ro2)5__Zt1!8sx~%BZJ~QHhW(LH&oT^C
zSIqbM>wf0mGaS}9J^aKs>nK`-ryGZQ&yV&^jy98tSs+<>`*#6M%j2Ib5y9;2IEqk1
zVGc{d2)aOI4s6hhN)?<QmKPGlAkRTV&LA4XZPXdzhp-&r!l~j)2vQQ%Ld_Az2;KoG
zp6~*73x~TBJ_2og^NArzDK>2;SruF!t`odNywcsFeoQIz<z`J?Q-S~@*LJSdj!1Tn
zbZO=p_qVy<X&&ny+Ay`dyGC28U8@~VY?w1(X!Svx`S5;eva`~U<)ZM@aRDM)21QET
z`n26dRzq%pOeie`zcGsDw)%*|?XcXPN%pk>+73B#_VI&^$h@TmR=2sVa1z>64r0zr
zQF5#xqskOAp`*O)3d0^n5D6d}C<YJ7vhXz8@k3(#aC`(oMgvECw?CRW=)Pn3(lKAO
z^VYnc9YcNk^B9{>3;sMl|FN-kcz0^zg^{Q8B*TwW=Gm-8V}c#R{0>wgoFIouk;x#c
zQ)xU#zyRS(Esdw?R#w1XoCTcbvld}gAOIg_!+eOopw$;@9u$ZdkoVe!D}lIzVXGi$
zF+wYmc?5Y(#LdQ7{L?s&${RA<6tN>}x=2kcSByW_PW5Xwg%mrgSNcwjT+?37^?9iJ
za(V0H$yCJY+w}yonA>&8(UmWZknNhYrxH@NR;T+vL&ygV)(-98NZ2W9ECaHHXaq5B
z*)C)e<*yOV!34|-hZ;K$kmRy3tZ%FW_Hr9bgoaaumStVKid+`0x5APe0J`1FeWVxw
zXkaCas}IbLZi54~KUbq2h>h6ZQmtH{{mb7V+{y$jt=ebPJ4Um2YFEreid_d^O;2o@
zymGKnKJjf~mwfnmZ^p>a?Uhb79|#rYpqLFr2_YRmgEFhV@x0bR)K-J=gs_oXBHlmU
zFNA4Cu0tISE18al6V%ZVMH6QbnqV9O#4_PHaYeC9M5r(Eq5(g!{tuGcb73=cQSyD>
zVG@t}DOH!dkNh?`Hy=KS7)Ab+S>UfI)r>H*0wiNZ;3Phb!?E3KU6f;XBF-fM838!Q
zcEtL$3JWsI{1d3+!>g3#=m>(?QABl{xfsM0TmqyC`wfr>z{p>r$^x<xGZ=4yC>Z9c
zTpU$m_zECugnlT|P%{wRx*I6500%|2ai^=|Y(4S>fT69e?6&{XcxbYvx4CKTvdz@v
z^M4Shr2b2sGT{p8l|I<+1f&|z$?%)ZIOwj@aY3{~)CD*c6nUvVZwr;6n1B(-mfXKI
z5u}@>ws@EbSQ3z#1obS@hm_{;wT9wYG%9-pui)S8sML+)PKTjGz(ZRd1uqC5aafp>
z1F)1h@fe&59xB>{bgKw#d7c;je(7^C><T^wEht=`J_(rf6%|YLFhMN30Hh{7`$|e|
z&{$$Q{0;&@z%X{rE_*#$h!puWmfQ<~zFRXNz4%UNlB}2>{=GhH;T`MD?BS93`L9@;
z@Y$S=AWQhsE3HBG;Xlz_UTmJ?fy=}NqU8`1Y;CTq2>!6d{kCd(;$!8YWa8HJ&!zuz
zw$DC4+h(Wd@aN2O0iK_8i-mul81Jgt>4KmH)E-9dP6zF6=Cz3O&@{;Slo?77c=9yr
zk&-0AjM*6bo`SGyf(&W4BPb^z9}AeAB2ssz02zWXGoUDpi>t}|2Oo?1e2QIgv%EPB
zo;%woLfSTv)m1yy`cSl}VWP#?Pn8{mMY&>;t)&>f6mEvgMf2f_3f%Lh*hd7h6)$*~
zyMmgZ2F`6HNzIBeA|4DOmI=JJ9maB~f;Xb7iRZTlN^}~IoWu>`jVcg8Qqcu%O*w7L
zP`*Hq)qZOXpOix0j8D;<AqQMnE(h8JZpGZNB5B!cC*%CvTs}mYK0h{4crs4C-|J_x
z=A+r8s%GD@i}U0=hIXa*WIyiNVdI<e^$gPW<kue_zS@}BI;rYzeYyuT98^aeAfr+U
z5w|7~H4w5Ojw`RnfrJ(d5!6+|F~FBQRl*oN4Z5U}nnHZh39uHbJ!sGwRG5Q?kBlC2
z_)=H6Nm71YgBK%(<&GOOAXl{k12R+QkE9&_bxbcAVSoQus>D3r3eDYr1msxamNA<P
z>Y=H9WW6V}Po(<cEUSDeTkm+=Yp~35j`&oBvuXDG`eQ5M5<es)y9s&t?^oHpgZ+PA
zWgP=S!HaB2UAQJG>Ah((-x*mKxNVql3WgIpZZY}V7Hu?$Jl_z%hLK0+HAU_{|I<l#
zmpwn}v#0y#8x{+*hWiJ@SL}HW@iB2t$tu9u<&wKt3H3C@82W&-U=li!tyH5Y(itRk
zC<=HLbrk{Srd$)&g&RxCqttXm7$bHH-5l5ufJUasla4p7LF&vDC1ccc+X&8jUn!UN
zZok*uJ3jn`5Ud)FyY{K?<UL=NVdjr=5LVSS)3LPDS0vgueKVwDPPc%ib0&(ENXk5u
z+A+-(AcJ*VJQ7Y&_gA_Qy_SA$Vk5zaoifsMg9s<AHu0i7st~DgNbQGQAR(tt0COn-
zK=bohiOD`AsK)fmXi8<9v=aGi4jUb+XgTzfmK)hF#+*KG5e8^D7laBGDXJynPZ6nm
zsQ!;<x>)r`XZjw6;?`nE+#QgqfzZ#S`9P+@s|5@)3wxvI3_jBXRdHs8E76U)hT%-@
z$9poe8UmVfJ8(RS8FXI<59#3j+1RFJvu+%4kTOsuVsL^8HRerHn4)&CocdnU_d~Dd
z%3Qcj0OjcnP%at3nxDW>gr5$kZkYOZxu(#4>XC1z=EL7EdJbmhbvYNX(s(KPfCZzi
zKgG-n#x(7Lm&PG<-J1jJ1a*8s$|z<IR0!3p#2i#=gGsC8h^2V<N!3@DhLO1mCMK*^
zAA+_DzDN7Nu67J4@%|sE>~U31I5gN0{-VA$<D1^8mlAhsj>+$oV%FX7z(f9T)6EQZ
zM}4@Y@0wG1wx)mW3cLSb*7>~TmwUYvTWqq2HLucCOo?_yY}*a0<Ks^-(h{xj_9yj{
z$p#C3RtHqb{Hv3}39XMq{Gijp@&1g%%>}-&i^dnYc}pDvj0sMI^~FAxcvafUg-8-G
zceTvO%3~>jzx<p8V(FAL1>s15#hfCZ&vRX@Tn2D$BMRTJeptjN$4&q~9xiLpJ`PbB
zKY_xK{C>#`R?rky9HkiCJ?od|-pP*#4^8f3LX;I5zk^cb(mxDQo|FPK6pO)r+k@;I
znO|ijGFGU0L+p>jhkeNuAG@AHrUuHs>zZWc`!Li1&+%G-U%`PT_^JcVyh!v{VqP6O
znO|dQN0AA@9z64;>;WOu4Ms%d96*!Q-I#0^j;>^i<RwX}hlSdaqw}=Kkl0TmPxy1j
z;~dxic;xfbb7!zCcVpeEv;OZ1|L2w7r~Ah%O?-j3VZ>;qjm)8-V~5wOF-;Mp?TJhY
z+PS&<$cAAV#H*{p#NHTP4b00U2tecS*ZC<0+t0SSuh8D;%^ucju8sD#N``t$hk6HR
za`?dX*|N{GSo!$l@z{k~6PM~&GUpk-qa#vT4kHaL9LcN+GJuK~n2L#NGhnz_;P~hQ
zG<Q0q`U*j+Tk|O_7C<R1;Sw4#t$v4;?2fC@XSQ|w$p8BLq_I}>MEcf|BImsoF3x}J
zQ(rlE!9&MI?;W<u=<eM4uh0F}#a%FA^<VFDCIkNd$-76wFXoHPo_-TubOXg{f7vpY
zo8HDE)vfn_+BdmkUd^_*78)m7hsFQb;8&rSf;Ybzs8=z^g>y;f7E<mf(22~=^P96L
z<n8PJk&bl+ZNhZ|TWOoU*?`yPI_E`I-&ILYCvVhVPG6z@rF7`m*2`I!LgrhzXAOk+
z#wG0Bo!b#=BGfy>Js^xTbXaP|wlArw_`=be8&v1)AY^l*Wp(YLCNh@oQk|oD-yN~f
zF|_v+uFLr@I_8qS0Wa!E-ej9-^Sy=Tk!-=DbKg}h!*}U$Ic4*CQ6gTRxj9?Me{H<o
zYDdrV$hTcMFJ3nhiTu}w`d1z<;Rn7LIyaE{P)nf|JMWb!k<cG^ljl;_fAiS#PRYiN
zYUqoM{_yP4t=MO$UO)e3<JEWmbnsh^B>g|1{QA?j^@E-A3Dvv}i9VBe!dEnB_Wjhl
zDw?Ou*<2^6&pW(|bzJy%S~NZ9k$lG&oDBlwofa*1zJ4ic&K{EkCoAX6qArP!+M8Jg
z&$ZiJcU;KEtJv#Aulp!?Zta<hJo7#H0=)Hg|2gKtZ@)P6p?I6s&LLQ6L8yV<(LDOp
zU2|OU?IGp!53KTE+xp7pNt+GRQ^$P2dUEg8+fLQVeYY&uOg|o04TNQ?502lD-?PU)
zCzNfQGqcpe1_r^Ap_dN4mk?*T+WAeW7bw3=b}$Ic<s8U~@_#!D+forxHJ`V1&z|79
zyX>WA&i7V$My|7TcD2o6=N*lx>Z7L<R^^_XQ_g4Ep2S<&4G5Z7{${cDu-G37>VwO3
zaW;g`wo9E~jChe{>t-Nj2eWzn^23K#>{)Vf!Qi2>k^Ehw7|+|$`Rv)sS6}`6TW4?n
zn%;i%^SSHyEl$mTu;*a$VCsLYcz?3<;>C-$FJN0m>jWKwIzc6l;Q?!NJfr#Mx^`ix
zf8-A#b63mGePJfV3OzWIf4J(ENUZ*g*DnRH6S#0#cnd7ye8=mo<0tLk3_OZ2$UB^K
zjw94FIWY5~INW$@6X7Nk>`|~Yed;A%PDk*%x1+F?mETpl%hFf;d1`9k<oVcC&7H(M
z(_^Vk;a(xF+yCUgb<u|Ghef`v7t{~u=6xbGaWwH~<A+*?z8xAeXS08t!O9R1w|o(u
zU!HDooRgi0&_pO>=S5UurSYr0#6K=Fcg{UaY-$J|_fl{!M|f0^CUh9o;tn{US%bsN
zU$oSm{pOJzT?2mJVP4pqya(7Nqh$LRjs_dJ^7*(yNdnI@2X5Ib*S-jH_AK9VB!qRi
z>g%)NsnyymS5!8=Swo*cywlk5>+9$Lx+S&rCXV&=V(%|A|N5l)^KFBPS!(?T_e0Z<
zuGZYXAl_zkR1e25q<n8gGH+21UDi#z&*dGqK2~LqKe1~)Lj@f&oW6x2&UK$w>9SX6
z(!=9K8dWhiA~=Y@NcgskLmlh_-i`StUX;;lM;#GQHzM9ZM<hF^8<C3*OSUg{TvR8h
z#fGf%uaunr`c8PZ_<>E&S7&!Ty?b`vh3PGiOc%Z*R)2}%;=S`N6781iRlLFt_K(Qr
zd>w%;V+EUD3iS$=z3HrLz~Nzu>hj*JE>X-0Ne<EV7lekuDzG|&gXo}I=Iux}DXWNx
z=H->MY;!{AmgPu;ao%z2lRvJ?a|~PuYjLR}Cr7s^igmc;lJE!TMMBHA;1@&7O&&!G
z^YY619GMY5*M8N~laaDj{R{m<S^Vnk50-?#TfIYd!@{^nu6g46>aANg4}X2-;J0s`
z?R_n!aq&B^e|2H}^VqG656Ryj>wkDlUthnFBgCl+T<2eoYvkgJv$^cLi7c%;3X6w(
zMrYjB(_nR`{n5NnqD**82i|19{B6&OP?#9DomJ7fIav2R{tL5K8Q&D_v2?#K2PbEf
zgD%JUI^GLrr@Y+fz&TwV9T=TWCI_a9^&b7o*ovOsmW~(l^w%d`_{;t99<s=eVRf*y
zc@@}|?~=<R%z_1VMX<yN&b_xv_+nk&CoCAC5P@OdE9HC#10KuPOs}dU&vDUGt9HTC
zqj`2m@|WFX<EP*_oXnfU=io8D{6pXNEQ_=Z_TcfZ>-wA0rdQ!PjtJScRo_JiopIES
zaOUUr9WgL$ixSYOT6IM9VOi3gJ^a-{7gM_4sn%Xrs&=LK?%1+p^r7pH?o)SuytNU=
z_$~Q~HSZp}vvcc%+vBl`Qx_i;r)-!QtJ&Jqlo^YckMLfz&dYxtEmUokf8`|JDDSXO
zMHIP!$TdgsBJdKrWg%~e&Xr2Z{N}vEmP*U$q*r5b2W$(hM93VRBm=>&?~de`lTGvo
zIZ=R}c{wv@j!T(tMCM0igN(2(iB0zDD=tiohf6NVCynQE7QUk%K6*nkaYdza(09u>
z@vk%qW&7#*SNi!!^4@gD(pg(9<Y57L27$0qxg1Aw0`OahbD}En;~ed*@;|A1FYnwJ
zjwji{b6x*=L0M-Uqml!iJ=yq1;gR4F|HvT2yg8u;t2fzifnkn-q1d6j`L_!SvXdb2
zfZN9_l>3jRZB_p=SKlT<JHF@RhYw5t`qz(eNN*Z-ZhifKfAyElC+iz-C-_JnJZ`_$
zIPcLyYfrXL^oN!6KP=nJtVz{({VQ~$d3A!)BXIAqTc)21O=M<VwobCif$J<jpG+*z
zh;<k~3=30v99E9jZ5d<4bT;8Ey%NQoU%(95rg*z~Fu+*vd8YWf%G`6`^%w6+-ng-^
zvoN+n+wVTKbBvbr*COAqM*1b08^!AD7hm^@ei6P5TkAj<mN}R>ro0#QD!vHHwXb>w
zKCR9`#~&YP>FiJ!9MZHBfk{UZE>{A(mh~ntP}kopG)A|Ac6+IFR0WS;@m(_C)T)R}
z4=d;p+vXVJSYJ;zI}kdTga6rWvVU;ocv}>^b}4+rs;c=i&k9G+vN>nWhJU@`t=+v_
zss9J@!dvhCE?jtW{i^ma53YQhb=0}?3;0b$G&<2ip14$VR|~BimO7{9!M>hsv&Us}
zd|`iH`OI-$7z76dTbWmW-M?~v40+wUcCKex2#hL6sOvw+O7MXGA=>fHBUKSqCjO=o
zKj8W;S_<7gM|?C+eD?bGD?x|id``=EJcS;r`&RAny9;?f|J&64b+2g~clX5`hI_X*
zkDt44#u(L5fx&8-k-x!eJriND@I^!WH+Qo_;tazOVlCZlphGSQcE{n(!{=}S9Lxrq
zu!on(o4rzD_k3CDbmlpl?r#g$qXmZFu)uA6`#SD*-KS)^a1eshN|tiVk8*^YJM7K&
z3l^DevJY%t$qM$)a!EPV(X2e?Zhb*AJsA7fzcu>)c=JuV_GkNke^5W2IjDzMUBUy<
zz2sW8Q>6Gd`sLt`?2b5?Z3=Y6qw`=+5j}kZFXK~kF2}9NY%~6x@3`os;DyaP{*`vg
zyvi>QV_VpHZL%|qm%^-WG6@RB4MhmLX|>dBGmJs~E0Nd@6WQ>s&@<Zjt&biJwQieE
z)n5HLae5gJs#{9u;?}=?*);Z}s?>e(R&PKRCzMx<lZvDFh9#cQ@T2$gU{ZDMX(@EE
z#Li@5%Oav<;Hc(0Agry2i7=>5OYo08P;!ZT#&PKYJKmtKjmru(7%eZN6K|ETU-n7K
z5&NZ^(>!VA^p50z0<(z=*WRxC0qgoDVJ=56+CR>O1GZ^OP3L&(J8mn$DQf&%dT;BQ
z^oKuw-tuQ$`FH;G>ZAYq>E^eCf9w2M@>FSap*7>D^EQ&SCs%f7l~+WV!Hz`2=^qUa
zqxfD3J7wjd6a7OlMOm;>pPWPpiDSu_f!uTNReP3E=*nBV3DE+b>=^$!o9jNcc3PBI
zl;g=>y-r{lJhv`nZtd_JSYiQA68sf@PSrA4u61#Sh=a+HmhA$ndrSNHr;qQPNcYXs
zK7Q7ane$jsyf1OGqo(z=SZ#8+AmWmcmuKR=Soakp9RH*Co?LdFRoR@nkm7Sf0RnqR
zmaWh~600UyR25uq@}XfMMIT}WbV2vn@ummT@=oSO^t17m$RU`V;L;1hYlEUp4lMQF
zPgfk__gcp&BnsF2lM4_ez7nyHgHs<8v2IO`>hG1_jeiiezVqyjSx4XewBuXllSbp3
z3t5wQA6>LHK(b+Pb`(p3OYFFayk<mIBoj`3cLdA1?(L43kJ_)I|D;6@o_p2#1tb#=
z$Y5SXv<ednk5)GSK*#^n-nYk7nYaBS)moUb93s`QoGlfSQV}a7hvg9J=`49-WSY`M
zA%{gRgJPT(q9j>TyF>>`4(mx%u_|QVbi@c@<Xq0I{rxS?`^-G=^X~n}{_OqN?vJS{
zao@lDy1u9Dy7gm&)ftM2kBD&1glv#il8Aw1Pr*-$_`5rDxp*>pVj7S!LmM4lw^V?W
z!UKFpzfsjU7raLTJN0_;>C@?oiQ3v6T4E$q{vR$?);rH!W@;?I#@WELql#}yv!Wy#
zxq*^EQuz{|@>bN1Xzrb;N+Jtnw13h8YJ%-?6FknPn>H+Xh*Q_7PQ*ziN8(~|7oHbE
zUM3=Y;~=2{FC!OY!Z1#h&ve^DUpCi;wZ<ybP2PHg75>GEQ*OyVb?s&zE!#r}`a=8;
z_E&~^_rEFL6%s<M01h>Ob=@J0lB0c7R}I63pC8^b44bNXw%D}q6mB0R45dO}4oWh&
z!^AD>BFfFAV+Fw}JZ)!Qg9Im~hN<c#f`(j<hOQeb0b~hu1b;#r;D+HAlkoq(IKHGv
zqHP!9??UuL>IjtenC~Awnu*?KDXj76V}pzqsCu_eMQye$FaL4SZO=Qww(Rfhsp^tm
z^h8M`u2GVbj((yT<i3M^qP)mxBo^@`)E#-+OLB3R1fn8=@lYiaIVvm}C%_boYO4p9
z0(-}Eixe{zQJ=;%il^2bl&4ZA%p#+e;)@21q@_us)1j+;-Mr}9CV7`(<}Q$P&YSC!
zSZ}SxxtPDH<Lai4j4!z9^B3HF?0>AMsxj}$*?%E;;^YW08;clT6}Mkw<qF2n96H05
z&%BM()phd~ud4B)C+L7UA%U!t2<a1Aht^kAfq=^a?jjSg!~cs2j5+HS>b<m3DTo84
z71z8Ie*A6NqpyEY#b@D6XH$akr)a5!<ihjM)P7hF#2jnwh#zh_En}6EN(1}Aoyj+H
zbR?<bJe2=dd73X++m3ERz9yunLL|)w7M_u#wV}_ou^KS%iY~TiGTYA+2Z4@H))>X(
z)YO*oqHokOhTtIG;p-+(OHBs#D1M8NcTMBftiWfnVWkkkTfIU<1AZV$N4w^h&rsGE
z$TTha7sw1T|9ACRQ0MnC*9ve++&t79piS^C>1;g~P7;Txp_xHc{3&n`WKkP*-vvBU
z8={ItSg8YGBua3Hs|^%X5>7GV_Tep}M-%5o<Y*?0YKJ;c8w+6Y`&;b$!iP?uWeY#d
z^u0J)XF)*n@<5Q1A%7&*GGt%ddu_=mIC_pe%1c@s&AHR)T`njbzgTFOl*R->5aCz_
z#R*a2%F<Jh<E}Dh?RVos31u0xXo(QWRjzLB78{t2cGNj4c1f{uxQvbPItU-2l>d2(
zk~{KmyLj<T;I0?MEYT#&S#ig@Y3o|^R3D}6%EJRQpR?Uxziyt{?-NZ7=d*DCzu=^)
z?D>0PlYLDmA8;;m$z@49*#I$8qk(wP7qAy-M{65cA!03N@sZRY41AoZ3Q7IKHi~kS
zn-|GtYND6LeOoV!00L?NJx<sLkHi5W!MPNA<z~`!)76*kp|)w+mev)V$HTv<xv!h<
z^jo&4G9V}-`X+=_G9;Qk>^dq7NysYZ!YECy{Jb1JniW+W`sXl*aNPnstEAYQ227gW
ziHHcds&5(abFMbT!qIeCR0r{2T;yFC#!^o6U~QasCN9`P-C%tRH)M~yiNet<_-fFh
zwTz_{@@lLRm^fvc;J%oDfy6`q3W;aU{^Zyv3;oQ8n>@mfv^{!Oria7l8c{TX6F4`*
zI5@YBj8-o}ZVke_i?<KUBS8`Up|=(ZZf=WtYRn-T2v83?hSHV)dGiisa_|fA@o~U-
zqI+t>Bx2&)9TcpTM3jPnGs~m$!9GfY&tR5xzVJau{OR=2@YB!ocet_i?6=EBLf*jm
z+e1A|X+g!Xs3%<%;mSeH4f2U0$Om$WZ*b%u6+dyCqj7;d;KD-Az)b@DN^xS{;96I3
zCAXZ4gC)2L_oM{NEicKlLo>lNA&6RTBF6)2+Pm6#UNp%d;Rk)xZu3sP;}cu{sJqeW
z9?P<uw<@Ne*M3dJ&e`c>v(j<A#xk^H;{GrSue|4ePBglndeX=ZPz=^*$y<AAr9eeH
zUot<jB=0ul-kMkt29e$_Kg&nbwi2&MQZ1OG&7_T@l8mX3uUsR|E;xmcw;|pVvIs`W
z+GZ#(b~0PFdF(OD%~aSgij0C}S_j*&`k4NwPQ+|9gtqmBBr$?k(sbkvq8vhlmPUG%
zwH=Q}l15S2bBctDh>C@&M$U>{N`fPwiP~aNN2^_P7z)Jp;v?|vBja!*cN1`Cr-*W6
zR?#;wPhCsVkVw*$xQzG>9!S$$<D?7s@B1Zk7}e{6f|(Nb=hvt7hlcGNykFK|QCcxM
zl3H=#<)Pl*6-0Hv_Urs26+?$#u!oyonw}5tzEx)0zodDtoK?yy#GBh)cW9;%+yZ5Q
zDv+bh3K$Z<v8X-7w&*8vl0;t79=|0G!$X3OY66;G$nx_4s5U+AWpil^pyf^Hoj8gM
z1OWIav;r;19~Q8KhG7k5Ehhi*mj7V|r{$OT9WN_~(6|PbA3PKDkmR~T8d7kG+o1&H
zC=M{ztucq=q>8>$;s3D<z^Ru^7BhB%`S$hHp#7YvMAEF}<Rbi&;P45a8n$z}u{S~A
zC{d!O$#1^1org|>D!afnveRQF9Yua{#2TVd+!_HK&(Vu^3egrlf<jL7NCHVWXPNk2
zm~f)*`o|yq*tc&zAn^B{$GH}EYn8OJDBgVFU)P^~d}yTi<jJyVj?Mw7x1x(TZFC5L
zLBT+|+y(?ma4vy1lF*g%jnIa`GpZ&-gC<d(jNejXG1G|HPD({JR?|(^I;&`JNJ9zY
zqtm|VLfz#uRtgB3>f4_vCI9e|n3mbF*!<4G*`W6=?h_TliARqfzWjQ0Pz=5|hXc7G
ze+mzYTjVSWNr!kIY8s~-Nu%4@i^1@d>;j2y#O)%jjW#Gp)^_Dg8AS>SQT34iGZ*U7
zmoJMFSw#FFQZ*~^#<D08GTJlKxUNXmd{5Cbf%zSGBiy(&9SuAC6A>{oJp0W|YNC0d
z4!)7}BT@F_|1a6kEyG#`e&EPx?V&6);qNao5sK~HGx0*#)bHKh#hLhrOGG7zv#gio
zrr66x(ws@IxQbMp3>W%k6$%lVI{Y2iMKO_RJ}IpnJGl%H5JzL9w>A~kK%|VuMoAzM
z;TOdWwB%NNkhIghA#HoleMt|&yqnlV$|aVeXCEttE1XU@I5)E?-E>doFSlCSuiO&2
zihoOmjE_Rb`0oypWiLY~2^n${s48Didw`u-YU+TC6o;5lq$|i#6cd%wtXxTW!Bv_E
z-<>0AE$VV3R}!upB_^GSpnyxnxAf_<sGb@P9IX=)5=RnD=5l|{ANsU8_&Y>Bfi3)*
zTYnBDU#GWyY>X;!NH;CnQ+gw4=EMEXfrr$VELnnoU<6DxwEw2fPKpzAZ3bGkRZE~{
zvLhGGP>qB_aR(6*D1oh8Y*3&ocJ|~3yzGH9=vGBCWFTpIgD4SlAQf`K4Y?3z9rk(v
z%*ast<injSulGX+PEYmEOlBnrGs)Sl^*`2jU>7W&+2?-Y@S_rbpq|D_7!cH%T@`D;
zMv9ReT~aH8Mwj7j6eZ}KBzd~HCJjjkp;d&ER??T3D2u?*Q+}>v1pNY+Fp%hcVPL7Y
zL{zXgqievf=nGt^l=UeHd?YLxlD-lR5|T8zwKk_Bv^;r=DhzQR1m(i&27d^_LPI6B
zsqE3_cg<mwg684_B_kuE!0h9bAO95ZjC!=dJo6{oKg2Jd)LV`e)+N}8m_CI0B+2ao
zsi2QZ*6Tvj#478bU35eT1YJY;%#k<OMO1&p*)d7krJ|j^v{1yC%(nnQ6;v2PLGqH&
zq0_tA!bTKdS#||GX!0d#;9ckVhkuVwS5`J0oZs?kH2XUmlpP6ZIeGQgNQIabJ27?U
zLc7!GRXT+FB04pOGdt@c>JQ%_S^sg=B|;i&tmf!2X)Y|}9@4YvBGtA!mWzP~`3=Ay
za_R*!imA{Al9F6|8oU=qI!iP+GVHg@Cu(TsS!XKdaU##n!e6`x|L}jI;jr;9366hH
z>f*(9E#{$L(2tgh*_rtnR8j*^OHr7y{%&wXk5T`pP~pAB8cf>Z1nN&>t8~m#<Gfy_
z7A<Z6Wfe8L%J^(jh28&fOEs!Dc&#<L!?!K<<?HXB@AztMjir&<nvFM&7p#58_;u0R
z2l@-SROg?zERx(9`ss02=UT(s;4+VYc-d>wYb;)BgSW0F$1aN^5*<LRvD^{R*}l-5
zArj|l;CcjU4)EdtG)VZ_M?}03g>Ia*0J5QyJR%2U*YmuV<Hvg+`n(+VtDJ0O50dX$
zu$q5i;PjKyXT{2Q8a>9FAC0mT`^)2Q8zEk^(gU2*F^{y5ko4N%D@PI80YH(J1Buh>
zc@%L8q=Bd-6fkQ$#fff$Ub1opwr`<h;<DCewjC}yq62wabO|dxPrAbpnHoAE`_j3}
z!#sweAwQ4WU@b{T95k;DJGuBy!-P~>4tl|Tq2Z(bg#|4^{U=L~9jXjF*gx^P><{Zb
zmbI=dc%-zTWu~tuedhV%19f|50<)XP-=D@D&IyL*wh?9Hjs=JyF#{r~FB(f!yY0WJ
zwUHMU>n4vw)4hW53nig5M|7An5|ErZBzaSS>bwwo#f6L8!F-5v;YhMuRWcsIZ5JHu
zV@CTZ+oz2`Q+K{u*p(&hV9&G(Cp~5uILjwH*`Ea^Gm~o9D~`;~<FrW-YFW;i?{$GI
zX$K3V6p2892=j|tTuELGlL8Kvq)6k@5^tuxjH9!Z2Qu3znhJ6*Ezz81Cs!S+#D6)%
zSabU98DfO|pJ#YV(bx6VGL_7T1|DPirmXi<8O72)B?f&hV^ek4d;30ymX|#dG5w~-
z4~-*@saJhU_Duf0ciha@r?1J(e9aoE|NijOlQbG&`N{F5Aeq256A7${A>}+WrKl%4
za4dX-3o1^4nDgHqT}{6*#A7O5_&k5q^0*Iv1O}#ixQuxh@Q-x-hj8G*=WW+76DF^p
z6+zl$8o`ZZQe-c4p(w#WS|nG2+0v&eS;fX#?MGD#q+q=mo_`AhKZ_tVMKqvpz=>io
zO6F<!D@8OLeT<!~NYbY%E=Sq8ld0;GM$!cVT`|L2+*ci+b0Z7?UIIsB_nP#U$;ksf
zrZX?*`}MvHoqW^$xhHV+3%J&-J9=#RqyH}cbkj@2>0Hys^A%J578Sz}>twAl5WT60
zb8#i3rM!BtRFTjj*8qGa)QO9sZWJXx%gL|Fy+q(6$($8<1%u&2WDFB+HP?$Vd#BJ+
z#5nz7$5rijp3#GxMzV3qoN7;R{g}nSJ5VDG7v=^QdhjbtN*){PP2Fs)EUOsYbp1O2
zh_+$*FDuU?DYJ3_Q%2HQTAahl%aDk<MRKC>6^oFB{2!m_24A8-GEoggynD$*>C>gZ
zoTp!wpmgQ#?UNrHX5#CFAK&!V);=yPn~l(}-YPAwINI@hhO)=x)NkWv(LN&yUniI&
z6(Mj2m6$2x#W2~Sy&Qe2jZ-1$PNpl03+zrTL;oPEAa_QDklvgfQY5Dk<&xwhpaT)5
zFw@ANT9IV{Vy2++h%jJwBoP=uJIk~73sMpf3mczzS~l^=YqPS;i-k3lQ*Kp;OFLH&
zS2a#7ZFxI3BimSc>nr5k%RMh5v%!TZ(Q~T*M?F`bC#p-a?;`+l>X^Tzc_9bKiPC^-
zn_1i4zO`r0%dy^SZ_ngKX#<s|C5)30>fk$cS6c&TXiB)8g?1?#bTZSoawVfP6??pk
zQcac6U|}Z7v)`XA-RVF6_P}D(=E>g*vd;Q>ye$i#?rT<^wT{1^n}xz*!-|>F@c#JX
zmden{-)g6G=cEGuQ(|uyVIp#I88#4fhG;-#Y;z!<A~6LvVH{eua3>~7N~anmaGbhV
zmcei<0uRZ&;wo{sN0Jev^Ec>0J#Ei3_oU9xu4IoIw!Gnw_qF!QCiI0pGSm)TpD+AP
z;5VIMxjW>uKzY=BjWr`j+i{DHeFW!X15Rtlf=ss)+#-BqV;b|TM7-q-pr}fYvw|7I
zNg=QM<D|r?vPsDo9INK&=hAcp);mu`IC^dLx(HOENN=#FN0`%b?Kr6Y2*7jnG^w;5
za%(rZV2L9lhORwOuz78uj&kE6_c#2(@oe|EO|Qp;0@G)co%QDaKTf_Md6DlCdQh0r
zIl3w9wdFeVp_n$ngaR}!KJ5x}T;|q4BPk`?V?W7ocQTo2RrdP8VGY#aTnw<C-F8S3
zL+vJV8e7SEb!g!z*)Xk1w~JW8$&K6JR!_`43-p`r+SC{JX|>0<W5;KjEh~rmg})MJ
z=*YJmzGb-G4yK-r7g&0t7Ux{NZNw5=U__<OyUAiuX;582Er1D%fNs7MHc)EL4#)u=
zq_5xz5Q~$zw>$$HmeHy%z=T>4l9r5GAZv}Gu&lPb?bN%;!foKfk!8aMHXLaUHDHGv
zY<m=<R$lBs+<PG8WnoLx?9}>}hwS2f(@lPdve@09r49KvX5MwYoeb<#`vx(76`c^}
zauaWR;6H<i5ac9`1mwp^W-w^x=7}6MiLVJ<eF0n#9YycIq5D7wL~cjjE`p)rK|At@
zQ<Z`k4X(0szVYMgP5mvQox*AU%vh(XkXxu8y8i6jx4X6#Ugsb0$j#q#uj0lP3XTeC
zs<f^aX?vo?s`SLS{6h;H5c+vq%tR&3`7o!VV!I%6U_V$vPsS_N5%?jAMSzQ7*?&DP
zNm^1Ac7U)Y^7Cm5rBxiJwpt&Jahpzm=)whxeY*0$D5vQ^m($1L$^6=nAwlg>&;QpB
zD*1<ZlkY!uX7{l_)#kHHhacvbg?`oAv$nW2_}GC1Z7=%2nNhY}Ga-8?p`C&%+PD+X
zKf;<-r6<E%2pm$0ij^VR9~gW#6So$P8(cILe8D7KEehsd=GH1C#0OaZM0P1KtI^3}
zJjNyvZ*IGI-oi`@c(y_QlyTBUZV_EHERqDNVS|k5yA$Y{Q%h0^Rz(%ZFYXT;Xsmr!
zQrPRx4-cH(=Xn-B{lmFt^NP}=GkGr!%cmYq_^12zw^Rh)mUTn6>L5T_UnSd7MJ%#{
zMs{WJYivY_bv!3xNi#JWnb*SEgnwbp2{I&IK#?YMApjMI)l#B0zX*XF<Rr!>=+j_h
z`Q#;G6K#`t4RFNr_-J!)PxR>M*rF;^&*`$ao2FY|Kyd{5KJf{65Kn>q(Z*gA53wNg
zR1SxrwOz^v66At$kp+;SiTt3kyf%PUl9Y(a3yQ%v6a=6VLk@NEmqX)G?N(s828ryT
zC$Prm)`{DgUDBz@W{3pVrqi7uhc_ynUzh6dbHuZ^F>iWtwy^6j4B6A0ar&)tNukB`
z`va3c*)5izKUL1RRaGa?SBDy7j-4|W^(rDPG8khKsZU@_G*&Wl6N$AXGVw&3z6|kt
z5nY2Y&Me9&)9G}?035moQRJ~m#~3CZH3Ks~GgJAKrY#l2DW;>A!P$k&Llur3^6!l*
z_`I#~8e8~%<*j%L5D0c0>LF+FUoVW2>BeN_Dy0Rk0Vqgu$U2GnGUQ{3Jh;gN5jX{{
zWi{!p$YVr@#M-_<;xh4d+cCq@ARI1-IJZ^H_s9v1a4B|}ah-^<+fTJqz$&L!ObnTy
zw^x<!o*8-bD1Ou;$iMT$+m|EPJ@{p_E0Evun^!ygoo{}(PT0_+yeE9$bZJXjh45iT
z?i{WX*}I#qEQlQpfgv$<AOvDauA!6$q*Xy~PB5Pcen>5%6Ub(yJVawAiOh3p%yw)$
zeOYfr%aGkOuKj*Np>_5B{Qj0{b~6~U_6sAvRH|HiprxsKZ01Grw(Hm2k25+~t6cu$
zz@&2||9oKL@<>ncfMk&9wR)0a3t6xM*jn@EQk*JJpu8Xy5js{qKz@&Pz=g4P(nJfR
zQ?a{gf8rVfAw^&GBsrzD2*tS&#}E~i3l>N9JL}r3-V``Ynpy~-7dK^PB?Ja$kj(U>
z>Eo5kC5DGQ)(?xy%5u%o?}S|g)taK6q`#4)*p5$&xj7bU6iAea=Qisz<C2Zs@Km*P
z5Tc<O$d)bn`lJ;_o1ieB@Jp};T=GY#DjunREmYR<jE`*z`?&CfS)g#J;N+&3&c*VT
zYuA=!%(~Aj?4gsz{gu~ZkdNRPFomEY3x!TV{c^GPhR?*2-;u|a6ql2?@!X&T^(IHW
zUZji`2mL2fU$|N)v;n*a1PjBHE=$R&nIkd#WNok%#@mP>1-oX9o3v9B_B1)l9A7v>
zhPbsg=H6-2ok#xAS<n89&ie3bO-1l=|DGv_Ld(e^!~^b?73|LD=aEvF+GwLTFwnge
zs`8(EhDblR+R0&i1Jwdt62X?YqC^ZVL0-M6<jBxn#FWIQR+9N7?va?z^AQ}J4u3e^
zub2Ia$gB#XN=xE^(_|z2c*i^8Ct{P{RrKG-EF*|R76t9>7e;9l4XllL(?lgIT2Ffe
z76M(jC&tQP)dHV%VKv!Hfb%G#)0Z#NMbwfEqLbEGj!aY9E`{W`Wb#izwu?%_s8VC2
z5;_=DKu@|;8U`8>*hM@=D$2R@zp1ES|58ycma`Rw?k~hb7FjRl8fucJLTQNTTM9Zt
zB&$SVk0cY_2qAh?k!%m-YQ+Gqz*krwfJfVwk_NCs@8TjKAH<KaebYo?3A5Qp`U_h=
zEM8FQ+c`6s>LC0S-BY+{yxk{sFiM7`eAuVMl8}}JnU|5!4_!vqXmGDpE06#Z)$0V;
zoI{8&4O6tCJZr0ku}m>_wD^Yo8W5J8XglPzKmrNU|4^Tdn;OVEQZ~$P9J+90p^YwK
zn<G-f-r(Gm4&}zxhssCqpEWaWsk{?7J<;@KrDgKj?H`X1fBMikp1y}Y(qY<YxSLea
z%=1WrJyKr~h$w6&`{{s4FgOM(4OO|stpFWU7tHRd<)QCI@{8ctIYkWeF^pV-6*Zug
zhYtRtHe3Z?UVU-?YG=#oPt#}nLnrY4eX|y`1*ZC;zn*{HTp9HF_eYlHL&B`4d$(=`
z2VAxleF?19*j3q~<3!$oI8f4GWN~~SuUd<&ZbE4`R9Q)UIj8+(6@Vw<iTQrw2I9k|
z{(Mk$=^QG>E@=UdD^o@UtaKC=>c}Jy5|v0Rwn(YpW48^f{Z*+Cub)-wD-A2}ut&zH
z<5$kNjgD6RF;0|(!!8hIgW)k8u_HD(s<3MBDtgioeWwULlTC3{k`C6J3353DK)1r3
z{!EHNEY%QaRh5h?^oZiF;*T}n*F_+9tE9|-w9UQJwe{?s8>PX=5A^I}ANFpOEmWP3
zemmO#W09dW@#a(~Y@FqA7PKw|H-JH^q{OLMpCDsMazfaVKv0BT9T@`Mv$pi|GW5+&
zD;PkC2(tf+T{3dGc%gum=^2HO6qmD#RTTFfiU3UQjQB^Cf9X`!f5&m-|Kzy3%Be?>
zqMl$C@|~HvONO1af=HpORBubsKv+chEJeviK;BDeU#L>;_C+^gC6D+z8+`|C!l3B^
zIx=1`q-29ErAX^B3C_)_>YmGOEhh)3*~4`tsA2a{v^K5B=?t#<Jo(l~c=hUm!ac9q
z$9lS+>L|GYShg|}IZ=mqud%`Eizl{8x2rFgXkAF6K^@X<c`(XpWPlIq{b%+Kq}vL~
zjX(+eyXu%9iULeGdFRCWGE4EHlo%R|N1@#00i4qESFic+DrV+b@Q3#!4|}g};*Y(V
zc|Ggx%*{W=%*C9Ti>)J_Ipg1B@X6JR+kz25l^p;FLp@Cd2AX7o!i<UI{tb+ynVN^`
z1-cpobDRl6p+BF=A@!Q)({H1*g#p43Wr28s&}^)|E?B0j?Z&Zi|LH&ks1-M7rmr3l
z?tQCijsn%!=0Ns<yRhudoF~webfJ2x!-j+2>%HtrIBJ8hiFH-^pOb9KyAH@HQ6gwK
zqBQY@H0ZjJo45$@y{E=TEHR1XJAtB&iSwbitMrbv?l#~T-TAStLbmx`>G()-MbFFr
zzAtmp7K6X~x$xE@pV9tzmgYvrWT^;62qkomhtNz1PX&b{+NIk`Jb7vgFRDoE0yHl$
zNe-Dii3b#eVcnq9v2-}Q9k&2e3PdW%qDZM_qH5H8){cEL?3)@Zm{hCq51Rb8zv=x7
zO2V%b{fDwAa<2lJcZ7tW_F0{<h2oUQagkRg`PU-)Toutx%g^(~^i`n8(LtWgxrB+&
zPRhn$u2FI!U3(eyrUJQSARe*gOzz*%iExluMKVgdNCs7ynqhlMT0_H0l|o@jpClEt
zj-&zq6=P;bj?Y$%Pk3kX^sTynzjLogp?;M9x#*@#7Tw5}05lMG4lwSLKsCeZ<3$nV
zZ-7JQth0nmQe}{pcA%W-jTuGnD6kBiAmMjXeF^S#PvNcHZ1!E@^RmG3sSh){{ll*~
zkD&@*R#p@b{+{X=Cj9+P;2qh|c1qZ1I|K12!(n1nsV%<N9$8o%Hh-;mO3cfshL1*k
zR8JNRYLdyqrKD8~M~R%Gz?y~UrzfyyQKiXvKI$TPBX1#HTrRN<DqCVG0uU{*rV;z5
zlkx}3cvSp5qI!Fv?bC<uX9Xlu`pEvnr<JmU=o->^)cd@m2pFa;FitF-UA)5bNwwyz
zF9YzB%_|z5q=?kWiDL1@*$s<V9ZShO7&x4vU$oSV(oSMbH0r@G6~4r*erBMn*~ff!
z>)<Y7-^gZm(9!ACrM8CU?JLhJVpiFD7n_cWO5_zAe0|<re0uQiZAqDpN?r^oR0cLx
zfV<1=?XizcH$jHNvX{_TV2C<Qm!U0#VD8@@iG<*UiF%Hn^iaey^P{JR^ZnIJ44RN0
zCHy5VXPbVp`~T#m$>?ZX)fI$$HJ6e!Jo8#&Waug~`6g-YMob-u+lJh$$KV^^P#DC7
z*qUrVHelO3nV1R>`!6f1x;j|J_(tTq8lNqZ{$m#a4y?^o)?(aW_13^x$GZ=2q<8fQ
z2SOIN2sc!g72riB#A0?;dLR|NJw@5zO(w*1tLUClct#osiXKv|;or5VzTzs8Jpp(z
z(nQ`2=5~zHfP)y}lf@gC_A^+iajQZesJE9XK_OQfb1F_U**wr7-LT-`)C;6VrT>~1
z@fZJ<iY7os{Vk<|cV+ee*oDG?MYIcr3DB2aQ3mmYfH+NWFwR;8+*pOWi#ox!y*k;H
z6s<y~i`M-zF>je-Y2|3mx12nU@8aVF9vU4OiEb`?xKHUkx45Or;&b%2mmU?#4u*qI
zbxvHqjm-({*W+~=uzZp$2E7X?CpinSWqt}VkfJ6HQz7nvsmsoVuvchHP>v&FE6)u~
zQYCYOP1bH;QkG*lrMHyF5xp9=(~BUn*aDUAI=^qG-}3Jj9qSp&D}KNpZvJzI9auLj
ze8c+GraJ4wCy8QF3qKyzQ!7W^2uF$6yrAz0!|Gv}H@g}tiV(>br(FijIZ-Igooft4
zBfYKI1cQ}=4IR?PQXSU)vE0s=@U<PaG@F<rbCb@#P0i+nd^D<3lKsz)k2F5I+A`Uw
zHodpk?5(YBkd|yp1eOwk+A=&XSY0S0@@fNKErP&DjQE&?bPT)1^B}b$WHEmx&Ik=<
zn=Fz8*qybKr(+!+K_i(qYQO+`YRKYklpMVaCX80j1KH-XQNx*+pZOEpI_rMxEcvq!
zXOGT?JfE}i>WlChd$i@c$5;h>#*iN*ER>beHc3@dC3c^nE1425#LB&={X9IUoohgW
zM7RmGa;mN80;zBe`;FZ&UA%x8)RRYsWUw~?-+Hd@GHPs%$a>eD{U_PPQZ_Y$Y%D+9
zZhN_>qAG8B-Jy%sRFpb-wFkGFqXlD3rrO$=`x8!9iHdU3*tk{)nKLE@DLMfC8a>e%
z_0*7^8#FO?FogW|cc%*;l=qvk!|D*ctY#?P$b1zmuT)m_-cCi~)km?KzS;@dSG=ZR
zSD$-^U6#wAo?Wf7x4usz#mVF0{X5_6U9tW9@3jZNEebd{Kfic4>*Dgk)^i8?CS5w0
z#HOX(R<JzZ(y6d6&pyIVxm(dClcSPH^-oe+KxySZjg29bv~C`CG^ogO=5w9%%oFM8
z7U4+Erb1U1BTw<wn>TNon=Jwdf9DI?E#re{JEN5iyuH^4=(`6kg!<)4X9b9hmnH;w
zvht*9AH1x(t#sXk5=naCe#Z4X`9&FcQPw7dk`<7X!pNiMOI$X|p|)~kOd6t`??pHw
z@UCa56=}7#z0EzDttR}vzi+19(78D@N!Dd-ulK=y1FxU83dcS+Wd9(0{>>52K3@}&
zP068VIqCrlIa<s!SczTaZNThm=LX~uM>%w%p5b&iJ3#z|$q}{gsx4=*tM(n&;HjQ}
zj_U_2-FQci4YN;yBp93Z6YK49Fw`tuuYP=v0&<c?zE)k5COJQT`t?9Gd%8E}faQ|u
ziB+;Aj6ANJ)DQS^=+M_&8$H+u0y=LPu0P(fDcx|7oC`%Z*Q9lsx79tkysDGPQxZ2f
zMA>v(DXjCg#%B~ux&mpFWRT;k%6qDnlVpQmo3Ip|qnw$(@{2l_byrz;NphX<)z>&$
z<%Wt$Xnj3T*YtWzShpGeWsbMcdta_EubHnGx^SMyyPY#FCD9(i#|FBencrJ+$AR33
z(|#*m1#%1O9NGtDg*F+|v~_E>-S;a(eVXe9uoU`q-6QDnQ}aBXAp%|R8Rb5W!UtuU
z7|SinNW`__cikc-+qZOabprHhCt^%~a>nq`mq*P}NDM0KUmT+QGCXMPWsAjl_bQYH
zcgX08a-sS0j?&b+u%mZpO3#)3Bg7^1aGJ5YJ4{{c1~5}KquVMs0Ix<fl58@O$?M}a
zVV=W&?tI2d`^3!}@pxF7hxG#_SSj^-KK9BwR=O7gP!uScj_EY7#GrtlvEMpv`4eRT
z=hG>s9_@3KEo64Qt@bfZ=MR1S-SkfMri0%+*<P9eFPmC#r_5Bfxs+UQuZ&v2(dt;`
zqR>b#X_Jv`P=5L;$JtkYU588tHLD>?r@>Bnowtr-br?^2r6}l{6T|d!{tl(fOTkmq
zm8EfQbgb;wjCU93_OmCwMx&k=dJON)cEi1!Ki%scb~oI&qv56MbVIg*N4dU&)Mdvg
zu_LTS-SUgpev210^Cax|^IS9R_bVn+eZj<u9GgW+rS*DFR+{mGEqJNH<{rlePu|y>
zeoji{8ddOB<Da4@>T5Wmhm$>TnlJhJ&~E<cHcS5G6aF_%;Y|<DhW=I+?bAKfsL3v`
zm@3TvCe-uS;`p|2Z8e?2yDQb{?q?DMQD_o&|C=)?^<bhhebKtc3Oel;4N)q9MY$Mp
zYr3)SO7V<DodycdCpaR4(!d!TnHf{tV^%rx;6Z<X+@1;>(g)#ZJ5Fy?tr&VI=&W45
zo6k)>s_2p-CEm_8Y0x$~@)Pn9*4*TJyIf;DTDn35iHE62SYCK8DdcxkF`gv7kcm@~
zqW}^><2aFedrkkG6oUX`b+tGzqiUna^MfaSZ241X-&)QLKR^L|D{o=G<LyF=((x|g
z&_}f&{A3OMn#%7tnQ8nTpFp3p7BZEk=_J1hNXN0k73j&5Y%|~|XH|WUjd*tz$I(ui
z4&>@0YTgd<Z<Y0a?#lK!;6;#fo^%$|OG8AFbjRwr6CCOSd<OUZ+){dubX@rH-aZfZ
z<k%bOHq$*{Dq(8%$<)uS6M3l)9wDCu{^*Fk4}fw0N7OPBXSwzgIv>ci<(`qQyI3mv
zEYBXF4;bN$;oD*Ti0Zg=>h4|K()yY(D_x)JILGQ}jT*W`=TDuR4((=7>b6u)o>u<Z
zk8fd7UK6bq`l{CNVA%H|A%}x|KI~fZwdqj1^36N<If5+|f9%Po+GJ|sgpK5Aky<8#
zlf<xO<)lPQ<+w8M8L2BY%%x<pl-I#mh~7#wl3UavvsM$>(7X|~N6SDgP4WzdBMB6{
zE<CV!02pwj>i*1Fd_~&<!zJOux4M?I1u(NVO*kOP-FAN0_`7ZCjfP)sZG60M(f{GC
z&#%kf)pjzwb1bXl{&>n_!{(X6+SapWp_i+o58l0T<7R2in@59kQ-O8M!rI1LJC<7$
zz)x}ZwNj{hKtND3`S4ExIE&31v~}IUmsZncYPST(8NB3duc;uaw+yKq1()xt&MnH6
z1V}@Oni#9cn=>V~0vhHbt;JFKTIHLpT~*!n;y`rE+qKQp_q$#e1{ZB2Wk0ia-qM~2
z(&<|b!w&ZTPCDOs(MDHau8yZoca;iGKF=g4h3aj9Kw`7_mV08<c)@OfZ9P?w@4BCx
zAu7QucyY-s6Nm4u6TngiE_O_(OVXUt>XSo(wx;1G)%6c{JuVr26g`O>x2^eYsBs8T
zM*#m9odb*Y0h@omWvd`P6h1=^{F-EQZ(wl^CqUdPACEepmlR9zCf6|vUw@^i^H)w-
z79y<%z_(GQvlP~KTVcD}L`u)8%Vg=3-P_~+TFS?7y=<PT!=v`=Y=7Y{0(!%veX@nu
z!p1kLm433!+BK<JTc8$iv+ps*t}3n}Y9(%-<qP%^>;~?ml>@}(X-TC<0Dc9R@Z5U6
zl~f6dhA13|D$m4I!$jcN#WgU2QJh?^H8{{%bH4`!S5{^i*4h5(sD*Krt$pL_q4p0A
z+s?bswDsQaC>XSxi^vd3*g4#Yy>hL~r7G()!1YUbOw=qBAnQteLdXH*d?tn7V3&f2
zdRX5?pwfUxYOlPmn`;n2jOiAcy}Tp@KYC6Q){7uFanT0XJG$$PCc29cn3?e>*n1XM
z+~~Y-f#=K`PhWKHnxXsM+cQz@;clPv9`}Me8lPu9cy2HGLl-zb580r-n77pjkPI~*
zpaTBfZ<UL$L&%KoPhM;R1R_+`am4kIj>BWuh9Js>kp{^c1PoQOz%QjgPS<X#4EZ#R
z@VUD}y*LTyTa`=Fgh3(W?4V;~zbFU&&}RK8lYEvBUiuv8dxw9*P1Z+EYF#j107=)y
zbB)wGmjj-45=)wdB!aRcczaGN{>V=_AwsFRUdS(3XM0ReSitoLEhhdJ?Yp@7lWiXS
zk)T5s(;0Dlnwpwk2k#3^Cmc6rVLp;kQ1`<@Lyz*~J!7@c2c{a{<lPtClfr%G2M|2J
z;8?wTKUXIw2|Xr~Af%r_X>+{{NMAXEk}foGVu?p3_}-BNs%7EnWJrO4otrId=qGb*
zOCHS+Aqa0~Io>yT_T_jCRc$>GKJ=LX=cU4)M{1Uqcgv3&eb~QD0re_J@pPthoEL$(
zpwv8BlJX#Vro6Re6^E!v->L^u&KWfHFnEQqsR`yU>L#96s?N2PEHHjI*XBE(w7<CZ
z2aR}Rg-j0E?AX|JR_lSXo~wg<*=#>(&)LTxwtZ*Q_NG@$u3b}0uzm2D|EcTOmfe*>
z8x>ve4H)UWN7ToXn*&)7&{IoiB?s`sW$jX&woR~~AcT_T$Z%TYFdh-*Rb;Z=f_<cI
z!pw4_@ws96ntRg|V*^TRD_V2E|9pDaV)o7T<QCtL-H5-h;Ia4Mw`e8_^`}%0*f2go
z9kuHi%Zw1)z{H3y(nuZczd3W@aL#5O>@?!^fiCu7JZo!;)0xERl<36G67o8K+~*~&
z0Kt`iBnBP!|Mcj<<ClBZOwH8RTrER^GdiYkhYtRpyUDWTNcRB;WsCIGRgd*)sy4vN
zbwrw?aUf%%mI?pi4w)U-$bEKHF#6R4jPsy%J5DN?m#?av_IGYpS(a^CI{5zW%fO$7
zKc*+}23%L|e!G3<<9Cg#ojLjEcg#!-mzGBXpwM`u)GTrN5BQv9(oN0M3b4ZK>?RGI
zz$6fVz9ZhUorPt<QV`y)?vO!A!Deh(G>1dYllC{(^lxC`nhco9>dR7|Uj<z81HUe;
zFgm`k?qrw<u)-Hz+r99*$MN=tZ7*kD_&-TtcenpCxR4@A5TC?qc_Nyl#+&H(W5Bp}
zDd5^U1F1BWOF-)u^lU%_+{<QYNK|vXd;3XRrWR4-!QE!Li!E#SuCXXzb+6de;$5dE
zBXo(3f!}g=*yKBY_%Uz)l?N6_+x~1rkC)<1a%#n>{({{i4eqEN(4ZZlkzZ8?VU@A@
z5<xT(7SHACnB&cS(yB2O49erht2hl&YcrBU+Ew$U<CB@@uSQ=Me)aVsgR{@Cmd$Tz
zS$q7bs6#_twpxUF_dcmx?ooc^h=zl5TBD(NhXe(!h&?gj5Wa$Q6eD#M)A(~_Pc{E3
zU;*m%9%KiaiFgud6$NU%#E`*I_+k@>ZxarE)AG)2>S4>fu<kNlzdfCrhM`Nm4r*>X
zbbY$6F8sqd!DG{w_gR6I%jL>2O(yndTvbV4isAeM9iDv`;IP%5N6mU?8%euFFvtfA
zOu-mRd^4OOe2}jDngxrzSsRm$G@T-T@%r>T<aKYHN7FYn*^y1nuNJF)+)SRKtYGia
zu<@$AYr7AB9BSKE$L=oecw87$y<S{SPM!!jm=&zCbLt`P6CtG;3kdZ<f@`2Obl~rZ
zi1pNn5bW;qvRMQya6hxTS=au|_MLlnhP6)(?s_>I$UoZYW4VOr>RqAjQ3(l;@7%Gt
zRe@&w{9)HS_cu_PHE(|UMm7hOu0ze2Y~L`uR}fDyRsEeNJnmD;b_;lqJc-sVce5kP
z9lV^0qSjStumiVNU7^$SG4(alR8AY4`aX2>L-T>crZI49n4ixlfq{r-itz9zx_+%I
zaG!V+C4F|s4$Fn{Uw@-OOVtsVkYFj}2xn(<Nt+w&(hsvlX2E!YL=3S`kPvgdtTGA0
zLN%)pH6P&V9Be6>9G7q?9+Gh(|JL!Dslj;V(ZImI${SA}TaS%xfAea4)4Qi{9<|Di
z_m^xHdW9Sw&q$Hm_Yf%J4yoj=AF(i#w$9fJJwSAWQ;;Ew2|>7PrX(c`<}Q(r*RbO?
zM7Yjm-m9<Xz(_`FRpSYWThr&c1iN&ZFsstl+FHWa<!&qN{rGW;%^t2A+ZLkw<<Y*~
z+fj9Rx*^45%yd`X58#)fq1LR$x%<Rj33>3aZ9~VC0|3D+o^idz)j=W+!w{U<z;XTn
zml~`ma*;Yu8EO$GG{T@+>3p-}J+=>84~GWy1hM@B0<#lh2L^9{dRXwn^0lgY@loSO
zThn=)4t;lU4!%nkZbYz!x57j{prP8ai*Sicrja@-M0D8XzH7Jv8!kevU_PV+lOPhq
zgzXgVVBco#B3w1IN=)|9&+OvBqoKnsVLG|njs+e&b}TTEC`Ey7jgQ5x;^N{8pzM?A
z{;6LZavw-%tDV~wXug%WLc)>c5Z$|w1bEtHxQT=TXmSI3Geg>w1-=wnB%}QDy7$kY
z4c@O?e{WR;Y;b62N55*z-P*@4V?ly9j|Og?1!49KTq*Ex2?{?P?BL3i^ZtVuC{A4@
z0bq84DN=5sLY^|*!6nvKxNWF1m`%OS;1eKwpQfyCma(RfeTpQ_`03tNZQBC-XF6(g
z3;V*uSE+vN@9pjF?-%uELh_3j@$vD`?h7V*2dhtSQ=1;jUTzuwFh$|<KAv266_;!S
ziF}U<+K(qKdU_cUOs!kGyjbJChyZZ=0RjXaQ|CE4CM!hHI&PL*a(#|!Ga521G~jeX
z&cL8)dHK=4#cxmM-%%u*5M;82yP3EWC;*~DAS&jHMBq7X_bxAIqVx$T8sG{3>BcU#
zx3;T#+cY0Esy-Yu-!y?;X<F&;e>WLu8Yn!{*{(ZFhi$*T%jZvZr3-%;A6(j&@_cV_
z@4Jc4n>Q`wJHkn}3U_Ob(<1&90uC=q!ryT=Bhu<9Nr96^Cooo0<A*}z(R*L-868k+
zZ-4Z;%#aOPdph)KR099nL7DStyq}E0_Oo{-xqx;Nkfra`<)w2jRZ@;L9ptPEJQ*Pe
zXj<Y%<kyKj8WHPs4mENiFoBeg^F2;%UdJ_y<mXGvN`wz;TOG<&emOb)+;F74LgZB}
zzMgkgIn5x{qO<Y&i__a?eiQlAci;S|aetl*=m#R^iYQ8psNS+>0eBWDIf|`L((NXK
zGJFjq-Tn>Xn<O}~v$qn~Irix2KXP~FzAX$1NJ%Iis4*@tFP{1gQdq8@aIn3`u>HnY
z-U84oHR~zo7QXRGJTez#A(}q;iAZ4+k=cOHMHB-_7DEiBPJ6&5xm99)&5-=ntAoFN
zFe~hhZ>cPuu(K)r*gPX^7H!!~ppv2HlQ7l#!t(CWyP=0m18wgf_w3m0Q7xFLdA>jN
z?^d}Yb4xf+BF2%pjaro0k!%=kRKHbjIZ3?zhg91~HDNl&MtW46PMmvpHyo{W?p;vQ
zro@4o<l^}IDQB-86ng-{GX7{7WoM(2Z%u+U*yskWoD`c6vzbYQ7U?2H2FMk)5R}xq
zEb6X07tp{!nGo8$1cYpQlWRWS4S{)Q6bsct@4z+C+K>6K7Pq{~pS_z0snVSWy9I)Y
z?OQENK3^$3M3Au75<_r*N1r(y5+-FJEHiMt@-zx@Z9J~Pm8s(eG(>qbGpPRfi6l!x
zH%|?wo%5lou099$k3%1h?hOq%5HNW(;m{CO-l8<kz^@>5U&z3JG}@)DUR~>V=);Y8
zGh5Y!E=9BpX&)Gfq|yk~6K<&LTtifB3|NFjC+^@hlZGhhkZuu%P&R)rf64rAcj1Wn
zE7@!7Gj%PpP<tU}W|L*JEtJwVl~=GS>}bgN`xk*T-L->F3MR12#SS}mB$$Y9hnD!K
z+d=9~Zb#!^Zb#$f{k?0J91rN(&-M!qN+GppM%>{u<#CQw_HI%917f~;qqul{YUH;o
z1=;+E1!e(@nAT9t3L^ItKsXo}AlAx_kc<gn5xE{AX>E;QQMJ_LZUX$;B4WtmO)`Ev
zAGhcA`iTUy+--r89qOpSy~5s#ini%lDQRzF&c88q<=M4s2|wGkCH-UQ;nmLU@I&2$
zSJyoH$pHQxnQk<Y0TLq@rq)gqejA1mz7tspkyb=UTj>gywQkv8)iBiXyC%tXT3V(X
zn>;+`JhCmZDe)K`(8+f3T=ByX2M-=1f!6fY@<UBID&}`?d?My=pA3!;8`_J21Q{t*
zqGciuvzQXr>$I6f21{I&z!$7h!FMS{lEmRSmv-)LK|=N0cWYAde2`(NRNQnRoTT(Z
zbonzM8Xhi<dmLl5ro^KBFr%~47dvAnLoVd<iFzU_EkY_#2rfMrEQq=zU$v_}P}!KS
z+GJh|1Jk8sw0qCa_3RnI+hmzpV9P6San8e{@*9I)9{|+NYu|fW-<tZ5l?hXRb+XxS
z)=BL{0;9K?_`k@3!%1i*L=OOJ8y%i>{E$(r#gViCPknL8?&QIxWwxSgezG2(Z0hRz
z+2mbtaaPp}1nZ7`4~Xu}7Tt%wfPn~=!+;ATS%<kDX$gKR>K1A5qK?2+*bF1DqF)A3
zMsE$L$DhY5N6Hv4{CfSpqvd?ldu4SJl+)jjUnv;&XdyabiOtiF8`8mlhm5D_xg!v3
zZ3T)&p;16&#OE*S@T>}JU)G&0Hpl$#+L?~?B_kOWm5O^IpVR^YSj0MWhnmw2_pBFw
z@^?t^aAkTG-K_ld3QCq1Wt4A>y1?1aXE=fBOOg?JGH}&XTO;#2?h7=(mG!HbmJNB_
z|4~<_THXN)oDXhQYZ?nta@)S;WO00}0^$0TProv-yfyW?bN;4->hzTmiYXBR#@e%h
z9uEK)Y`N<pave{r9pz~mEXR38zaY-C6?T@`81^37Wd2G``282R<kmc^$J}ZQRvzu!
zyiIlTy=l5C-&ks&btL@)2`?ZHNPR3uU|C<YyNlbh;S6iPL_)#xu!-2=$EMf&r+?cu
z8B#_~6jxDeUROGOYw)>~lAE6FqTS6qXC~00%iF_ulo^HoJ*D`oxp?J${SiK4o<A<M
z%NkECN5Sq4<>YU@;4dNDuZ8CzGiUqXKMDGt3yW7C_Z9gau5fjyDu3dYGQafGSgPF5
zzLIm+272Bd8NXjDeIX{okxJ^9?@BUHJh0%)^;$-|Rj*yclO7+j{P+#uU%`%Ie&^1o
zZrxh7mY^T|=bPc8xv?&)+~Roi(u~KegX$YPe705&PT`6l>z!E0!>jBj^Avd^0%-N*
zNz*snqpgdSq?{B#sh*n-dGw;T#-Z$;g2erk(}RYiUwBu*K6_sfl}Au7T_v(()AZG0
zkEvU5F9OPAjcP7|)U~N3>xK`6Q=`T-Xfw}9b7ZP>4?I!M-oCza@O`svWidbSE4%$^
zX*ch!TcXx<d{S-n=a%MtN6t{$j(g?BL-D<>n|BGeE2>wo&q>-`wRgH|$cLX;9RK{x
z4XR}76|I1q#(Tr&%8m$qNv08gre2J?@~6Z^@dx5isr^%;3kMB#9{1TSCMFqVZ@pm)
zBQLZlO-(p7ZaAa%B*7y};Twm2UA)cC?yv3+*q&vZn3v!m6tXF4#+KaQ=>aFOoRTbK
zHScTld%*!AegU@&-=&DPo@d%s??amK$88)N89m0}yXx`5t1{U{{WMfFN_9<)Zw-A~
zc|dq9Bq$g&Y~403$(1&8cJt;)#07<>8oIZ)cQiF+`BmH~d$MJX<~b>9bF<p?k-E2&
zggJ4U^Oh?AbRb*p+HSGM&2Lxksx#+LPyODuP58TI)AaMYP0o9@_N`dB=o?K1o8%3~
zyT1#5;cx3VGaa7}$M@*^5h$AZl4)u4mR>zM_`IM*_+$TAZ<xSOQ82l$qr0G(UA8{-
z`<~hry8GlMRB|bt`w?QAsvaHPr{nfC|0y8jZaP&^PykhIi=s>ycK0QOuUHqEq<3PD
z<Ttr?3KSJD-`I}7r6;sZO)fU<FZbV<J`}P5U!th;#?<R)*AB)#iqp<XDWx%MzuNAB
z0-CO<D4YCpgKl4@jCBRV_XUFmW>TjdCB-`|a%v*(M?^-O%KG)k|2)FrZ_5##2lu9t
z6@hma!`qW10w6a*6-D$LdE6*Df=KvXY0cRWc2ly^oBgw8aW1oewqxP{|3v@)jPm|p
d*HIf^qJhG^Z_GX@E`}fW8=bAMTkSpZKLLpmoKye+

literal 0
HcmV?d00001

diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.pbxproj b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.pbxproj
new file mode 100644
index 000000000..7672178ca
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.pbxproj
@@ -0,0 +1,453 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 56;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		4D5B978C2B2B21D3003AF2F1 /* mnn_llmApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4D5B978B2B2B21D3003AF2F1 /* mnn_llmApp.swift */; };
+		4D5B978E2B2B21D3003AF2F1 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4D5B978D2B2B21D3003AF2F1 /* ContentView.swift */; };
+		4D5B97902B2B21D5003AF2F1 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 4D5B978F2B2B21D5003AF2F1 /* Assets.xcassets */; };
+		4D5B97932B2B21D5003AF2F1 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 4D5B97922B2B21D5003AF2F1 /* Preview Assets.xcassets */; };
+		4D5B97C42B2B29CF003AF2F1 /* LLMInferenceEngineWrapper.mm in Sources */ = {isa = PBXBuildFile; fileRef = 4D5B97C32B2B29CF003AF2F1 /* LLMInferenceEngineWrapper.mm */; };
+		CE1A4A5D2C8596D900A62A4F /* MNN.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = CE1A4A5C2C8596D900A62A4F /* MNN.framework */; };
+		CE1A4A7C2C85B69800A62A4F /* config.json in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A722C85B69800A62A4F /* config.json */; };
+		CE1A4A7D2C85B69800A62A4F /* embeddings_bf16.bin in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A732C85B69800A62A4F /* embeddings_bf16.bin */; };
+		CE1A4A7E2C85B69800A62A4F /* llm_config.json in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A742C85B69800A62A4F /* llm_config.json */; };
+		CE1A4A7F2C85B69800A62A4F /* llm.mnn in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A752C85B69800A62A4F /* llm.mnn */; };
+		CE1A4A802C85B69800A62A4F /* llm.mnn.weight in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A762C85B69800A62A4F /* llm.mnn.weight */; };
+		CE1A4A842C85B69800A62A4F /* tokenizer.txt in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A7A2C85B69800A62A4F /* tokenizer.txt */; };
+		CE1A4A862C85D43E00A62A4F /* bench.txt in Resources */ = {isa = PBXBuildFile; fileRef = CE1A4A852C85D43E00A62A4F /* bench.txt */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		4D7E1C0A2C40C6530004DA17 /* Embed Watch Content */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "$(CONTENTS_FOLDER_PATH)/Watch";
+			dstSubfolderSpec = 16;
+			files = (
+			);
+			name = "Embed Watch Content";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		4D5B97882B2B21D3003AF2F1 /* mnn-llm.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "mnn-llm.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+		4D5B978B2B2B21D3003AF2F1 /* mnn_llmApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = mnn_llmApp.swift; sourceTree = "<group>"; };
+		4D5B978D2B2B21D3003AF2F1 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+		4D5B978F2B2B21D5003AF2F1 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		4D5B97922B2B21D5003AF2F1 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
+		4D5B97992B2B263D003AF2F1 /* LLMInferenceEngineWrapper.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LLMInferenceEngineWrapper.h; sourceTree = "<group>"; };
+		4D5B979A2B2B2677003AF2F1 /* mnn-llm-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "mnn-llm-Bridging-Header.h"; sourceTree = "<group>"; };
+		4D5B97C32B2B29CF003AF2F1 /* LLMInferenceEngineWrapper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = LLMInferenceEngineWrapper.mm; sourceTree = "<group>"; };
+		CE1A4A5C2C8596D900A62A4F /* MNN.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MNN.framework; path = ../MNN.framework; sourceTree = "<group>"; };
+		CE1A4A722C85B69800A62A4F /* config.json */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.json; path = config.json; sourceTree = "<group>"; };
+		CE1A4A732C85B69800A62A4F /* embeddings_bf16.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; path = embeddings_bf16.bin; sourceTree = "<group>"; };
+		CE1A4A742C85B69800A62A4F /* llm_config.json */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.json; path = llm_config.json; sourceTree = "<group>"; };
+		CE1A4A752C85B69800A62A4F /* llm.mnn */ = {isa = PBXFileReference; lastKnownFileType = file; path = llm.mnn; sourceTree = "<group>"; };
+		CE1A4A762C85B69800A62A4F /* llm.mnn.weight */ = {isa = PBXFileReference; lastKnownFileType = file; path = llm.mnn.weight; sourceTree = "<group>"; };
+		CE1A4A7A2C85B69800A62A4F /* tokenizer.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = tokenizer.txt; sourceTree = "<group>"; };
+		CE1A4A852C85D43E00A62A4F /* bench.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = bench.txt; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		4D5B97852B2B21D3003AF2F1 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				CE1A4A5D2C8596D900A62A4F /* MNN.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		4D5B977F2B2B21D3003AF2F1 = {
+			isa = PBXGroup;
+			children = (
+				CE1A4A7B2C85B69800A62A4F /* model */,
+				4D5B978A2B2B21D3003AF2F1 /* mnn-llm */,
+				4D5B97892B2B21D3003AF2F1 /* Products */,
+				4D5B97C52B2B2C26003AF2F1 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		4D5B97892B2B21D3003AF2F1 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				4D5B97882B2B21D3003AF2F1 /* mnn-llm.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		4D5B978A2B2B21D3003AF2F1 /* mnn-llm */ = {
+			isa = PBXGroup;
+			children = (
+				4D5B978B2B2B21D3003AF2F1 /* mnn_llmApp.swift */,
+				4D5B978D2B2B21D3003AF2F1 /* ContentView.swift */,
+				4D5B978F2B2B21D5003AF2F1 /* Assets.xcassets */,
+				4D5B97912B2B21D5003AF2F1 /* Preview Content */,
+				4D5B97992B2B263D003AF2F1 /* LLMInferenceEngineWrapper.h */,
+				4D5B97C32B2B29CF003AF2F1 /* LLMInferenceEngineWrapper.mm */,
+				4D5B979A2B2B2677003AF2F1 /* mnn-llm-Bridging-Header.h */,
+			);
+			path = "mnn-llm";
+			sourceTree = "<group>";
+		};
+		4D5B97912B2B21D5003AF2F1 /* Preview Content */ = {
+			isa = PBXGroup;
+			children = (
+				4D5B97922B2B21D5003AF2F1 /* Preview Assets.xcassets */,
+			);
+			path = "Preview Content";
+			sourceTree = "<group>";
+		};
+		4D5B97C52B2B2C26003AF2F1 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				CE1A4A5C2C8596D900A62A4F /* MNN.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		CE1A4A7B2C85B69800A62A4F /* model */ = {
+			isa = PBXGroup;
+			children = (
+				CE1A4A852C85D43E00A62A4F /* bench.txt */,
+				CE1A4A722C85B69800A62A4F /* config.json */,
+				CE1A4A732C85B69800A62A4F /* embeddings_bf16.bin */,
+				CE1A4A742C85B69800A62A4F /* llm_config.json */,
+				CE1A4A752C85B69800A62A4F /* llm.mnn */,
+				CE1A4A762C85B69800A62A4F /* llm.mnn.weight */,
+				CE1A4A7A2C85B69800A62A4F /* tokenizer.txt */,
+			);
+			name = model;
+			path = ../../model;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		4D5B97872B2B21D3003AF2F1 /* mnn-llm */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 4D5B97962B2B21D5003AF2F1 /* Build configuration list for PBXNativeTarget "mnn-llm" */;
+			buildPhases = (
+				4D5B97842B2B21D3003AF2F1 /* Sources */,
+				4D5B97852B2B21D3003AF2F1 /* Frameworks */,
+				4D5B97862B2B21D3003AF2F1 /* Resources */,
+				4D7E1C0A2C40C6530004DA17 /* Embed Watch Content */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "mnn-llm";
+			productName = "mnn-llm";
+			productReference = 4D5B97882B2B21D3003AF2F1 /* mnn-llm.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		4D5B97802B2B21D3003AF2F1 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				BuildIndependentTargetsInParallel = 1;
+				LastSwiftUpdateCheck = 1540;
+				LastUpgradeCheck = 1410;
+				TargetAttributes = {
+					4D5B97872B2B21D3003AF2F1 = {
+						CreatedOnToolsVersion = 14.1;
+						LastSwiftMigration = 1410;
+					};
+				};
+			};
+			buildConfigurationList = 4D5B97832B2B21D3003AF2F1 /* Build configuration list for PBXProject "mnn-llm" */;
+			compatibilityVersion = "Xcode 14.0";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 4D5B977F2B2B21D3003AF2F1;
+			productRefGroup = 4D5B97892B2B21D3003AF2F1 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				4D5B97872B2B21D3003AF2F1 /* mnn-llm */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		4D5B97862B2B21D3003AF2F1 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				CE1A4A862C85D43E00A62A4F /* bench.txt in Resources */,
+				CE1A4A842C85B69800A62A4F /* tokenizer.txt in Resources */,
+				4D5B97932B2B21D5003AF2F1 /* Preview Assets.xcassets in Resources */,
+				4D5B97902B2B21D5003AF2F1 /* Assets.xcassets in Resources */,
+				CE1A4A7E2C85B69800A62A4F /* llm_config.json in Resources */,
+				CE1A4A802C85B69800A62A4F /* llm.mnn.weight in Resources */,
+				CE1A4A7F2C85B69800A62A4F /* llm.mnn in Resources */,
+				CE1A4A7D2C85B69800A62A4F /* embeddings_bf16.bin in Resources */,
+				CE1A4A7C2C85B69800A62A4F /* config.json in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		4D5B97842B2B21D3003AF2F1 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4D5B97C42B2B29CF003AF2F1 /* LLMInferenceEngineWrapper.mm in Sources */,
+				4D5B978E2B2B21D3003AF2F1 /* ContentView.swift in Sources */,
+				4D5B978C2B2B21D3003AF2F1 /* mnn_llmApp.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		4D5B97942B2B21D5003AF2F1 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+					"USING_DISK_EMBED=1",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.1;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		4D5B97952B2B21D5003AF2F1 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_PREPROCESSOR_DEFINITIONS = " USING_DISK_EMBED=1";
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 16.1;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		4D5B97972B2B21D5003AF2F1 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"mnn-llm/Preview Content\"";
+				DEVELOPMENT_TEAM = 6G7464HHUS;
+				ENABLE_PREVIEWS = YES;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/mnn-llm",
+					"$(PROJECT_DIR)/../",
+					"$(PROJECT_DIR)/../../",
+				);
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"MNN_ARM82=1",
+					"MNN_SUPPORT_TRANSFORMER_FUSE=1",
+					"MNN_LOW_MEMORY=1",
+				);
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.zhaode.mnn-llm1";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "mnn-llm/mnn-llm-Bridging-Header.h";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		4D5B97982B2B21D5003AF2F1 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_EMBED_SWIFT_STANDARD_LIBRARIES = YES;
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+				ASSETCATALOG_COMPILER_INCLUDE_ALL_APPICON_ASSETS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "c++17";
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_STYLE = Automatic;
+				CURRENT_PROJECT_VERSION = 1;
+				DEVELOPMENT_ASSET_PATHS = "\"mnn-llm/Preview Content\"";
+				DEVELOPMENT_TEAM = 6G7464HHUS;
+				ENABLE_PREVIEWS = YES;
+				FRAMEWORK_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/mnn-llm",
+					"$(PROJECT_DIR)/../",
+					"$(PROJECT_DIR)/../../",
+				);
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"MNN_ARM82=1",
+					"MNN_SUPPORT_TRANSFORMER_FUSE=1",
+					"MNN_LOW_MEMORY=1",
+				);
+				GENERATE_INFOPLIST_FILE = YES;
+				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
+				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
+				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
+				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				MARKETING_VERSION = 1.0;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.zhaode.mnn-llm1";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_EMIT_LOC_STRINGS = YES;
+				SWIFT_OBJC_BRIDGING_HEADER = "mnn-llm/mnn-llm-Bridging-Header.h";
+				SWIFT_VERSION = 5.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		4D5B97832B2B21D3003AF2F1 /* Build configuration list for PBXProject "mnn-llm" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4D5B97942B2B21D5003AF2F1 /* Debug */,
+				4D5B97952B2B21D5003AF2F1 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		4D5B97962B2B21D5003AF2F1 /* Build configuration list for PBXNativeTarget "mnn-llm" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4D5B97972B2B21D5003AF2F1 /* Debug */,
+				4D5B97982B2B21D5003AF2F1 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 4D5B97802B2B21D3003AF2F1 /* Project object */;
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 000000000..919434a62
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:">
+   </FileRef>
+</Workspace>
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 000000000..18d981003
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AccentColor.colorset/Contents.json b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AccentColor.colorset/Contents.json
new file mode 100644
index 000000000..eb8789700
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AccentColor.colorset/Contents.json
@@ -0,0 +1,11 @@
+{
+  "colors" : [
+    {
+      "idiom" : "universal"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/Contents.json b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 000000000..a657e3367
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,14 @@
+{
+  "images" : [
+    {
+      "filename" : "icon.png",
+      "idiom" : "universal",
+      "platform" : "ios",
+      "size" : "1024x1024"
+    }
+  ],
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/icon.png b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/AppIcon.appiconset/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..824ebb7abe51767656c59a74589c2c8dc7799f6f
GIT binary patch
literal 370381
zcmeFYdpOkl{x{4tY6h#uIG2?%1~U?p9hJqH9L6cfz0v`Zid`#-6(UB495a|g2qE?B
zAgW!6S&14`VOhJP1Isy)V@_e_et$-LKl}Mzzx#Tg`@XOHxt>4n>)Jad&3wP_&+Gj<
zeP<=P?6&<udcCxSgv1YaOe;4D2`u~;D}lqnmx-9i|B#R%N!VFgdW1`kK8egz8}uq)
zSls{lM8VA+Mr-1J_51Ar-*v<C^Sh@0+1VV&!Ky8{rrJM*WMFalMWPa)ere&*&WyvV
znB6bdT(axEUVFInXZ6orLpM(U^})O3r<9+gK6MYt+tbRpZuT@=8>;wgnVM=b`g0lg
zls{Yi)lHBg_kVuq|BT@O_b6DpbtyaBz^7jJRz*e7@#8M`v;o(B-rn_53r2VA>+35k
zEolQsiS{&$o5$YX@#01Q=;`I<wSRvJ*G*6o`sLk&k+&7AZr;3k?AS4`TeNmDx8#>V
z;~gPA?d<_rBaLzVc*Y!-5YhVJ*xUN0fsx;23~ogq{UwbS@#thn1F3v<+?U(0)bCVu
zzwapGx|!GQ&i913er&4J6>}^sC^B_wW2ep{aYvv(+4ybr+`H_G7}ap!nfUQrw<S+`
zb|~+NaLzIPY5kI5q3OqmZ{EH&*|v>3z?3di`(@~k`Ep+Ca&z^~O{NK3tS+;P<v4R}
zQdR4~z`%=agH<CV4^2$A<q)k%>{S?*%Z5ECdpZ{6Q^LF-PsUW-xl?i5)t*-9oo*40
z?(k_@dDl``U08IKy0s<8hI1BQK*Gef>7`#X@Tn2c?ZgQZ=9CLd!tysO+*)QTZ?dy5
z=BrQZiLuA`Yier#{&yRits8aVj}w9SqXruW%6`W^Bdz=P@^yL4bf?AkuwSKS=`~Dc
z+3PGUPLhkymF|&nts$&UbYx3lp2i;<a@G^hH*JWy_pRaEyRaOb7k~E6;K#Qof`U!}
z)fB3w#Ptj}=_sw)B^6j2xir~bclyhVl}p|~{<u2r5TSJIRTA4!V-d^1C5K9$!>r#C
zYbZy;smR90vgbGKNf-Sd^HXj0bo=+iVW0ISoc>VraQfM7c#50(YX+_G^w1se$f?ee
zp`Wj8Ji6f-b!(03KJ$>C-u9D}b9NGC_z=N?PyL3SI@*%lI5HMj@EhaQ=Z?xRuhP^%
zc*jH@S|yL8sB+Z1A5Yft<^_2M(b;bH3;mwv(F0SR2C0|sjHU%UYLawzqls=_n=m&%
zHug)pW)Y4}A{DU<>GT>y#X%1eTlEJHll8~bClv=gJ5+-{di!*DbT~2yt5+#0cYdgz
z5qu2ls*d@7#Y!(;z&ZPPuTP6M-pKpu(v6Tk6+bPE&a~dTmGfg~im_3#aeE@2NDpkt
zH&WiQ-RiI|zQ~cO>BZ1IpGapiNrwDCpPK|ljkk(MU*(zPtf!<rU;Tc6-iDEp@e^oC
z)C=TgUcAuJH&~nSzIJDA-<GPNh53swj`9rm)l`{0+mdEf>_{bGuG4V^EF<}~9Q<&D
zM~-_e;Rsi|I0GMx!{aSYw;c?Qef4pF#qqP#EzA2K>y+EZ`BdM1)c5Aww{Op$tuBkw
zWxJVgzT;JUdimST^PjH>$KUU)7#q96x@(N4Kh+*Ui6l*(Z08^N#MKWq((no>mDy@9
zKiDsk%h6{^QzZXE(9S#_bfSLd+m|n|oN!L3qIx<yvjF8wWA8qH=7~|RUhus2K<(`O
z%<t{)L0xr=v$JQVuu5y5{vqS6x3)!>sYxQScds8Nu9s@4Z=<Ty*SeGNMeWQiw#{%u
zey9c(gWuUz_i*~#Y`6?QcInw`0RHI7Lvv-XU%!6v;4k(I9=Nio%*|;wM-2|_Tc6gr
zLL$o_1b>K@g)PS-Be#*(z^ks$l__YC->qFLlNRc01BaqUl5dxCiKVav2Uu1$bkcL>
zHAkKGK2|S$e4y+-KQ|t=#<XCmz48gH!uM%$r>KA(9px*YYO=A<TbITj_$*JI92TT^
z*DTI1|8j9I(UGaVzD%aTxLwOwntis|)tp5b#9C!a=%yLl1#tCW+9{J9`D|khej)AV
zb5Z)y^Le!of8OKcxqor3@6pFIvt1i@o^TcD{7fCl8m+wPY_afRdg#ub)ZqJOHBk#w
z?I*vMOwRveJ)q-u*-&mhEQOIqNO9IKj~c@gWBgWo<&CB2i~NpG9jxgkJ+%$TgMK-2
ztD?L61Ma{+)7h6d%G7z`hkpI_*I#}~bF+`&COGWT&tGT=7*0Or=NqvwU)J=bbFnru
zGV&0?<u}6RHoJ0TJR#7EgmsB<`BW+sVlU6=&>>6u`{Z!8!?)E)A$Sa#t=<21qNBpw
zvtx2>4hYi3#P08>gFZ$NZGd~D4#;ofPBmBl6HVAd1k<oz2Obti3O~HQxJ3}@5e>H?
zU+j9>P)B~s{RTQ2;t{UC854cc4mB!%8Ly3rb&z>(9`<=YJ*dm`tnaU1b{?a=&&!+h
zuC1%9wxg9TMJc8K;Jo>YkDBaXOI7u~s!tBeJZ;)!oXjGULUjoY`5nVtZQtEBZPZ1E
z4tw`nSDZ$1md*O%QoB#sh;&CBn`SvkTyKqY!li{uwjUqsIx|xc1YfS&;dXYNjhve)
zm{cl@V!)<sZ}F;|?Z5Z7^|skhM>ahBv;Ab~U~f}d4~{ER8D^9i`|b`TuD=NDXCuW3
zV3c6&0!lOVLv_7!h_M8^RHlT0{W2bIa6K$f=oS0uh@OtwH*dbo%%t6RS{;YTOYl}W
z+LF=@jxXB0MBdqJ{c?6s`^Cwwj!HrA!SKKJB*o{KWw32>@ddDZz*lE2OJ$tkXxO_q
zGS<fua3rI$&|-O<<qpeOz9ZpD8`UX*L6F12kK*Nb_T7Fs*VQq3f@SHcCcewtLB~(<
ztZ0RVVs84wvfn?wEOT&pDtBh=^8=m2I$`e3O-F2J$!UBM@qh5%fEYCJ98;AK)e(Ic
zE0#M8lzB3v$A77xd3O^C4rD#%CAyo*rQU<rFYGq^HSntsK=E1aq36;#rlu34<RyEz
z79DQODJPS?i=iohDkroUFW=5*V>DPeb{n&tNir^WRXA7Vt1YQ0zPmI`X4_$c(we+l
z1KzhcKEM3ZjXC`|GH86}N5J4AJ?_-x>9OJR!YJWF>j5v{)~23Mqwz0NN;1Y`mnt%W
z*)+#B)vW*_YnaMwU26<Ay#ipVbi80^;SDE97b%YZck8Mn#vi=jxj%19*om(Bw-qr1
zz~4aBqAewT&R%AVWpGrLkuq`j%9UFU{1bm%?B}%6wo2~OB-9wnVK7ry;01GBuQ7Dw
zjpU;-Wxzx0NyTym%gan1ODv;={)ww?M%Cz?8G9dDF`hP_@MLi8_DmSBvT~7F7L^+B
zW*&XXrsSTo!R8-qd#f*A>^>u02-&o0ENRqzL6xbzBR69%V;vbTc~z;*8az|^2r!+!
z@*w=$E=7nXY%HC+!mj3|<rxTHyycx58vNm4<gw|lV-+WYegTnRw@*#hd2_yxO1_lM
z^TL|&S%;@(n-0x|$`-pG>F2Q22(zl%3_>ggX&Mfw<_PyjTQmD5Xy<T4DgwG|ti(Ai
zhm>H`sl$}-GN6;1S#Sd{M#hD&pN%=-clOoJ&31JJgLocc(9|u`Z<{T@Z(SY+`xH60
z`9jN;>hQooW4DaL!c#fz6qv<%x65psG8w{-T$dWdGK^i0+b}~1<jxphgz@#saf^6l
zSVGS%xPAK};(-Ub{VlIHh`wd})P+p1F5^xWf!HXwL<c_#wf0gG-<evzqFlxe0$4qL
z;{<(;QTHx+I-EEGlW5#dDRv!X=%g7JXX%Fu96!Ojq=njNwNbrTZAna=yz6?)9oA$f
zY<RKzWjZmlVDjU`Iq&Yn#|S22KHf1y6(*;HTm{b^sRMn!V8CX32g-Vw&(d$-TAB@s
zfqfEOi9GZy!^T`q;A@{HQS83a9`S@e{gXdqxPdE8@wErhpyP_#DJ+l;U;BX`G}9b>
z*opdiflu@d+!`JjFQ{(z^Xwb#GH%J=f(^|cg!=}c{P{=3ce5$r#ykkQa=FsQS@>L>
zSAZ<#C10{V-UfK5G(#Jo1`Z^!6kiZp?0$q%j*iP6raO#b@5VB6;O^RR#A~HV#`f~s
zL=r)z@l{>++wO%C_>wjH=+~gRnJ<U|Wq|=VZrooz`@Yr7BJ9`0qqDuAMsriLFC}Ph
z=pVXfj8AK4KjnfA5#JGT6ikr6izbiJaJ&J!bSe?YmeBQL0ex|7YS^k}_?>zCgX)Cu
z#@-Ab{Iej>L^xnjKiijQLM>DKYe`>WUFdfVUihIU-)y$A_aHS*1I<U$>N0!xDz@Pv
z+!{C_xDBqhBnAf?5ZHefAQE93SSsT_MGzba;NGAlCbIE(5>vu5<0LCogHYA?rmimL
z{fqrFGI6gyX8Tmn&Fp+{0B1+14wU@eczr?j$;r3EKc{)i&)>Fsd8yX^bYvaodXx%y
zD%Z;=bHD-RxW!T&X<Lik;RqwBhZ!U`?V`4?&00Aab(su@#WxQubpV+p%xd`(#vX_E
z`3QTDZFuOdCYIwaFCQKc1)e`nOm<8SH@yIB-BIv+`$dPR20vuq8Lb|{y2DLkafHB9
zJ1qGioKVe4R=X54fD76}a-@2FD$SU~7ts5AO1BtiY~SvfE0bn)5$h6r*8rDjMW(i6
zIiPfnZR{rpPmXpR3>hC7?_T^i+Z^-lqKwQ=cza}Ni@51bxZl%&%pmi*rHbVzTaW4=
z04E#j+`sE8$k<v=wV}>dT@Kbo8_STms1LZWksyPKL0br3BMs9YFF__WaP1MU9AGA5
zeft~6_vo>a+i$%$@TT8Px;uoOh@N}%`E&Z4H`KDIcbf|zosL-eHZkvBTN{=>Fn=R{
zIwkJjHdZ^uSa1NBMyJr(fC}(Q+G?aVfE5^Fyv=qi0IV@Mo$^pvPWmU0YYd587qBF;
z>?8?YWPh0omX~qK>{b3gmn{+J&&}vusS(+>t~Ch@x_#%)o!hrv1-2CKl&k0Zw3Y*p
z?;Ec#F08JZ+TXf73FO1noa#aDV-5d6PL`@76GH{Ovh0;Fx6!;LGHfnW9B+ie(X(wN
zhZz!bIPs=}y>q+)>x!|HxTsAh$)r5LQ0V>PQA8iyD{uKr_lL+YufN=Og5+ca$9tn`
zniu*xB>Pguy6h|OKGu%R*Gn_tCgD{jH278|3=!A{gV@H&l)jdLle8j@%A}o21Q84*
z4*Kr~+5;N8;TSJrRE*vfFAs7K_fKIGED3}pIPXECc68t8NsBY5A5||-eg6DeG#?|H
z{ZeOg`omSO&m$LsZ4bC%XE<q)jmhxGrfR^p!GWf|axULFb|4O|PpM1+GO&uc9$$|d
ziDLO-klyxq8Vuz+ygQPXYB&@zUrKUiPaXHYd+$L+*z&AsX)$PdWJOfaO(Qu}iJq*?
z;>Gy&v^V^zH}-ksV!xqQ4u>F1<+K4Y0Z(X;%Ht$UW#q+rBF%=Ia|Czpuf2q|R|eb0
zVcV>~ctgxyumWw=tyKm@YXSuv{A|I-9~WQCqyU(CXBTJF>z6+-i|v~DRt*Hh0|P$w
zQ4xX0XCi~l9@k9uhqT5Feonsd*Nq5V=*Ha569G&%H6UPcwjhmSoypKQ{=VKUfd`HR
z<e4i&0xW<dAzGC>keUm!PduIifthYkiF<;?bL{5G`@9X%Dl{vlr|mZIhFI|Y!ujVh
zLk*n<Ytt?M@^_gve9Lxbg9I^_1uw);HSRGi!$U}x7;a!2!}_GccO9wjh_mq)vF%bg
zuytS_59?xK{qcZzW5J@GVS`F((qr2r40--w*Nck!#k5-C9|wJdpVx+sR|Jj!;vz7x
zq7EnzOg%UzTAUdv+`Bgda!+k}Yd(@xQ;!A1>vZCJWaBy20BQ(w0Pkx^e_m3(!d1}e
z(!n?$LWWf%6l0$avX|p}84yOG_giYP1RCw}+WsqEVXSWEO+#-xOk!#J?a$qhD{W{4
zS;rR|U%z<Kx;EXX^W%H-OMjq%q#~Ok9_mx8R4n4Kwq_3m-GV4WE+8E6i};OEf?)Me
z)y+X*+=2EV*4&Deuss)2j)oVwL8e4541ogV$IAKfVV5eer*~QYsLh7Rhr)$duT8qW
zOSnFXE`pMWz~6Nzhnrhtgwt<Dts#5%c=LGo9z4{M7o4}0%4}n5!n|L`!$y^dN`k+;
z*mfNP$8f__7!X~GA(5fFOCHc)Ebmf-vS+GTjwQ(Jq$NGpf{!=KI23m%CB-4kyJoW0
zXaBNDC=43=rzAUHZhA7&-@D+mV`ecsyYNT-<(9GVqh?(Y$dN3N2<zp4W8#Wk|9g{w
z%UU1E02*@4fpE}H0~Z`B$<V=UdD+ZuXXA&7>kr!;Ifo-!l3@z``uMUwot6vR=1we6
z=iL+uXS?g`=AO08>zA4N6?1LJSF&Sae#~G<x>wCz)AjZDOuAgn546XF?1Br(!Y>S$
z%0Xf9iK~i84FU=MQ~y~7jy_ww?f?jRfd&YAJndpN3YSHgKt3t1pf)eer*_#w)U&@L
zYIzgb;3Dpn-BvT<KNfnF)psrwq7?SBv+~fzBy2xvl_i5DM!At9Cebl|dl~+_UMnyg
z%p=l`6SPC^<=3X*uEVY$X|N<OGRhtK2qwNh5{k?m9L7j_{Qw~lgS)&XWA_n^s_beR
zhn>gk9!xK*pNLs}9VR>zGIZEJqLD#Mhb|B$F`vb_R;XYKqXs_<I)2KqwM4-fc{(Ev
zivnY^SB`Xn5n#ZP2FV{+@Hib*3|UJFMO+4LfP<9;>moR?f_(~8o&<$e+Mash>o5Ba
zgzXInj~)v-d-m+1Lve0`=kFoFoSIAu(Gg9<t)F+;<3c~ITP)8w1kMYh32I@0y!>@v
z(ty;5F%)C_ays!yY1XqgVD*MYCd!8X+G>=vm(6Tg0$su^R1xf?Hfl(*>)H%$Ny`i-
z)^av}H_87uNC9uVmU+?lm(#yH=#9Am_RtfD2bzbjh+bQS559~~NqL%aA)*w;O`u3P
zPBSJ7ndmlQZ9upr{TWbow6k}YW~dG`!a2aWh{{2H0h6$VW*$U(8Bn0TJn%b_SylI7
z;mt^e@LO%ra?fztPLrTpD^hB|5WZm}@04&<xusyKE-Ph7dG#)c0yHlQM}>*r2fQ;X
zZ_s^awoyr5Ih?1Z4s<NrP;;x8OU3C9Mu6`0S|$nBA721#iRu#IJ9JrTQ3Okj+%q!l
z3f1mlcI|ko=!@-6QTXD=pcA8ASGl&@+$rCkm5(2zN+jTZt%<jA^mX^Qz9BGdm6BXr
z`}IgNKOrX?2fv8rvzaXpL3C{eP<W?LR_k*!^x11b#1X^es7^$vZAk6>U0RYJ@UKgZ
zS7={bfAf!vPycY(VDNE1$l#VRowt}*TTxe2_rQ)e(B~g$xT6kCxS`rs^N^*mmcr$R
ziFw6b&YXOrEaQ`yQ4qO~X0ZY2&}IR=*lOBd7&2JKGQ1Z^42+2iPV(}E`HER(*+cr(
zA+c@1&A~aYxc7BrCke}~*OC|n1tRH(yhwxY=()VSNKyVF@N;z!@73RXXh~E54;R7t
zFOw_X*NK7Q+VK-1-(KdXC}-PfUbM9XX7T#eFR|T9+)IGLgBq;BEZ>;S!OF=sSV5~-
z@KQ{IP*tsvf3CD7G}o~GU`v3fo7u*85;I>(yvK$eb+yp;y&s7Y-5x)?yeX*bG1%F@
z{;kg|gyr{Ie@M4@P_U?)zoUAhvEg81C|H{WeH@n9;0V|p&8jAm0Cgzph8ih@*Fqy`
zw#q^2i@?w+3T>yrt`<RAR%%C|y%8!&g!ZE6Ctarw9YQXy;3eM|5Db^2MY5MjBB$a*
zVt*V*%s19B`)B{jkhwSCo_&5+dyG3(EL|2Ae!;+C9XxlKKVKO0%~=$>aBt6eS9fRE
z$5jzu;r(Gx)^c`1iJ}dlAM|GcGe5P{#Mlw!G#uH;9Hfdn7C<wQCOAFVX9!Df;J1-s
z<vKF6Kq7x5ZNGH<@z>rt;b)-Xb=@^{@4kqwpZbHQLr-BnVg~v}vxP0@VW)=$r^m*t
z&bKo`U@Z*^IEopyovA$Ad>}vpc|PF69l0`4&WnW+bP4)ORPDCg!GzdoPcyW^w1Aeh
zfnK`<W8kf{89G}}<p9IAQQd*8t!gUeC@h$ve@`PC$9Un1prwcE_0wT<v)=E6v)s%n
z#oTqR20pd3og;5!zT~zREIsg1&o{qUYqD`uda9B94qyOuckIv}OXU7x6`*-ZTOo_<
zpi!`r|ANECr;HL-Xit#@`pJk#Kw>+<BWKub+)Z5FJo~XWtUG*pO}FsPoM@_b?8B0N
z88`jiM5FsqHgCS;Mz_UQQy%AD<b~&{xwv(PIV!~UXgxB4NyOe5Y(9z|S-EXAw61~3
zd>6Y$SAh)$mn=@NV*8-WfhKf-u$|Iwy*?Ly8i0yV#5ESl8L!3{Tz*oiw*TG0m$1{q
zFShTyy<_HDO-}oia@W~W2cF(VZXtZ~t%Ycz{Em;sT<68bxrbq4bK#BL+_t1pG;UW|
zlXZ~yz%T(nI0Ycz0(yQJku2)*4%=&&+F`D%DXo`+&Sj-cehOdO?395;p*N>!1fM+K
zSGJ*B^ul8CiAB^%*T}<%54X|Eq7=lPrMGF==eaAM!d@Vb*$>Y>;L!`gQC9LCDE@W|
zWUiHIgFbi#z)=HJv?Uq0Bj#q5EcwG9*}HdacZ3UpG6_WXIvrM-yQ`nWR1WmteFVOp
z1@{n3pd|Z|ndS8LN~`z2`w+P>TN@*M`Z4CwLvRf6_$B%Tfa2+)O`Dv}MN56HW+8*U
zWo7qD>gp;51_>|ui(-?AtU78B)^hYA5J{p3W(>rvAS6&kqJqYk><f+^)>;lwpyO^Y
z4~iDZkcTn=5TjH2zPc9Lv$ru5UxMnUW-2~J4A+8jmM)Pmth>8Z7E&HP-dO45!&lWI
zYBz9W+f+7^N$#jPpyNW?<Kz4brP<J)K@19&Y+yr~5ev;)kHmI*jfAhg)%xv}P$H=v
zdK~ul;RZ>7F&q{WB`4?*3t*O#s`5{YTx-~rIIpq#z7dOe%Q0`d7Y3iTY_?yhj)$(*
zq2%{Yne2@*gSk3|qQ2GxbvriRB=Jn47Ay&cFa*{aX(_fJj5|bL66~QiJP&D)kF=YP
z_~&N<KuthGWneS4iL=e-|J_k|F`xd~;$Y&v@NExFgI7EEZKV~Bc14iTH=l&%;2C~v
zpJ;@vHX5!+hJGB6c|y<0ss?n6S05gW^bi7py0)VV2TC$+Zgm@;y*n4b2EvsTE<`s?
zjkp7K_v_(7M`(`(lrdo91z`8YycgIYnS9V_+c^e{&3-}>OUh85#RuW@$jG44N7gip
z^Dctcq3<TrO!#tZxdAkT1%Kt(GLg0-S25*IbIQRLAgVw@A(qlb)HTM#JHSb38H+;{
zB0G{7G+S(K5EEcks#6YA;-U>${RKK0Z6fLY{{3^_F<;J$iQ@AnFxMa_%7?>y1q;6;
zPZ_DBtgk2AlI(;dRT*#y*r)Xq=Is<vE0n*XXTkXDqH2Z(;H+umv*|=EoVhW=fZeB3
z*IavsW&y$r;hG1E)j;~|k&r;RL;HQ1lm`3;5P{^$s38SV4w1)yXH#lFoISeqEd8`F
zc)5Vr)lv6-B7cMD&XGOseK8Z&I^UmQL|&Z=u+#MVWQAg7v@Sf$knAjn5Diix_E=TU
zaLZl+m?~$GKQKIqA0b?zGU1}OF)GFolQdXCuZehfjWiIx7M=gh!5~^xQ9l(I6a$p8
zQqe|m#rU{l0hH0Qj%R2VZEXpm4h(W^BPpckY^hxi>^eMVxa(|%ZSVjS(<6bB4tf{}
z{uA=OKk^}Mb|pp^A!q@KiTA4ms0}ig4QL5CvLmH-^P<kUk{|2VhVWW>B93NOrfF#o
z`PxtSkCVEbv>X!FZli0K_C+SjHdfX4jGUBIpS_RE-1Mn=(9*G%uCMWd#lK3I)V|sl
zmw2t|N46iS#qf}WTFcV6`eTpZd>fRvZcb1o?H`Fqb~~pKSj@i0ZSyCs3M>vFowp)j
zr81AR*#&X%>}@J<O?uGSk00KLWjFP<5|vUc-dddLTa1}$Y1KbaKh^%Q{z1g)7Y!k$
z7&~f}O`xy#F%KI%iNhr8eT)pH%L<Tb;BO?#Xj$y5EZI`@tOReJ%#@KYqbShG*7ijD
zVXuJp2BuU)hFHCFnKDkvFBw^v{A0q7j(2@r9QPLe+3meNU$<XRDP<^d<R2r-t@USx
zO%CO|PXwLFw@e1wD^<l|<vC<6y!Ap<G3~k3VTH^0Vg8IPr6a!Ro~Xx4<F%dk1-LZ~
zWgX(~r6Q(G=CZY03A8H4k#+{2#0ZeQ=;~R{y~`j|&s%ZW^oIKM=$P>FC&E|V(NlTn
zolo5JT-w{<H(4IpGupYeB6@K{cFg?8!lkcE8-moXUN_qVJqh_3SLsNV&0#LqWwl$W
zmO`N9b}~jI75Q-E!<jA$oLDZ@%9LWJT%hk&E}Lv!$^^~2CF8VD<p$jpY$|v@`M_ek
zjJ*Nv81~deooM1og>Y6_8}o3k=i~F*oiG93{a0zX|Hykk<g@goDI}<H;oUJ^0!^1=
zMN*O74F5EfeKo4SjjLEpEom!7apv-ULzz{oOd0FFnwyOS+ZkEJD|SZ&dJ5>xtnC?D
zM-vtHYHSq)z|h6Ux4^9-_?(R?#f>53A6K9=-o4y!q7Fsc&PNZ&^;4P(sd`E=H8EcY
z=R`wcLil{cCWGi>ks-%kG{oo#D8^;++N%P40=dK^RgBf`AcJtc3Yf(Eh8ag)a^tl(
zD@egLkqH;DE;KyhvcT#MQ%2G%`F@+fb%2qKwQ4hcA5~+qc4y?=SXa#4w>7+)Z<CP^
zJ>e|__xnWt5;^i^+Q9qFWPFN_NedO;h-s*jDM3C~%C-zJ%DQCe`X>IWV`)5bwY`iy
zye)_@N#QVsoe6cf_NrXA_GV*vL<JOi^(6*N;3!>Yxiv`Vvah)jFYq(kxWvm$8NzC5
z_BDX(MI{?K>?;kqQVJ!ax5PqCxE!~f7iL~PJJI|LY=En$o>0MOXXNb5n-$T00t?a2
z!ljCe$CHDxYffEX3M+ABxbF)v;wW9Qq4{cW@+WOZ59mE?Aj-(dN36aG$IEb~?}NnC
z0}PxgQGCPEKMep|3fTXdt$ojhn;{Fgu?xR0sQcTWd-ja;BF~OLtxXRLL!-dw>HK|n
z(1}8evw<*>+ZA;-Q~|tL(?Jrv#~UU-uB~51!}sbf8Ee=jEm^LTwt$LCT1%>cxesR@
z%S!bpUcTS9;-Hd^Wa1?3Wt`|V-*~uc8eX1OOw%&XP$J1H;G7UfHm726$zWu%*pv<a
zL(Pl4<*x`39Xsbwc6%wM7;J`1N)`>jG7)7LE{_5r%Jric`*SzTOsP?|WErmZIdG~u
z&Fe2J9cgc{0-R}E@5_O=P^I2mK|-J;F<GGJ$c6Kmas+ztB_w0o5gTX~Q;WFlBqO{N
z{3{9F63(*ie}N4A{&Mg4=5w<Pe`N12{LuBGBU8~$7E6q!({tIxb1XBb0AfD?z|i#w
zS_wG)Bn-;9hD4wRXL;Du_}E+S_84}o!bMjgq6`T*nSge3oD0`Az}}9kOK4!yd;t?w
z0Tq5O;BOSb1yx(4KS}gu-FeZ%ldKIZxQNtK^4aI9SD#-|urq9SCzzUxn__0N!dlLb
zr!DVVCpEC(Vz0u!$~DZ2y>FQH7v0S-OXHJ(diX#Fh<l4(LK*`SD3fB+t&aF!z2AoA
z9+ifsE)Ix8NxU%_jJ;hBHLH!QWS__~k}hVach}B5d2my`?%(lo=au~aa+8Oxyzs!q
z(f2T&U4-p6B!VNgn3m<jpbG8*@j!f{0?b?kp!&2zQpKPaU%&6dEyu5-wKve;(6l5Q
z=*a@AG3*$NN&@1@0QgXifwUEY(7A^0u@r^F3h;BVQc0(n3US!Xw6x=+YL8>)*73dt
zh@N%z)IA@o@{Gd!R?eRiPVNsnJ3eH=J2@bb@e$d0pXq!yeGNvTbQu<d`<*1Mfe1YI
z!DWiqg&X@4z{ALhy$`y;&ajuUrs1t6bAi&}Pbdc6Lk}I`5h%&czu-Pwy9l@y0i9M%
z)w(4Rj<s~v)vAAdBz#hFCwi%Cvl4LEv5|iqdq31=9`mhz_|=!J4g1wHu4WS8V+WdG
z<Dmn$O6K0xWif$6HOXXj;4itEZT{DQsW+AKliRS20E|6;Q>j#cI(KglwFFTsLKXa5
z7j0is7Vzygz?K^^*{#75;=P+VO|mlxmLX6f?W(E_J~{vCkGRNnvq4Mc&N0iAPk>7d
zGypgK|ELZbs-G@M5iQw@z6J>gN9Vfc`~O(hS+j=PS_rSUF90RGWPmYbAO%`G!yOSe
znhZz{Fei*=g>G$hH!Hv)N%<=w^0h$|dI`T1mn+PSU)za5da%?bP=x{(?=}#K=F#JC
z3Xc6Jj!FJE41IVH-0NhCb4q6Vku(S#I4xs<Oeqt86O*Vwvoj@=L3C0Z9BFu`Bd{@5
zjO1H5CkXRJpr)`&|3kB4VYgF=mKwj|Pi30zcyu;QWFi`9PP1K{Gymzs;mYu9cRiyr
zWP+gkhGsc!lf{SmXWI<wmiqss4@7X3-iUYO@~teTWH*L<pl@1x!yni08^XzY?1RwS
zW4gk;`&5#^vbr&{ED;Z101$#IDi$}xo)=8X+S_?%EPpKM?yij#eH>9gQ2S2_O1|x0
z@k?Rk;!+AC8uRMPCOWNHRrWZEg2enP9p?lr41{%+3tIAXvz0x-Hvm5Lg6D5(Z&1gJ
z6&NVZ34PyFBdg6n-U#Y!n3FYV1>T9~>lGkyw8y}B0YU^hy9O%8US^Yj8ZoH|N5mcH
z95<WluS=f|>Wcn4vZk=>$oFaNiTPTxiMKQm|6<R^vsI$`=2(6Rd9#|Pkqppg^A2D+
zMCWg)o6XYfWkBbto8fa5C-Vh*#_0E{=rcvM;${;1({nrVze#TMFXi6d9kIHM@=}&F
zF;TZ)H2m&$`iTSGixXpgAAf{ZQtmXX{PoOq?}D<yFNKTB2q3vsttWIpLvXTOusXP6
zxM4~79muL%1IhR>8Ezes1@Qk?AUxojzrdojmr@kU*osqnVu>PL1|!-GE`r^vu@xW<
z4+6?4BtTb^tUtk~%7&JKOSZEAGhRMoGTtow)U+sKK1evNW8$fIg@(2$Z%_36Ti$$k
zee{ALD=sc>S40c<S9?2F8@JpIrXDEC$chE4iDjtV0Col651T+(4R%!UM|@J1KFkF+
z11g4$zaSkT%Ln4ERcTtX#AM8rDvVPbz-^C@xzB@#rsHF_Cc?3}V=;>>2F1r8j49$4
zQEwTzGaZw^yKL8o0-?B>4wfI*<*-){6R8QqRlAr?($(U^w-hdeI+I{1wy>fMyrhS@
z!xiX;6JIjmS;d7EjJ*SkWF&1Jp$jGjqrqj7puZg9Dv5m}0m@4(6JGEfE`v(6<fNUP
zvf4P>y_6m)8Wi^B=_$1;D5bQ{HFVxHh#7e;lU)%tS5R0#^(;@fwXkY8a$i#(NZvNn
z1B_h68=-ko=`z|2@SzQ)Vy0_E4Hzo)c@;V`w6rRleV|VM<juuY%`&zn7#Q<iS1jR_
zXuc#Y|LJ>C4@@Ic^lYYX-T2t}*j+f3Bh!s<hF_%&`8>XVcI*qRkH$Fx<^B({L=vb!
zhpi3IjBm-viw~MApl}IRRyZvyx+-|cGAS@0;zVGGgZl?hj6R771|^p8CV(D@CzSF@
z3IjL5$VjtYFzx+JM%J(PGUzcdI6OH52Fa8pmzYM{j-}yuI(P9Fhl9{`mV4^XhV9ie
zxAoLB?_MzJTDln){cYyO9`A2&9&UTsGPML-3M>HD%?4y93E+4G*uS+1rtjjhFNeQ=
zrB>HO@#cGGxd*flmaQ}qei-m`udUy-3_Ki!DPU#!!^F!k8MEqNmL5iifsI>kd2n-G
zcWskjW%y;7M^NG7B&<+D*V15f+SWumRf83K&`=GpDH*6?EGY+bf<XwZen$4i9CSch
zDiZ_%$+d(kRf}uLx#YqnJ1ZUmBM%@o;$&ha;cr)iP~!=JE+j(;;vk|Zqid*@a}9Ag
z9Ggf$PZa6D;WBrTC3}(}=uFIseiaP{SuD@AAW>?SQ%Wh1fS{A@b9M$SUH93c@*&>b
zwfG^seXs%}T(B_c!xzY~g4#r^hP}pGfuFf0hpsO3q*XtR1FtDYHiS7P7(xaatQHY-
z9sbw{qyS%HL8%<McHsOaWnn144!wK5XmV%#Y<O5q*!d>EE$<M=gaF@KbePZeZ3QD^
zSr%`J!^Zg%f2L~nB(iXLw4%9u=~#lcV94Fxvv@`Fklw8-V-r`mgB17QXmBiJ+5uiy
zyCGtd^e;+fWJ$Q%fX{u2q8Ua*K^1L2p&{5F%rp0r3l`N(v6u|z+CVqI)9boKU;2D3
zfN$p3%+0Pt@X^#$at;xK<!yG3Iny(BG20+|B>wh2K?vA8ob_Iv%^(3u$l0K|yAieR
zWk_zN6yH*i8{i?a^M_and9+jud@nQ<5}-^*fZyU~J$8n`-7W(T)-FJ>6*ZUuFNm;q
zc2nUif!O3uclGI+9&s)z`8Q4a#oG)tiDZ27AcLfG0n$!X0+8K46(+Dh7?U0*oE(5V
z{ys~sgp0+I$eZaBa<PC@E#p96(g++5y)<4C)LRw~EZIofiPjUs#IMQ)izUA@Ab8Ot
z#|AxF2s8?o<bbu7S)<dR)-9oI#9Mw^5jGoi&oj!#Q}4pgU&lUFk4FtPT-?+x%(mD*
zS7+jZjo8g4t}dpcH@a-(3RSKVdj4z=nE)9t3e1BFgd~j;Aqn|Cr{w$Wq;}-+ihz8N
zfK(di#v8gt=+fX=K`<k(!I?pjfvYj5X|Q^FzU&XXv;1W_OmqTzgJq;s6@M4|7=tS@
z!|k)3F~NiXI3^&`>B*~!=VDo0m~1OhpPdZyyfqMr)L=fyJwh}D=Vyq!ejlP4@FNHE
zk|F0;FzcYw@wYhSIQXYmQyK2|^J;DlbZum!SBq1HW-*?cwUW1Ov0(cYGJmrH0Pz-^
zMKga+zN(As{-O@9_!cguDOkDd%*gFR(Gm<v*cUX>{(?NUMAZTtE&&eM7(iqUJ`ciz
z3wVTbBwq4?#e6zAV&o7&FA1R7C_;%%7yt;UFAjVdcQ3>B4buS)(kYjsFrfN<*>1h-
z{@7VNR|_j9#?Ke%Q+$4bc+ed^@O;RpCiGjsiSvp43<#h2CqL}Vu|k`^sg&se+T^LR
z8SMmo(glJn3EU^tHd63w06x)JkqPvY3J7)uhX}^+0$&}{1KCMZ9x0peRW)UJ0W`xS
z$P3}zVD-Tk_`?cscJz1RBsp@4^ixzKEmo&<ks=y^!ewWqXtqZf<6r3sG}Pz4{CoPH
zrKxOXVdq8L!ZC~JQ=};~+W+&M&Z!|T61NX7+`Z^|C_-5xUk}0`sF#gE57k0isFAql
zNK6Lf2liD`wb^ms)Wq!Vop0}S@q*9(3rJV~1*A3aD;}Q(Aa#%w%CNC%1leO`u-t4#
z*kyp|Elwt!*Y`=mx38hb2Yxk#;Xw1lr$Y%Kpn~UvoG%{;-h}D?8y2(<^4N26TO(V_
z&J}AKrMMX!wY9cyJjCWV*j&C94UfCL%ap=8&dt(MAMK7B|DE^k(`2ol(xwK#;14Dh
zOA{3_lPMLVdz+lsu3g9F5fquAQSc$H_-pqeGeWY&#UT|#RuJ_O&{*ciLzIG`nTd6Q
z=5D2?xFCaS1ZX0}Pgp?93gu9hW?+f@kKl5f#|i}1l>ZYUi2?P~zX9rV|7^4TQO;FR
z<&s#oL~*k?KMo9+Qcw~{Wg~=4NmUBH6g4$0`z)~+5}Rq*fh4*tgkL9Z<e)&);4(|&
z|6&i7q9Y{J`-blJDsYXf-DC;o=Vh@FYlFA2)H0H|;#9PGJ0;m_7ynt@vS@L+TQss}
z5j^70>?Xgj$Pe+peUZsFh?&zFe++&|-~$g%preG(qoCFXTm*D)mSBv4nXlMLA^n65
zg$4=f59&Gy?<)=rofqmAH;E|<;zTQGY;kio300eiodTf0U;+eTob2-tgFXUh3)4oR
z*%F(hsJ|S)(y^m1VsgpBq9c0X3#i-MT~JwJiNDb)8V3Uq{o<`7cF<4^HCE-*_puCJ
z=&SLl>NA8AzZ4myOqWtv1S(iKGGP@s$SlxO_&^>=mj+OJ8tAQdmLZ3{`6Uo9q7sEw
zV60cg5|U3HkR{prgC&QkZSR0hI~gE*H)KLMzEL!Y3Qu)$+3QD8O8I@kgeP+DTIh^_
zkqwV;Z@;>|`N~#+#Adjp>{1w1ZWUvn3Jp#YZr|mIxatMx1?r!xrBD{1fqEU7AP057
z{s?4Zegr>&LTD>VUX`zBytf3J*tk)Pcb&Cib1<T?J6>nQ7a?TqdHUwY+lM|2lMC7U
z1@BPe(t=2>!gRzH72DxA<BN=>Z83a0ytlh5Xc+27?g3$|DTBEHc<`;j=*M3(%E)Aa
z;FD22k5$M(B{8jv43-?i@y|?|L>4yCm)uH_qR>?!4xLkYYL&}Al7qK`j+#Ix7@xNi
z$g6)@n1j$R9Lw?<H>-Z-w*{9XmA@lw>_b>R{FgZwgG4V-%kzWcz;>WSH0UTh@YOlA
zO{MV}U?oY$3$llip#)kW&@V7E_MT&bf69$dLS8$QL?W%qrC3_+3&02{PznKMNb|Sy
ztsvJsy#YaJrv4ZBS$g^)!f)j3VAHh0^1;Q64_oukI7#^zu`fN6WV-;-wZVK#PEw5A
z_OaSLZ17H(!G#F)=mEOV!K^|)U2O!=w1MKWlwzz3bfjI3WDxc7$!<`_id$>YL4qV6
z*yoCdG!x*rPeqEB0Cn6eL-|An)DR+tqdf7Sn-DF2?2DN@C>rb%7A(h|*Q=ju@RR(P
zz71@OUf%HdYn32`3{e-<V>Ki{NAQqs2*ps=qECLvqK|_>Qiaq7J{7)!K6VPv5M1ps
z*A*iE1En&Q>LuU+i}@f@V$!vQzXNLoR>6(<1`6R~rp#6vC?e{e!Lqbfi2}sW5dKZ_
zL??eKICFOKqIw~Cf*Ys>I*K0M;8vgWg-=Fk!<C(~!r6j8Xc*`6mEDgNH(R|yu~_ol
zeM21d3*bUjnfOAD5&ulqt?v&L6u{-9D%nnEYc35=*&goYdMbko9tK2IW@{`gp=Qz`
z2bVd~-`MBu9sU(YRd8N3^we4CqvR9eTgm#rrTD4&$tST>kHNTsz2U}kv3tv)57u6Z
zYDmfai{J)n#Yqa#@b;L(!l7G0wg>=K#&-uHN``<<M9u(jeSv)yh6G(K{2fpxxK3{7
zT^sa7U|){iieHxy&`E3y=?Nr!y!taJ#6^b9p}JY<`_H7G9esCM@c!uTI{m9KVHKP<
zHp3OSH5WoHoD952DrEhns4a(qprMZgy&=H@T?U^9+7Zj0P_5d1L<1(43vE>_`>G>s
zs3$yj@YUW56l<U4mWgB$N>4CWsIgf^GfE+Er<^-+{K`Aiv$McTbHWjk@OO3LPbn3T
z?|T0BFS+xnpX&c};!l?=tAKWanxHWOJ|`bTwh-Ec_2`2?FB{e)8;kPcMJ13iWL5xl
zsUAmQF3@|>wD3h!@hHteg9AYq(kSdK6d)+bc`*6Hk^&gWc=zLI$S6SOm7YsatA71F
z^JPzTl(5X9eA`3kD;-3dG+PA+*3w>v(4PX`ZUYquuWsiG78g`l?2}mcPom!R^8LGv
z%k2%0U<Rbwn!w<2rPSx})LPqqi?M3Fn8X0YP?O(sL3&NcTy4id69P$25ych0)z4s!
zAvOd`vUgq0@Q=$fFzetg2**T|e}W-^_3OR)CH~$~AK~mng9G(T9}6E(3+B2!t{`dp
z-!h|p3>-YHsF@zQITv)GyEaT5;d*1DbexYDbUh9i4yYf9606)EOGvI7DmYl^|J5-;
zJ)s33QM?8<BRo&LFM&@?q_fypF&aI_owon{EC4**LiyLx-#Ult1lp)XCs58O{2kQp
ze4!AjU0qz{;(3@^1De^=98dId<uQiAFFqg4?!_-jp(mH^j0}Kx?ERDZV4Q?8vws%t
zX<amLn?9SBNq$ED(KSGeX}{k156qj6wA!lmMmf=}G4e}F0cLu0uRm+=>G#$@ElYg#
z+_c~8>jsb2e>>yO`pspp^`!*W->z;c_1LjdJta)U8=61c^;FZ|k{J1n*81yTW%nj)
zr06M<jQnsOYR%i5uIe>e9`|#NP1ev%(35K9+{5b9rF5@27GeAw39*S<^&GksKT%V)
zi6ghmMi2csSu-KESH^=St&mcskf&1B$SET>WjSv&d%XB0{Y2fOFs!RSW@wwT{?gO1
znFSN?vy-<+45Io>3eSGYeeqWJl3yYvAiD#bNN*&jRArR0OtnZQ7%QbiN!AyVtu_4k
zs+CqOoRwOlng{E+A3rg~--E@!BcXU*1!Lsro-e<1`+rDecx{2J<2dQbIp`Vr@iQtL
zIZpX|WF=J<ciF%Vu9CYhks;?W>T$`_-%78tk)YuxPr}F@bTdCkjA_xa80cP}?|-vs
zvCy)dwevuDa&vt7`%N+P=Y2%e7BLI!PRtH#CeLf>o))-m@lUeeo9rYTaX{)iy@^w%
zQpI<ZgaOc-GV<N^1a4}L+DdR`a1CyB+3gNR7>}afW|*LjtX!gJuMFOUm1>&lDytg2
zyOQcq$xqZMna3y=VbB=06ZE$D->{-vjdfQq3&XmFzt44_4WEn4N>@MG(_rD)>Ro?&
zp!2OyRL`qDK1(GxTX~X}jZ`J48p^p;j1|c*$+{EXG@r<El_aNBHj-_{XLGcXC`l#+
zlTNuvRwZd@wq&?U>N=#tW%=Q7!7l%E)7zUY2h$@F&6AC?9kTV&?05??qPOYMvuE|q
zn))05_;NANBu11MGdrdJp)}DasiP?)HCe+MMn!-F0qnTSZigeWw$ZDkDIKOInCWAz
ziv0L@I%u_?y7axt)>66&sist=?OHCCeDW!m;N7ySTNJK$Gyy*Il^vokz=Zec3GVpe
z_=zRbgzW@p^N5Lwu{9PoGh>V05sSit_Xi$Lto!4^P<Qy3riXWU6KO*Lx2b_+9wb}4
zY<h-IVgW-9t6(e5*S@6k%#e=Ny#z1mdI@kb^~f(nPV$ufrCvLKg_PsHTm9^mRGTa-
zshUa|axQwOJoE&KWf=bwy(Z#LxkQPKIBCvS_!oJYey(1yH(+23)?eNMo0y6tz)`^K
z_?7Dga{xHLdaDOC6CJ|G``;84!Uw~eH?IuqbG1Rt!sqd&mKP=`gk9>Q<`kdh^^-S=
zwSbf@{dT%^ETCSaM8zmsV~rxIil3;p4IM4#xit&tA=|0A^RTjm6rc$KQf^HXxg4vj
zwAX5_ZVQ}u0#UBkwA_<B?U&Rlw=3~pvUTTQVJ(6^B*{k;`8;08;_`U(V##>Sc+bam
zyUnA2i<c3dG>cq#cX7y@CoD7ZiJVNoV=l5$P)s>au_JK3M!9l%tX`&MXCv{c%IZXV
z6A@TI;C38iwQ-N05t+Wb+085|)i1*-%0{o5Tmol!xRGpxNk+S@S_vD>E%Wr#0rUVM
zPuak-y0h{#wOUfF_v$9VdMBYh#Te<L*HODpue|38bIUiHH;$X19~wV9e=lsT;KP~u
z;g4D7gAML>`ewJizrA}aylJtJl^yh9>DAkqq0VDIgV|beoa6*nYDc3oRjE@vGT^I?
zdn9#TfJ5B$1p3<~o33IM*XX)bHo9EuWs-X7{YAaOP2>$yx@-Lve3&-$VDC|GS@=~r
zTU#SP`5_OMz_Kz)qZXY*`c13;qlISk%^%nOC&)d2uqo*AZ2SJVysrSch%ZxR$6QEx
z^h~Vv?!U5t-AoA)2VnK3t~Xk$C0aFd<Z4x_R2=f939;yZE_;)oH&d136LAV~xg_8|
z;22Y#gwziBuIiTlRMQGm_%Tk2pR3I%pfeR`D5)y9>ymi?(^ADeNTd7jQ3EpN$T=MM
zJ2Zh)_g#AI9Q}@08+K~syv1_Ma$$GR_t8}c!+-WRXyOB3T@`r7z?yOZ-bU?udL`c`
z(Y=zEkg78ENZKXVPkz@WKiAWhcq*`)sY`6KRzj76Ek<q)d5vQFCZ$fQOE9Tqo&o1P
z4QOwYS4;t#lgl$L$9Sv-w(LX;A90|G;Hv)Nzu>lthe!2&YNtMoy!8p6o*9V=oe^yK
z>2HSp%9;AHEMWbAADdofk|yxO-efOXRgK0Q62R>6zl&wz`KhqJ$r>aOBfpHT$r=ub
zGb*!Okshc9dk2$ND{NSZ|9?5RUJn(shDYq|uU*Zzcdp~j{N6lQJ>iil<9u_^i4RNP
z3Wc4pt3NLp0VWXsn>Zz=<-K&NWGB5XT&;vuPcncB18PIQq=Gbo5Q`&hzphfXN$GHs
z77(?RZcCQjuC;)25Cmx}L>s{r%>*^Ym=Zr7&24}(XE(i53cGBMODAdo`yqi4E@iD*
zHy%AvvVXi@RPb=TyZTkU46o~scSPUo?5-MNS<r!q*#}1rd^TN@CS>SY0>x=ztmvSh
ziJqCQI(dG49NI;fOMXdOd;3#CZ}X*bi5lhpw6*??M;oaLYEpa}K+7GLg6skkLg2Z;
zV&}`}0jl&sqn>G>4)#6{jC&1)7cFbD20@Zs3)_^hb1;~)h6q52n{Z7){1$1k@TM+n
z|ABB_kkZx=AK{b?Z+d6Nher#MqS2Vg!q*kn--nmz{{O-7$cg;rxf7y+Fy8X-bG$ha
z3LSNeKO5qGELsnMs8+wN;7ttPeA``fW?++o6h<+Hta`301&t4&X49A%W!?#gK?ex{
z*gF8P9c|QxEdzzuatJ0hX8jk_OI1M}y}c68wNBK!B<&)3NWRw8U>mHLyKE;;aTk4C
z69L2nd<(5b{;p}}s-)+NDfD@2T-3z$PIck=@vfcKGg;2#SH9wvuNa-`84g)K=Cf!k
zS}NpCtuf(!{T5^q_T_~l&#Y;;4YCpd8HJ%nRrm^r0^5j@+jSZ+by^X)i|?j}w_@cJ
z6VbteDmYXkueHl2UqkZ(96v0#jo7^C!J+}tz)zHbLFj1YXkGAcM2=9=R$#i&|HN7N
z`KkKHGjq%1kI#x)e$hWLKNAA0T2cS?&5S`^1elX6)lm=p;0=-~iK%{k@=*nJ9Yvnp
zCSZe-4jRa+2kavb2SOmN;?Nk&zath1;;t4dsGK{?9*aGt;7lQu>(UP=0dFWe=;?Z>
zHC-h*6H*#2WwXj84&OkEOV|sfglt_$)72_?lP%y0so)YDwQaV*rIYlY+FtoO>HO{C
z-0l-+=iiQ-%`ZeA2p8}HjGIh?qL*8RO9LBB!m+NCFWeE2BHub?6}&rNE>8ut39SHO
zi@!3=ZVy^ay`~I07Pcc%gH+O+uTrIug0lj%)~IUK%e>S}+6KQB8)x!{y~zcs@Dm_l
zKnAjVdzmYf4W=9<eTthq0j~dH%-7{G^{(YV`}@{_M;Yk>4}8aL?%RxVSJccmb+5Xq
z)z(&eu_U|Xd@zczsYv``6aCbh6J19Yu1jp!1ts9hfrPLs!4;1t#{sG7fi_?oO{p##
z7|h{DqBA8AeAK^2=u(+t4IHRPT(iaTlAl~L2HXMNA33kJV7ru41a7k1k!{q3Ly#{~
z0g==rrcY$7&aMfYTIPk%^FA(if>}8X&Ti=925@%HqEYy&**kvClj~K<cLhxatuIG*
z3|KBf&!{&E7Uy`fW-Ja7N)d1vr55-Z1F6Csq$DE^MGN#d7)58dQUAsaoX3(=vd*Ri
zH7gch{}RAOY+TpCq99|C+M#egKWb{yc`SP3_uFBjsoJ1!UPzvda`aCJ>ZYFEG^qP^
z*`fQ<)aWr-=%4kZ<un4%k&KcHkO}dV)zDZA_>lGhyQFqqPfB5z(gGL*aA#t}f;_pO
z>xEwGX*trld|iM%aflCU+XO4#NFXGlL&x|l?Cb<}Eb-))slb1}Thh_2eOi&U>u^#T
z#)AyrwU@pv_(FbWnJ1Se5s_&-k2|z5**w=BIbShe@S(TC-A3Qc>B=Jfx1{r5M7;n#
z(eL2%W-6jL_m2O^e?3A*)D`3^!I^-uq)T1TPfpbAR7`<qs<Nt+ssO>`{xniTKJhvv
zq3}$QjSFxUL2hPyxAc=RR;Mc)Gg3`8FNt*rgFGqQluB=mUkv+BgSW?jf_T>aeBGmy
zJr)Ju_1B<&V({2o(WlQBO{lt9uBW{lb}!udLaJwG6B05$dNa^lQ<uHc)C9dLL^q@o
z$W%Zs*CkY&uBtV;j*?xJw*M8tss&j?&nVf61^6YE05s4~>_)o+Iz&rgHR_+PO7fE8
zyCk_I7@(!AN(uoJ4TK}b&%&lp$US>*!JF=h2_FI(K3~}N)9@jEv%dfc`@iiG<#jJ+
ziH2d6Fw(b$i&K9|<`p7bHIcuQR~)!0;3BX}r<(|H;nZ!x6bGP6kT7KBq`<qv8?B%a
z%&0Cv6#WVjrLu8|15O`)f6BfH4q-@Prz%{wHbfGz(s1HMDoq?~n=MG=Ax8n`0mxMf
z*Ox~7kVMY|LSyDz-p`GH7;K0ah5nmFn~B5-eRiXlTZ%u&Kpb*4RYi(?5X*;oJ_9Px
zDuIn55(u_N;Fp8HqZY7h27kv-q$gB)jP_sm1pU4N);k4)P!l<&8HZDqd<K{(R7?lf
zgw#vgWh0fC3Pk0A5m47M=<p{xC50iyC&VUC`!~AD?iJV`IMZK#cIoZ%YY=LSrIGP2
z(ZY}4$2+hgW_~f|n*kU@`HhweF&1EpkrG1Yg%e)`H;d$>>FQ{*1{&FMU_n@JSSfH~
zK+dt?&%n4q4xtZ-6EG9~D)y+Y8k$9(@OywX^ad@o)1Xd?PT%t|y!;j@BY^QNQMx7|
zHwr99OOOS(v;0)l_g?+ux2~tP>iJPiEE#2C-<iRw2b*}4-??-Do<fnmb+KN&o?UWL
zswcd#wdxjWtKV>7KKG^;r26UD%8|hDkS+k@8})w2GFl(-ju~>XvWGp?_^y@EH@Jhr
zRa#Knl5DM%B}XvAq;B=6U7=n8p@d2D4REL}fe?YGmYCwuv@sca(u(M%!PYr!@$c8v
z|EYa~2-@>8V*YldMQu}jdBTD4%a5b_2FGKjTYVPd@(4}cKg(_}?`Tfk+AD*MJR#P!
zL}iKsglB_Iq*ta$WIRV(q~WlopPE2#)Zd2Q(o7%REzqrhqh%#Oq6xCiVTi$QY7qQt
zRgkul6Gq{_y`-*>`X!lWY5}EHr9c(4)L&t~^yy9QIB$A(dFR7w*!ep*c?-{u8Pv`6
z$^@B<UIKB}2(&R)6g;7yZo`VQ8SQ0)La4$ixXK>lyFmj3MT#|igKTSr=>_JIPJEsO
zRS<3=J`u(PVgr5&N}|M6mY)vlZ&cwdH6co+<Y#&X@0Qdm0-#?9HxJt<iz*v;N$@^M
zqx9e|L3to{CDOO}V=3v8jb*lSy#UMcpO#i^<ljKKpJqpoJQUvRUj7J4FKVVxIQcqe
z>95JWIF(boE3-f^U>d5R-NCS!c1SZ|q<+eh<n)`MCZIWdSJ~J^FBl!CEc^iKAWAy;
ziE(}z)5!)PuXj8_&S4$(!G))!CZd?<2Whd^lYGHH$qSn}>i1p0gJQnM{Tp}g8ULq#
z-wdk%*1msFMHE~-->s6GNrolHaOG^_HcykG?4iOn`KdryYXoe%=~aQqY=@8pKFe#;
z6he%uMlCFoI5xmE=m`QhI^-8{;wxS<7C`D=wW)<xgyqVWtSp)|$A{wsMGCT!{~Bci
zqbXKajtVh^(ju*Xn6u7&IVC1?Dh4^qiT-6Ul_5PelJb?cyv3Oc(VVE)2LY2O0$dJf
zd;d9J44?$3n5?6#bohq2WRu-4?m1A50T1L*>3+!*8o|@3`CB1wl%gSTf*&S;)qn(@
zY`qOGKgtFWT!Q>kYO=K?C~5+nKcDO{=?~^t{K{(0kj3w;c&=S^WHMpBqrb-e8-e=N
z-Q3d^K2wHToI$81A_u2lD?tdh`(0KJ*)3pXI7lSY{aew7(v#t!aU2aur-(ygiTC=+
zJIG4;d&usE-I(@s1&;zWxjk76*az2W0QL{;e1DNRn4Jcbzft}u-bxns8nQ3PozY0v
zgul`vovs8*1(kwI2V({21cg<imtI0oNp$q^QH!%<S?Yx!&WMDfXi>Rno$?h7nDz-D
zQC7^9#nR1~nW?lqb>Y{e>Y^og;YPS@bj46EtYp@0O`JF+)e=xB$-(6*>;z}g+w2dO
ziXRmu2}M9f2gtKXuH<TgL3@ilp@Ncv<OVtqX}6Mp-BW>sj2I}JUX_CL&wzGF!`qrN
zKv03G3GgbAK1x89ggrYEivP`p-kf|naBNR>-;6RO@CPG7;6fF3(WSKMOCDkIkwb$t
zV`C%MmO^Bb1#YQ+I4c&^nx^2R1c~r}VxFfT74jfT7Rq3$Q;x%NT<}XO03k{e|2<fD
zAXY`>+EYypPQ&qlFqz!}H2}p=^z?IuSg$8;Q)9tPNo16HdY~><8G=z@w!Y|D*(M+1
zWb^Hq=|0gy2~Ws-G@-zeaX#N&zi=MvbkPe`0X3HwUNKAN-A1U(SBBV2wz9fD+MAEq
z0Lo(|Nq&6&mLzd)5QmTf8U=C$<+HR02oc2rmB<2{BJl2Jh!rxxE6PL3Mr%pY(7bY8
zY#g`vgDU`W24hf(cyFz)62h0{WdBM?|9xH8G7R~<NBzqk;owS20*^05DJfJ@+DZw}
z!S3}-RBM98)=c?c;i|%FfGbI)=exqp$ws}mU{9)&wqkm@EU0vlQraK}h2R9M=!LS&
zN@=!>P9j4j9qoM#15*(DDd>NI6`%~_A5}p0%_R)j258R3eTgZN<1QJSKH++5xP5CY
z`1{*C3zrP$gil4U;>%eF!goAA)i?N-w{X-z^yC#H+5c;tATIyw<BU&iP1I-{qM<|#
zi^-n9B@HzbDnB3qST<c{L0?Qt`3=SPiGu&cx!FUAbC;ft7<iwZevsXDrbn<RTtQBf
z%puV*l3PG}nqmor4Pad%e!^KM>Ok>AOUQ?)hejHV(s!_pxGL8Zm*Rj8X+o!>t+ZS%
zD%zmiM4kR|Hu@%{c4)!CkHDcc5;yEc-&-?<dO08-iS3%(sx~3EK1{oS?f0M`oDW?6
zt*`Lonem>7&{Tr+r|X;j=nVzioak#F6d?bNyLXRk^32wT3&o8hm4HN`vL)dlNT>*o
z0<Tn%a8RiN%CV*}MwC56E0gffi&P+36+%GdEGiW#b)*Ghkg<|2lCcsED6l(@(pC#K
z2GCkXK;;yY?^+MG(>d(#x8Lv2?~lD@C_H(d`(Eo>*LAIRKg4Oz0dIIqh&qGmFoq;~
z-yo?B=Y-;+js=7hIY!WzAg@S7#&m=6b<o?wr*1L>F#_F>a+QQwAb3R)nF~m`H3x|_
z7}-|<C8+2DBSCLr=@7#WJ<!$ZY__a__F1i6+sxIUJ2PW~*=}@Z9-Vs}GcXsT85d#`
zBR9%UZ=^?8k>yvB2)qT1GfZa*zU+~rF!~!wvyCi=;iD&d{2%sdNb$v>*u}NP#gX8J
zuKsX-d!+QR9X=NPtwDjRU;y&34H+)B%o#cx;u|quYbb=6ef<HPL#MR{_IL!WYG74~
zgSm8zfv3rro(21zFY46y742!_{Dol#@@M+A-q|}T@6CQUkU4m*TR&E<Yk91ntXB?f
zZUzoPJfS<|LiYqI2Ty!=jHwMwUvW6r4X&yM)ih9)!o(QV3y{{=VQrS2@s>K^+r31q
zkX`)Xj2wS5*I%(zEJpJJmJdNR>{)X1azMV$e7yqBJfmhq&Rd@LY}5BY!6E!7N3)(?
z&Q0_4o|-g3_iNTS6}~!-$SXk=U=h0urc8o5Si%)g2zp4Lny&d?zsoG^!?!$cUTyO;
z^Qj2j#6K*I`b$~CYqQ21e`kM&_vPx~^?`EfSB6uT(TT6e?KytrJNKd^a@T-A4RB7I
z7u_@ZwRwbrp)~TL+r}?!zp{M4P<eJ_e`Al&M8Dj2)5E<P5`{Xzk&ovxh+}iC==nS}
zCEfr-#8^?b`b{5cvFU?P4I?=fDi_Wop$cl!Kc@lz&Z93s_s?Omc#XxeVy=C_qB3)1
zgXJs(@djyYoN1hLgPVbmw8c=ES7jM;%}KP}VUb~Aa5HV9Qp!!TXBp2^x#k2}`{^&G
z>23_Pzr26dYyIq{3A-#^>fX1%(%dP1I%TK*UNSwb*DCi<R8OTOw2fbA8ow)ZsueC!
z$wJj4VMhtuxH*v>x_4W%QDTsr&x#yRAzYtGTZo~Oh$@%bSVMD%91}}p`q)Kf5w^+=
zrtG@b<rSVLFhkiPuHmCf8JE|SsN7Hm4H)5(sB{o^%9ci9@vAzep~B7xY_NzGx>Z(F
zuy3;SV1o8`@A`d@U!>`u)Z6JNKiK=-w0`($jAG(1<4qakPj4hleVj6oKD`%DyP<S(
zdZg{)Z2yer*B<zD4DlHT_7EIm3uDpp+Sdj@_0F%+ZtieU2WsjJE#Vn#l{pX+V-ZCx
z{#5y7xwSbg3kP%$Jpwsu)xJXw*3B{}hplDodi)fLTKXNQTI<dJ1=d}09_qlADwoF6
zi0m$Dl<Dq{4cQG}K1<E&Owf04m%RAus#n%@r+#!a$#3Y+;w^jMev~y)t-Gb4O3>Y~
z8u?B?IFRvjYGAfxAll7_9qJz~usFKdp`yWC>Qo;qE|ocPc*4$29%Zd8;vS!ZdjyL<
zDG)9QEU<Ppw-UlKv4!@L9kd3fY;oId$qF@cEd(jaXY-@07KkjJVfg^nN2Rd=8N~p0
z*=ifnCO5Etw#y>s!sh(h$&W7%%wBBDdN%a9x?SIYHEh}M9^XkB>ABUretdAmH*I3@
z*eS{2)71lKuCPl^Z#*wl9l-t7vWgB)8X8<X07Fi8&@iwNR=eXO^*g}^U7JKp_&zHd
z^A{J_nuJ5E7Q`xHD`9yKW_wF(s+2NRh+6M$2V~~_d9H;8Au5@9eXOTCFs|7o+{S~t
z-2R$RV?D9;!jAABnYpugL$K&$^h$#pgf58Y#CiXw(5%RuPSZZTe^Nhsr8@JOSIX}m
z=Pkc@>VLa`d~#dYYp?!BK)?Q~WbCYDT5WevuRW&taeVR!*2T?-m*eSL0V_uk)r=1k
zmttX<wIMac4o9TM8lJbOAMjU3VcEC1`Q)FWvZE|wABD6=9$lA}3SDYjgQJz<!O%DI
zO&*r^_Tj<Jn$K||HK*cUoi$)&ex%?Gwk+D;L*r>MdfyoWz<nh1*`3ED?Js}0KGN2G
zZinLXrG(l3&|byZz3+}G#?By$e^_aZtCpNrHO-;JdEo*}-1P!t{WkMDj#sfma~hZ?
zl?~Q4upkyY0`IZ&c!PJ|;`Bg)6`w2YbX}z07;TGSM9ZzC$r6!;MyNHCvdbm--bT(!
z6!*i~v7RO@`5(N|n~C!fqN=(S1_m3r)>@0Ucxc{9R7%ej!1dDlZlA(B`{^G0>7TWD
z$zKGP?g;x&irp)U$x^GQ-_=V81W4eL7OV(GWLFd$dF$l$8F^HI^3qz^#0E;LT!?0B
zv_uL30(~7NEwRiZAwK701ELG|HFyI_WXXw?N2m%XE%{t~!`+sljjsash^&U$hbZWr
zMEHj*7qe_eN7tn&X1-0xcy=ctYg)J8%rE`VSl8q0!u)1#9#f3dy3T5O36h~p?2vgM
zaXPmFRI;&vam_}+pbDmP78t7E>5)0{qp`~fSoKCD`(P{zu;YM$%iF3BTxQ;<EJ5&y
zSRv=t970sLIG_TDhp!5x=KS534tpJqJ&**5-2cI<y7*n6#Hx;DOe-Xl&wvXI@B(iY
z`m1)A_l;yJQ=r#?i3ca&hytMVDjEQaKt%s&w>ZoTFaXJ5q(e_x1TxyYv?UgW4*(k~
zMBJ9J!>e3)xJg>~>ry9PuxNRbXqzcw_c4Gwx&YO+R_4U3sG^0gjXe)WQ(6jO$yROh
z)oQM|_v^ng#w$XfCO2hid+y%*&le}<Gq*=BKiBdG+GfUL+MhpJ7dG%#up9UOic+3Q
zxSZ4Bnq!Cr30)Qei=qk@f>$AMsJ9-Issk6`^S7!uuH;p~G${&Iy8+-n!%UWK9>z6L
z)PP@NF8#O7(b>(1S~nZ29UDRB)Ti+?C@w_nG@rIEN04@+^{$K(Z&=w8;byQ4H@im3
z`ttkKf}`vAk2_cErgF^&G;=T;-<GDIk!WiXs7EJ1pahB&u@p7fvLt0ml$($1A}VH1
zyhXCr1cxX9(uE%CcW|pA>?H9Lniw%ORF?o|Dbk#~WagpK9)HbNdk7bZYOvUd)bqlQ
z-v{`+aaRV~=GjIe9yE@yjBIgvTV$zaDD2;@>OUKgw9P(EmW*E6A(?tK11CjeaB**D
z^Dn|P;X_^drQQrH6RElJm)k#8HFaz_)K$c;w>F{MP7kEH5sT312LB3{^Lg~VUAAJ!
z<3tw5;w{Z!4mkpgq~=8Brn$I<Z;mjv?I>DE|1J8wVSy8GWpPAR$L|r^w%|lECua(B
zX_aN4BLxWD)7H%^Rhw)(O3VX{q=N0HT%+~Zd#BR%kCL<rc5i2l-wKmVKf8bZ-iHC5
z=bXP=IPp~feXie!_xm$t1Mg7c<8i}-t5EC{l@>*8W5ix4*ycPWm$1WuK}4XH)I8Ki
z8Gf3mgR^TmcG4xsBog(L_<#fL{Q(9N@U<X@ZUq>}P>O7@2rR%crnLDF)}b=iT6ejC
z7e%qXN?WplTgYH#p5ZkVF0FbN#o(9tn(*T*mOyo^DdW-Iv{UV&MqMt&ht5h@@17rM
z#OMjzjyy36u(AXVx5PXHwrV$cO_Xh3L##N`Kzv*fg((dWi=u_Z5PXUt2Q)5?E_*GV
z#hU~%__ZRgCv5mGm&UxsKD_8=BTKc++WgnHDw)G#Bv+~W9RzevfOa#v+G3W2%{7;t
zs?Io-i+`4K-w&y}r^gkucT;9xAb2Mqh3UJm+v$z>zFh+*_BeE4@>-u{#IAkch)yyQ
zB6;3DpbxQB97oPL)uWgec%yCDNjS8KlGzpn3Y*>`4Z-TnX-|g=6jFaBWguio2U=M~
zY|yqcat)?ie|?^7V;<naP2Py5b*KP=B$-150Y>r46`Tqx-Gn^B9%T{!ZpHPkdzU8}
zC>ak*ws>B_LbLuU#X#m0cknZJzfAq*+-l$N@0_cD>O1@6>I7X+YFa|pqp6suSuZZ3
zwmRi32t%6LoM+0sk(#r$tOc=9K`u|IX`Mfk6cDE@#`A@+ImbGq*c|Z^2ZP;gma#|K
z>z`%&Q?x6OfOFxdqnYqmxtz9kt!-VtRnriwv<Q!`!UH_;kG4IIz^JlJ@|2ZYcgcuZ
zX4`gb%z+t-^!9{d=W%M<R`GK1Y`pebZuRA<2YPMOdc3{$-!GmD7eDTNft9|le_%J;
z<bLmx{=&da&xr4g+fgel14C9_ft+i=tIn%WFx)}De5*YH)ja$MOa1|HO2Spy>NwnE
z3>!oz>zp+u$N`yqbd?I<7qP(fdJ?1DrhZS8MV+$HpD475Dk&H=M6*MM1*>NLzry3+
z-~O&CYvjyuLi$spzkj(n;ye3($MKCNbq%d~gsHDPaUfvfI~xZ0t50a$mesyioXpVg
z)5uj%x@m{>K-2QpaD%S8*UVQ$iyTQfUE5&G=EpVTUR1^Y0zAV)ur+6jpB|I&yjXEG
zqhLpQpeXbUV9HLlty}T66}irV;3<frMTVM>1Wx8zi^BY#tWKDI*y1%X^(;X$nJ?)*
z=UjI0Loq6xnZA?wgYNsR%w?nd7_joSrz*Eeu=prGu95hIcWJWPkuHqb1rN5Y^R@sB
zqH0G>ILZT5w~vt7&Ae_>sLX4?_l-xr5n$2El7zDe;EIM;0%g=06H71fKA97kG|NM{
z!oa-F)UIv(C0;5xePu_J<Y(sayWvMk)l*|>r!LKmftQIV>QUx8@|G{Kx3}~v#+at)
zG<azd+rXa3F>-9&)T8#GdY>H&We@7hsioU43F%;{tPNIF>57h^VdS5|nT0OAXwXHU
zl7r9~fR>*Z7M57t9QBC(=Y<unfX@U7g$SXA4XD~=v_WQ<`EqETWHt%^{@bgFRbQU*
zo4jA$2CNvKH!20B7T{O#%m-t|QQ!)YW^Oj;2`BuO$AR;^Eki`hiJyX~-osoe7IP1F
z6mu()1^a*v5mlY86%C-5mIw$cCZpE$IM{erXyXbv&e#G*k}$wP%9Y`Jd8<L-qiJjQ
z7?#(8Objdb=`Wl0e>nSHAKvTp@kROX7Xl6TyT7gP0yyZpjwv#+=De+mpXPZASMaDX
ze@=w~--!bBU|q0*Mi8y{-XVSl3<GfxSP;#1!oEfd7JKphA?6T#gvA!-oDflvAO<u9
zDhY_Y!4O<nw8R;hv)eM<6r4i-2Wu)^2ffWfsGU3_T{(`8o`6APk-e4XQ|k@B<+QwE
z>u1N!{PrK*5#}O)q4{vMf7VX>EG)h?tx3HczzMw;j5Rc#BmGE&SWjYBZV*|xewRz3
z`W;L&Hegj+kAqbDE4u*?h*YWdD8x;zcPUYZM5POO1d3sUnv@ST57Yzp1~38^O8*~N
zfY?IRGRBcb5^$`67#^->z&<d?E+F3S{7t-#)UQiBl{s`{fS`d^`Mad{rAC%kiH^Rz
z5xXw5!t+7MQT#>r^@fos)<(cCdu*C!4;BfQ6)|9tpDs)z;e53f_VPq)9Aq@c9_eOH
zvD0$L9<{kSsEMPYITsE_$az_dcX1T-L$<K4K-jqkZUZV(M;;7}m3cK5=3G%ZNh#@K
zImdaP<+bfoUnQmJG`AR4(fZ<0$&Ro_QWZKg$@4ES&sZKccW%}I4z}QisKoJBuAus+
zDq=e<!ZHDP2^a_!H2zzKk=ZD7)B`aOQ5G#UWb<=W5z1X1+t3Ijwi_#MQTOFTvpGX#
z_Xu$rD;_Eco`*y>nCf=7SGZD-VK7U(tg-K{Brp1Z$$+P4et6o~KJk}}@pFjF7!KcH
z@-mw<-!NF<bgaX|zCuvMb_NM*0lFe#=ZEJ>TS3W@G4@z=Rh%-<Q|*=`Uebp=VzWY5
z=CxRNp<XCUvaLn$p(;3e<<$!pYz(wLEMO;zHbjf;5y_%jZ}S5vV&YBr+;VI?JpCOx
z!Bs09f{j$3kT9PLE5imuP``Q-^xvS4U3mqtzZ=n+Wlh#XpPk)T;GO4`L~aeB4*?n{
z+SE+xlG9)<3Ww6NP@kVq=;Dl^_Q{Jdz^)_6tQ`tvTq`5j98;9B>(VoU$iNC%A(}nP
zCi7!#zRzc|hT<jlhJo#|bCyE!$w=bBCm0bC0fdPHb+Ui<z4JfIGey|bd+9HayGw@U
z_^Tn$&&51lKmMIPs?bK3kwLMnUQ>b$K~O0-H5+Lf7{`T!`40P)bp?iH5m+b)hG-Ze
zuyU%DzejG(>nMS&$N{*43R^nBdqA|ZyvRi{v<R=V?o+b4Mjd&txPy`FR-wvC)M71W
z-H0`@gn}}6FiboSsIxE`|4d8FcedIt>*ckx)w<{RuO5}Z=qUavI_>oNBq>6P$buv)
zH~7N2^N+@mG!>T^B49*>OOn9|O&CLrWb`{6^+rvd`g341Szx?H>V+y37u@D<OE1?P
zl&uz`5fCLPSw!fTMpYr?ouMZa=XHQHplK^az(iN&`FBWPj}vX70C2RBOprC!_xGF^
zi<|XizkBB6^5LiL+H&{InV#*vYyFKpbL>~}8cQAvAyz<z!newn!X^MypzI~dP?AiR
zOXnJqLmY&pn$weL>5NAyqI@<K+JxH7lQt0F?kLV~kizoepctVPZVMJ|F_pVU2YOl>
z8rsvhTpXf7b;0pjc``2Fkpo}}jP@5D>5y`db^N}QGJ|EayHb%c)S{TW)9W|=;}I<M
zK!$cy^8KB~A$#A>Ykz(uC1Z9%cc0XTnf}M>O!)Q@0t;AUQX;!*&^Y4&0oul3N=$J>
zO^ODt<6f!OM;63(x%3#>7X}u(Ho_Vv425oQ&u++8|EG6Eu&5T#z1oLJMtQB81FcC+
z0YMAKZ{dJS;txg<Yq)uohsa{XH1$x!n_HZ;8H{LgxSdw>q#70u!9zX`7j?c4V~;`_
zeX$;|k#;Rmg5>fAgmfp%a-$72>w9jn!-@`CgCjV=q&0=`!crlWW5rugKskonsydEP
z!IMIXL$ASYFqG9GT6V+N<D55<zJ9pC9KIR|7<s+H;4mP)IllJ6PNxH!O^Oy;Sn_S)
zputuP&Dj`e<QW(&;#l$_Ui&r{06YZ8P>R6kBK~dauYK(q!^$K-?T_91$!h&=V|}mU
z!K;Ozn$b@x^tJjYhy7ahF9v2_9%~<UgCPajAX<fZsDpK<<^;P9o|q#st`5N+6yq)x
z=tBZ^O8~R@7A!x3Gb2d^EKmcfwWwB&3hl$oI){H@O}W7}!-#_ia!9m1giK*@t8mjn
zIC7E1D?*gvP>jLoKk<oI%OdET&RDAKuB%~Lx_rflBL^?y#j+P-z~k1!`%BP&JCQZ9
zqluAvStRwczOZgSbm;8H!bBK(wb`ujU{V&7YY6fi;M0W5S*(NxjF(0eV|D`IKvJ2o
zY(y3Yh?bmyY=Rpi(PA#*Blf(#u+xE}Mc4_pCYUb;Wa1ap4X>5AuH;l?`-g*ol(0D#
z@Y#T0_1+#i#6h|a6ky&C{5ZO56D(u!#-f@HZ~uN6e-&x!FO2js)4edR*6rW>wg{r{
z;o<#~$yqzyqx^}dx7@iXcErItj-wVbYoOeK;9f&kQynZ(m`@Pa5ug<$H*iB(F)0k~
z)~#6U&tH>sz#I_0!63&V@7i~XBuf!<r9=-Y{M7Fhl4WdtUVR#|iNZj%qY#lNtLZrW
zPit?=Ho_3gleC1He6#-P8wu#JFoyA_|2^E((wi~0PXGNk$0QSq^g-zE5OXWF|2~;z
z4RX)5IKqFvOD%?3;GV8PA?b3dfTg<?a>&4*!;3~3lq|$!!Z4PiUcCl=#E<KA@du*=
z)pjQH%?6I&GT8JeVm*vSEZ&C2b%8Fi5xv)JWMl{iW5f_;b3T48s>+%V8_#smKiB}A
z0DK*Q=8&TT{+g6cT83pl|7NM5PSd+yd)KR#LHJKlE%iD({X_V0+(y@}bKXYDQso*E
z>=E>-(EhW)(JKzT`6bR2u}B9gtYOtWhCQIcQ9$?zfL~e6hVrc47l^bWRK;Q=>gNdb
z1~;2#(qk~oUomI+%?u5IzMkXIrS#jY4@gT}H{XYEoHd#tc?p#QpR?YqKmFv6uum=}
z%=C|BJ(f%v_qJ<q9&1k>{wyia+Tgd@FD=bs^3+3eltsYDiq6I*TO&Nmkd?Ks$s4W3
zoLZnkb0S%N5k}U1;ME-#$Y~oqrF}i)ImVtR$))+Iq1CNXx)=>A>gK@BD3*GcKmaN8
zY%Rg5yo=aY0gKdbTLVvf!vS^Tz$OJ_skgRp#%3Oey#WhIA*=+Iz81EAYTQikhjVmE
z<5zwEI{kZ}(*ITX;4@xXVfUsp4%{WJ8z05}J8$*F3@NMQX@=~J1@qT7e9if5hT+=I
zFV@=`IDO<D7rEKUgQu?hgZ;~z)IY5G{k<V|yFTUAaQg7^yc<9K>EACa2HYio{@AMU
zUb^|)`z9aO!z=Mxw*HfS?~0Zq+10GjQ@qBGw?tABfZ-y0G^G8bn>F@5uH=h!0yHsm
zzZ9ZdyiMfv^EU1+W?R8P*hfAYvyJFPEU%w51%Fe*v5Xa)SJ$Wm{%#uu&?m39u@x)i
zDO-3z+R|$8sTsvE{x{WG!&ZJ;-F1mhcQ$NU>Tr-~q<=OhUf1<_Vjye$MuOx?eu{aZ
zX#R#`ODWt>-z|VX8z(DxkSdGUWy7vH;%I0NP9Mr2LVSnJiE$b2|J+LJ9(r0GxZ5%Y
zFiqLS#xKF>Q@s%<a$2c=mqL|$9yiPEGKwGVA*$ro0=5-oppqL5WmJU<NWbpb187)S
z96@%{R`r?5hoyGf=S}UiU%3wq_ACr}+9qA*oALa%pYG05GrI`OTui$JHw!z9q6A=t
z=$$Y<ihYf^2w#MW7cn2|a;xh~4W<2QtlC8kVc90a)7y)hF+b8b%tK?p5_7X7Y=={@
zd~y2dEStZ`Q^o^9-{xUi6h&041_DbNXDeHXCqD=W6P}Sn1!WFci7H4Sg$s!Gp|pB^
zx^`6mVmv|nY`@>kt&#P*M@Qx3pX_^b`(nR#tS>>Kf5}t4DBoXwDZw<(po9ZWKpfp>
z-XY&D(QbLrafkx&+O5LQBe8}{SfTS={~RkmV=Xr4%N8R&2kjeU{efQSA%siWFxdsg
zY%emCUt27-_RxG{Jyzgt1kz?w<V1PX;N4M_)N$NaZC}CiN{p&f7V#mUV8I;>?1(d!
zJM|>8tvt%OEOU-mqGc#K_9f+`e`&&!rwz<JPLWLBNRy9$S-j5u6Js;G%oj%$FRon;
z(Yi|cm@f{r{f9MYDK=s2Yg$op<unrNWv>qCLEsvqxQWpD$2J)>YjQ|XE6344Q3n{g
z>f$}rC|jlTS?TX$8yX>wHYvkl!dbA?XlDHVl5^1)nFrGf4V|L=R;Y%>7qVN!{dHq!
z{nGUxk7(7o1NzC{0sZsNT~D>!|Ifb)Za40Rl0+%>eO;i(i#!o;zgTsetNe#$_x#t(
zp793?wLcTY%UyDO9Tpih1GN&-f`;&1<e{+<o%BaC!VOo(nfi2Wi0&{M<b<O)xi@ch
zoHEFCFbCcMKe#!u1O0$F8~0h+(e>#Qmle8~xr&Tq>)dVS<Mtmud-hd~{^5=k-;79w
zJ*-L<(}_ackV<dtdCEt(10Y>$7BamCVNa$%6e}5lXaiy&imAOwF&hqhj3eww;qc&B
z(nn(|M8;DdC#6HC${B0;aBLS)EI{&HKi$n>cO+dNF2!NkuqKwd)Z-CD$1!HW7GxE*
zA8|$)6p3kAI-O?1-x#VoWMBqxGcg(_x%siAD}Mirb6yJVi@p)XY*$L=VE^#b>)Z;?
zZ$2qdv)~cp&Kxm=kk}bCPqhJPQILoyEoR3ERc{=S^5&h_1S1Riz>qj!xb;wEtuFq1
z^PKonHh!+$yxu#41TPvG1`AyFE&zi%M>UQ7ma5LTh^qv0vd)2kWN*F)PAq-x04r~P
z5!DiOj4J$%4uNj546Xr#(krrkdFnF>qhC3X=>8dUj3v*{87J(wHb>(fWSLe3%K-}N
z76(ZRXA5FbB&F3xI}b?jjN%2bhIS&PQK3oVuRIKe63(z9uOg=1OSyuyDTgnOs79S^
zMWc+C8*OQHQ}ULY<07H{Fnnmi4MbeJB=8>}hA^g$fd@q+1W*pxRP2Wn?T>-|C)&X<
z$(Rl=hgF&>^rO`Z{iD+B_omJc;cxLawQ0*~Glq^Sbl+MH*h>M~@%BN9>|EeJ)OWBa
zR9S@pt~GK}xsh7F0P-A!RI7!vxYpW4RYT-QZ3DrJy9qMrlCeX3gq{Dx?O25FWfEFO
zWa+Uvxt3(vQRfKY%kGjL^f$831(?N&i+JYl%8=@|nOn{Sx`)R`#KVUj(&7V=5P*N}
z(81iJoYQb!FLos5wIC8(&%}=T0<<x>#me>WC^;J?0I^2L237^=nF!k*JbIEHx)SPz
zT4-vwMF^@S63_WyiL0r5fxhs0HB|qpXb~5|hGWHK1%m^3-g4a<hy**W4j`*<6OmpR
zzuhwK#fPY-677{<KkZi&Suf8HWNL5OrM_4dd+@bavjD@@C;H+uA9uS;W?#&tD8|NO
zo@PGHY8E<%?q8sJXAu&x8MGc!vA`T~+(MuPcMcJT_y%0lKXNV$>>s58nqapjf3L~}
zdk#yL{0G$=xS#BMMSOuJKd-da0tF8A(KFa~PV6G))PVqnIMG{;6p_x$F%V7BYHj7A
zVOfdJK!ZT3@vtB)sspT0$c;~ao$<W$(#YFWFLSH4*{=ZU!Ts7M-F+`R-wY!I6zf)}
z^`TWbf86nwvSb)XvVh_TygRrRzUSuFXf6%V-G$td4L|~z8^<wi;5d{HSugA#OOOlN
z00?V%C(5{ab@!H-TX_mQ>s~<@v=XisNEV^%N(3piDVgvGa5-!7Nz-&~e^|f%)+^{g
zIN_H$R=G}ZdGAu1exbW$FzxB95U)=$H)yusP*jHg3*(hQuzP5JUC?T8MY0>F6Y42V
zL&3bGf4q#;;cobX97`}(el)TT?gUt+tN&iCZpmt&jvzF$a{nib-He8py<tJVwOE?p
z3i#w$DIu%8jj(MH_;|wkZDn?uV+SWrWz0Za{sWH$;5eESUmO)6PS9ftM)EilRYa5{
z(UE~kn7jpqyoHL?BV2*h4;SH7bMJOj1Pw~gJlCzV8X`Qs(uHxjN)dTS>^bamR*o}e
zgTno%M1k!_RZj{F%^b)TDJaFe3Y5p;YfQcaK8pMZ&_3^caMW*dD%Vfj-`hX+^ua_H
zaNeZ-`42nr<|2u19SG&%VAH+*kFp|FCPncU>8Akbp{HX#3vi^XXsH30d_|~7G%Ph0
zaD~;s!RUoJ+II`~7~^A|tHGJ0v+uExd1$xExV_}2L7`oMbB%uJXa<<DcPR`ZqF<|t
zHEMz}>B2(#95gbTALxOrF6b!Xdsu`^mny?o%HX$!|JfS7um`na{h1@3)fC?M((kK2
zw)zS<(mowBkn!w7$PyN<O)q9ZB2oW<!2~9R<K{prv>g5b%;HDigJ{Gc0a?D(P#*PF
z5_?agw!^2^U7$Im1)$%!N*rw*Xg`fYZSzea7}ETPc=@YpeH!6~g^0x%9x)^38q|P@
zDq<16j<qmOc`M(^@zp&F%X*p8K7OBI+;~JXt?@|HHUDSYh>qv3?`ODs8jk750LD#N
z7Dk)Q)iz9iHz!(S**IQ?pxs<%90Lu?1}t{)J$tw}Ny=N17#eEGjs<4~RYB0ZR+#SU
zKoL@{1{y&oi{1i31Z6qc;GGZJay;1R5S&BM9p>9cmi$E`xIz$H9H1a(4k~PA#o5%d
zk-LDyZ?NG;nzE&m83sp6L2&+#*gQK491j#(<dQLtMr#amoR}i)J2J|nfe}o#E^fi-
zPzAzW%;yk}!7D2=&&`enhXKs^o<kwZ5p-}*)mw{;A_VzPyi@hwh_NUDDYDHK96-jW
zzSn1jqrjk0JoImHMNov5iI!I2+lK5UND<@+yjkFT;4DSAUOQ~3d$`|k|L~=w12eSZ
zA3nJ~AQ``b*-hQugvl@Q2OCnG*oQm=95<k_P>J+eOg%&wE6AU)#Rp8JXINe!f<e)?
zAi?T@(}sblkVR1>|EL{gWEdB~w3n&^s}RQOcUHRkJJ|=rWu;+?KqGW=>d@0Lje~_k
zS_U+7oXrc*VN=9l?8}_Rv2aEwC7RVieP_>87GcCKi%>CWq(siLnfl7vZ_*E_Qu`06
zGRde@?>xRo*-~^^lSFiCCw44Zw1Y?mfDa*COCBrk7&Bg7n^0m7NTEL{_k(SbpKpLw
z#*O|-=o08Am-2mMNDqLVHSB9d;)1@1_oz1$8(I!BOmj&>7f!vfGl(pzvq1&1Ci6im
zu_pM0(=_)2f4~@Y9u~*;1N#WmCBy3!lV45v>As&0tM(o0@moIe^g9V4r7cVU1XS_}
z>M8FN!+-_<WfTNEkXrg0I2wpnxeM;%#QaEqx7|u;pClUU;_^TceeyA4rOd;85LMOr
zi3=#JK^8>z=M8DCv1r!Rl7xY=*xaSZlCiCDkO43SgIcKKt~@k(fdUXBmVPlB<86F@
zk1{sSs{#x}Z>?m+>m2>|Ul|yzdogf%LL!+?SwF4$PN6-iA5YP}beE4C{;a3WaQf<M
z2!tfKC~BX-2$c<$5b=XfA=4Xzdr!Ci87=%AsbsJn2m%CM?DHA;2N^s;D4wfHtpnV1
z@w1zaq;xSG5F249(|5<Fz#FJq@e?PdD;UqwX8z!(fGrGh+N1idRBr}^f>X$LV5_-x
z)w)a9?Ih2}(HOg2eR=%5>m#$Mu$65uf%NMqr-44YFH>gwFZX`u++**!#qv{Yu|q|+
ztuV1b^BZx5f&FP<3-=Un{XrC12o<#KOIhJkz^a$gr|32@W(9)wE>m35`$MF!MnnlL
zsld5HFS-G1zy>3O1w14tQD7zXEDNJ0LP*6DXcFWv_ROi40zds-7N1BR4pPdwpwhlu
z7Kw@$$6Xor(|6V<WQ<Wpv?wO4b?PQv!8&)Ad|Wtnt`E*fwSF9x_PO1xm40wUtCPHo
z4gy>}hUG&uKs}J`45yvIt@hvub}?-bTm@!Wg316@P#9<UKFA4y5=;(Xm=EOQBB6MV
zf!Xsdb6+dQeIZ7vXOCJ#DRG=?Igxg{f|y09hX>wDWgGPcCw4%>E%`p+`k-9EEJIfo
zAgOZ(FTJ($o7P>oYn!9?xvwPe<vz(wLi^OE1pQw?e2hA|2TCj>&?ySWEMJGpy7QLi
z6T?6TGyh><@H2Q<1$qw_G|NjZ*)PbbMoeNJP`!5Qf_Fy{l|x$YB&xt2(i%_oyA(KZ
zF<g2ytXylb9X^oE(1#HxWVF%<f$v+<DlRIDvd9SxGe&l{le<W4&zn@=NSOSt4=<ky
z>z~zC>(l)7Y5LV^@f`86a?DP9#ZG%&KaJNRPd>~~$a*l|l(k}So+1W02d+}b*aI~6
zg(8NQJ%EhK4W>#X>d^vYBrC^wx1}@5re0@iW<zYje@G!GD+z=*37NbkX%uo=EhpW)
zQUg(-aRe@TBZq+8Fm`ax-$r?uJbURG8n%MsQeg&H6@+dkH6Kn$W;CPM?X=pfc9|nd
zBN-EL+W=XA!o<O>mtDq#7c=^?h3@(ma(z%AN*oQvL72$F^v-;Ck!n0L8&1JN#Icu=
zV~-k-Lem({8san@BQis5WjxVxoRs11J(SH*`K^ZO?+S`BKpePeu0&(%3P;_LT=+rM
z!B7EQ$JfkVdmJl_P^H12Lh*xuDf?6I^x(Xlf5Sj7LhB;lDAy|)O>NTN#RjzKhZXvK
z{lv_@^aJ2UH%{t*NSMvn4=FOij1<!+!=~>l@^`35qD&9BLWM-SrJrKPYCGdFB>sQ9
z{D(nCMN-H%pZpfGI7bB}2Cc+PKup28!fo*;1Df-2#!+mPY05fI#b$qhB3S{^7;{<)
z4Z(MF;xsOp|F5&*cXI|1Wy2FM$;8rx4`2AzWUE>HQlsKzafB})vnT(&{%)MI7U(7#
zX$FOvFZDOzZk=2I<ZA#>#Lgx+xl^O+bN{`?d;!s-A~p>$8%Yb)o9``$G|Ni{fD<#c
z2qp6$bgBeDSi|CFz4g=bU)#n(Q$+Qa`-{H}OJ3|C_OE}^q{~gvj!EwKEDU_w7Tdq?
zc{LXQ2SpaM_|HeCZ#R`i0NNmHA-SNxr94_OX3>BjjIq;fgdbu2pRGhEi`dnhKsyM6
z7$9Ohtl#d&H4Ly(vf)_R0F)pSu#c$9t1$y-C~~PlDF?Qh*X=uD5=n@MNoDez$fZt3
zi1G+r-(o(M!)wF{?JcJMY=iKcd|;NPyMMHOq9v_;X7aLK>ag9H$xS-lNwfCR>Dh~l
zXJdYsCggqUaDQ2Wkqz|`L-E0wRVbo{82YsU*>i?v2dhvIktH0K2y)AWMHM#(+$RWQ
zsBt(HLCq-aPr+j7i5`#;4<RNeSq2A8ZNwOYL9GK3MXlS2|7O!{%33hrCFh<+67en4
zJmd=@9Xv|%twGEn82Ca*G;y(hNgkRUR_^AayBN$?yxf-6*qaH3pvU_KMz1F5@Rtjj
zlz`(rd&N%j_{!Cg+4uM`WiTWAF4C1K9>X>n6i1N-rvP~W<lBH$LP%lO{5Up;snb-g
z(bQb6w6N!tMIf0;O~Sq)jfYVwA3F2P$`y8|viVe{WoID@h!)_fw-Qc)1uw_j+(;(f
zV`>M!2J4xl!?uxkpvdODpQG)66AhX2pG1RV_9gAh8Ta-e#Yd>~D@wpSjZ|+T)u0#s
zsozKw=^mOTIOOAlV?Sh17@nt5#xiIG+z=P!A8LGCG){klH`N;kI;sOny&*dbCNKEI
z=S3SBoCTmO4s8ZMEJ(ySkF%%HhC?7aw<(0Sn0Cd8nM+Q`#+AYjHu8yYnpX_9y$N%|
z5LV|6UE{>d!%e5wjHfb;_ED+d)U#uL-~W^+)Nj$AgF*cwO<}LoqkM@mqKt*C!tgr0
z65!`TxcWqkT^4z$5&b2lvW4<jv%D0<C2R+351gkhtEMuA%&-zI30*1UomW>_n>)`*
zKqQEi572|(icSb~Qbc_MD+T(9qK|<ikQ?<oG=OHSmeYdaZ?4SVbv_Cf`wv<oGo8^A
zSG;s+OL4yGx$1lK`OM}zi`$>LBnpep+%bIj^Nrh=NZk&u%M<?Arg-Alg}YO(*!n9A
z9xfOx?sWYP>k7MaLFj+{=j_Xg*<-_1`_246FBLC;IO7YN_H^mM%;>?9_9!%s7L=eh
z<Eg=`83UfW=3GMwX@p1T<KobR=)<%-N=hD%LLEXfpk|{Df~gDf$%a;Q$a3^T0c!9V
z(L@MuU*e3D1;sM+BrFnixRejy6P^Lj-*InXwul9*Iig@6=&|ODZ0S-8$Uug#VJ_yL
zMZ!W*QIWLx9yGj188(~7&OY6_e*ft6Tkg6azR}N??Qb%dPkfmEyuTD@De5oU`90F-
zsr5K046_7nKc~Hzr7?kB@bgxdOP69RG4JV&ngLBm%}l{!n4f5gAY35?=DD^9xpDWx
zj}mmmxyfA`=Zj(rq>EvR(+J1|N0j*}8J~W5j?5*a#DK%VLC<N3-RzI)T~~8TFdSfs
z#tEB>Y?EHU*_qxf{Ehh+5018LXOj{#N2~Vh*S)>(*&VETr())-OC!D?>bvz1^dA`t
z*&i)mF?IoG4wixx?(T^5_!UNCMO&ibaoa%TcHvfYz~^#UNmx(zZY3CHek;_!>Vxy-
zA`l7qdHGkWnbA(lGiz5~aDlhwX-^^B3f4I+_*ILT(r?}rT_wG@ge9Vh1$5#VIhD`7
zNR-WCkw?j;L@1T>7%+fjEU+v{a?hH1ggLVxmD`_Q4%YNP9{j3Q|14jjy?ys+_46kE
zX7d<86&}068?zlq8Sp@mbDAUGfZ?rIrRQ%G-aV+xjWhs*UO$K!NZLUhY^eaR+5s8{
zORp%3x&>r-fLPt5K^LgMttjERAvA^ppMJh?jT~srH07#R;8+L-7kJdhupYG_?hTQP
z+-wcPktW6j#Tf;uehe?A)<2uF^TQu5)OP1$iw<E+E>9hVSg{&`BYP>OZS;Zhz$}-I
zX}M?r{$EgYK2nC8Hej0L9wz#7OfV;5f^ov41~s+LM~B%gLsy4I)T=ZR?Z8wg3STH%
zJ%Wzoo@(agb=M}$UZA@wx3B^Mjq*4|vdhy*{qDU80qhX?2nfJ<Oo^&1sD9xcLR@0g
zPR)2COk2K#4iSqN|5grC@EnWvy0BAPV><p#{y^4ruOjQEdc{P%gLve{*uzfAq%uMG
z!;|YHFPb7c$`QHD1QHf(t}0ufr=vomBG^0gU<70J!AH?x09?yf<Ip3b1+oqfW8}Rk
z(?L#ccB@lG!{idCeT?n`wG5zC7NOK!#8DhL+Bh)q^RWFT*Z3_fu?L7Q#vy@bK>#>;
zVI|ryqy!e8M6w2LYI1TBBd8Ge9AEu+1Co~~6}l!n$@9SxznLHMhwsf?eEI#A9RvFQ
zuw}Z3-=*o)vtE*Cx8}c@{U&F=j80-;R+9Q>X9^582riFEl=*O}lyxAHoral_e7~kY
z$G+N;E?LXmV8~vVz=C~FyP>Fn>`=>e4$4aGv=W$6FI24xruHOwS3YDcDFbZg7!8%K
zq`bfjCUajyr8$O!LxlUVbKf`+JNnz^0RQMJwxK;{Y0wr0K-~?i)=z(vfA-$~BdW`N
zH)<zNY1QtM7cU=7xPF8|9GXjVLPnG$Ib;jd&tNEt;YMI4Bpk3>fG47^Kwp!{jHP7w
zvLUV7<KT<{m$*9Wc!R+3;y+`iA9gTv_@*}Y;cf3C%4slk?(|Bzw<-u(gK?Yp=v0*-
z4~o!6c>!v)?qBD?ovca9frqm=E;DgtWOnk(!Hce^@j_z%a2m~bJ}=Dkck_YS52!%2
z>=1UAgM1aJcXYfrb^#l`4a6+p+mi7};Jt;y$=ioF-KxK^rU_{iYs3zUJ5E(G0nUYz
z0<{8F#5_}nbVs7{$mF}GKJzy~n}Kwx!?RU1Z!85Ifxe-ou4#C!cqw=o8TvM&<$Z2A
zRAch1>dX74@n@>*liW{<N7mR+#JeBu-#?bGe}1L*dYB7>9;dggM@s-I<Sr=>w{0U#
zp~Z)55)PIH0)pC(kFrc-D!>s84`P_nkUc=-`JpS-EJ-v#6rsUx0v{tX3xA3jrnyTz
zS&8Te(hv<)-DdQ@ID7N(6$5M4>=?4~-bH67l66Y|bVNVuC7Hc`)NguhqB`@1GWF@~
zj(yL*KZm!dI#U&1ceGzJZtQpX-*UY(j2(UOauOa!RM0hY$~6>zhRY*wLooo0-r0uW
z(YMq~2>aZ~xTD${&bZDOJD|N|fEj!9G4_a)gXE=wa3G+_3n`1hL%|M5HvnTV>Lqca
zCGesSWzR3l$7^TVxpO;&4mJAxQTToI+@w{p1vFV17+lr4W}(#SyrE`YZ@lm9cjJl=
ze}Oxfo&EHK#z&X=HRG%t)BbYg)A*f5U274KR(=yrJdnyUcZdg-w+|ovEdsxQ@E?#6
zZx8c&VH|z{V;({m<_Yq6$PEA&AOj0Y=K-ql!#EPe92}-S0ITg$2O#VyL*l8wd|JI3
zCx%*=gF@n&nG|YK%fLr?^FErsZPly4Znu8s!F9V#h>U8<^nwZ9TiR#Fux!7YVTe$Q
zcJfNvfXu{jLy@H%0+ok>EDkHTl@+iTlwj)Jw#{%a&8c1vHp&*`rL%NEBCgpIU469O
zFkbDE7?h0@!dA^j+(z<v!4Oyn)bGSwpfbezV^CQIqZ{V}tqiz4!@#}yWEFap<pW9J
zzRAszJ!FhXcemkeD?S|-g3+6F405-va8pCkF(?%{=(%Lq+7wbjzBCN@Kl?M<T*s*4
z4YOr$XW5I$$;Jkt6-)69rznP^zLcgQoN!!`AQ|QKwE8w4hEx?DOu?s$EUZA}p|7l!
z;XsV&dlrn%jzGpS3OXv}REzF8(<FtwW=by_%2mq4r1elX+D<T4rJU->7F22ER}{8n
zat{=w2%+g@|6pol{gl?rPB;2XQdGapci-a~kfN)0muGrvCkC`v!fNVTCl@W~+GHdJ
z!oy><RR`q20e~g%p66ucA*>4wth)z$99)4;X){j+aWUT%tN{Zq#cYR0h){qfIzf0;
z?@~f<v{EpQQ$WW$!DhDxpoCp=1;C-_BUIKax&G)28M&bQj>j9dpebT&%nIR!ejq`y
z*Dyw9eeD9~XShHHC~3~s(I1s5k_l~JukQKJq;#^=UH5>p;Yv0ZooXO&?}`c<A}L2V
z$c8`!q8oEZ{6*p=wdD6<L-9_W{2fO2Tfry6LrD}gYwX*DpgN&^nl+!`LxXH(P6MET
zbSYIm<aV=WGt(ma6m<P_9QEFS`3}r)Z9~*1%>xakQjZybSPGpNh=Cpc2nYH9MHa-&
z{zDdA8pvGe+Wv=)B`&-Q3@fVfI@3!83236w0dbC%2rWLmiVgikw1-qUnt`GZT3WJB
z?HEi?12fec(>QOS-T`?b&X5OK2)^mz>}(dzQm{`qVB*WeqyUYB0&tLy;|QjW_i!3c
z)*vJk2Lrqa9u6!i+APxt$+V4V%b{w_D`=Y6fdl#;YF3|cW40|~gG%qf+xizRR$=mq
zH;eZ-9Sb>Ht=0NvJ$w1U*_yo4Iq+t9sobI#Q@<6gG~g4AT2tuN0-Sq*Q)Jj-En0FB
zmOpe%Mt<z(z-**XP)0t1NFF@rAh0(Sxlq*OP&ah?i(?pnw|wp_vhwh~DDZT&uhHD2
z5t$7=CU9<J18@j{?7wEsyZD>~!aCJH-6Q8}eRr-zJ6Y>4(SH+@;5*!}FEGZKs&1)Y
z`bE3{^{dxj%>qo;rVUJCa2pA++LcpuX46M7Mj^M*<mr45i-AFMF1#EuCY*5ArPw?P
zeZ?}igQkJ$`j8NZ3oC_CMXO#}k4yng$0>DqS)_D@F7<!g4<)8XccF>4YXy$gFUOA3
zU=ISY85D>#`+0y+E0-F`Xcbx^EnLyH!;I0Yl`_oI?CRh)qMPJ{&P>Kj^+m;g-Q!=B
z<z;?T6AJD3Pf(WqfuU&P!A6NeX&_2iAy`3UMj~{fNgyo5o4A%@v=LpR(@Z_)XS@zV
zPea8;QEl%6Z3LEh3cMWywU<T%Vm?RvG!!XNjM*7vR<<Wvc;5L7ao5c-OY#=tggyz8
zx5#BdizFJ=s6$Gn1dTk(5~m6Qk9EIZuhZTbkm#=K$G#d?Og)=ATm8Ec$u+Y9?Ke2K
zk0GUz_R-6Bl8H+xQB^BSjJl+|kqQ`YL0^+zYXCh49)#5{f>R00zbMM!yzC4t3AU9*
z=w2f*IT%%7;8eO2RdKLjp#fwytTf0385?Yr#aK~Bl?~afKW5`B1FRcT2vr!dvPEw3
z(V5(h#vRR$1&fv>UE5}e1~(*~DWJ8+yqUV$nsD4!K4eHtfsv8H>{-9rm$#@!{7oW^
zmP+(<HNw2BPaRA~6aoq0Kw=4_gUq1>%qG&f<x2FQ_o8d^9%dNHD&s90eFKvp(Dd+n
zP_vd;i6aC!l>iYyUIz;KXEeznhq?3WP=blMm#`y_S#F|80vC}39Y(oG{taYqoQQi(
zY7TSW1uq5wQ31=5!*G5y9_BZtO<Av{T0J$>+di%Knb1`sJwAMJq*{9^ccgv%<xGn1
z`?Ksf+L!-F333~!jS_=S6E;CGp_YOi3G(@-avFbFjUhlJ@`@ynMCD;j>`=W3#m6wJ
zPU^-D{%I>3TBT4y)I2f}g;KL6yVGR8!NQHv_yA2s>k=!7RW$a3-F(1GW3R=zJz%jy
zkq)9gZg{5xhGo!W0>-_8c~P}}PQct=bb%{7a;}a%1UC82ru@TZRO`oLjxJLeFThNz
z+u&wIGo6j64mOQWK#HNi3hT)kkcf(kiKowKw1@25eP<BJ@F_6nvH=HH&^bjEz*ICH
zM3AY&1*%Py_9XpidVr6Ym0)-nP>PVl^>I0XVgpf2Ls;DSa~Y?-(PBkfW>a$?73Vf>
z&p8lAsEDn9ZPLw*6H?jfZ-i9qhprA+XMK6uZ@Ak_^77@~5x<Pdr+7y;baQ5X?7UCs
zf7EdU9ixo5_JUtxWE7PF3}Z2fj1wrTwDm2pQG7}ZJ`L@ydTTl!0<~or_*B*!x~REK
zN@w%v$f^b^0W{RmJQt~66<FZC7|_6<o3bii?9+kqGV;$EJMpw(ApF4N)b)zJ7KAa&
zy($ZGB!bZ!^h5(NM-|r~T;Rf6DXUSZP4>`kWa_G`XCM8no<~bx?FQQ9|6g~b{Q$M*
zVd!B^sRYn_Ici#qO2PDSU=kiRGZfuXM=YuZDMrJSXc7Pjo0)Erluw7;4$O^c($?S{
zm9iv;UO#jJ*QK&!Zmt0`WxVtPnf*oisP`pbDMhQfe>H6&Ylal_c9~(^JeQ9+%24b^
z)ke%x`KV|tipl}=mWaK1fWrz@3Zabyy=ew+&v_XWdtCPw{+h-D^CU6O3?S_3wJ2Op
zE*%8*DUHH35;M`<0}`oW)iPtGXfAo!H+B^5P`_IR(~v|nOmj5IK@3IaUYca|Vabp%
zw*vn{N#f!_UaVmkI?i=215;=J)pnr0aWG`$?HX$j+~lh>4a7b<ym*Twj4M;ifjL@4
zeqyN%O*%q+BsppbM%>yQ86Jx+C5KM1X5^#zVnO3qv<ESBmDJutDq(sAy`i7zELwH=
z-J){;j<}4I%*Eyh3i4XuY0~K*VB}<rEV((c6Ms%+{Fp+o{pPx4Vrn33QhO_`b;32J
zeX1pG*=7B+`{W}@vL>#W{l=d&zr-_9burJnlg_o^{Q<BL?HNjhs91DfeVeE`PQx4*
zs6u7jKsREpz!BUwhi6z|-bYRc1{I_*$RauR?Kmf43p0$y0LiF021vqDfinzaLSZjx
zUBc(?#cVl~*dUU0#*jR`*lWkD0{vAb`6fk|0;2s@vfT|diGMyZ>U7pHJPZA^fvnkI
z*n1D06_XHk{rdwvgnv7XfEtZ{Nt<I2L=U6%=u90Uk~1{3HO-!M$`-;|0etP-^!Fzb
zmav&B)3z-mpP4Zx7Y`z2t1Fxh<{SbnB?v~EcftgQv}JNFb1`#dIh-TZDJ%*ae@W~<
zVdq=A@w0%V0o{uhlGJ8%lKSIc=0N&$-N_W4rR!;R@&I>(PjSj(kCMdza+o#efjs|e
z18ftmaay4k$DU|VJ~v=1)c_AQY6`&$hE8B?f;K3n5UPM`^fB=QnMnB5=%Lb{0Dn|q
zd>Dj+WBuQ6L7yg>&2i6Ly{p4B8m%1Dxbv>8ObmkRu8t?@Z~e_Z`dQB%jld|U&?Sj#
zZZ3^@V1A5^5it=LHU^r8CHbxNYH1jDVCY!V)9NUQOf<309on}P?86BfoQH*_huJIW
zQaIM=Yt1>e=%#wpW--GZHlD64(Ovn)z&xX-G5CY5z*`g+8M4!uK`&<NkxVO`!60F+
zfV)jxwpai1>ab$w(0*r1go^`NdIF=#{-buj`|`aP7CD-;h{It8;jo_}qX*u!ia6QI
zT`PFONZ2ZnH1(32Rws%s>+?d$P!J(b0){y-^^H?88~o9E0D+~aG}Kgh=abd|a{zPa
z=C&?LdNpMaK^IMgg1X`rtuBonL8j$USX}o!W^S02Hphy2BgC)gV^9y>H2&U>lED@b
zCc9Jm>FRjj!T#jHr<dcQrpLw;vYtI0-H%gSG(_*vj-w|v-rMo<c+DRMpr|K+b^`R;
z-eUZXB9e|YOpm5$`P^BL4r$6_hxl#G3k~2ityDXa?SvOQ5tJd1doWo{h@{$pB9_B4
zx1#O&+d+|c{V`Hm)xiaFmotM;u;!8Ha~D7Wzy+D7CRzPsieJXd(U1h){g$ToeLZ>w
z(`xwpk}%)i$7o{-b^y8<?;64Iv<l<6$xM)Y$}Sb0VGJ|zl8Y+Y;yC47A_w$xRiLNx
zCH!bH^|L!RMho%|W{>I5=NhlOdpb9>I&JohC`ifQ*;bbu;Kcclv~6ehbcX+Lv4zur
zy;m4ecf$VR0?Spu{+R#GZUcEz_!s8f-B(Z7eYR!87ek-CdD~&Fh0V9R7wrQ_d&jfJ
z#epBxjtct9Yrk9@;<GlgvgXS+=dcCiJ+^}<cxE?>W5(@M27l9dXpLh?p}ebEF2=XR
zQp<Psm&Obp5u^rvQ8QMyD{DieZR+3&tHU;~H|tGShsbkwM>hI&HT#Hfu9FXan=q@n
zgd-n#Nne=$ugBW`he8wF>?99PDkgFz&qux5<>Ink<B^RuU#=~)N?(J!2pjx5eCH7U
zcVs2E#&*1?Xw`W5&ZVwRLoU~IOjbLF#CMllr4Fht;9gfpeoc28lK!pjU}cPa$TdZC
zq8=Bjw>^1Q{C=VBg4L0YJ5%TG#w2|W|L_Mc?&9u<!frlp_I>`~Iedy4drzHh>dkO@
zj>%oy!)+(<yAJN+59(Gmbu||kbr)NuJM^`=ULUFxj7Ju}!QLa4XT5w7rhggYr@i4f
z(_^OB+^x>i@7G;@c|TV%bD=5oaqZD|U3tPxb;h$RDXL=I9hG}d@=`Ro)3;L(cbCWb
zl;j?d!9|B$Zq|?4U(ZP|t`)z3YV23;LtRT#4;SSgPYdW*)u)G5)`+{=wCL}bPL%I*
z(7ajVa6O~FYfaeU<GT*oruVLr54luF?Hu~d_h$Icek^`@vTE3+&$m3evi{4P+ou|i
zJDMeLEcETh&!}z*Zrd&$tmN%%6gT+Z4D?x>J|GyZ)ZV+Ze(F)RPI*?M?fn18_NdnM
z{dfx=ud!t8Nto`(b&~N~JME9xtEbI+jqSA8V%o><+GRd}(UkdQhh5g>I@zxrcclJm
z50>6%ZAf~rU}x%JRH0_)P*lI_f{)4g=WVLyo$;EDH)ojraep&CV%eTkT}wUuykg=H
z*|_!zYHUyPuCEO_g^lW>G?~gMPd;U``nq6HwLhnGO~_P($?CAd+rfv^lj{EH)8*kl
z7A+t8OmJ1e#&wdq!_#B=V$E+FkvFG?K40~>EwXjOMTcCkYj!><#X%kS^|KGcvZl|r
zKcDBBFfE*#Tsokg_0!fKRg7*K{7f*O?)rYJW@9CHWJO;_^x)1M&52sdfj1T)TbH`_
z*&>U52X6}oktlS>VN>&_oRFej^#Y_WFQsf3vLwB0P0T5aJ?rboe!CtK#Q)N?Jh@RY
z9#@`gW3i{M?F0{(;r1cS>a^c=c{GW;i+l%D%g@TkP6@_ctC2UgZC&;9v9eX;zxA6L
zJu8{P$`4j&4(HNVDD-dZ?@v9uIxzcfU;B%%LK5^{F<Flfnzg^%D1QHoYmPVbyCc3{
z-nk~G(Z}Rut?hWbV7#o>XDF!Ad@OFf$1%MU*Uudc+Bt-e5NqoGC>MW#FmFpK9P7yl
zmJg+tdx<|_Vm*E??<?g2jp?f1!Htz^LoP^lWPp4q=u~iBWqNH}@mYD-tv1z;!8ftO
zhr~ApK4ZU%@mV{Tn$C0olJd+nHEP+OM$KxmY3GGD)tZ}8g$r83vc|qsOg=yAhm4v2
zi<K(lQU2hAlnlIk;Mvjjx@%tLXT{yapZ&4B*kNZ*+u6BwD@iUyZX+;og%9cksbyHL
zvY1o?uz%WyH>ZLIzqUp6h`Y<r9y&hZl)i@2o4Ydy`_eeo&~?=mcn0)KE*vXL9_m^X
zsoP7aw`b$2F+7eSJ*+Xkyf><_J0c!CoxZtps^ET;{`N%HGaSy*^eV=^{HCw;W@$gu
zuOej1806s(Eny#MKLA~P7kA+wx+_h_g{hWb0DAB-UjlO5-1~e7|D!OyIJuEFBAnR>
zD}KD?o<>AONTJC|USE#f_2vxyY*gcv0EOYCLs)85rPWD?kS|Tydm4+9`%8HtK<f~|
zID)!sjr&+IKuj~-wKV-iy>E9uei6X*j?49%<@nX|U8#dzJJQYX6xsS9Nv`6XO=2wI
zU=Z_D?`Mpaj{Hl=($7_t&otaV-^j0l=iz|JFx<viaJ*@%Q<$78+}I!ADDT}~nBH9T
zrC{*uoUZMS=^i0m1*fmAYuNR=pmWX5a;w1;xh8jg%e;m@w_WOb6A5+OcJONe>qZpF
zwUNudsIg@aMKvO$+t$P^>$<fzvQWD%wQN_R_-6Tp=Jt?FSu)Bcvm^4cNO=x+M1w1?
zHbaG?LUMKTo1=cSckO1Mx$Cv-{IsPhlIhuzDYL7&b~?O9=H<Dg>!+t<vSu%?pS<0a
z({{^xN8umekVh9H=Id6C$1N6jN6E3~?~h}FzVPX~x;F9@s-5^|{Y{j%h_8oS%m%+!
z4Se>;H*lBz->TLjtk^>z*L-<1sXM9=zsHZq4T|4y#1j>^%^z>~=|bHNrhO3~Y1^KT
zH7eV6vKGjIq(Ow0=N_;19sKstDul1?>c~o7pMdLlw{0=92Y*l4207X`e?juH=tiG0
zJYJOU2j_uV-C4c5DQo(@glWU$?wQRQ+9N4Cot^Hh>-tAto0f@DfIq0iMj({`<Rf-W
z(I{8NjFlx*SvI>HKIJ=DX@!NSobd&-NVObWDDJM$XvF#gInyH=)9e1&%}?Le*o9rZ
znG_XIHGR802Z)SR-?_9aVrTr|iL{)q9mq4^50L%tctS)dk|h3v!T|YDCBhDS1v*on
zjLIB@E3O(ZN(L4ccCERY|D;Kwzig-Nyy!R8GNSFqE23NMBtPT>ud*KS6uMU>M0?K6
zPy6pC@}d8D=Va~wV28$fv;y1&@F=w`7weA|jzUV@tZ%zOEGRXyG7Svpy1)lvB4B^P
zbw}b0QvvU(nxM*w?g+3dkLzO12`l+fSHyp}O}2+=x0JSr<)9A%=eyT&cUD^YSnR2^
zl8@OtHl)YOhg87Qm?_QK0sP5{aSZsJ9e8oZy=dw$RyVE>Ob(6A-nYxt9`R$H@EMET
zc`_I4QJxUq$lW=#B1EkD)|c+`5Mm#9f7_QW?q0_S+)OphMPQo>Nz_(Z+lJDS(Ku$`
zw<-piGk65Kh{WIDwdOhyuCgAx2{1&Kh#f;3-$<nmjDG{@R_L>*&KG=8Sm%4QsI53*
zc?dG+tjTK0MQpu-UwG&^b2~A2f8t=1zvN);t?RWvjAY^b!CZl9n!RJJn7$pk2e{@i
za%g8IOWcjCr<Ui+$9_w}ls1NRN*#Qg3TDmC-tMARcqYLnpw`gGJL6FT3VkewqEHyN
z5|H()238=2v4NY{1EvN+HE(WDM+nQQdJMfyOcY_qi$M|=rib-Y**#*LIv7-l%2SB+
zr;?B0YIAPj?vw|AzH^7gomDaE^<NK_R%b93UNPPIb7`s00!Jo93sa05FYLBW{~RHY
zl<RVL=>sJ$bMO&=u*~)5IzG|hMm*3d1medvc(9CUa2cPWsCW&aG`>J%g)pzKXUZ7Z
zbvn2{l_8WFj}l~4c{1)9$zIS_YAauBy6OLND|)}qQRrd@X09l-FP$a2j}=pWegtnX
zlw((n6{F=%+Ls}E&Etfb?v%{Q`y&til|z%|HmPM+JI95E0gcBg55`u-#1k@&ZC$o!
zeR5;lS!_P`snO=$SnP)?-vpqOmXjWpUg<Nmu@V6#Ks?v2O6@LpP`rZx0o9?E{|yo<
zsxsX*r8{DM3~?t^VPM$sjo#e==l;@*fVp%BU_kM%Mv!hWoWsXav-7zl3z9d+HWvTs
z>)^v{!v4GcU`oA?N`07O(?akdKDL^O6|RD65uOf~KnMi(uohK10<o<*kq&7wcn+by
zD}M0p^yE@UP#X$}?udTWlD7FMGI$)k(F>uEG(dvig7PtCLs<P5fS!-9JBxQA2e&uA
z0UB1~H5Lj`@7j-DZo?;x-4P6~=mQ{roij$8&g2JJ-I|y}!P3wF{bh@O;4Bf~tg&JJ
zF*_aLY&K?KTscNcWi%ygPN_6$?-<)<MD;JYWqs8au)NBBt=V8uWjcG$#zL$g!F~D~
z_pYT>Jty8TY>UDcA$R`x#$aWdSc6TtIOGb2bTgw7MN*LNuv5M^L_TCPSedW~bV~e2
zs#8e+>X7tW=qJ=~lcfaBED%(H(jD6yYvch!1fj$Aq2Gd`4v+QtYF3X01111tt5Oe(
zZ|*l)y~%U@uPWOc&VI8`6SOBKqcMK7jqaHT@M04eT?p9xQ4I7=(*8)M721T^(t)Y>
z+zts=n-P6Q&4OrKMs?vfu;@(-Je4i&7BZx(IC-cBk8+VTVLHmexh*ABN6W3SmUxhy
zG5hz&!{fn6EXCd7K5MbT#3xY6!7jS$iOn;Lv=E6%<YL*Ljs2)JsE{L_Ai}};gdHHr
zSi~UIq#f93#O*TFNS8hVvbXM!S8tvj(Bmx*&iY)17KMiX76BC+yLPknlSi7g(<zeY
zU-kLv=5og+r2jW>?ux$|u_p&C4L5e^1C(zl;zl{{IT=ry9+4ICe%-2(Z*0@+Gwuku
zsZQ_neTF`^of4#i+W)_9Q+GWiCGbj2brc+-5d_D_gaAadF*en$&ld9LW|R-Ib*O85
z{9xsTCaNDPXt}5G%bV;jkC7EO9XoGsPjz$i(|v>On7yb!3p7j9U#gycmZX2WJz+Ya
z(mgXmTdLRiO;zI*kCJMQ<hh^jTfga}x>3^`+mT1wk5EQ{RK&4BL1R!OKp(LWlxQfL
zZ6|8RrYN?sZ65r=V=83y2?SYp1gcY5`WkouUBjOfvA`az=<{{FNvq@nZcm$o^n!Tk
zj!<2||E;02l@8L1Y>C%E7quYwTvA94QEU>&8mi>cHK+DR0eF)9bdMENUzy=$H6wV_
zWBY91fc}(Z^6x6ng}Ge1Ads)URe5obYF+0VGb|x)xvLngKY7{ENGVF^vabA9<E}S-
z9Zfsy)1wNHw52oxqRm!^49dU|-N#Z-?&@Dr4JMzY$y~5{<d83*jS!|1Bn8B&FaX7d
zC=50~Bz;X7APzU|`hICO522Ovs}Lv#VtVNpKx``0yQ1RX;Fr8V{>L^GWk;~wqy@my
z)jXA^yHx!|KOW;Z^;ADTptaL?hv{~_+NBI_wSE{UO@5F}gz3LGn?70nvPH7)D(Juv
zS8<v#NB~D9Xowk~;Au|OVAB{IhTF8KE~8x@fI?2L%9In<8<js_0Ctm($mxzkts)Le
zr3`$)Y{6>Of*n(^IjH6%Ut>^NYm1UkqF5tM!LLbbmRkW;rW(K&yLMoYmr_Lq+BYJ_
z=$g8C<oLI`X-u@-ug;h$_0#v@g#^YVLv=f+zdtxI+0~?f39Oh+R!k9V``9tP_fF--
zAyA_dYzuVuII2k#QY6PE`uR_@0KPW|nowR}!QnECA=j|+&q+R~I$@>%al_c>rwZEK
z$&mTy{b~_n(euHeW5y#3Gnn$THiX#bSTJ&E<5;^Kk6BTR9SxIr)$iO`^1-r-n3~G3
zU1GYMBMOfkdOvG0ChNsP$@65%3#gv~{g_05OfoJ3aXSR!ro68I(Qjr}Kc<+TG;9Ax
zq92&KsP8Xk*7-JQJ|z*uofwyI;*IiMjoNLD3063^Fb5hA7($eY%2wCXrJIDnYOrvU
zqlMz`Tej>ys(h$h*Et8S5t}OjHiO0{GsudZp|XTchQ~GdrYj!nnrV4&*P0PHAYik7
zf+_G>ijnHGuc{k<k1;HozL<qOqw7}at`L2&)7?nXznD-A+x@{zF^%_F0r~)ek|%g8
z&lTf@+u_I8?ozI88e4_xdlB@D?7|ap)2<Jl3+ON1)oA+vaQF66QQp~}@0>N$V{3F(
zw~0+u2AY6xU{W9g!;z+LBoU3KOM{4s6=>pHA{yqvImcTQteSHa=o}*gf`F`8A#{R<
zq|lg1bF8K&&;}@q*tRkFa)eH&s!U=GNtFr(RrTD@elSUQPWQ~ZcinaWxNGG_d8vBp
z`Tc%-f4|@T-TP^5hEK|MuuHOzdLAW`r+UJi>y9Qz>HS;1OF2K*7%g-%HF>9%gXBmW
zG`OG3b@eU-Pz&($&FD*Ni)C$%j(4Z#OROn}yb<=B!;MFxI_BRi8cytBuWf<l$3+8G
zt%h#fLMNbj^uBFHXT$qnxMI4UbLU{Xzt0kCRYr7*Yk`vT?XenAz8DP+Dpy6eQNA1=
zA60UPW~<pAWSB!t0+g`(g>_DpITCVA$K{&X`9H`&I*gV~Oo)TWxDOmyFO<jQQ>7bH
zzMcani~%8r->wWNKuijV@CN7-A<=4+)I_kX9TgD=HEM3+`t-O#+XA9%J%HR@oDGn8
zV6X%MOU-aXvs5|bNK=4C8SOUv5!D+1BvDoTX`-^<X&fH!$qW7$_p_->f{x#HIa|>t
zJ3Md2v+3nxpHms)|8}Lg^~3oUtTAh~7IAsr;E4{_0U$y)HC9EBxbn%n>+4Zbo&$=s
zA`z@RTC{626Y0W=oEa-k&llH3>l0%JPq4<13M%(i3jX8BZL}8cGFiK9E%C*VI8WLZ
zn6~cus!!iJ5oQVbkl6AjejnC_w)00z=sn$pMfuq57nM5n@)MVS36lqOy=psBt};z7
zWe~1o@U1mo0!Z*p^2#O^jn@<$W%*K1N2Pj4`>P;q+2HL~@B0u?kr!Fjsfn&f!LIgA
zu&CP20wP_#C<)e%<;F?as7iIdXE>$R(H1^Cr_qh*P@6)Ya#W47?yf}yEbuIwTU2Zi
z*(H%W<uHA`KpEI=8q?Y*)6{?c^E8#GyrnVSZjBmw$3M&SR?*J-)~eAp`M%gp!Ndbd
zJ=|2-q86sGd5Dc!IvbcQ8JEu(;mIy-$XlwbYfXclJNKvUl5^Ztv~xM~8yOp{BLT&N
zUbI1wgauUrj0lhu<+FD`QKiiB!V0TMJXOsdVE@#zeYR6oyGe+^KwILPV;v}u@+as1
zq67Dx?we)Y>}-3JI>Mx`5IAHB4R+otjn5l@b*yE0Y`cA;TxsM)4cs1CP<kN#@=9Xa
zH*0jrcIf~18ue~>uZ#evIV<D)GydHYonHPYOSBJ~lDo4y9t~IcVEEj=7m5bXxx;E(
z4%iN{M2AKJ8SGr`x40ga^bW{jNV%Z|ZrTH2iEluk`BM7WA5uEKH}1-E;PXiH?cK+E
zlRnPbW~~~HNp=EQ5;DKHP8@G|#M%lYTV&R@kn+XH4}b8`r}@J<N9~6H5wr4Fi!_hD
zy?EFEtnobZ+S}6~D*Yd|+3&c2wlwTV1ydgX>3zR=y#B@G4fpT+*?&uT!2SG&z5iqm
zri~yOwx{PFUh>O(o7Y~iEFb~*EY1Ww?KdJv9x&F*!rSW0zg_qW5wO#oC&P^ksYDdJ
z_fm%ulJ9*~6cwA4i{$7<An+95_l&iwpu@W%qG)MOE&@--zt~Q76afulGb@bS>(>^e
z0I7c}FFLUAbm{{+Ln)qik0tnpFEZ3)QFQ4O0kikrwtfC9Ipn-+E#^DFQ9jn_{C^(|
zdpC_kDS0d&1qFZsm4VRaSev{nj>4x_SXI!+*3yE&^WmPLpj1cQLMZ)yu9)*wQy!l&
zbO_wPuOSSj8^L&?>xlnVNBKbO+6pvq_>jnAkdoShAx<UXxIT{GbBiqCXdPH5+7AB!
zHw%%d3Ar3uf%`q4-gWtNfs~U4E@7{HH>dk_Ua<R|*{_7oD7PX_U(w1qVITYYV4(fx
zsQv4ky79W<;79G9zN_|$1<;%WPn>Q_7?|5=zOQR>PUyS{@`kC2kUlW2A@#^+V?xXb
zw?N-*$f`)&mFvD|1?97&ptV6;Z8({(wSkM}1Nc_td+uMA>x?lvGN2AFl+wgDZw;bI
zO?%Or-lP%dy3S|I!2adoGa93z>KXhIf@fPNKMm<ynrS2y@Td8%MUj<pBhD8bS;eId
zG0tVhuKIFb>4ay|pfl89zH|AyE^yGS6zaaeY@y%S$HornNArq43<s_fa?+d$ITfH>
z(TJ@B<R@EI3;V2!Jwv5|e81mu(pV2LR8iABF5UksSVf{@l(TYFiXZu&M~%H|1)*bG
z=P@q_rDn6r$f>nSrG_KAyUfK;M3B~@s`Bx@JBsg<eZW5e?zysY+;bqXlV2+$4FWSK
z^+mK!l}4uROq5kJd37d57c7qq@EEDlR`Xbga;N0h`4sc@-O8;^!{Z-B2Kz@sLn)!8
zzpdPLAOTT-ptWd*F{usAk<;ZQuUZX>ITfCboP!9qWlO$wQRY#w%?`mOKLfZ}VPphC
zlKv#vBI0LvBs<q}_b0bp6}25;RFs~5s5i#->@!H|cK^1fH1|EvM0gfzw^fAqRppd6
zr0|v%;l7_&cow!Lqfns58y)3t;cfZ>Qb+uVvubmMBP*ioNQ)tH;z9fEQ|HWMkxFRT
zbjPpRb2Bc#^G^2HUoWtP{G*{ia^Bu~qIe*q!?Tx4gu2m>iV|x}%&{8R0Z|ARXy{Ge
ztpcCo2B!lzZNdVQ+PszKr@jI1Qy|u;Jb@!cGy3+=c9g4s1yX)X(3AD0WPCnj-g=ba
zz#$m5pgDEf^s#yK3!!%%9?KIP*x8P@zNGcb#x9yeeyuWS3E-~TYYAK&HGlm{q`&ao
zi20MN=383(#9M`XuD3&L(#Jn-jOsht@IFz#&>ckyPjG5{4!MduK9E2{=9W@YNhm4}
z_Yu{a!sk+;D|WvOTT5F!81LG{MWa!7dyd6-8{8{>kHSPFlVb*LT5<xsS;`n2n^L#s
zN?DzkM-GY_w5{v{s1&*Omf5l@{4<KE1W^%AMyDls*cNalTtjKn&Y099_j8=V0VrJQ
zY_h!fgG+Z#om0kp!2Omzp^UtV$LibmWS8(LRda|K_=!HW>^0Ak@}hw$yoQ`2D#5@C
z<914Tud=6LR-nTScfZi}9P4ZW4q#=G|Du$4fQxS!ik-C@HC!3|N$xq-VTm;wVU+wc
zD3u0PliDdVV=NL^=P_gy{{j>PRWd0DK3djyt`S|UXh8Rx0Gyh<L+p<Kc=I}?Kucg^
z#C#`3H|EwWU*;%Z6)U^OK5Z@xeJ~RGtUdIHW$NO*U{~6HT=Y(O<8y@D<ea14O5#;G
z#JN6>0E=t~rtNapwt!em)<pm*JbaX)Rp1x-Dj%jCRvD$2h1nT%2vMM{!%-DK(l@ap
zeJ7Y^f^(HE|0L&B)b=I$UKK8q>Hx(}2cVToakO{JKn4m>VP&gLErUeIoNgkw>WA2L
zh*t#_vumSsj&fiIo$FXws#c5j_ILMiNR&Gx%C+ql<#&UfGd%@)vm(vjVdXNX<r97S
z)5;H8{h3tYYq2<pZm8mON4JEvZf-EbnHmGRhxyj<RF#RzY0wIyq?!l+l2stfaJSI_
z+-C(S0>*liRJuhtXhWTolcwQ@&5ey*g1G?_E}@2{wm+%<iMorZQBwC1djvazLI7C|
zgxMS=K?Nm}kV{@!JCJOr)p1M-+*)7>mMepbcf_vjk5qcib4|)zj8nSMm$Gbj-!ilJ
zyl!G!?BJy}Xn{!j;3mkX`^aW1h2icMu|3p`a(5!cQn``Cl!(|n%M+DZJbhI?5*l}N
zC1TAziE7a-JJ(s#un(~?qbqO($Q!RASwcanGazW_L+72@#sp#nZj2R@r*|)>>Qu?n
z10@_8ydHAM9L{Nu@kn_%(62X)m$mUOUEX-nMZzA`CyuP|a<1Fxi+?*|Lh{qs!;1T_
zt=st>p>;_;E8LIf4iIU?eIodRu;Z6nekNyvQ%xEs{$^I-9bo~<N2>)(<dEx2XG<E%
zuo_iHx7K(PNWeiPp&7((vexKm72eF2kX=ybD0i+k7FgF-u(Y)WWsX8A`LU^aRa9P~
zw8v$=R7^n@qiFH$%3LSHq!d;qt*pPat@|A6TfiQeP_Az``}XFAZjG33YjmS#3q?iG
zNZ?D0=|*13b&WE1#;!~#Pd!Fpnd75Wfpi`<IDKk=QqK<wd->6Fh_t6k`1w}b{T<6I
z8sgeK(|ncG<cWfW?KF~d&QRF4h5TwQr-dEX)^{F^5S4FwoE%+Wj{Qc2U<nsOF-fD&
zaKr)r<Q8m=#PHXbZd1iCJG?|B7wbCnaCs_puk%^s5r5{aBdOC0a(6i;FPQ_QW<N`K
z>l;95DH*>$JhA_v&cFV6lk=p9o0NVG1)8l>-x5BR{4e!eQQ1*YKd`N=D1l!a5D@$`
zCv|SUur!3u*0h=e(<yKsv9rrr4ts?}$7Mp#_w&0Q)EDKX8Zx{!DS}E`s3>yMB9K5}
zPDKNo*tEPi3u6?Ciqc|KZ9(TFN6}9@Vh2lBmd^ft@*N)LcTTY@H-`5}SnSK&qlD=~
zRmwyll+qc@R6aj5Jid1%aAzdYhi-N1p2YVbNlYXL`c`nDce)1WIoag;ZDmDkhI-E%
z2x(S>fPorKLsY(%i`fW>LeP4PGSBjgz!76Yj_Q&sPlMy*u&}+6np|B#uo2CAE<k-B
zH=DY-d-Zfg%Ty^W;g;$_mGZ{6O`fVILLxan)Bg`qi%H}yo`reHDaKv`$62Bwi+i@x
zk+gX8)pO~}#|Oa<5*ELQ9R_+VC9&@;EPgYUjjee@>Q9>^C@gjvI>5wlRZ2lv-b7m+
zowuVp9<d|4^AR<84IRQ$@kY@MU#!M`WcW={1#Ie#<*XG9InJ}8UiH3{ztRf%)DRV`
zTd0r>bf?Xd5GXV=SJE;+SrrfL!`mg*At$<VM(ki<zA=wjljA<3IUb+UwX@8kjAiQl
zt>4R7gX7<2th2jApB}VNyf8dA_(W{awl#+e2)wD_ezq9;<kUoGcJ71DwG|Iin0nTV
zW~eMoUQcO0atRCq4HOt)4L2r1ln7i^d5K!IMP9*I*&-O2@Ilz&7GXu{n`_)m<YprQ
zwvKw|ah%dzAhzkN@F5(k&oS0=-B^7n2<)Ox##^l4NKV%es3_i7xw*`-zoD4}>e$~A
zo4@gWkHCn+zq~w>tg@FLmZ}TuVVkUSv_|C*HH3Xfo4l37zpzrym6Ken=@$teJK(+s
z?HlnKrpMD4rgF1MmQZ80ZX*7N2nxqyQiC(0P@|_A*U1JL1&d&CBZRg|S%yf#J+Rd(
zdx@Dqa-*zOgxHH#5SnT?SIm|uSA|W$xIK4$q7TgN*P6!<oeO!U$Z9En?9qA6Zw)K&
zw=19K?HS!<{_5r7(91gi_Q*Y1QyMq#e2WF>>1{nPkb<qxnL$n=zYr91cVerkZJ$3C
z3@iG>W-D={wj5y0E~?d5X;E{44Jz27&Ub)o&L29Yg>o(GDRYaMcCt!~U&1&XlON%Y
z)rj;Va$S^moA>ALvtsvR`Q*F_*q(KmrG5kblAm4bdfD}Ch4&eL0V3d;)}P>8pF6#(
z<??pLZVz2mZfecnJ4fH3it{^<$B!&x$9bPo{gG`_m(LYs)=0!8BxRWPikw7Ra%QJf
z!$Ihj&)a#di)k(XS6rlIuG70#bSsk3oaAaMdf!{91$~e++jg?jT>-McDw-@fSu9ar
z;X`p#juyo@E6a)UHuZ(-*>hskl9y@-n{~S54$$G#1?Jm9St@(rv*F<9k)|*0osyFZ
zmAg6VU`TS(fx?{X-)`IaxIktJs$9Cz&P%Xy;bHlzo9is!B)Aq(ffCRnV3xEWtKsdW
z7@YS&-ZQGIt?~t;49iXw9_g#Q-+hncWIim9i?-Utqcgnr&(;fSTkJyKklMcBB@Q5F
zfR3UqL=P!0J2Q@gT~rF}TaSx`rQGZ}ZoB`1w$s3JMp@Z|ZsK24+n#yO!Oqxp9%5B!
z9^vsdeJ%YPIl<9*oC<QK5=xb+HVi~Dq_QRs$INZkwdu#wpa&R3sfKbmxW3+uih(N0
zG6L9DH<K`S_Mt=r;6)Kz7yQS$LS?9u5RLzp7g?UWr5MJcQN2rsCL74lm#@~y-LaS#
zqBQwsFfX)n-y5oT;B=~Yg@87`<k-(aZA;EPIlwgGFj=lYPf_Eh5rKf_+t<vYDZ~D2
z!+wT`j&C#W2>fI&Mw2*WQr^;$MQvrSMLo91a>-MYHp2#LiA{qijM+z5biRU$LhXHL
z^$9Gl2~z#fZ;XyynqlnSFUG09tfBhhpXLZgf|0onP#mF}7haqcvpDiFpW}hTQrFAK
zJAgbxv_jDiM?kC=>j=K5iNg{ROJ`4|T($sKBY`jL=;4;xq_n;xVV`^x{X|k4zmn5c
za<f&U9+P@PlAQi^dXg(D6SPBiAv^UYX?fw56&wJ80=C2$r=h82?WEyY3Wx|mm?<jD
z0cJ=@=-eEqqg3Kuy(f`lR}S?7NU8e*rEpbIqlLo@W}_aQ&5uQC*rC?h-nFX2Fbhf4
zkrcT>m~xI4Q%QvPN85Va77bE3X(RL<rz#d*2!0{-+SsFr%cvKp69UA&aRCrKG3h-5
z;%@tRxn-j56O71yw4)X!@GasyP}sW8I@NwQJ}aNF{Jn)bLE5(+D>-j4Zm%es(U`+-
zu@S_mXhA6w6NSsSd9a~WX^u3dxz18IvAPx&3RyB_RPU;fOGV$NGAuxT&dkG_eynx+
z&Y#ANus?e*q!BT5h9umt=kpu+gZCZ$qUcI1d%ID3CxLxS8UURvR^4DroDk9P<}fAX
z@A2I`lj70EwHqszzEdU4R1PX9fB;CM0vudjOQjkpF+)noa4x9}BNf4j#fKpdm~w;f
zVgF3|wwk2`J&09dP`nu6dwvC{WllyZYpb#_SPD2v(0zTJsC~NQB`;1(bT!>o`QA#K
z$NodywTT-gY}U5im`w3U1x6JAbxx2zeWH;kL$!5IlBoY;A3ad0d~6^8Ql|j&Ai)Ci
zvaI#*ztd4PV-wpgHwHXK9Ce*dbM;PiIp=bYqxUt$ZK?3?z{Wk)K$$w*O5_7civ@_R
z1>M^twNvt_qmA-H(}~BuKd&Ghg5T^9i3%SEczc!Nm4Rw2@(^Zuuy_+O^hmTOmFORz
z)4Qxc$$OB{vN3AXqMSA4xKJ6BxHCrMJ_{%&F5-l$M-GkE6z^QWF=E9~-^phJ9|`0e
z4h8c<-#b;OMuKPMRL$%vih1*Z916+29mH-tv9@D@Z7TBx!im;*q}kTniq9Zth*)+!
z+!s62H<#iVJ4fTFlW%i>)k0?2q$0ov5O*x43hXd*Fu<3WyeP}QG{d_>UPAnS!0^y2
z_iAcm8n~@wf<pJANv^9#M!}E|qyj)ZXUEPChtB<b)OX!4sP7Isg-!_@_6zeBHEF41
z*?|ON^<Q1BVyv%4;NlsL&9Xj2y<xwVd$$t!?fZai#&H@I!YuH|KA-d{yHY?2C^cQU
zzQDE;e12jX5|*KApL|HPj)ICb*R$g4bYxT@C*(vF&4}Bg<}4#I1Y_D7kudigf0*FR
zmRea(vygCSc&R(z^>TRJ`-Mb5u!UV|N0}|}yi*&WH}QrDvi2Zns~Zs5@BNV@K|oBK
zY6Rgt;EspV8pQ<bjiM?Fi^7~lDX}FSxk$|%VFKnCf@yOn?MzU|LqOIQ5-6!X61QA0
ztK>v%7p=lCo4g;f&71kQy6E|8^r1ushgZ3Qug3fYYp9KNhEv}(R+V!|-9MvEqe-j)
zf-x@LX;lI~crHT4fX3v_8CK37G`p0vzjDM<ul#Pwbl=ax6vStP5&w*H8E4;l0rWEY
z*R`ph9SKyf^~E2TzVx5I@d9LHJbNy}bN>_I*(|YTv5yfaBPc-kwux3aS(&A)0}m7c
ziGz6}jdC>n@;PgAlrtvZ^Rn%SBLAHA=67HWn$(-B3L?Z<qnPh}^^2Ujw@-}R_{j3=
zVn@x&#$ziNR8KoN?Zfq(nxpss?PJe({p4@^vw!&3%q^Gl|LMPW{&U6u`8Nkj{^kew
z@0+u5h3ny6uRa=;T=mcO3m$mwUlzP}HqUZB`0LS$mj^OpA6{|a2Txf+xD{+8$O!~a
z1SUfU)v4W{DD4xuZtB2n6Kj#XKEDXJPesz;_VmD~)N)TLx4I8@ewJm9cD96GAM*~T
zkA04$ozr>epPo8gzP19hbhZ)27=!Lob$@VzIoE6bG3x$k1xAdZ8x*;ERB<OMG6y2w
zjN2RdFK#NZ=g$R#3m|z8EceonydMY)qfS9X40r1E+96JTaSe_O5gH@P96e&&T?u?r
z?MSHQgJ5mXqu9xDSZcj%`Hronl57utL6FU~?y`>Txshj@7`4L$0`HG1u7l>Ut`FbV
zo!hRPC_NYaW1IQw)pozzJQ`^Z-IMopYSH7HGy0RHcgc}Na&SjcN)lLIVH7(<4pRNp
zQ)??ychrym7AbfdV11xFJfk|EIu;;Uf-h9TlaEovNWMpi0)9)9!7z`~2oYGhF@oHi
zZ}oPbXSdMnlpV1gNzt}3{Yfj1KKye45Q@rtlp?|(g%*T8KqyKj<<&XMpOGqyZDsGW
z4)6QU?qiVTB4<1<>cu@r3L;Inru@ZqJ1`vljn;f~Z*AH;mEp5DNX<u}rvzbASPS$;
zq!B6Mmfr>Ls0;{iwaqqU(Rz^$`JsdyL;>zM5-N44*|CE_251EU2ep`533!DhNcxi(
zZ3n*b>W^0o9XPqF8ceDpn+UP73o4RSN+T3mxmW<PdI>eH@_rCXX##tckj)E4>v1%w
z*O}8;$=Bz;k$R_4`6F#*VC(A|N@c^!_-N>Mq~CJA&g5@3kK~xI?>CR8m_H6P-@cwc
z(LcR5VMgL0n;eNGgLQNrNns}~ORPQE2wV|KZplt7dewAabSet?w8W^9+3ug^Zv&V~
ze#Nkb;ThgWSt@>H?o2=sy4}w>@~NIhdc+ggW`cEj_*rKamCl^>=cRgv{#jXo3Py$%
zEt!`kG$%<IpiM|>0E9Jfw6Pa9I!h$Zb)@Re{wgdB;?_^)`Z6kJ_Z{34aiu(Rk8hLt
z4iBUIv@!j5N>|C}h5sUJIXrf4c!EQ@u`lDEzN&m~mK36YCMuPPi<}(!xZ>g$ebUM$
z(*z);Fd`>!2i9bFa=gU#QCT1b3$>C@!jh}O`9YqP^(M!jG$N+Uiz4U`@ai!yLS(}m
zV=jXt#CJ>0fwU$%834F41s5SZM3h+d60kIhfR-)&bGd92UMN*`p2_Qccj}8X*yml-
zuXc{Ur5nF9Vh%1dj~y6M`XZG-oU`9B|LeI>Ak4&C21iOhOHqdG6U*$k`juBQ7H7WR
zk=&zx0A4b8fHQl*ySZ$}_m>rZ%B69az0V*@pO^uUD&~Xb7!x#&F%7#p==gWAKB{hf
zG1nG3@eWY|HUj=5itbCy#LR8*{~ENV>de;s2oVCnQ|ogji3ltf0O>+skfVDP-0`Q&
z8t03yE!bE=-Vb+w;)&`36)et~_^O!_|GCg*miIzSXhpp~`1sfEwmo0XwNIGGy@jUu
zM|c~7fdYIaDry}9p8$X=a~0$yf^)WM4f+<{7t+cSr+d_WL|>CLH8GO4J=hAkVfA1W
zMin_x9VQ8&RT}_|P2NLlzD8ndEs+ut88FR#uM{4nmcncBhP)~(nJeg)_nJJHtY775
zjT$q&&v@$16Ja!bV7rr5Ls%NLkG3hj%HLa*t{l6!!0tP1pJ=p?b>s!V*lu4NwWiSz
z5<pTk_M-4Wr?aC5u@&fvJuirpK-2_r35xLv!jMF`S+s?+@-ujbDa~<tr48aM0JyEN
zb`bQdX%(n%DMBA=Xa+P0$L(0L@=83v7@Ij%ALn}6*jt5Wt@cH>dSk`o2_cw<v(n)E
zd2eeP)e}Uu$qvcZt~C7G^Wfg$b7!eiTm8B<JM)RF1BJm+<%@6gq3O<^FGm6|+v)!b
z)!Fa7*A_gabcgZbqcS>@?^BBwoFT+u+KE^xkQ0=GYb)YvS&&?J54ev@p;_a|!9G0~
zbgw(R%Q;b<Hqe96fbzv<PU<OA9F{@=umWDX*5Bty(2!EfT-oVTZ5<Mf^`g2|#Mf3a
zFNpgUIiS^bcDhs?$VEg@0;l%@*K;Tza7W|z_eVQ#;}4q1Q$C0^eKKM8_UOiZBjb_R
zPwAA>Jmy*|f2=e4;_P>pSroVVMC6K@ub*s)>`UqayDoyavVIqAv92pNuu@fvx_(s9
zvqW1Jx&D&Df1EJT8xL_nQl^1e4XjiI92ol10M)%o<<}UiA(#N9qa~8uUAYy#RVe~)
z|9s`J^f_x~&tI`E#Bz>lp+y>YDP7QPoSlDk1E_9k4sB%XXcrXa^N~|c!Pm^Uhi{wZ
zw1qBPf_B~br{_vO-~Mm>J3NeY=&Q)k6y?1-^Nqcc0j(GiZ{|JKQP#9i1WBNh+A@Z`
zX89GO8pQ&UMcI=ep3NxhIsQk;W5Pxdr4hP}>xO7<p+x8VJ2j^w|HDv7lptNAI!r!M
z6ao>WbLckqW~sX)oDF!oiGD?tUDdcq2}0F(U;-7AbU~A3Ss-^&jGl#;rVRj`)apb7
zpen9`sES41Qy;u)XEcm*>ncp8utZSLZ_L5WP>(+l);UgB@`Aj;?+Z;|7TfP!K4)sG
zE;zESaZ!(<TSNs1@76)WD4jji(?iiDy${Wdq96{f(^IJ<FKoj$!!eYWKVVJ|tWgi{
zXN|aCttc({-o;AsV>T?jhHS}t{$$%w--6y&gRhcI?>e9+iEcE7g_A8A7s7$}nY=pE
z%3$Tvo)y*|z3)!#<380F;MHi2T6KSNb$M5sixe6^2u7s#hu5fykKeJaV(sED3io_%
z9(&EM{6?c2_t?k6ltEqag8Abt-6x(b-9)HP3E7oR%Gc(AI7lr)w<dl3y}ZZo>0Y}W
z07S%4Pli<H^iNPOSSzLsg8C5zRDXh)tV>=c)B`qaDXo+EYzqs;JG#E!|7*tukPiZh
z5b)`>VR*2n`6xxq7NJfoU#$Qh|8c1av6!s)TFNW60NbMyw8dn?Cx@C-{S|AYddIo}
zQW95Hk|aIxkhaQj%7@?VJv`MX{6p;_ce`?`O~pPzo&S@_z}elwEta5P8|c!8#>4El
z8uR?`ngf1j(>}0j&4qd@IR?q<Y$efxfkD6;-Sc>~RwYxVl)9ATB)QvPRT=0_BR%b;
zRpsBWvh?&*e<k9OBQ^tDBqxYG>wc=<Slf=WfLM{FL15v7`r>oyQ-$QZ-tsQZL`xOK
zA<$S-rMMuf1yc>eq;f#dq*IE*J=1F`4OnMTCPZFXh>yYAKo{wi>-Jms<c0F~@Vxa-
zzeRaZ@n2PLWB)R}KhR>j6I^BwUZt4%l`c?A{?r8&(^!uA;>NNc9d1U)ipvxiM4Dg>
z@6tAOak`{9(U1|jr0rAzp?}@s;?$#2YB46rgzDeYISKRx_}W?~^&GW7nt0H5w`Gk`
zTUE~CmJmyXPvgIdG%~hjr$5=@Ss=a%^`-W!N15iF$sI%1(tHDE3x*IdXi}759(L9A
z6<M@j@m!7M32q^s61Xuu@lm9|FI_2ZGr0#%*Z1d*51iZcWsYv*a^6Flj0-!R`kLm{
zZ9`M@Y1*ZjDgLL0ktw8l(u=?TasRK?ll4R2Qq`-)h2n8gL67J?x`#PXOOe<mQf`*?
zbQ0rpl(x(@=k;*72j!V{xss4cLY3{yhZ-m@uk7VBpAhiR$ua69<PN}Xq@T}qX})WZ
zzlw-y6#RfI8lpoO>|8?^Nyi4<f@~@phCICq7->>Xp<>-=yYi*BWbAy3(u%*pZ4Ztp
z#g_5eDVFieqqQ2-gxU9keN^yJcjSkf?j`$95NAc_#bxGxPH07oF&nd*oV*zSuLX}$
zvn+CfU(g`L$#3v)5%WM5;1!A~oiK|eE50n8hwgHzTjaV|(kbP61Gp2d;S{SyZ+`xM
z5q*bx^IA*ZE*j8ps5szS0E7^UaBkm03O>_RcojCwlaEh`xQ_riw-oag5ttCURKwEQ
z18s_b(9S56?{e#}zXh_gm~DY8BP9f=_x{E4ZqmTXfm(GvbFq61--JsN><)4Y0SF@K
zd;iY&03MKhlt`>xA5=PP2U<bBRv;M5XZmMCHMt!ZU8+auw)dG$6)X<^A~~yw3}WGA
ztOQ>qy4kWTQa@UoA05T%5YLxvkx|zb=1LHaMfMSjf@d=P>ONPdigoWwe?!Mof&5QV
zGl`d_9%_izK+IiZ&T}_UnQtUUniNeaQn{%MrYOTo*WClUhj8^9<=RM)Q7KoZm~Zd3
zkAIvOT<e)LtsnVT&9uUaj-?5GV9Oy7K$gVlPCi8Br*tVXbcdBI!sc-apP%@Yq}H0O
zWua-NnsiyM=Tu%;D~FPb!zRj)^!}n#w}?21A$h4PGO;;msTB(YjqcnCvAC;LlRG10
zMV$Ld*lnslA&iIeHaa+E$HuT)(UD&$+i?kyP%_%g?3&&5D}0+})4%lPjeVYDx@jr-
zdbRRtSo-J<oU38B!XtDN4a$!6eE>fYPzaG$6$`z*{2_Lqey#_%X7nl<Pf(9GHC1X4
zdgv0%=eqQ)$W2893y#MZ4$vo+<L4ClmZFz967(I1qYDSZOR>O683h8gf{HS>8qZ^(
zA`$)EG(jM|AgEP>4QCxLxVOl)dC5p0I|U2C$}Smr!0B{e5E|6#OmfUE*l0X?W&53C
zOR$5ix)%TyT%dTg_RsES)lTnw_EDrl<<tJWi9fWNzG9$hdC^Nv?4BxMDUJZCQAvIj
z-iaiS|4+?tSyb^w0TOYRBPu8`ZQrik8nGiSx1u<3#JCxQEC+v)8&BBYM!|$~M^a|g
zpyb1B(;I@rs1>W45ZmG?o0|=j@m-UhkW8W=XB*)_1j`+QnM6x~vI$5I?*OYA*%LO#
za`c}@B0AU^wCwp4;;HT2FQ+wqbiX}xRvT>ADp#11s5MJlSiupt$`!-eKra#?y(jWD
z3Wf=p6t#rw_YoOeD9Bf+j%oFP#_VJzk8(9^Tv5SMf*@*7;3K)12HXeWlPX&pHjDRj
z(O{A5t%l}Grc|_|Bihyt^~$NHFg6vtuUgIL2D0?xvKA8E|1R24%Hb9fBxPey6AvY!
zIAP?apFD#=l<J8pd^6vDc8BHi_H<vdh(yZg%ap#|AuobL;`8aZPmT23Z(p?olYW!u
zzcM8ht8-RSJd4*54;E{vzKgfp$lL5s8(8b#B7W3EF}U%(S+w8c7=vpO?N^Bwf3)OP
zj*V!0@HmbV=T+nd!K$3-N&ab_EI2bNp-oLf)ca}LwTkr9(noVwlnAI|6hbPDHUFlF
zhNX(x4!re%YuPAj$yze7+T%f8oAUIoO99k^i6E!4{97XWmaa>?=i}Bq|3$M`Rxxkv
zsy)QM{^#(%`htR?ltSkVu?V(~3#e#^Tk-+(_~yC{5Ph(nK{kk}^6@?dANN!0Tn&B2
zz`+>k4-D>z<rQ`%h>_bqG}TBna%SaXm50&z{?4~ntw7)g=Qy4uZJ{eLkU+f!_i?OS
zI;tc4795G|s|9Yq%x_nUj<>91zU~Q;7tqjg1iSY{T{al+$E%}zB7b03S@_j-U&A?)
zW@O-o*{8Srg5Sp2AP;W->LMzitY2&JceVfTE#G|Of<cLcKmzW$9IMIZJhlU46ja3r
zrSo7MlB@qYLNrmJ0DTr1%rp~6UYj99Dp}&=pm|Po2(gAxHbnxXgaGsq!M8E&lXG?<
zr049RYr65~$k3A(fcqcf?BfT`pWTyppz_3=;$7*_52Zxr<a;(!0C|p{rTz;iBr|+u
zFcRX`CChV)ob{t^xnyNSe=B<vQUVW?IvBZJ1ZC_gL`eE72hbQ7X%CA8Sq@xUxUXJt
z+jk3gG}U29Am+1cCzq_qmFYd?{w1jyU&v8*FPi!8$u6)omvlkvyB!nD--h9>FWzbT
zRToarjDOtY5!CwDNZ`UqiSq4vLqvZs$2>Y}au-CtxnbwR%<i-|Ut6A<+@rS7Wl7kh
z!}Hx$)e;X4cU5GocSppEY9SpZ)qHMfwpDC~MA(9KBz3A>B2uXd5k*n*7{7{UNr4F6
zoE-Q~*VjXXARBR3cwZ#Wa>y;e5;Lp-WHze^*#$=>0CQJgNM6Mdie7!Z2zQC=hY(<;
zx5>0+=R~{t>udHvVQ28X;yY*hDld4k)3@3(@o#OWJJ-#r8O!nUcCaTBiZO_RZX)m9
z_IlbryhvAU;EQD12G~oa|0RN1-(?Xyw$-F~cSP52?zSaH4WjCl+Z@Y4wW7VTVwzHt
zGJov}r?@E{4a<au)UZJL5=ajaUk=3#?kk#J?9Nn0yxMT&kqGY&&tu-T0*_y=jMnf`
z)F8=sXlwU=Q1qc<`B~ayr_ZkWH0{jok<?$NE`8}gl<wG2tNASdf4&m_$APy0TzBo#
z#fN_R>^$oi)7HKC_vTRN9`6@>{=xhDAF_Y^RGj#Rs#<VHZWg<ChOwdfM7Xgb-<==O
z-nlkv9yRm4jX>nE`9_ZYzxi78I@>x!o;H*3YFlW$an?Xt(Nbqp6U&}cd>?n}NtJxZ
zM(&c&L+LVCOjh*fr44kioXI?=fjT}ge)7ql%pvCOz)m6Q-Yj;4==R~XILOqq4)1xm
zlhoWTA{k*Q(`-8ypsDljsz1tF^Ia`=H<1yfkx~TxqJh=?6>jZ!&R>Nh?oSl076ow%
ziF7r0yuTt`7qHV7HyCLO44b|fPWRt;&;F$OgV4*siyhwQJvjgMv|MXgCp}l}dbYdd
zbxu9Rweke2!Ms5Sg>hB;&pDc0{o68QB6!nvEe$6Dwp=bei)b1#O{P0xy-MBIn_)cE
z;YA-35?v56e^qi$x(tJgi>%qqebY`3-*`-|T%-*cns77EWX?vFlrcmTJ6ZC4O^bhx
z^(Dr;iDIF$%vm_)y-PAvmw1;P>g(P(@#O@%<!<wYIdG%hbo-tZ`?!1J^BaYs&vnY3
z);9Z-`Brz-n=}ip-&nDWyUweoI?mWcQH#}!E0;^2TOF-*a>Kldjw!|N#MJ8Di6d%p
zK7!;IvN`&p#N2CX$e`A3@L9Q6PlJ!J(jWD{ohrk_ay^bIa%--;_IlqzdhU?sW;94b
zlJSbIRhFtZ!~bkLHlIeLK+&X;SRFm1(`yfUo)!V9vh2<$<{KKNT^1*O&-jRC+>-iw
z{`+YM_8AjhTN$w=CN73a&AylUSLB_iiw4%X?{7$<?W$TVgiqvq>UCAokq2n3SP@b7
z7Fb-2Ge~)=qQ_g-)>WmTw@Lp!hp%FGbv&Q4r1BukE{j@6t+~PwEj_EHPy>KDaVX3G
zQ>{SFTtc+)3qHEFQWv}PUX_lf9FiAIb@U5*jO($jsh~8zF?}NkV&}~>BPf)^!GYb%
z_*;3wu7Ng_zjJ)NQOxN<7HC6#f$@iMotEko;a6>wg|?dA#)hV@)5}30SyY^RAgiH0
zsow3b?KK*YV|!7|$hIN8&iRLTb1mjv_f}j~@Cf6faO0H?5XxBjJ~U)!oNJM`u@Wd{
zdns_dXlIO$E!Q%TWuypr)VoB$b2-a|4I4SU+<U&HGTNBp1Epg!%{J{Q+xO(c$bmD{
zU|UhCS6a-Ul`GeI(wI3tIHm+YG`B^5es5!fW$skpw%&e&yPgf5X|1=UG;!3aq&xu|
z3NK!)`O}4=N6OWkS^OraUl`|KSQ-EX<$i~)Ayq^L_p^LX|4(sy{Mq6Z!{liwAb~RC
zPJ(@{##=40@lThBY_{{>nOE6(DO4NfDCT-_q6OA@wyFxN_q`*{C-Mu1gE!hGz?F=)
z4Ub>c{#)Slx)IKczjZipN81@%(N`JQ*cu-lHDi@*7Y(}s@J9DrU_F|pxI<xI>r5;B
zvoVtL7iz2iHFz1OChCCHWQT<^TCY0V{7ijZ6Q&-B-K2v=wa9muN#7WON34mkdDI)N
z`WJNfb(9e}Cvs$l#6VBjq_*Wh&MuRAL{I|J);uk-d88C0A@Z*8$ImQ1Y<Mic`0g)6
z-kLB43N5Cwa{a1_W2SGs0J>1S-eMYV(Y-SeGvc{Ic`TJmtx-r?;Jl2WbYl0UkA>W3
zHKybY?Q3Z9UMI9RoSV@Uo~SiCwtMy>Hb_qvJc1st!LnLyLsTEE<_(mI&GmJa(V2}D
zJ32fB67~Od_!UV6#(a-yxy*6K*qar7sG!=`NJ*cR3LJsn!kR{?axKZ^T~jCZb72mT
zbEL2<aP|?fH20PopEN=U6W_E=E&lLC@SEUwZS;S=hH8POr1IN_ycUF#ZyWLw9p}(W
zsmq9EZ95@Pz1w2FUe1g%Z%w%W-fqKGDj`yLW^><dql}>DEL<uaLIIAhkzR_+@)wEs
zlyrO#t6g_KNoA;@Xn{_BcCL6G@r<j0Wr;4^|2BGQKR4^HAqZ8Ph^Fe#X|6d{di#xX
zWw0eObc6GkWieSI$6|~rgQwXxRjnrmR;!ALTA8;pSE*SVRXRM^490s&BbF2`OX-QK
z+3cC8=gN$WWJZM>J_cbSx^47V&z=c@gk3U&EnQ#@m5!nT!i@U=MzT<Sog7O~eKUc2
zHwJId#tznxu-Yg=&%H>7<u#EF7HW%Yc9Ti?tWq|HQ@EbBp71zu_sfjb_^5`u+THH@
zly15$?>qh1maTvC=xs}i#bfVG(zq9H*_g@=W0R2oXUFQq$}71d%}ti+@IEZi&;p3B
zZf3_o)#a?z93&=spI2?B4eIInrY(=7B16AqG#iH)z&PSKhq&y&*RP4qJFMYs8+;?G
zCe8uQK*COA9S6=}9b<Vpp_Hw})mmRCrej`&EtBCkkKfXbe{D$*ewAkmd~LrtazkT!
z)sVrtsSA9F7yGCu5olO$YxJ)<c1gNA@&yaYF*of<KU6@)^$Dr$3nMjx!qK36=2UAC
z?LjlVJ1TEIWwxj4`x?oEnduAlzG}{9r|L~$TvrF(8X5kd_Sk?+np97s5l?k__}w@#
z!`P?i!*9&xc**yWO-gwY?k(hDVWjIh<NY#IjW(E4<Fi8tMnbnoLLZ;8`#Dv?&%>TK
z-R>W~I5-?|7n&x*^312^)H*?~N6L%tTk>W&w|dCA&J>He-I7SK0{}D3@nPc?ga4-*
z?w{P*J_^aw`>JA&Ol%RxlnlTch(`U2%p4T}CsRk!L_ia!H>5Fdw|l$vR=yaRQmgHs
zB9PF(TODA?DoB4|gbZnV8d;DNN<ML@Dp#6wwMtr(*QX<d&Gg^W2ERIJF$LQAw?!Z9
z3?0yiK!<<2E?kPS_e{gNegCqHb3_oF^n<utJc$W|l2fD+n<({X>jJWr#ye4MkV|U{
z7*JakU+9sMt60=UjzUviHM7`#a7m8XnM*1+=Sn~h_cho^8k7Ym?-uqY<D$XJ6E-Ou
zJGXnQ3-)c{hN6AMkOHR|pTbTrGrFLBnt?OBhW8*$mW-zG|33p0fk`kC3hSFy+96Hn
zKUNzN!u#KC@;=P%K?${{?TYy=T{fBi*@AQCX$6G?YY9slS(~KwLr9Hi<&sAzt&?3k
z0p`+tmnUBJ^_^SF0KSt!i3(H5b~il9uK@5d-HOmhHJBnd9eD!6-PeS9p-}=h)K^q-
z;+Z07*>*wtPaCh4IrPXezQ@deFyHz_XC7}?dRiit&$8@oN`s~TU>ALjf{ZC{5UbI?
z$!?E|Ru%4x&lJFjw|pH*mqrz!%R}sGXCw{lD(07vK|_$OaksSTnI9RNo+ZmfdP0_h
znZOJ9o%KTlt9xaP(tOvPGS}nm$t<xpTl0rpTbHMc>sRa}(|q40C8}eL(#x(d$H)(L
zYOItnhz8JJa)RnV{)ytV?~imUbOH#h2>uxy_v)0q&>wFUj=f`8SC1FEs1~~@Q=58l
z6iMy@3)tfHb7_|Q|9qwB%wq&(Z5`6B(CC^vM!ZSuvQB)-BQhftNa(QSRJbqLZ*}WT
z;~(2YW1r{<b;I7i$XyjBI6w=m2qF|pLJJ$~)UmO7s%;J^C%v6%E;e42^LLU(u2Lst
z&K$FXRNEE{l%NI8j(A=JsiD!-u}Oh=;Yi4;a^QZAYun8r;%k9*Lt7!wi~+zY5yjqP
zOsJa?ktq<@lFAliuk43e9{P^ARshq<T-1b}kPnSIS(M;UH{Q4Fl{lB|lz->Uh=M2f
z`v*%GyhzZym}gF0l<A@0fQH>{pg(Vop)OI|6o65kfpMB-P@2RcF|aTi+2(38_^y)&
zrA-y3oVGc_YYBzlc;K}&!Dj_8-|g`#!D%+xf7hJGJVyvf(Lk7VGM22*Wy_-!t){)}
zr8KNZbY1zYK_7CTWdqBu9CA06ES1AW0y8$Wf>n}dhF($5A5?CAHXP~~yg<}@C2inc
z;~U`wWIUk|Y~!ga+uV+6a5>-kc+qLpEC+FQZ>9UJTypu-eAv|pVTcTM1?h)c1S~iW
z=%~`}iJY+p7eE77933fzWtovj89hnLpW&i+z!j;hRd^pJ>?L3s<J{uk<j&z&%}l4v
z%0Q+JvA7qpBgjL?4fo2bMQKrb^#v=3Tn^912$bX~ePToBL`cm@49~OQZq56(|Dtv{
zCr`$+4HVkPZxnv$>aE<(;!N+$Ft$I=?6vw7-%Q6jWL@khkfzO&xaUN{b6<lYR#OIN
z7k5;n?1{oBik`u@j$7zF<4EXno&mJcx<aZs>V<}R@{dl#--6?T(~#>(%3GgbX7FY8
zZU1g@Qkfa7_TGsYLBv{t++!4UWUPZ#NuzABTMv1anGNBO9@Z;%(^!vQ7Z}v*cqT>Y
zg<xa4F>%myTuPOmIkJ^3AZx<*tSe;NW}KE{z;NRH)=7&dHmvWKh`?A6I{?{A`gkW5
z{MI1T%%r}6bkf%d*#)^E=xk0^y`|JPNX+$3%gKR!ZnZH~GL6_`)bXCUfk_9o<(JZ2
z)Z6K@NUNbhuuyEH>i20vMk}h5M!%#OebvCpH0Dod9No5L&uDNs*uC5AyM|HqJF2S-
z3Uc5OoqE;n$iC#cR#c0YA{uP*VkxTEo^xi2QLO<awy3A1raWz)GqApNws!$pgDw;(
zp6XrrW*WY4J*f3qd?8GVZ=Gw{4f}hWv`b}`7pudui}WrrHPWp+HVL}voK18^aZ@A7
zz2Y&Egfw}4vcWC@OC(*_UK?-0uu7|K(gZuj{TyU0(5P3KoaY05_fKY~!APt@92DYD
z-<e(2@h_!Oj^y7IUv&#rPRALQj0yYYWCY$cFx#njJHCslQQi9}*b2$vCm`HeiO}$+
z%eT&by|RG&uf4kl>2<qR&g*@&R={-nA&s=GFkOMe)-&CBh>=5?hvKxWhZe)&;YkeS
ztF!dhGGta9C;Qxv%v6$9jinb9o|5XVZXLKSB1HLc@aFD6Q}Og>f8(&}_TccF&$8D}
z8xI*5f@gNn-Jq(|qP(>*WX4M?{xGv17HVW9H#&-o&Fp$y)#@05Cs<A>&fp1uv&_OJ
z^x`1~XQZMKJ)E1INvuX0;AOt+yZe8zG~W5dA9Z;NKCh(hx_9|6OUCQegA^J8-CZ9a
zxt+0RNn!PF#JLmU|ErZ*$tXMc9oX`lwMm<`gxTE?B!~>U39^W~HV>yJuZbG;76Z#9
zhm=&7t2vs1I%g1bEqzlko{9}fYy`$WPP(VJwvOYI7Q?zQ{nOI7+DdH<TAUcOI49kK
zg3wZO!XR|k=Gt+;eIj(#j04v6J&*v{zJm#-@#4xQhx}_&z1ytoj5*zh3kZn*w`C%b
zOn=!r*<J_dBB-!mY>*ze^xnx9o-18SF=`N|74T}A<N7lobssasq$p|Z?dLfWleS<{
z6AVZ;GTp0g(xg`|c_M@3F8@o%<6B#2#3kqHog(9aL>m&MDE)eaw$>9k4H)1eZ|tQ)
z(HxE`jQh&Lp*c*1rt!U@%s)Lo%J$%UtHoUC?6yf%+s=>G)xr9cb3!i}y46G7KfTYq
zQWz|4!BK^j%CPF%8tXXKe5F;|rMgNkPiF}--SxbOI81xPqysOeq18BN=G%OBN%+`^
ztN1W0(o=Af?(XY%oc|qSb!|b%3M-=-!`;^^RYbBeR|dD2%+Ev715RA?;>+MTac%E1
zoYpd1z}%h4&WWr1|D!h3$DSN}N#35|fHK}V96BqEXw|j~(uQlm(CoFAJvewGZJ&25
zK}ubr{>ctIyeOb-z_@@|W?vK81^?7+jxLl$g#4o7@5wI6r$HCgD!i3-snd*f!xMt*
z;7V#IEP+NFr`90RCSerJjFSJcF>_<JzV%E7t<p@o+HI|}@>$S)D=@-G^@AZ!()af?
zCl*6ol=%VH5J7OAP?I=I=rta(df}ilL74Lcz$3?8k2Av-q^<p0*{;$2W7~W82GVc%
zD`u{a`J0!P{+AbLY<%WW!s568;Y{Pd{inGbI;PIA-t~)~pL{$v?brYLp?~_pXZbv!
zXUnpaU-TaO?AaY@v-Pn@Kl|^xfa&_p;tw_{R%>qE`g2qqe5KUnyvbC`i{0(LNo!<6
zl?{63zb06PA|j<!8Q_J3)Y#Z+lZp>B1Q7EJ0fVCJZIZ#>3o|<}FraZ&b-p(_W=k>h
zn2}!Y$@NF`CtMFX`W;Enr4q2-HALHyNjZy5+Y%1%ez0es_i~#1j9b?KIb(l+n#<7-
z8hOg<t=3Ne<XmUytd4%)J-2Ub1D{TL-gK*blo4)UXhM4Ph%WHixeXZ@=PFZ^p(ElK
zV-6M0u_iB`g>bGJ@Ufc#67GBauS~Y>z3lzjWP3QqBCwbE8a&8JHSJ=?lB9OkVB`a;
zV>9*&L~?G=@IIWb5?g~HGE<9p4|hk#jnm>iayXjGEkEZKo~Gr@NJV?nES59G$zf&+
z`Hs^(g-M3tzQbDA>R_$je7PmhG=9;1`?E%!|3mw6^M2GRZ6Ac^QaG1YQIanRe2CaL
z<i(q@y|`3h2D+{h$w^i~W{H=fJm9BijCFSBJWYP*(wqt#ZCv5LC#^<2NsO&%JSdv^
zs%`AFIyy_m)qs|SM-gO)S)*(E?2c!wi!>Q*c6y4WiMF<-ffHToU80Ywk#>y=0|VI>
z4xeW2E8K#656AVmdN<{MgpMAjd~q&x9x&K-(DVmSrf!S!<#3=;2@Q{L;wcyBm{eJ=
zd@>x^t6W>k!~spiW-6sM$gS8LMVXE*L<y^E7dmKaDROS=tkg^v6NeWK3z7Y{U#~?>
za6%er4_>KmQCjT*S+W|F3saID6X}{<Ep_2`#}3oyTQg}t^*@g*5|bjWWfF&<uo@qO
zXsV;aILf=qWW{5(($mj59&#iU`j>akb6>EG4)DB=0sHu^0j=)#@L-$wMP?gTI4-1Z
zmCoo?Zz<0Ps;5MtO&G4$x}HmK1o}}~Lay5+g&LX^JdVPZ_p1&`8DKNow<)5l<T>DA
zpLi-xf&XAv;lAWZ>KESML%opj+-gfGOM{prt%#*R2JrXJDjrxz(F7Rl-9RA8l;KW^
znN2ol*tD(Ib*x*C|CLln`-6FN+#3aNZ|$H~V{>o4Y5r6hZ3)`d#Tw63eBxvsG~L>p
zq#KW#Xq3OThb}0$n2qhXk4`B4=F#B^&4x?8ZEi#<M^aS7Im(lD`;5J4$NB16D01y8
zt|^R*l-+3cp9WJ=%|!^(OE=f#u!e!v>9X(J!Hzu!1PzQPh=yX^VcLw~KU#!iDcbf^
zuEO|ONXNa{<9O5sydc-Rj3x9jD;UMonZ+w}wl?%$on9-Pm}FlNp^Cud#zOa2e3%TX
zcq{O{5t#E?o4>!)?{3Kp=2$xYhxEb5@i;xB);HBNlfUnRA&I9SToLau{egpw8X2^?
zS6+fJ69Eg|^;)R{(DU23J&JvTN<kJENk+CAD=NLSRG(YGE)w|Es(OFF!M|97A5%=I
zk;~^7?tq&{`%=-sNx~~@AW_x^S7yduT;`C7G42QRR+=8{DQSw&s6I!CK(8%1eWEHH
zl%{W3i>Ww%Vy@ydPw0Yq!BLXkeskcUVo`3ICk_Tbol;=Bm2pR%gMC%0Q*JPGv{>oR
z^Ha^b)LWa^%BEA7?eWeU>^F;g_<R-qof1*3C7b*--9T~M)lfn}kfR`G;hixS_mMgp
z(|OdCFvlM=TAx#grpT}7(yCMJT<(9CCw{=`+?+Ey>Dn1<K}|&Brvdovf0?LC*jHzw
zEYi@s;|QB(Y;2O+<W(UDc%zB05(I5K54oDzt+m4s7Dii27h6KMyx@NQaG;fcy;<gU
z_CMas`?phlp{&@kqfAg2Uf>Q^Y0lw-p=YY4`&e&mY*$AJv)@UhRoWW%8xVy_4<uA6
zcFpm;MI~iPW(3<v)({b0Dn2ZUmClaN3%yx_M0pBG5*;ui^Oocb7iVb;r36?KH9V77
z0`Na+dXR(p6p=W*qlhCXB?vtE@MA6Hj#Vo}m!=ml6jaU*k7<+?%++KqW?)a4C+d{X
zm7AHmjm*)1dsw-wgmmLCDAz54TiflUgTr4m>X*!~X+81yN@qehgDkPa(h6(nuVqS!
zf2Y_j;A(Ql)Hx@uHm*`U_Iw-_d6LsG`LU=xq(){F_hwMrIVsr}39tlzF5ebc1~z<X
zzTj3>za^k=klf41=cXkT0%FArh8KY3(##o9^#S}KJT*d685c$K!d<38X+gEZjmSI#
z!muYFp%Ti$shQkXxcLPeB;RMg*kb?Uf_CBtU`@}T?8og(#JFhnUL9C#+GQ<CE|j54
z!`Ou#{oBe^%PN^ibp8+HiP|zMD@{bf!pE%SJtUa=)D`z#fljwOPQEDphBlcY4P1Fp
z=1j2>Yd6Pi>6FsIQK`Zid&4*kS`rL-85UG^4u!j9Q!mAR@6tmt_QM4gZe&WToQw7O
z#yC>q{WICFQ1m5(Nar*dQB6*p-fF&H7#L#k{Mhc$7dLd{TIH+~tTTu5#?EU#3Vd4r
z#i;V-Ii*?of)4!_o${N)&L{io*W&z&0yy>4smZRZ=qP$HN9ft9Z(8V7K~?sn00-#^
zBt!^}aP8&UK~f<XK74^16`kjjypB)StIkUS5{nYu)9TpYI1;-+P=H&>CV`gFh9MDO
zk$}GKip2GYf=Ca$x($wV=Xe1%kfH=i%|)i9sG_p~)*)Inq*ujFS<NOTR$@OqU5tTO
z*FLgm+-JJ2)$Q^1?D6&MgO8fdE<h^maEvxEDAZlwzK?z7pCL_W#bUx>j$h6~tvX*|
z#v#BGHfiSta(l^Snv_tQu>AL)-hNwl#p5g4t52q1<?_Ll>EuR`f;<A|{;3UG9%yDf
zB0~zI3Pm!irX}PFjxDMU!&~ahtMcM9Bk(v*2MzrMZ6j3<|LTY??*<vS2P(zHjN6Jq
z8_kB)!*zXC<@rgbLO^9?@Q^N;X&$Hj^X_skFpn~FP@}v@Xic=FA2%+X?mrwZ)-LHn
zDio)U@kw?i6%fbhLKHZy>jk;hB~jX`iNhO=jfM0k3HBg)l&l?6`*(O(p2QQbw#{8t
zkIC%nI==K8vWJ(L=6zYp0Fp#dQOGg^C^@1k-e95G<-%a(Nnn*~prgFDbkey&?VvN$
zw7<!_%>6=5`>ny-b)7t0GWfYYbZfv;^7-(G_JK10avtrm+4a~xiMY7gAX4qB@V+f}
z^zJkllcfAJ#BfhjkeFtS{iCvQeN`GUQj*Z>#p{zdH`RNMYf9Zli&;kcN+1gtxSD%V
zHL{ekmEyrbE}mPx1@ufAqgdQ=L>zh$DKU8;q}_|BXDnYBxww$cLCi+P5<rUMSKLol
zWUO4e2bRMHw!Ea9;GqtEIm)N4ZRU^f22V@innfw3;Yv=~jSCA(o8ny$THazo(dphG
zGNNy9?O5SuP;TwWx^zaG`c2YWmDjtv%h{ei<hdH_MBQQ#A45`9xbNtPMFT}-rp~dz
zKhljOU2*gH63&fg8zbUNWFb&K5c%BmWRMA8mO{&%RqQI+N+5C=o{&V2%DM_#p~at|
zio)KXc{fe36)an_+69_yHT!w|b^yTtubB_+|Hcb&+<vTZY)fY)9&*gia3(^90CkzM
z_9o|Y7SR$a6DVVRB)do07HsAPF=~)ar7MaPz@V^w4H5^R_b-2j)I?7)rPK;8b@GfX
z6E4sy%};WLQB3|~oq5!u%&+`*iNK2@{z}u$UiG0fprLB4R5A<&kvPJ888jd+py^Tm
zoe^Co9`g<ZxW)K{?=R9#U0}klT(J1(C*?;ts_NWZJ1g70)#V*xngCPrkz~8eyi@^>
zQuT|R?22M@t^<P*Y^cV_{%)b;#DCi0yIg6LniW$$PQc2+{hjeSLfknr#>-`tE}>R*
z6!b$-)NUlOh}5gX$tBW5S<p!nk=lzUhf?)jk;H{NNb<50Uk&~)Ci5Je9XrhW3$Dy$
ziyIwbviP?K_l)<a2j9*M`U=67<0<|)J#uz<@m;c&Z{3F(>K^}3^Y2$f7m0)sODxe`
ze1imXD>GReHj|mz>mD{?_NCs!Bdvjd%<7aFW%a*{-IP4vo5WTHI6a1~gO%gs^kslY
zru(1o$3Ejo0zbl#_lj?f`zZuTkWYsrA+MKDjy3qyj}F&CzFQ5({saoqQH92Srt<Z6
zZ#%_#3R7Hq<>N`Rc0$OS&VPpKG2blJ>$3)uYNECM{UuR_?s?ENs=_Y8@9t%HGYK_#
zJe|Jv*yqGFh`!p)&g4Gh5Rw?Dmu8Y(Tt$?z4;qbmDxWq-ls=GWnuIWx{p}eP|2L^+
z+ruC+l|cjcIYj(8w?GY<n_xG8+QTLtD<vfLyWMr1kJ66i{s=}ou11dN2L_bL6Rori
zI?#VM_HA|=cejLIB|Eu}kZP(JhOiTJ)h3)<@XVRmhs37<EtJNdH=NEI6swtyaRL%I
z3e&1@qBmGi3vv4WQTDUpZi0u+CC`i?;%!8*iqcAK;bf2>DFZ;i%N*}kjhrS1Sq-Sf
z;K}q}kumNu_RBOnu_?p#N=s_mZRje~`Zuj0!#XwrQW7+*p65t(W<+x1FuuXOzy;<)
zgZsY&80K-GZtPqrqz`>?u4IQhzsr$CaBx3MoJMDEU2gRw+N6ScI&orxzS<=Q9ww&T
z&$BqHi3KW<A5x*AuO@t8lK|e;qVI!FCIyd5?94`r9}>c<HIOibZ-b9qRa~RK(s5eh
znoFoyRE?}RHnso}B;E@+9GF7y(&b5Jw@WNKlV|iQf7rmj9d)ecns6AeG(T<rtWjhS
z9rdv~%lQ8BII;b|wn&R&A+F)8(z9LRsUJBUFXZwSgKw8i7l^T-Z!Sqa(#}qS_Hpy`
z?kCFfd?z@5UVjx`fP^%lKs)C>N}hd3b`<~G67S(U<4SA&IfPnbgO;BbJOQnUhVO4q
zzXy@4R7#Y;|J@46HBg9iNFj@jLK#$E$rCkPo!7s;_(4nU=1vAirM@mJq#NI!7t~4o
z`-VyfEunxW^wo$$Zo1l1Q$96r@C0iR&wdhKNI{VghL`2vFAzDUj{VCn%#X<<Ae{p9
z&dc<1JODY-2$^(Q(#S5Hm$-sZydAZUdP(5t?!6rpCrMl3h-Y_|tcCFMV%IXV-En|9
z=9Mh*0{9-53s^5oE+CQUpSE*(Z^lsas`_^LXlD-Cg376{@c~QdAME4AIM2a}TLSL(
zbOY}2OKC+&gYC6*UNC|&eCuKLB<z&}tI9~|*kS7zEiEK42L6$KKFOsx?qNk-TmiEl
z1ab~ZJZ1PMf4M*aF+NGwS)EaIwnX&m(RwW|c-H4hq`L;uv5w*GA5@=hn1O)OFUqp?
z3t~x>^TZ2b1SqMA({AnsbS}?FTrgoEG=EKQk^^)Pk1RbB8LSKnE+^Ezpz|-&4K8RX
z|6NH`PW~@R*r|t^Ap#gAuQbi!!(@<R$h=N&pkp<;FO5e}s4XLyf+vmfG~wLMAB8GE
zg(qiDc8Xyl8K@nOmtwoY4Z9H@lIvj{0MP-fs-1E<nM;I2YX_j8Hh5x2Kh1J?6ViIh
zH6o}W3jw-1k&JCM2O-Xq_T-vs50dSMJa3ioLa8&^<2mE2Euk}d`}lR_4tI;*yv}?v
z8JBAQ5H7k@-hyE^Gki~yd&FSGqZm-8ik2d!jl9i8ZMBv(8M;dtjOU)m^bO=pG%11o
z^}`QHHF9NoBOY>1s`p82-*!2MC{F&<9kE+bN~+MgnEqWEuU|z-r=EcF)ih_Di<Hwq
zFMBW7{9L1SmsLsJKg!IR`?giHrcGTh?c?Cms<QKujxteA5UT3S%Py0b{%8q)W)58(
zw13qc`6gKWGVq(I*KHp?WcMY`^Zn=uyJIn;c~5vh4-|0WCpE%qrJU-OLMm2g1Vy>k
zM;*e1WE`<7Js}?mNyfrJWP`9G1v#_a%?Jt-Cq$YpynnL6k7zH{L+jhX3eh&6Tr<pk
zN-}K=`3+bofKPz^)^FOu$R8j@Ia9_rNDpy&)MIjm7~o5qye0=#lDI<OhE856eCpM;
zeOFp^BaMouF)#RaOQGpWb5-HkuS()IzQg5)xqZ}c_#nxc2ARu+(E$rULSf%|#^l)h
z*<5v1g~YEs+5CNU8bS<^cDS9Xo_20!_3Y}+6|!U$z9cb8n>LBd^F}4oa3=Xr)Gek6
zOu|1Zz&LqdN^^;}LF*kBw_QBTAVD8gE_a`)efq=ycy91tLqGY~=EbvSKD_at|4{Mu
zJ-Z94KYeoH^Tr=#KY!oqw6d)i|L)Vj`}@!T!+pQ~KiZ$(|4?l0KI_3p4mrQ}by`wJ
zd@uR8ZMk#h<@mP#S&4MZs9QirF3J|TG`R)#?VB`pgx~9F%J=o>DL2oVz8<s(2K0MI
zmKAi4K7D6zo5|a3541*_Zf~~C*|x6U;G3UO^cJNRT0aXS9A~^swH=2Gnba*R9ZR_y
z3CO>O38l<whLOE)@I3@J#{`8OL=j)MIi)d{eDc8%>Lf-U^rpHdAtNa~Wno?C<5G7+
z!cg5K<Vn9ij_KNGW;3Uem@Ml!Ik^u>-O3>lE+aNoaXh*fDXSzP?=5mt&;tXAeq&f|
zZ#Cb%Vg3}M;6|I$6nf2c;~HZv5v!Q$07^Q&3ZbfKMiyqojbuz9VOa^4UA=g$wgbW_
z1so-C+K9r9B^eb5@SszY1N_o_4>b=w+?&<;CZkkTmq7=HUKxb^gVsrNZyf9fWvpa=
zW^@K!1*__3V<W?28?Es@-`j|b1<^;I9?)Mqb@pM5Lr64WO7_%N<I40L&m8ytMegGM
z1O{T~1wAdgiL;1wDZ1Og8F|w`wEMEAbG+|h$lGT6>f*uC<M}_S?n^54PD$NBm9kX!
zA~v%lb%y?SF*kFNVD0m0x1-*-p6yAjPsg-V$A~0m0u|zYl8$Jgq)L|=<REi<xr<~y
z!G{?YyP8U}q=<IawO3Fhp_7cySd1Fw9Wbe_F-jT^wN!#-@-NJVXfz^o<~T6&^pd&Q
zn=zJH*3*&Q{M`z9=MPqXe!=|34g18c+vZPiSj?YWZj6?6aqOJgPC$Sx5tM0`Sos<Z
z-Qr&;8o)!zbIu6%?6OJK6SpO$>36C6qrD6Pr*MDBTrplGh1&Bv0x{<^gV`|{+5{lZ
z;d(Grc(ABEh1hSwtl7T~gC^wzgpNcS8p@3+8m<DDwv_UH4ZZuO_XDREW52t*M!i)n
zOGdogcq*Q|ey!&cy+Ic&!9Nz7{Ee3Jq1_*qjGXym3J*WZ6vxiT7G>jug=G(1W`t7$
z+9iXlcz;bP4{q?jhwWPiLEr5Wq8W--i_(VOfGJR)^pkZ0%Vjr96y#J1`^r=&T*?a}
zdyBFaDGEP_tMg5f39Wo)b>J6c4Ah5hTr&tf!P{NU;*gXgcOn;H(C?CzDx)ecugzD5
z4yiSxAi1}y(Nt<@qLA$uE18T+9e+C5ss}})uYPvJJocge!@}Tum|~)xl_|55rCjM(
z2g`2`ks;Di&wxkw;;Rx5ka!vUwtLwPj&?cv@Q}ui>fO<Wa21b+&fD3JcSSqQ#6X#u
zD<xpkIr^)&ayL1$1mh-u<mu-4=)jQpMbmq;Qs<MB=c{DQ(wE|z=X%^DE+F?jhD6j-
zGgq6bYc_LQ@mS4wn*2Pa=`t>ZI=lb)$e!&3qn-U6Aie2Uo%wcmo5@z4k}(xo20xmV
z69`)M<;$rc`hS73fwLfSPI^KKJ+MHmQ`}8HrC`KiXs&3TQg`ND$>(jpTPm}5sN7fW
z<>J;tH#eZ$fi#GgGSKu<dAuoL@p&sR4%8}HCD@T4PbSItRq66cp=<+0*JJUbK68#m
zHAnx}x<%gFYfwjlJu3c8k**#|5B7A1MlGG834NXt$WjdU>-QErGXVJI;p!@-_<lh8
z2Iz6=%=@T#hWljXOrYIz{Q`{V>xru+jp}C>Z;xnvpf%|XODuLhC5l8dp1^3WSSsjo
zNB+l8Oldhi*7-Zh3G&ni@rx9S*?Q3ys#yhnwJ8Gr4;}PAG4vp9BvkPHdubmKTRs9z
zb9kJS%1K{B{++*`lE&1Ol=Mt5_6Wd2(bA5$-My==RdJh!N)5(S#9R7f`fW9ZB1D;g
z#m;(~n;D-$xZ`|G5mSfwWW|R7Z^laPzUDl`z~k?Ca~V>-5ZE}LjhR+kCxie`9Zl~L
zx4+mqbac|O;OgCex1%fE|18xyto3q6i9IAg3gONajv*Sw?+b6=NDtCj8oJq{SH3#V
z@bmwkm{G)YClU&Q8*&^Hyg$GvOu(WgoXJ_|@bq#`m>#3>Lgo9G=HMjv{}fPL$fGo*
zEdhQ21t9;{bA=={*V7dqRJVMqGYjQF1TG>=_NDoKXQKQDax|^Vd^tI)M%#bEwpxfc
zKA8=@+lzS019`c-GVMbie6pK#zY-a`$G{wa`uT=<_>iV!M@8^+W@ewt3*FY5eObd3
z9~>FkQy2V0t0nL$4<;|q3vN7c{}N&_hkRdOEyMJS4j3+bA96V&ow)cbqqV*#Iv@4_
z!WyWe;3T20Yw<9%j8=p;{>8qodF;knsyWw6{db9q79CZ(sKMUGyB&A)Xx|3BOA9lG
zA630~H~W8@sdnpe{L*G&=;~UWSy3V;W(G6%RUJ%@8eZSI7!_s35YJ+o+m|_OTc;rY
zuG2-2rU$d^%H$EUm_o<R=SO^f(pmyc6*`iv2VF+GpCfoSMuGV*k|M~mahXR&O=0^x
zabG0zpe2M)A{~}ob*?!+SvksF!8v|dlX`?mQQEOY&z%6|Tj_JmAs&yc5J<&nAL*js
z2{x?5U5{x4Z%$)=Y03N?D}}YxWcV_oBpShBWhF(^)D1_ASR6|qZJ0$De`mVgD4|aI
zUZ^{0zQt@sLfxD3#aoVn4IGV^PRRY5%6Y9SN0Td(OBZpUP9^nv#k4CCs#EG0^-(l*
zg&QBM2vkvpR|hqy&lnQ2rdC1n8C*#0R3ECg*Gn+`w$*4Cp>1t`<8IZy%;-sZMXAaf
z<#7QbdpXfFiroX$9i&_)LqJ3@K=GqJl(<RhpR>`D=GqS;Vb3Gl6MiLNN#~h7JPrz>
zKC<njkz13;OS#$7X3kZfuphu|!rK4~eAU!#;LHs_VlU(S^;HvTH+K``b_*CqI`GkR
z@ha6GdL4{BF@*L8!ud66jU-mg+9CcB5D(AMW45wLz`0C`&E|p^%Vxt#P}A}_1~13S
zseQ@osg;Y=EeE%Qw}_6!rAzK#S>ztD^~3dq9LW?9{LgOoM6B~n;9=|ZQ?95S&$IbB
zef7$)eGJOtY3Uq&Ica#}p^5i$bfNQkfr}dZ_(=}+W;dywJ~VkL?!<xr->hL{Z;SbJ
z6&Mbi$06F|*T?sy=VNz8Uthws96_U4c9&%4d!Ejfi5FDeaEs+jJZEdj7!@P!_DjWx
z$>4B3Sv#DeW~>}=iT`#imjTa{on5kp26$GJw(&=v*vLOw$7*#QUzDjAe|a4%ur(%%
zD8jM*zj%B1xTx=Q-*<Pl=NNCVNlc8Fupk7)3)(?ZIqXuVfmSrejs^`T5?R!H(6S3_
z9oMXk%xS}=Z6bH%w#qDH61*gX#w3<HWp*VgFhEIa3^6p!YBMudO+uaVHZU{4^M1Z#
zE~{(pz4ktToYyO96C*hMzQ6Bt`8=P`=Mh&j^0Qn)x!+Dgy@&sq`ynARQX1a)H>Kfi
z%eY)cgSPbkWcTsTRjF^5nj%EzC)1sovl#(F4akkl8E7aJ$rP}KnwiKuE@+dylLUOC
zj_`FvmxsWFivt@De^Z*Gb>d%m7TT<D2@ixWU0VURDA6l2d2G<b$0ak?RS2=4vI2&@
zd~X6G^%|B059%P4Q*54^j>?n!*{Ubxu`CS$mn(8Vw^3xEQvW!~XI){6j{SL-knTpu
zjV->rQ4db{e^V4$ophPS?!!U2Zabo-j?e9@85-CyI*4G8=YA-S2m|rIG5{kO$bPUv
zHzj$DKNzZ@oysGfc17=*dysod-dOHNaSCs5awjES-~|x|C=sF=*3J**BN2^pU}8BS
zv*vhx|2@lco}}$1DX<mDl&oW~Q&Ya@SZ<wR0Zsbc^2O5jT5dk)2%vhn@{H>V|F6eh
z*50?UHrEx72~}BcUowP;wEa-3KCy%y|GYifz3BN3?1%Bg>`%_=eFN?`KtRzF3WT|k
zy3AsE1VBrxE8=|qg8L!;kF>TT+>i&YS|Zy27GWC8^DPtw*yvz`p_a3zu*(T_95Vyg
zNYJ9L+-5UBAy_}!1JzD*NcMiAAoCl9EHbTc(qBpQJ~BT+B@SnpcLUyuIC}^VA!ZW~
zg|hQBh&o~2fl!s)t8ZK7u$HMuN*YZ@F1f(+A>8x|a{`^BRDAXODHDltaNjATR`B|T
zp{h=Kps&=to_d3-lUf_Ct8YVRdEi;JrlNvKuq2V|0p-w40D{3PRP01qw~3N=!5GG%
z&@Pn!S?OLv@1{0#S<#D!4|4qEz4|!c&bXXA$5l*hl;e8d64pWnt=kZCStP@u+WTyO
zo>n(@U=qcAYj{F?PP?JGEt(}r)8%&HI;WEz+hhN@x%{3op?lvw6_fnoqSy0ltC4%U
z=OH<OpxRKtts!sA%Y8>P$<LR}T|z{`tisbTdIn*iW%2XVed^i4Eayn_5bMD(eK33x
zrDm3E%HRBOvaAgdHWGp)KAkU*#0t({eOH-xnwM|v`W^{$RMr`8lxEK=B$2M~$&;2r
zo$H|p$4mZIDXSMb&UYLb545vmAyVK!=19f|N6#1~&C_DIE5BS?W6wvtJq6*_>7X<`
z?VUEeBz0vO&Zf}q=})q@vpOp9LTu@@Od`TbDq}PO=IRRRU&;b62TyhK%u{^{NR!z{
z!#Pqh%62eSab~xzv{bWLZ{9%6RMWje8&C*xbBYLCC?Z<>F!YGc^3OPAU@N+Br=Z6D
z!`5)%j5U0MgjT49I>Lq8H4Xp2UyuC@kj8oKUr%VCYFG4{9!FDa(B>-fttg)sH)ZNR
ze?sj2??<gDT9WlZ!sLtQgg`klistljoAiia?5Yf)KH#7vCBr?7aAouqXsl#&_r@ST
z;Bzs<gH9#C8)66&yz?r)A^cEfI`(4pR{RnXhIO(WU!+^c#~Oj4R49?}+AidQch?8*
z?*u(>tPtJi%X1c4o|MaezXUUsb;IW8(&lk{_&l=1RBEQzZeF&IkB_Av2uWetV+^e=
z8Jhn#BY5*pQ7|){N_Hpn`yquRjm`2k1E3<XQ;IZ}+%kwR#$q$3wdxQFO|7(Bt}B}+
zD&%vrc7Xu+aJ2U+iaBqUn-Rr;ea)!*Bgl<LUR+j~Qg(~ti25XLU--c_JDDfUYG{X?
z>BF~C+#NUC4vPOZoi+65^>h!5E!FX~N{3*EpV~XYFtadt1)EMwD5#A;X9=Zr-TK?*
zu8ALx|1n2v)h^}*t(MytsvMscr04f8%^!?;(R_Xy4hg2&@Q&f4pEze1Cx?FD!<c9?
zl&LRJmxfsB`M_Mwk3>=t>qxQJ*yeqT-cia(`W@#kA;!SQHwsPotTKIkwzCNO(0Lhg
ziIK$`PfKA<JIpgiXY=TlQIhet+Uz`wK{J>$ht15v2^xh#sD$9AKka$uA3x^;$bqr*
z2DF9gSLD)?YsCcV6fH#2;62>82?6`VntA-X4#9IlXys1t5z39|DWHTrtihTgT#1{_
z0$o*~6#ueT{HQ!5P=o0nMT^D9Wv;ffUgz6E+!wybV0NIEWJTmGrbS$u$w*NcK}*S6
zP)j3NqwMWKa@(KVx0dZY-KtTeZ|AH#?b~f`8AKvbs^m*q_R?IDYwFuHI+)2c?UktM
z#e+*}(L&>3hdr9F(ZPbcI|oJ)8K8)qWw?E%XZ*Xfeeb;EU6+xboSB+HpTA&i<qXsp
zm|4(5BSKd8jZhZVWl`%%{hfCD7$%Dn2~8lD{0~>YLBjr*`mw;hMf)<4kMSMh>d=8&
zYsFG#KV8;~l9IB~&;1A5KAHm2l*o2XW-r~PcP{HefLWAmd#RKK%4!iN0EZ3OKPeTm
z_IJnIyTWHX918N-7Fyf&{`}TNBP&KLG6t9Sq!FjxSQD8OV#IwZezCZobLnydE4k@p
zZn?8Fccv@s>aPxnkkY%<3d@z<rtorRfcDyauhSd|>`Z;dSgKi!92(;E0Tz|;772?$
z!Bgj<;6w~H0Xt%O&2#4zdFIxD4_7cw+YBv#Pw-+#Nl2<gNAN<2cJNLbyMsw-Z-erl
zN<Rj11SgwCW3X*n8Mp2UuWq9wcR$5jy;Lt6QF&4gn&Fv$tbS(dKQ-<Dr=f36{mD0y
z9{iih%dRy4;o1MvR(p3&&4yLKzU$)GM^2o4qwbRz?9cv}{JR&Ny5}G2j(_K)#s9}c
zSHt5Izk9X#cduF+R<E0S^b)E$Vk9VmdU6QQ+550Xxh$sbJ*sxfwlm5>JC^Brmio>)
zBCI7bFR71dGys!GP1>mRXT)bHshI%Ekm?B4isau7P#hK5C2QoUEqzoUcS|5f0Yjal
zC<hn<i8Y;&9jIV}Q(5Vz7CZ~GgwqI|=Pe%V^>wcBn}yoV5u>K`p1J;I?$f2s?zyYB
z&LG`kGcdqZEpd2NI!;U@qK@yjds^5sQS>VP<Pkedq2M|eg6}?_@7-gIw?dsN-d@QR
zk}77gTDnaYQt1DM!1~~OYS4H_82R}b-ot!<v2jLYz(|!wcVUdX=?qkp)_5)vkQO}B
z45X%y|JFym9yp&-MjD;Ak1t+&8XZlJBk-wZe4?<+ciQ?dfs4lT`jQ)0q40-#@`ApO
z>uaWi{rRg!qa-p1_l}YWWpeJ@NF*89VoKbb8CU>`mjM#Bf43;P@p-Yqd6ADGoAy4C
zRZ5v2BV*D@Fm;@hB&nTYz)ML8#U=2g!0Ct_gDEVSJ%zI^-_53~a`zsw{bPU${9ZSN
zJJR!!8Oy_?D@`hN?jW5;`2YE~^?V%R;ya~7371i<9TvI*FF+hJ31I@yxEwQn({W(@
zy7uXat7QDT<Gq-`TXw)m90n=TYuREXBm;)oD&E6wSzbs!rT2g(%0su&m7D7uOZR0|
zrVVUStWjzaiv0ihsKihI{9*xwF<9r@l9IVOa0dlH6&>?(V$@a;-Muh2<N+}<g@=-4
zB|mUh<pbHo2jkS4tSepmI>VUXHc=$!g$yNlXN6lkf}hOaU2^mM_2&qydD>a+_H%jR
z3y&-u$Xq3S5WD*bCD*oS-x2nA5>s@6Y{)NB5a??l6hL?(?FlO;qPP?aJvj!fKoN);
za#o+2BbF9I#A5Rsn<$G0s}iM6;5$92LqjZ}5x<WQ3JDwQMCduPkOk>J5~p(JSNs%%
zH5DR*5f4$S2QU#Q_6&g^=ko1^gvnLfCVUiV2epvE=RnPoHq-wYubJ=NsB0DDzisE7
zKIZS6mhtzn^x(#cynqAZydL%~H&izsG#^4w43H&-lCJo9^L!*UWVB9>CD0{O87cB#
zit>M5AzSDWx(Zo}0;55_bB(w<$!V9$4Kd5Wke3bsOkUAOm_-ZRAFe)!L0~No5l=!+
zlpYdN4FL&69o+?#16!28KGC&sulrl67_d&?);p}-rh#L3h&Gs0a`WEhhKW7nZxdTP
zw2v&oxAod1*8ACcx|#yEuF-6S^2em1szvTA<1u3&G;eucclN*Kl2T+Gq9~M#RS31)
z%2S2O%ZO-H3Jm^<mmk`64#1;<%945`&HZeeo3-}xPdJzmqxOv|ADxFqD$T>*)JEE<
zB~NavYyba#O6L0m*L4Rb@*JV-`>kD}eQ4dAjBI1-2A^Y0lNb|k^ncyNu!*eQ3C~yc
zkrmw1hDNb!04jrz;!1ly{-WqOndb$5OtV=g$^#dw!PTFi(<I(s&*_@P{B+(qNp879
zjHdQ@0#1bQ;T5bpUH%Hq?P6xZRq#kFbNlg`R`%aQ<c8}}X>vA<-3MDuYNYa|?lDK;
zN(yTwI<#wvhKb27*3k0MpK=^w^e-_b*ZLh#9*d2KZJA#Nu`ri$X?H?Ao0FAC<9)n4
z*pk4i@-DJukOyQy1CszOl)so-*+BuRij`Oe&p32iv<Rx!8-li@S_E2a32h(@PAl9u
z41^Wo%{PA|v{EV>Ncqsihbi2nJZgnaJA{!;4w@kV?6=?(of4D|2?8O^)S>>h(UP|m
z9g7b!>Rp7Ors77$_{&Derz4Kg%@No8V^^(kkO;7mSvRxRH%$wTGNcP04Lk;6fSsh^
z@HhxB>`8E8>W>ffrna4T#ST0w31_eSG}Bi4h!5OT<Fd9U3}Y(*I-_6CaGD|@_?_5p
zVGgz&2{uW|UvK$7P=t`-rX)kap9J*-v~zVY)wT=)b|EWd|F3fwLWj~QvARWP(qv!0
zxF>hVV!1jF)<T+zcx0q^f}gZsp5++7kry7({zAmN@W}MoW{fft2UtL5;?0R(8GDiJ
zAItmo5|mQkWBE*uD5~z|mQ$N-=zRmQyG=28c%IxDSXJot;L@NXxsyY9P5w$lCJdf%
zy#&cSQa=N15d{!Sbb|)Ckm?W)koZnFK#VGc;LYvRyN3`tqKpu=vS9I+ty?2ij1p6@
z$8}{6eC*uTzYk`*!ox;u*j8wnIOZCT3Ba62s?DER^!B4LeRsMSef4}K^nn1BHA~=@
zdZGtnWfYtirR!757cz(vK@_8gfNcYwrUuDV2O$cO*9_}>lk$2Yiat`ZTh7HJej>cE
zLVBE+gUenrMdJgLo5fKAQ{jbjMlo&?lr<a?A}>8lU1K@K2s&sL<|b0|w2!|CbCG0p
z!=VxHUi~drYZNz6uKwv@jkG<KrTsNkkYvAa+dO+lZGrz`{Ei}Lg2)A;OW&l1GeV1k
zE2t!6th|y-AfHLX#B%7qRsGC?Ps9KHQZP0M(9`L(ZsNV>MD8fY*ni+FTOW}vt;cms
zCZJLnMyRKmP=B1a8Vq{m3gD@7GnKr8%S&(!_>1C^K%L<u!;iFHuVS_5`7?}}sT!`_
zx?v1|&-V)=>vkA5sjIr?%VZg;(8`|Gh*Ee?6)>tOmeK@>r{H9ztjwJ6xK5hut3(d9
zv~cFmk_8CGQ4H##R|(xzSd#}Jaf)_u4j+d}-A-zfGD%kGRACFOHO;Ms!9|3qw1cuU
zLuiy6WRC4g^EmgzvW6-Ip~kFAI=MVw<{vp2CNM5GvN+o0V9YCvmc+l6lq#4&s_!+1
zgk9BR?w^;%#&)h`G*AIE4MPmn0lRSAjM+F!VGUp?-bpy2th-By@{N4TsMG7go{XLz
zM!$Y+*9d5zOTv)UcHgB+JK~tyy(d-$1Dm?;rn(k4W_VN8RWh*7QbN9_igmSB$dYox
z_FVrj%qSvQh)mHhIzY@5n@$8xskMZ~5z6s-S+#E%wPG$$p&|59d!c2Eqr>3qV-Hi+
zPh}zB;=HCrh;rN#SX%?Tik~k<Xw=DBP)%o;Mv`jTRsL_!rX)9=o}3q@>CBX5!^_l6
zL$Giv&EWJ(?;iY4<-yXgoQzOUzj`vl?5(C^EOx&NWJMRYJjeGE!e%m$jhY#Al-T^)
zc^1E_4EX;BYv>z;LnSR?4uqUD5y?3Rmt@(_<>I^jWM!#oH#}~>snXkdmgop~u=A7M
zPo^_p$&S1tHDW*v6IbTPsDmWybS3>qk}v*)Sf%Argpz#!$MPVn`1OiJf+<3bs@)W6
z1HPq1x!_T1H;M(dEk=qtA>E79Q2blyCeQ{|Q;P}JS9y6WdQTb&M2M-<q>CRgpWE_I
z%7O8HmY{_zb>_x1p@+{g5bfU;JA5+NPxEDJN+aE7`8QCoiP!+=4s?A}mTIq1A^%ju
zNDL=h-wu8Qrox=Ps`-RiwhH|h^N8sUW34Ro0m0-ILJvWW;hCgXqM*qXU0qfG7M6+U
zSEjk=iEK#kM$ZA4Bh9nK)z{A*5OlZ2yhpgEI|i`Y#fg#~tD%%H5|LiG8^JT0EH*xI
z;e#T+V%GysI_!bWOyN#e@PaNu<ARAn0iGQ#I9NOkBa!Z$r(Y@cCF(8)BIIhR-cS)s
zu9qhSmrxiSsHLd1BN*a@WOe3?N+QO?W*tefY(bN%nhFkSz+wTa;F0nY1a^yhPf7`4
z?JHO}3kjDL5ZFY}bExr=61l|1ih6;MK{}?sq;M4L`N)ezj-!O&yhVd6q@3@LS9FC6
zsqfCFhipc!nzzog<IupC&T^brXwSQ&Om}&js=P;hzhPF(G?BW3Mu(nZOJ&;g)lj_p
z8|9?L--swDw~S_@K^!8WBu6As2k0p|@1`Mj!k#6C_~J(x`a?`+2{5mAK4fZ&(*+(|
z)w|127^S~bvRZs`-A*Mv%G&yPrG0VEZ$H6zx1S2!8vaV36e%AhHR&zip3YPRuVs6^
z5RB-K6s-X0+|*yw2mx+j79~TzcLHABQFtf9vZPl_C)LY8k`loV7?aS90Edan<R$##
zc<G&MRRl>iNNE=V<b&S<0BDm#<a^%YGVrx@Ig;Ti65Ug!bQhf`IYe0%Ijz-}--_EZ
zy=E-MCC@4grpj@8ZQ&ue@h*tEvF42LHO2lcek<lmD}}cUU)Tc^BjX3c3mNyo%oduZ
zY*`816KAN_4~R61ZtbO9|Dhq@xA~}5iLMm?6TOp6eS#+gx#q`NnQohn;U_xPYv5c9
zQ(G9MT-r{eAg>5nYpe6tV72>Orhasl@0yR955=tbZtU<|QTsCF^+~pV4<dj*AP(qY
zp@#sUqhb#;jJ!8>CqXZ-#D7-1e&efG=>}KnszoC|{nLu~(ja`Spa}rLQ||)d@8T+t
z^-Pp$3(s3f&meXkyjEa;#3-&4yao&>gmETJ_Xf0D9F(BrJYiJnnRGauM57c=zOTyx
zMFODCeMid0_cSjQI7n!Tp9{In04l)iv{{sWq_^SqfqQ*J@OtHg>D@-AvqyL_2C|47
z?9GcPS7vd>zQ?!H_W*sQKL<E_V8S(i-2l)qVzic64ZoZ3mg3~ggV}8heurqyS<|uL
zrPgxB?q0&6>_?sn9`jcYhF27|i@bFGn|5-ybGG{KPkxcBL&<=~F0dHoHd#y0*QH`v
zy#--ZrFVAV`>DPo<<;(%Ax0vdD_zHu{7<Z=W2&uUz)t~w9@<T1ZW(JC+|Dm<4Ds%Z
zZb*98Mm8o|uJz{``wvy~QN2^}=BQ`ji^FL?lSZZ4;vLK2U8IbbJ56H!;`NpY1yKny
z>)zE{rQl`->IeAKd}mbQ0n#Ia+ack2Tv-EvF6b4UCf{JI934NaEzm{8P~ZaRhv6gq
zC{Y7d7o<lfDsm%|au!(RYTE2{FaibkVpwUU#)e8wJZJJHA|(*FJIlI01XzLw&b|%#
zC2*^&)+;d+*UyG;p8;)n>Oi+)4J)z-m8HqK0jQ*Ty{9Js3`j&?G7)Vy#q@4y&L<p!
z*d3|>_!cwbowNbfyhcF&MX#T<!-x|*Y_5Q3Z8v*ypuXqkmv5B9q6QtyFQnNLEfEKs
zI=5xpYFn(8_`m@b+#4~1kg@&Iv?V=hWpDAD+^3m+dzR>7zDG^3J7Rir_0~ft1isUT
zT%k)?COd~pe&XF_SfRCAZghk%;NuXIBedLh1oN00hcwfZZ<dxqCCq_g%nqeR6!*fR
z=7qz-Xm8sTk$l@wUGOT3{!Wb_2}`e2cR-s$b;3s@@$$0bpD+oml({k~C|9&{C~wp>
zgcsPf2J=Rx7KY)%v_TrMTksIRP;`;*q>m~R1}|(+8Cr6WMA3CGaRvfEA@VEoJ0@J>
zDxS#rgQq1eC5%qQBQO@JMRr!Eifatvt>y27L)<ySpyB4_{Wm7Yx1~4VImC!^o%aEe
zy(~o{DA1MxNNuSsy_3Dpqyfx{Ob92-7P<mG7|D{Xsr_3REeIK(jIdRLjGGKryrQQ%
z-Y=-eA(E$T;=xg213%~T<x)~q(2`LBqE@63To2uWKWGCbooNdE4chEi%+-m7=!e<B
z-4r#*J#6<pt@X!*F+kKbheBjIt*biAZH?yFN%QN~ZWo3J9f6o|h2u8p{IywHyMdYI
ztrMwpi#)F;`CqTTyBPYG6prN33b_2+OvmJz*-{V1<(zBL5z?ur!H)d(Fh&8trHH2K
z@E-e8T12*x1EFFGJZO2G@{9kZ)DVVmB1_Gdg>&K_5ob8=6NTBA>M!9XpTuodAU6(E
z3^n9UqU6|OlhX|Rn77IQd(9ag{mCDGlJMu|3;(n)zw{elzwiF!m!Ep{m;cYBi!MJ`
zbmL!~|NZ&c$42)5_g9m?^ZlyX4;@~%<l+bCHZ^Z}y>G!EU-`dut~18Zer*i?XxT^S
ztAFrn(ID&Bc(b;MJ3hr#nkP#|i^I+lp>2VdhqhYA&KIVKx&fyy7aDGy4a(lU529iL
z#6%h8`v(eju*<>j7w5iN!~IMYK`me)>~sHoVkBvVnHC)#bqd-kc{zBy`*PeB83^rU
zeAzkUeRwOde@spBA_FAt4Cv<N@l6ogY$7f)ibe^*7zQvAk*03mi}^$xbM-bJB-TMg
z-Y1I=&eNrq&-g)QR6(dtbG2)iwM!!o?S@=mtM=JgxUl4A-7>?(F;?*nYd7@aTUte7
z$+Z)>mZ7i>&7XVTH_61boo+4{hUEsB3zRbgu52ds;W)C5Kt1mjp<l$&>EbIO-Zuh1
z3G50MjMfOt4<WdrjlsJQISFCRT&=x8HRsvq?vdhU818`hd?>z8vHQfLLb4@<x=4jM
zY(_PrYgKJ}a6Qj&Z*)Q_zy}rrc9z}{Gh_OsEodkR|5Ao?qe@0WU$>A|HJqVt(<mA!
z+QFe04vZTQXk!iyZpG#Eg}$@nlYEoN+0nFpg8i94qS#%+%apX{xqY(l;1Pjx_?E<j
z;2O#}D(B-#-MO(M$3@D7sEn^R^$mNb;+rnMEa$yPPl{F*=P6N>f{OEQrY|a#V&Z@4
z%o<PegXN}{7u{Fv-pn|&PP=A++cDG?`fOOcrS)jn0COzYPFw4_yo1`GhO{#pMULx$
zpCady77VqI83tG6!w<zU1axPc)GhFGZAgFu+0a%5u8QJm(cF-<<o!bIC(L2ra{tD_
zZVd3nevue$ShkU{NBK%(RPZ7uTZ}`14?V<6#gT?LLZ=~nD@27rKIRZ3c$F%de6Yib
z0AyMb>z#ol3DA;8F~qCSk%`jmsmfAS9`a|1DTT7gVpOJ@InN&H%n6NZdgi)%wo6-Q
z4S#yeT9WO$F!mJR4Qh8TtI#*)r;8BUG!-DkI&X0vv@kr8_EknP0g#6-)%JqI?%tWw
z8yrch8xn1&KNCMZ=TaCxUDb^crEwsVSV~(5QF5LY2SRdH+JJOTSzFnOzIDd?9Ob=5
z%X4u2qcvRAvkIL7eG)z1j=RXSR?q3!c<%F=sQyiD-ovyH>zn#cS%Os|oCeOgLgO7;
zw|03%yKcF0=k!uE7>^e%{vk);?T(w5>x}=ig9|(h!ITTdwcEw?o+l-`6SZ@I=fp&#
zESG<gBP!W0wk{E?2c2B6ep*nl#{38V`7fu>e$EAOR>|xNf^USJ3G|_4Kv?~WPZ9xL
z#&NQCvVzT<*(%PStLDXkfkzJGk0GKNY8&l?TLQll-I>5Bz?5PQBUl(xnBj<xh`?UF
zA&LrC6V7&N6F`z5w;LU=x-Q%@SC#$;r^wXwB*7Fb0AZ#?rHa3r0R9xXI5aIpYMK>*
z)h7JD|H(O$OsdUqGNGYxtbim(B<-Izxqt1sNChbenGf19<F%T^$?Pp7<)S!<R@yfK
zy5eFj)!P_(rY+CPUzL%Z^}&4Y&m&_cAuTv!xwhXGqLN+HYz@5^JkJ)>p>V4uJg6^u
z!)RTN4__M#pIV3<HDK3t$TV=!BTR_A_<L3D-r`b130Zs_wkab06M&VNiBg7Qk!4hp
z`;-obF{K!*sJM29_^FrFW0)5s_0Zn<pAD`J<2g#I5!>~a*gV*BP^(!yU|K+xs?hv7
z$jQV&l^%<@Yj-=uRl*GX=zEMYkd*zYj6`Mb6|TqFSay%o@w4`zDOS{o3@yoi;|HWM
zzdT{N6>4|AxBtd7(y5)wpAC{y9V6DVgbCtjbtCR_ZgIEvE%P6Wvuy056jI}oVyg4U
zg{#uX1{DS~G7zu9MiX4&fxwUXM1Tdzu|**c)P-Cy6>7MiTt;|m+V)eh_o@5~q=QXj
zIyOr4cYhJ&W5@;ZxLh&V6e+cIkxh?Qf|VXh+u_;sw&68P@YDUUK`ee($;3wv2GA;K
zVp-b}{zM-R7(#Db6Kh#s(9Xt`oqXYam_nw~#X%X8&TBfI!$D?82tn-vi`KaJNW0F%
zts_kQC#NM6V`04o7m9sn!?w6JG#&G$?ClV--}z~Q3{`IO5@V|Lz#Vj7Y5msS^_A0N
zp(h3^s>`HSL@UdbgzIgsZA5j*aPE%kP%veg_yDZ@zI}XbB5Z)cL(?JMVUAX9gnCf@
z!O2eHyXK*hL6S}e<7bsc@iIgp#*G#+9~8#}yd8=Lc@SIhLdN+Soy-wl;03(7v{!R#
zQL@D42+{{F<zbnovUp%s8{AD*K`$7nek;m<ci#%$Mu?0uAYKAt5B#SQ^h}Pd5{9Z+
zsxZy@@TmFza*+Gvtnh<%<~M&ZVhG<nYY7ei)rqA*8quyGxM<A_cGh;@zpRI_o9fgp
z(H-G+rPy5Hj_nDqhx1E6*p0Sk4Q8sQCW2pnhG{mR6WS3!j}jf-!5+O%s~{}6pqS+y
zqm&K&id<pq^lgIi!V~FGU>W<%K4i0%?DZIyT*tMvOR^MOAkpGVVFSv{ivkD$Y(aLd
z3>-Cc6f&J5igq1WU{`Myz<~_jutOjs3(5?lD!>n44YYRr_OHXE?Jo0pm<}<_j@fTI
za>|=4km?Hxz`O?<?_O}T4a{{C1E<)SBOL>WK!p)peH0667>!(Gek(P<x(ohgb#C?m
z%o`9i8s}PiAo+CsmD@Vxd#Pxks$RB@%_U{B!gUf)lnPXjA<_U;B;pi?J3|RPPT3}a
zI@3N4=GgRwJ=KN<#{N?n5g%?2zw0`1>#8G6Jmj9;{5-31Sk}<ndExVU!AJUX``<VT
zu2SjC?C(yJum-NiAi+?JR3+YsInca((7k*bf%3_oe38V$QxfX8C@g`FAdRALB8IB9
zbH9-;oM>>B^hU{$1Hwq$?24?9@AU5BlF2kJ50ii4fBOW&R8*Tg(lP{*0}&IeC3%ed
zvEQaB5n8Maz5>T#L@@fMY80y~DH@4YOx;?<e3=KwR})sNe)hiQ%7jG=wQIL2BI)bL
zC-pD22uup9SfIp5avWkRS809<UAE1w`?n`WgnKo&`>xJIM}#dnZOs59i$;nf)jLK7
zo@I{OC7xxxDad?krR*<@SXC*YsxXxrptxRN#>`uUwUjyP7X;C4v@b?czbt<HXt@|t
zQTuJqJm<T6%fwkWe;|N%+`f4>Z{kkh*P0g&SVN~B<K9NgXLb1y2et^8<Xq7!_rSN2
z*^PXJhKiHk3gIa?=C&S5D#S(_T2m(Pnkv&HFn-on@#)B>WV?eE|BADY&tkAc>B#lO
z<gScit^cu6dMf%rz;iH?NC6t{%<``w8;TzwK#F^xfWlFMbCeI%pP0Si*JWTCk{BHj
zsG?=Wl@;Jp%juUzVGw~Eq(m`}2kmFc?s8rtOy>AFEP<NdK;@VodUH%;e~|WaSN~Y(
z;r#O8J<@>QEuypJ!qiprg@D_15!SLKiGC^$+xN|`6~auLcZy7{nT{-U_d?qjL<V>Q
z&hdkW>MGxcbP@FDOXwwpQl}`75I~&1Sw-t+9MV6UP_L4nIo5Eu?}_k(G?s)`3XOp`
zSgj=bz{B~yj6nm7Pdg?)>d?khG#`b=r@IUb{im%hC~YrUZVpNC3wIjJpIGhPhlg$M
z<o*i@vX_qzN0LgSE}uq+N3HK9t&TDf_BPe1IqN9+u`Hi~;-puT9149r2Fmu8nLpxM
z)yDCL<G6^G&bz&{d{3AzG7#Z_#B5E3fcv&8Q!rT|woX;#fR~6?M-n3AARKHkBwxMN
z_vYjPzU$IB4EOE=reEcYi=5HpCG5;V089raTq&vNYGYc;a^t|Gq+vk%>{<Py)-|Se
z<1pdpN?qN5<J(pV0mFPZiVMb}c+ZR$36G<<X1k1pwqofO2h|SBj`xx9-$6c<Y&Hx_
zQ;un>r#A;$N<=G^{v7T%jNbzPVrRl8_85g!W+XtdW+V!_r}CiyoEOmV2@D(q1Sh@P
zC<Q#5PL~OgwO}BVP~|-3yI%F-8OH<{$(u(>(GmJ&{*IEH+5SPj<;E_{wMJL)PdSOK
z&p9%0d&cy-xV$&ga5m-j6%2M0UdQCBRV5D=mI<jBVGpO@?LA7MpxPpmXA+-Fj*3%=
zq!*$fo#?}qXi-AXS8njl%E*+nYm0E&L`551FDO6Vw%Q|5C{`?3yRwyx;4W3Sl$1s8
zs-_5s9}Gee-s%oQD)&v=IVP8U^9Z|50rIJ^PcjupggG~5shS=6%9Mly*d<1wr`%+%
zGu#FH<IAGZ$|RvR(Lkh>DK(K@sPG(5)0*f*#48l;Maz56O~X|tgI|>4BTLbwJ>$3g
zA7hHK?k;pDm~P0n7VPOgr>SwbL%WmJYBF#j&_(;&BgW3)2vKo)^nh>@cdb!W)UN7n
zWeCL}^hzP0AYT8zWg7=pEE?Dj`k7NZfGPBm&U4Fly2793X*XQq=L#JYZ&*Vw4h)q%
zjlYR13VoCqk1VksOnovV;k~3(-=_XdwtVixR!%`S!uL1`!tCtsh10NO6;L%yXz|6D
zP%<K6BW6*La7Rf`vM%TY^Lom*o@r_)m4JL?ZY2*CpVvx?FVdAkFcQeQjO*y?M$GGl
z^V8f;?xjrNTL26Z598z?B_I->8mP2S2&4eRhnz&sCR00H5{m@2M=UsHL|zNNKf5_4
zE=regK1?c7)~b_;5wXLMZq$xNF5P)y!d?GX*QGk1U2Z@e#rP_8N8%UFhvCqk+{`WD
z+w6wWAxl9DQfF{m(%$9+2@?yD;H9(%-Zci-0ul0q$)^N4UEK@4HB)q2a9b!Vq}ap+
znywmak=5vklw5=+5m<{ki`zM+nUNebE5$_R7iq&=3V1u0NnN_AxqypWT*mB_eIXoT
z@ZTNc3CqnK?%_~QQvsg3ETNd<e}^+N_HU8ig4=*WJk(~QLkYU{EY^@D3}nO>5*4^}
z5vW)7s*?B{dFMqw@hd5EDHO`gPw~c~8@2q4z(NesXtvaxMp`iBD6u>U8Q@^!qMpGQ
z@CCxMtxlb6+~NAH&kxr}3J-t^ge<a7nl4N!%+4crTcVRNO4BU1F!DKKB3<=bJbOwy
z9N>=QVeJNy^2(Usdf;)>hXNh4AS}0U61J4QhE&(!m23`_={=?gRA3Uurv*SMp5*61
zWVa+s)Y8)?3r4Q$XS-+alLR2SQRa1BFTnF5ev)$62A-9tCZ1_)**-EX)uA&hHs5wq
zC{+OTv0UQ(@|hqo^b?iMFQ&ekyP>jtase13OQ6sERWiGF;`2&AFs1laUKhZr{AjW4
zd)+RGj1(LN?w_ZYyARXblV_n!FNL)3Tj@U}JRhlG)zUt3V3%@%CpE31gGHwj?(!sO
z2;nF^2`M*sP~A3&JBfKYazn^)!f~d8z)F%7BY!E`3QXK#jHTAgi~2Cv*=t<*6%8O8
z>G@-FJx4GBp^XM&gm5Sii=7$ja|8~HK#%@<YUaG(YiGu^Ol#Q7wR(SlU{d-qjBiv&
zz{;R!;j~unBUbq17^}JWz*;DF?;yyjT1XdIAZkc12u4<-ZK1&s10M0Y1Iro|OAFqM
zJ>j1!HYr$zEc3AA9+B!~x+6}{^09=H7=y9!IbRa9Iq{nB<uDm^s1usQx^DNP{AKx{
z3Y<|GzWMJ{nf8fZFsb*FUW=+b<91R*H!^0;;if1<8>}lK=M;byiB)z?>8e=4{&loT
zWH$DL1Zrf>6p)^j>lBT`u_rchkv^7B%3pkL=URX_{J1N5$B29!v6PTBH6%j`gR)tN
zxZj8W9)-6h!#21<v)ug4=3Do#Sn7YhZU2AWUHabdkLtdWx%_9l-uv3`fBMSv{kh+`
z=iS`WioZ+x=}(`3t9#~uoBoHdeeH#)n%L5pzx}mu|6}e)|B&|d@Q;r8E*iQDKYg;`
zQ-8-(p6^%veOpiV=+2|Tckt5e;7t!waVj~Q<vUwz#k6>k52nbiwRDPC<wKDrB?(4)
zl4WXe1s#E%Lf3TT&sa+?+|sOL2F$^2C3O|R)i$qFH@Kx}dL}qe!en{`8DHDoyLTrT
zs$KVhVP#^<lrC3E%@5{6qxTARKtUk_d`Q39lxUR!)l1Y%-qUi(TYVeoEqEQV-t0y2
zBij=M6a<XH{epY*_1NL<%s5}MyLa?9<Yvw|w5_eS96>)b$<&?8O7Fi3DCx0;`c+GC
zyRXU_boY2$4dH-ge3#})F-!!U=Y8+k<8u_HJW2?D^8EqnJxd7Ho5?AxonV8jZpoUx
zphR>9coDM<C98z1Co)2;?|~uj<Sn_T>H+hNr9tMd03p&Y(NoKo3n2sWZttL_2G>(F
zate<|nYSJ}?3*OddtqVnM#_8V7(&bqVQw2Val+?=GS_7xD_4+mCk~U`ja@&aY7yzm
z!@M@iI7TlV3Q2oJg+)CqydAlvxd2B3m(O=x8NP8K{i2ZQiaizHCCZXOrr%}c2`1kM
zfwvdx1Ahxzl^joP@hq$_$`&cF`!_TP{x+4#F%RTvTkf5+nr)Ji9jCQG&R)#4vS4y6
zpu`&+1FlJv1yxDldE9F$YP0YihkKK#N4F&z^_=fw@bML|o8*D?F2%s$@PUbs^kI))
zJ3-A-W(^)4FqZ7?3S1o0&TB|&7|iCp);XnrsdQzc&yE75l_3Bg0K6aT4!;Jc1JIsE
zp@Qdfs3JDsbSx>SytB433JbGsm0~+zMKmPFi!Yog5b_$;ICk$=@MESP{t3kp0eV}f
z9=WnU>v~yeVTO&|W^P6FN<C0*u6QcLU;%YK!wg5=eYyT5PnolCVUiDisSrOQw-nIq
z={ZO}v>rx4cF$qr0l`*=uuF9k{zfwbKX(0jY~nguFAsqQS?_AftEPS;Jx{qYqibhx
z&V9*GVu}PqTbPZ=x%$p4MWQO{8IR^SxhOC30a`uE5f0~0_A?VE^47-<2Om#IDbJ-*
z%t8)QN{=G<)b1$<dsjGS(&~9n_wABmvS=W?Z3dMozI5wNbFr}Fpx8LGWd(8=521H+
zaoLKj=>hud;03*AB>hb_1U58VgZ~&jul>ak+@)Q!1YzZcV?uf@NBbl%XylE{$J&R_
z$|(!ac#-PfDK(s@7H;GYLqjSWWJMuA8OGL3s%_JX=jl^-wh@1mjh@&Tp6t5XMl<+^
zsp-u6u5S(}AdqWqq!S~?1WP=Sx<D`TvOB#xu_?~>v~<WCvjLqzU4+_-!Ciq@sAVd6
zQPd(46TX|O(4^L!9XR}zGn-3TRO|~mb>8P1J^dyuPACcz{Df&Vhmuep7^9d;4~0g$
zH0&vS=iBoRTsa%;Hvhop`v-|&G&L=Rthc9~unRA>m<9kgHLSS^{KJbQ)P$3!{?Cu>
zv{cjYQeT3oaUoU|GDi|qx~YJtZ4x;YZ~xrM3`_(7Eu{evQN23U*etd<V%L5&ZF*<X
z^OP!`2{h|`?HtO}0`}J~SZuj97KT6M(())OXF0-y7JuuQ_P((s-aFqAEynAHiCJ0z
zfImhHP85dchjiLS@sy6$@jsCMHkew@Am?C}8UvLqE)31jFogy>JB}8m?Fi$H_{CW^
z=!v~)oA7N#|NJr$ejn`^0ExZ617qQ?<5C+(`hY_4a71Zj6|qf?=J~1t4SJ0@M~}#C
zrCv#60g_xV##53+h3^2xigHfi%uc7H9e@LxtapJ_=QM%f*|K>?_!Ywc6N<XA!Btr9
z#NwIk!OpS=LI+g(x-wzZUJkdp9wh!Woq@(Aj=%+5)$lhT`Phc0#BTT6z^fR-pmRg6
zNB-(PCCfLFx(LUFclA&~>091olE$_<Yg~z<SxsXtQ-zvLi8|3b591K(6|sCqK|*~g
zRR%vuZ<eb5at0|t$&IlNHtg0GbQZnXc__XZ{Z>?RR$ukd`$(-OuCon>i1@bDnU@@)
z{hH1YsA-ZbhvT~AwlxHc!4<q_4N~g(TP?R={B4?hbF^rfu>IWF=G{jqhCGA82J;Xa
zo=#j6{SQkNWm(uL1jM*}F5vn=CL^|o!MJ=Yz={inR7~ag65RlJv55VI@4#fB75Q<T
zk>(gHRUEHG@`ZHK_e7EVmh|6JHnddqWlPmbKH`wbPmr?Lrz-^@CWEYEgJFzPh(JwK
zY>UOj@&r5-afp-3iVQ|g?;K0X^c3*oZ)JAyv+Mg0Xv1iAUo$$6kNGeUp{r)s4Fd`=
zPDt?;%DK9U`mMZg5=opSr#xS?=oq8`COpdQdwF)g*jxW>FkA8&p^+tB@*FVbq#;D`
zFNQGbYiohOIU;%>6nNr>4|vb*tPz9*cCnbbOUx8WOkxktti?;#c%i?sw5~SR`f8{y
zbYA;S_#&mppR_=VR$&QU)o@c<O+HHt_h^1YXqI+W`-tkJZp?5%yL=u^_R?YY*F^z(
ziS63n%5xaO@TukJkO#>QO=J<Q2NQHjvftnadAH%pvw_Pk>P2?DNVXEn{Y;@$_8N}E
zs}k|V31_AlX$t^xNGAr<bc1kTfe)o~mes!11rIEex`BHQOE$7(ixPkst7nKMRC86E
z7uQ>GN@6It6>1XhSh7X(@lNk8^q7>x=gh3yQggmhilH51aDkf+kL?)rC*Y>IKgC?X
zg{A&^11m~GSF&B#9+;^-IstllwczSgve-Q!Tq8-a@s5~LhU|xCW3MM?7fM;8mtHY8
zud%9@KqNi0OGK@d9%Rw}sN^+Ug>?#p;(gl0LJ9Sa{5E537O1J}-$Bz=teRUMPm7o9
zneDqMBbF(14sEM+PkJqCjcp)mMbG`?x|_yuU-+^*KK4PzSkQ8|-Z?(X5o`EeYbYRx
z7x=D(M$e+Xu$Z9o8wHcehm8vv<li5gSN)eGvU8B)3@*8710yQIHr4;i5(;0nWn(G?
zEzU0oNP2pNny@5B(Eu5PVwz7T`~UOUiof@|(K39?5(>5Fz29-=?05yc$>b)(H{j7n
zgBb+&mQc5|@d6^XihT~=QUb^FEYnm>_h|Y=yHH_%3Jh5&%GE>O{i0wZWzkqJ!)TB#
z24zuHXkc}{Tc8EWRD58TvnI~S!k}Wc(@fj|FpL#If@!Pmc@&1R-M)+KyXrlwi*|UA
zrmn32TkSWlFc$3qCU2i{gdG!oKRi0K|8p)t)mxPQA4};U6S`=%_)i<2csC6_)}aav
zqzE8V=LTBD4!RJVs1;Z$s%}#(zUW_Vl)06HZAeRpdkPTjm!c=2Fgg?@7=mAdW!lBX
z52N8bv$)oFUj>0pIx2;?Fhk%+XKb*7;4J+k<k~jwY~XJLUuSY4V5pos-;3VC5vP~P
zt;m3otz=2+mOuth*0ha`EWwGX!I8w#fqssBVMWe7=rY&s!1yMYc7g`r#<4M<PPA5!
zkc*bfZ7SYzv>Y2h_d;rDQty_oZ^i)Mk;yB-^z870NJ~~iNkakpXK~PHEGHDLS-k3<
zwF8@?c@SX9!*E5G!F&c3<x6VqSWlbTF}XZ;AU?3zU+>~4u4ftA;;cbolT;5u-c0Ov
zu9)#8T+hm)b>72iuTu5rg)hjNxkBge)FY&`A??pY;b5yZ)c135tab2>;N$tlJ4&ky
zHdi`_7sHz6-C`L{y!hoSL7DKci)Gxk2z0hNQO_y@Ucxj<T*BOfxqVWa6G{r{K!sT<
z(m%nY?u_BuoH38}u8?`0^kqOv7`2dysM66W)07d+q~Q1LccY?J91$4iYW?C4Td<Nn
z-96}a;B*N_nTkHiGnGPw#!e7Qz8kJbwE$#rkEnY=n0XZ#JIB7{xH`d>k;{3vF6UUs
zf7dlMABdUj^5rq1M<9#?k_}{&^8Qzj3DzLT<htjOc=s@K+hp_XrU;z9W|>taWc(<;
z>ZBN;SiQP|uY+{RG0j<Y=a|ymvun58rX>!qUi|>~jNrUJz5kOD+@(880(sJ<1>IRc
zDDi!KZEz@j-4M{Lb6RttB?(j%JSGRD25hkLIPu?Ke313XlCMC8GnrnB6Lvn}J0;VQ
z?+K)#)K$7oO;zF-K&GwYwW{Q4Qu31pGq@xIZlMIjpiYWKp$W)pYFTzbx`Cp=-u77N
zXAj7BIzUO8r6LGnOC>PuUPi0D5l)$ZQk5^s6P^@ryL>5gbDOWKzlJ|br73Eg0lAlQ
zwoqq{lrE!^zNW~lf~uAZCb&w2B64LyyGgl!!F4I6Z)qWl$wK-K=cId=2{bWf#C)h!
z4wwdnNzxJk{!S6*O-&BG!h1?bf!7r(8u~u=sA0yvq>zml;#V(%9P$+e!_|B`VVyA%
zyw7*B8YUOdp;s#ET(NW9RJ?a%+xyCwXO?eE=d!|?-ouB_RWq`*MQz)C<oRo#c4)r5
zuum_Aeu*z`sUdLfws)6h+!~a)*JF9ix_hxcGj$nV>}<FaLlhLSOwrm@z?qYn1K!a1
zJdc(&%@L{C|K>5d7Z%3l$!Wi&8Z#q#0Zm|rQimGlBE=$%qMnaz3e>g6ONUo8L|;m8
zXZM^C@Qi0BE;b@}3;#%|)@09&%nyWoDY7X6O`!Fu^;9|FH8jdR#Iho}QF2bf>z7B2
zB~Q8heGK_3poeuWirITue%R{m^owL7xaD9y;y>5N-yA%FpEl>P$RudbF$tjqh2G7g
z5OfCh5@9GJ1&pG2ih5v;1U{L92Wei!1Odh}rtW501Ig%%HExh@HZmMAQ{5-_1RqDN
zc{FLbh~-KRo$p7@IbCQ8J!c7gfla9dWz{~pWVybJK5Qtw{e>O%+sI2xvr=0M60F$t
z2jen8IUpya3#WTrw^~NRyKUw{L=g}jLBY$V9-;wa!2wZ@*_fasVC9tc>R0$hU`G`Z
z8OEPclS*nxMk+D|Q+Zu-%a=MRt_g`s=G*&eWCa2dK<tEcXDt%g1n4xa5edDlmH_Hh
z1cS(+oQLzdvQ7|BxC(+v3uGmGWMmFeCNMQ*NV&Xx>P<)lU4M4nxSVr9bCtNH-Z-eI
z_bo@dn(Ey*%4-4Ynn~a5y^9+O#AL(tnaV<q3IWJiPbSU=hqjH@npbi>%z~I!O^M~&
zNDiaY7>So|2lY9n1fLHnrK<9=`T#QIKakjJ8ZmB;Wj|yBZ6HHbAqh=MUd(IK9VL^t
zjNV7@^M2rydqzN<6#+pa9hu)63RZ<bvG~D2e%F5L`JzXOcHT#p4`Yc-M;3EW?LJPd
z7dm*^KHS{hv025Q0_GNy!=+>9)#eGK%ZOn19FZ7a@qn1PDIg!c^&O_F%?>ny^@<1x
z#Clj&2NAX^j?yqgBZ_OalPoLNA&30MQZf#5n8B;ab<&+{h3+MjaHIy5Yj=c(1VA2|
z(1GN@gko|=W&xjz?9_)*vY>sevO*xNn81~C`_AWC0I@7ra&A0zfOPid@lh007|P6s
zK`@U&@xu#ngP^8YlfprmSu*KI^=Y36#K$e2iC<j|%eK_&Hhf?NOcB?59~p`k6u&@T
z0tQ0j1p@Wj1R|^Ywupj&`c66pT*SEA>$fs0K)jK4Ft^}^q~Q+*8qNz1=9#~e(1t8G
z{f6<=2F+WgZU0`Tci&G@3zpUt0E1KCKzd)G7u3RZ7o11AG2(b(>f1!q@lKKZNK_rt
zj(5(@MdSY!VVeN_%%dr?NJV2*iCh{}e~pWkpphMA$af&>&Jpg?7DiLDpA6xS8V<0t
zDd}TvD0);_|NnQ}h4EQGne5s#`PhYwjN`A(UjOx%Hyqz>`<pl3Dt+e9+cW0;XwvFm
ze)f}l8ozdV>DO0zzE*o8^y9dDDqr98t>0B$GPyo@#_IoV-_c{)KYt^l$pXZ*_KksF
z^NiB#tU@-H9ZrvS9T(Gn`IM@O=b8o{-AeRDnIjY-VGbEnYa7)kJu5V#HrCfDe}hV2
zndLnLT7ii_>P!9%$ssV`;LILHB*B~?4<}O{qY!^BvPB>0cp3(+ZdN%L0=K)52=qm*
zQW^M}T__LbCIulFT4I6yq5p47Z#3^=DLhG1s)6|YHf(U1>Q{F9mgPAcN&}B=^)bN{
z1|@T#(Xx{Q&ZGA3^lcohdOr;^X4>@FIS=)$+UYu=5nqQb;~(8}Tzk%NxBodq6!f4G
zL->X@_^u;xCb6Z!G&kd1(%4)~>EJ0~vmuk(902STVAK%_HZopZ$cwi*6GeW}qddaQ
zwE{#jh&IZSI?S~Ps?!L-MoELkp852nXzR8Xx##=HBm!yS>zv-`R{nkxH<g-kE><xj
zoEsB@y;#Z|O&e$!5@bb3PhjmEqB$sT=;+Vc#9@gRPyI3wbMG#!=LsU1&x2zy7BD$$
z>^K?4=8viwK|NVap6(z6l_QR;725duc1Hl7>)GJFHSPAkQ&Rb`2A~iGy<jVY<K3yU
zxYY=^u9ZNo%mk}YpYgQ$spsSN&uGZIj_O*iB9rhemV+mv0*HSXg8HXS?)I*(^d3X0
zL4}?Zz082DmeFKwOj+)|Gb9bCHLyN}IuIv>25$PRrbYGJc<##zA8Ot9>De*%B5UJD
zHq57#T$}WRl4}(o=eW!hJ&y2x%k{xC=DGb#8D<Yebxvu8!Y+VW{Nh4{n}q9Ao$!`0
ze*eJk2D-f$R9xmS*!FRYb@z&0C8Yy?SG-NQD|HHWp`iojAN0bfa8u|4>X!rL^n|Kc
z(6y0gh>3Qt;J+Z)nVvgKSW!|)3^Fo%F7qYt6ycR%{_I;*)CjimRu9QWpJQU%f>}_z
zTq{5bQ7DVj1Gm0qLP?PcH1BQZIeTlOEURHDMW$^I3950<3|0&vIpM_5hSyl~TsOM&
zg74r%a=mJ?E_qEf3+AQPjRmkbe`KF3gmH0x?M*r+?D)c>_~E3w(&h?hw;lZkKDNvt
zO^qFjg`g~T(;ArF>H2GMGsOcn8UXoV5|L_MZ+~gpfcq&=G2^x9jB~ktb89P_H#Y6R
zx0XoYt37ejj?`!NqW5wOM!z?B%2lGZ8Z<Ph|NcTbCN89CH}mfLPYz3>fL=x;f^iTT
zNNSUbpxvduW=V*+YEXB1;1s~WWxY@VKO4RcjqRn)dhyln@h;?=2?e%Azo}_IwS}6D
z@QT72ztaw+(0M|p%BI*q9DJV2j`mQL?bv<`Gl691PL}dlw(mB?@dls);F2M<6Ao+T
z9A_&AgmiXrE(J%bP8y4}28^6xu=5@Rc@+(CBaNP`MF;D{Hlu~Ho9jIe>q>?DqiGFn
zp@1vNTwUx`|A7^7s<m1;T(qv18_O+Yz=st<T$U$))P0p&md`{4kr!&4O)|w`wh6oh
z7`-)$f4q3q-WsFQ3a!-wU9ySo)r^+NMT`;Hq0t=Q%yB^`LP1#O!OnGV`IHsIE&F4e
zLpLW}!9boiG1jFW%nSawDKGfP+ZR$?;cNIS#T@V#w#SO2*P(W%M>c!`ctLQ#)x#_?
zpUj}Jki~VF63YYyI2%qQ5r<6>#T%h%V#YxNHuBR#YbA@BHwgzZV)Bku$Fq(q8L<Ve
z*9IPT*P>M4&JR}4P@6oIG8|_=fd4dk6(tao5H|F-;<R`)$-q)aP)t`GFeM&TB@Q8{
zZ1p7&uvFzE@mVTf{if)h(X#x6<6_u8oj(&0kxqPcnm8X<@?m;gso_c6XTvp?KcC+p
z`pYcq_{jLgcux}#n;0qS8rrO|9nxUx!rN~$i7i08+@lpBJTx&_7w&aWRs8S`m3Bsp
zlHZ8K3kqo?<pqfn<N=pJL<qM0(x2m=IM^VjlCkkRo)AA;6y=wMhO_a*+c_ZB(6#24
z?Rx=r;W}|2&teL`F13iL)?4V%s7bCGLZ5z-vc94HLi?lkX9o+et)X{0y<K3+gBHzl
zt>b_5{3>MN#7930=_LhkX=jaf_9ER#JWxsRnCu(iBy>{epU*YbVLJ*^PSZ^#P}A{9
zrG3uI$8Zbg!B>#6BoBcuo5)(SHU=EC`EF8$onU5E^H9$%mW&B$l_@#5wb9<k=~A4{
z?(5HOKzTISxJZ|*lXW#)r40e>_F*JV3R}qtM<`u<SNA%gVp_qh)bX<=xBJsWgY8`>
zTo-ia7Y1ZWg}#$aWLBM7yYD5*GlFPmbF1F6Ct912@X_+AZLq7`R$2s)+|%1?TEtF*
zPCSG+jH-(!1R)`uonm=;GJ7RNwTw%@H#v~8TKWTiuvD^oKWtb8SOuH=?5)xgJ<yr)
z+Pc+EafL7$Qk|%kOxcxD{jI6K{n}r&U<aDE(5H-OZfS$sRcpy@?e^P-XDB|tp?z!#
zIf9LvN4r_o6}Xufyrc!fleBNIV5kt~f0;fK8{Z1gAR?ixHi79OH$FzEGtg>=Ec2}3
zsEoRh<yv`hZAF6A5$*Q#T*+28UfGBS)d)sM&0(W4sJFrlrz}f@cu4(50O|3n8h=}B
z1q~!j$!PelgjhTpbdX$j`I<X@FWnKhF-S|}9x)9A)Y>IvJWh<Mdt@C4xCuTfa;Oi;
zQF33n=r~5s^nOG%+rh8XlkwH^XYxL6<8G5^WtjJK%kK^EMQ)=FJx9|vZhi56E!*hI
z8X9BsdRxtd_CDU8hPW}To_Sw+TT7@1<%IcZSzzEl;(kinyzId50as%C&Qi<D$kvp?
zhltGz*VPWp^!<=O-^Q2+%UoIU#S#KS+TtwsL5PRLXj$is)I*JC2w$1K@1}V+kH#CE
zOTU|6{`79tCSgaQ#}WK&za#M6SXhsrzh?R35bG>{$M`Tlf**Hqh}x~o){?(G`{-<<
z_b6DWFKNt2)+D<3!Y=7q1uuvntHM;L12KgMPp|=Rn1KPk7oH<i#vf7yawd_dB+@oY
zw>DwhC-CyIi&EktWt&N+hvH_;gAt(yZJrMDjc~Z2YT*sNv2tHDrNj`%-=4L1lx>tl
zV!4ls7MycPSR{l@VwawWsE`@DdyL+BGl->9nyT%6EHsEUvF<m*;SrlsVLT-CcxnH@
zZe$vjc)%sw%3bCwXN{rn8{&zRowcyH$jm!N-SY~4EBpyrz1*Pv>~|_yHMrE^tRiri
zHCIgW|FF$@wyE6*{fcHU?clV&H`dkKTZi0N#h7+KZ!!&@wq|mFO^TG7GSMT^+26{|
z7W-Tx*yc*#J7j2(^r(3`<-rAcy$yj!Q(M}(fl-~92Q?p_`=R3l#|PmN8h}>qFO6C|
z2@ncc<Puu07lxnH0v#I57`!9nhd+!;#I2z?_F_3ei{~k5kcFr>_CVnyC~z-LLVN0=
zjwR9`1C%J<6iu6K2YL4#Mqfx1FBONHfYwwG%Dj;Xn5Ym)ItPPzCYY&uc*4EAK!*WY
zrCtOz+<{LkI%4iA;ZAmUkxIEZcisT6K+vi-fpRO)T$-VJD-MKHlillJB(<_Ht3d9q
zI?u>0R8%42x9CJv^}5h7pHK3x;d0Cs?4SM;bM-9Cpt8w#5l><i?^q@u$?xxG8p*9v
z=(Bxc78}o$!$<xosyHZm@x#1gt>*e6sXwc?iIAT>(77m2?QLpmRAriumw@B5a?&r=
zFwJPG;B%57PgbYI#tX9gXvuxtxU?nuz=qY)33Wl@!iC)ahO&#6t7lxA*ATp*1^5T1
zsn%Zy-j4Z6-U0uoIN4c)S6T~m>gGJXV8HxMtgYwFeAC{KxyXknr%<Y-!58mb)mxK7
z%_RAXAlU}-fVxo7G2V#Bqfs3a+luJ`a8D|7&z*cxTF~HkZ8ffTPKSD{WEHUs&6K)K
zL)iS#9FYKpy2pr>G60$lbJz^V2J&r<Bqu!Q;}lPgZjWl8VGh4zP>ByqzTf8i=0J6d
zQ0lN0#ta({6M{l&i|JB_=cR0<fDuUZjl(|akgatHv{wDs@bB6kSMJ=Y10_wy8C=$}
zWjX99LDDmvWNIp-kD(G8$mm>|vzjw%rb{Vk+Anf#-x2x%{JK+M#Y*Y&U=IRcTRdhd
z4SFd#ax(`u@G&GOj?x+)@2rg>8oK8h(ZZ>D=}G%!Q}*5kg|Z1_?e^X(7_9EBSZu(3
z)JmIXXKvp;7%5Ip+$l~hH@&kgca_{2gcLh$@nyNpXJ6=i^Zq%oi}({5x5^e&2(hhc
z4V|k<)v!NT=w`lIV;1RI=5+tP)EI(2RiBLN^sYNhmGBYsTq-VNjqIE&h2|<IsniXd
z6d&2SLMMks5!j)OtZk}M;3<;!upt%TG4T6o4SGpY+c-gF+aS>!<@%@`M3rdX6v4}2
zX$aQI>GGa%7P5hZk~|(%RUoya!GBi$K(6Omscx-8)8!iyZl}oI^A>AIEl0=3|7Zxm
zv;W4e8^{rK$)y8l*Hct<F!A-y_RXZGipp;DC70V-Z_mQzZ;Em&QsJ-YK^G8}%qvNB
zyDRXAYHA~X!<qp+cslRi+1o0V18F%i93VtKMFyv`&N4YCWwL{aJb;O$?+88Q5&P7>
zM@#DU+eh8BHtQa@^;Kn5KDw;C$a8kg;=d$*G!vo1;8$-`P>_!l*YQ2JB~Ogv;4p?7
zge{uzr_L2oZaJ5J<)l)blXw(q@3BHNvy#z2Oe>OY3K4-_Zk}fW1BrE($5L0$fcVaX
zT|kAN!-^9ciQzO^r=i1w=w)(?tyA^i<Sa&XgcBi`yVt^N6p%@Za5RgI?Ka3AlB=S$
zP$(Bg5CX;?+4!pGO_tiDQ{I4mOl_6v+F@TYhfG)2paX1vSlwf(>TRi@|0$jp-S5Ge
zJJJt~x27Msb<A;TQhKZoUpO{UB@FP!$s!hR3`7<TT1tLP)ghOj!j}jxj4M1*@W$gg
z-Kl2meUy<-PHVl1muFJh9N)`iWhuhI$tg}uBnh}vRM|0v9!a*8ES2^(Uc^bqPI`MA
zbcYgIJF;)t<YKq8b^DB!+oj?kwA`kr9vQQIJWKmzP#?Zq0-koka$`OjYD@ayyG>-m
zb(LIe)8_Oh?mW*?x_;j#fUg$-FxDS7rNp4gOjGg-LW_+&>^yakOu38-lWR2fVL*eW
zFf)V%h=eZ%|Cw8Nm0p)726Vha$<`{QWdrSl^yA;DiVQ0W8@wE3lX`$E(de%w2ME5$
zSycJpse%(r=`2ENnz434q&T;{mRM_FO_~&J1W|XdkshuxqvUDnp=qa|g|*M7ORkhY
zcqDw-$RUaQ(^H`X;h#~1u)GBK!AB3Lmb16sw*k}<+~xI}j$Sr+`o1}QGOD4Xxp6%S
zWt4m*Y=QRN)@q%vHie@*S90pf=RW5G<OaSClV>XpMx_kuM4$-K1$JUVn^RIA?I9ta
zDP5^4C3dW+%#EPpHJ+N3X|cw^4VwYT4kezxHljePD|bMuq0!)<bia|}FE5m%eFj>!
zws<)Q#6#h0qYqhkGZ$9CR8vaez$#m8{7U2D4F3;d(D57r;OIKe11`w2A@x`u>S3eX
zmq+AIHZ?YyNZq32^(9cOf$n)RJ%g>Px-xiJ+4Q;><K@xriWHA4*R=Uwn;gIqx{=DU
z_mb3W49m!e5=3|L6}FoF^jjRang!_xt1IklrG=)S=&y-0;O4^naZDN_wtwnYb-j=#
ztY7D>ha$C}Hh~1dyZ=UJI<H6#c-KigHTx)5qhFj5r<yJe8~k5i5^IP^C9Nt&o$z%M
zWuLXX*VzWEYT?-h1d$OB{HDe!$*Q!62be4*B8tuQMm^98y%m@y*}SO1{5@M#qPUik
zR&d6<?~ouJ&b>FeGK9Gs?bLY>IBxI4>!spf?_h^6jXUINN*eb4aKUy&+B4rzi+?3;
zpfwF-f?C|UloZZLX00r3pnHsLba0*}Yr!b^JDW4xCgy=+4-p{3u|6A2R*86{Nk}xb
zBBE}T5>0?8sT0n7d)EqgjygIHcrvo|MTR}#?oy($?uS=`FO?woAupZ!7b*f$qwI1O
z99JrQiK6n2S#Or>S*OIiRF05U*D7$f+S&sDgYUnw#37)euxrnI85~4k{p2U#l)=F>
zyJt_I``$Ne9;`jI_a~F96JFg}**R(13!VSxukLxQ|G|ec{^r)-{<o>Qe>3^`^V5F(
z&M#k@{D*xX-+a@3<>TL6`Xpw^{~2pg$HqQ6{_)8tkG}!=kx&C2P#2ht!sv)&>|@vU
zYx?w?DL1Z;uS>4qM#~QdPjTTHG%sUrAIFMvOb~LhGBUQVUrGq^K7v{CObnR0hf)D<
z1aYL^Wt-u-X?xld+za)ct5)w#$0-QgWS}t5ucHpW@{aeBuIl#YM0VwsKV7+mHXFpL
ze%q*fl4qgeuE}*X-(2IFE1UE7P{3}lU9orKFR&#C9TTT@F3Y7^Qs-;4&Hhh2z)E~l
zMRvI5P3T+Si_KKxE2g@WzMav&dbotu`*{=Q%)sGd&g0@0%3*^gi9P`vKtSx@sn}8h
z2~XfijzEGd5mZF8rtD}{6NpCpxPVyT$Aab?Wx=MrBc?i~b5@IFm{7zSeh^t=qY@!E
zOV|ppPX~CCSg;}g0j?@r625#BbZjndVEQ5`T12^+f|>jY-qyX*GNz3n`EwT;ekh0}
zNR#h%5iQ9djihe=#t#z6e%JI9w})d4B}enlxO^LWG1e29OUyr0{C(gq2i-ehX;4c&
zxfR6hZ5pelMD{D>Y$iyFh%Gs=(6fWfNE^9!GW(E?IPs8SGi)a3GonEgq|4CzZg2cP
zOqB7HpUFu4@VUKG;&hgqrRuk_0}{*e+%2_9ucUc!|D>Ls`Tq8sDK5*cOP0W{m@e&i
z`Vy*1ZRu_A2%Ekgx8)L_=sdKof5T=lv^P?pt~>}uc?kV!H#;V{kDlF4R-Dd&*GCs;
z6-}?hJ(On%UIPo7#O7R^Uf0h5XSr{QFv0SoBo*n6@s5;L2hxx+MBzytKDn+|S(N0#
zHc-h=wl&d5wK*GQUcH|kYDtEdg7x-nij_|9d?`1%bI%CkoD6xP5Bh^bQnG+jzY-AK
z3$4iA0CBudX$%@LZF!mPgKyd05|a5|qk_Z*pABz~8UHlT{7Gi$zdH7;mgR^*H?w_f
zqKgh<h@0P<*tdfCFGL9EiaaMv*4%aW<~6*cG4Q3^c8&_MksB)S(7KZKM4G4Ja}h(E
zD;7e6mQu3raMAO3@4Snd?xKO`qZ(4Eu+s)JZ8gT@souKkg4r(!aB^?ws>w@PPFd-3
z>Xzxsnt6GEd}o&LjtPHu)}WoRg#I`&7W|XM*qCPvyFwRYwC60;{*IP{)HkEbq(c?G
zBpXw#ecja+of(}xa_74|I)?EDM|?Xm=Woi*6fRvQA61T(vYgdu{J-HLhin=O#;M9Z
za=qAJ6Xm(84upDxHTAV`rO`?S9wQ0x@a~A_K-UHLIBKtBzQc>!?*4`Ru`50oA&!bW
zN$VV_bJ0X(b->h=3b8atOn{96{`IuH>byQoDhC=;JbQ3_otMXb;(PhN&zmyu701<&
z^WM+11aF^-nb3WD=EXGlXq*$`B&k<D^K^YH<-N&bY%w!K4jo2obF9@p2&cFw_|vpf
zbm}idtz#QR9S<tXpI}%}y4O}7??0LP2*yOzHS4)H1kuXQDaobPNk)Y@etA|Dk_`p3
zGg$pR>i+pXk73Oev%R;$c!&w>ypivw`d&YG_ov-BSdb1o3})>Yc|oZpxBZ0^8vhum
zm|`WTcyN9`-<@xLG--Q+7b8M8%*)7izxOy885QljNv1iIW%#fY5k6ffR^kGLtn1CL
zoa{-)+ATpqKK=@box5IC-+wK6Ym+`q_018<n^^vgLWtz`Rrpok3M)MtEOfC_<iJk&
zs$F^*cyI;K`pb+4{GFR5O|l}WDD#B@1c9sDBCI;OJ_;$3D^1W7GS^l@AC(8FHJ5r*
zNw;g__TehawUMfrE5m_F-bsOnDT3(IL(E^?AKsP9X4KV9M$(TX4qcfPM2iOI`x8=q
zzeB&nd~aHS)`Kysst#L?zOAGE^Kqtw9U;xwy<B)26+1SS=4ET}?P5Hjwr30o+F5ug
zajeXpA1}M57w15_9C(rIS`pRX@W&(Mr+<4yC8Oa!1MVB$mYYLzMtG!M(8dj#K747m
zf6ZoD8p}hO*8H>1DV+&fLY>a#TD)t=X$))2#+qvI1fmSWgO-B(osC|2-=%4u!yD4+
ztam9h1qyX5(4MartR3))`}Y-r+8$E02-J9x0!{#tW_o_j@iXp}_L$zgZ@o0t7ziTg
zf23=XE)eq7OUVsYTr%Z{tVrX1!5>hqUg!{kJ_01bjfH6bxw<kNDR*Y84xCcQAX^$J
z4rt`R1-i+{DSM-9?2ILF?c==RimFhHE5X3R<8oiEOD{PeGkxBs(!h2mPp%rpRQvL*
zP^M&&$x-6-Cn9d!nBKS2Xwr|NWx|?=V#^&H*ucb<nsOB2BmId5x+B2{i|1VwmVz)m
z>Qnl4zOr1;+{vg>6EcZjJ&)G*BF#RO*yD={TeJ>bqO=>`x2P~fjzERw_P@?DZ1X>7
zJ!2Wq3pZ+?IKsGzUUG~Zy8_nmZ!9;h9p09>N6&4a!E;~%^bz|L$RUr)xSY2qYELEm
zFdGD$$fVd|)%gR*^B>@$5428U6EQi0P|{M7kzR+m!pQWC4Z^Z8?MI&ZC=**Gtcl?R
zI5G@;mT!uooD7yL2D2-9xfxinK8Ir3xq|7(mr}K6dv<=SUSt8^r{I8T$1bM9VQy)J
zAj^VazGaf$L}4mPhW}uiBe!9O)5xRi1N-M=G;Aw*g#Y?2(dEnc=%gC#(1mWgN<Jy%
zimglUJKwti6aJhzJL?H2f)#AH*`JX0J>70#OmHeyRuF?*JNdAk*u~JN`}zjp(SvOU
zeh=xSkH%Tg;5h-8AJFvpVb&(|%8yAA)LJyq$V4wU)}KJJv&&b;F$FSuYY2dEoZK*Y
zYF=MMjC*#`j@lJ0ki2S`7?G=n6R76tuFxv-g3n`UHX81+_#8nohU_r}IW{`)3Rk$h
z=R$6AY2UjiKR>m!H#X!$Y>;^@74YCGwrP{-SLiw=A8n|de&OUnXLhuAU0LAKHrTGB
z2F@h!q3{AvZ1*kKcOqB&rSIN|zR+6D_&SC1kOUw_A_pK3xZArhOX%Mmd14{`AUs(y
zz)^UvkaI?log8eW88gkNo0EZCrY+Dy`X^$y`tY5$A7cs(MOMCE1cvS%lK9syFhHJR
z5?9}DVnj?%WHq6151is$*ZKQf2ZH_S;fbmPw?F9cZ%MCh-?u?X;tlz$OuM*25V*b0
zR$*4H*vrU-vVFRkW~r8yb6X3+xJ{Jn!VpA7NJ%0k;OFNi7oLArMW4O#mH;8^Bo)px
z_UnWdngCR9>V3x3D<vXI;CpLhSSh4SeSF!(SV+ej(94EU%z@kCv)UzXLc6XZr@_HO
zom^KSFW48OjfHiThKY{At5q>e4^^ZMtQdYwR!xhzYM2-}B+d&ay<W5kaQ*QV0>_OO
zn2$<GRFhH%W{#rYaGn>pxuUnx%W(5tH63g`;O!T-KOLCDacvnk;CW%>py=cc=A4lO
zzZFre{RmBz8r4V?<QcnLi~t+ICX!OmdtwrmmjW3sc9_Jyx+(COXNSldY2N4w-$+G=
zdMx-$(j0R-+<j_}_+H}uM_#i~8dF)g>!-|@S{6vDxWPC+DRkh#?c3K}2dETVEZ?!2
zvQ4MGvt#q-*1pATLQgaf#A9Dm-p~$pu6KUmF<?W}{GQ&KQtkU*B1^!bDt_LV)U#;7
z4&#b0z1FB!X<!#fz8%P<<-~=Lb>f-PQddjAww=OXR(1@`kHrKrBLQdU)I(G+c*&$@
z1YXRoi(OL=A+|Z56?vQgY)B2CxA^`z)-|Ew`G3ixozeC?ZqMNe?V&%OvG}sI5krUy
z|I?3qCTDtkTbJr+@I{wkqX@M2i^U1sU)ZjLG>_m(i=8H@$O=4jN8<aq7-{Zb@Fu3M
zlUpL|YySr)w=RHf44ImvrSKN|T2Jr%zM2?m#8+-C`NLv2L~=O05%$ivI=ywB8N4&9
zBn|Ya<-Pd#@*)^LvO#?#w^cP<cXWo!3Z>&xI>WSmb;d<`#9NwsOzdimK8Qdqf<Q}o
z+sXMDR(qa_(XM>lW66_JWMBFZMmLVWlImr2a$<&v4e2wz>*lq_b(13@@S0_0l9%kf
z=Ych^fzwk%4CZ>CmZA`Ju^kwTpecT+%r+vHkvi&ON*~q=ppbyUB2g=yqLkz+V@_Y+
z%KMt%N{DAR$BWMvg5FG?$KtHA?|EZ&56rwj@bLBzZ%(qDlxT@G!x|cN9iW3&Ij#^x
zEqHRl^VItdRWTR5O$D(zEPDpXj~D-$6p`a?X;XwbRyM%ogI#)2N<!bxRpelX@;E*4
zVa&&blXb$kl~{(CB#Y6Gvuau(e^4y<FPyFW7T^v(!prW2Ekf0WB!@Lb4ebDr`M9A9
zML<NbR21CJOWZTr46!~|q%#~CC-@Th7K|<i)i6eKJ%gOb5GHrTJ;tWo|6E3O3=UGz
zs56nWAG3q>XTt`PjX&gBiDn?nVi`%4Bt-E^6yY|llocb$0iKF6DB$Cd+blzc8@ra%
ze6LSk1$$?2X?6Z+sRFGSAuP2Db%=XQWYGB%vizF@zk*m(djyFDbKerFjA#a-8l#1S
z)n<(!n;v*kW{ORwY`C&)@j0L7{&msdd=~QB&iQI@acDMf>B~IGyMBiDZL=*Z)0Uh1
z+n5h3#vBuV%fv2=KeWH(TJlQAZPSF;s`bT$`qeE9+}^OSGPc<L^dP|zAK$?COcUt9
z5~HbuhHnljpSU)(SH_w<jvzzSOZ_3C9F!x0viv_itX1i3FID*OBKL<pPKxrAIBx1G
z1uLTfyLdqum9T70$#x=p?6!G+fye^5m_ml7dXzLa-%bT$BAplY81(WTw)JgHSO;23
z4Mbh;3oJhHWl$ri%+l_O0X0@Qpt{5Wdm}Bsc(gh*dsq0^REBM?@P(f6uflJZJmnb7
zvbI(PD-SxC7J06|);VR4HyJcT@IG|b1U<nLWW%T~6=}yX3O`JN#7TzhoRot@xYQvW
z9ZLczKv)LGN+HxKQz*h9KgfdR3OgDxUVaX%OtuR#iIEQ|`|WDb*&rP4ZAjd9r#1T=
z>-epAmZ8vc`%`Q1+MU~#{{84fiQRKOO$9~nqxm*A+BB`Nl!TqY6i>>X?_)73cO;ka
zu)Qg%F20V)Sknk$ceEcGBnmVt$0!(b!8rH)`djh~bTCQ&mxJ|WG5NXFmU)}0j#}*h
z<?T(to4(I{-<h`Ogmpy7!V(W2fmw|!cCJrcYibjWS!xK230CYnW=SxNPR-0PIuX|7
z)=N4`j4ki>1cwj`2_Qpg@ObK2GteB%vR#s<U=p8N(^OJgXyQbQfxoTge%_y)rJc+@
zbMJkg`#kN_#t>Wn|Ng&s`M%%p_mv?EEYNJUUrJ8PT4=#3V8R5RuDv52wp?7%sRSMd
zylEFq7oZ(2V1;F2OLvb9n@NZqhH&ykl*va`kXM$p+xAlmEGc%<mP>oa2N9W-fe*ao
z)@lZ3$B#xY9gR;NdAj%n4HQ2bo<wnF{Jnee<x;ju^BHjF269v339;jQz8HY~!c~bC
zSx0qj+XY|G3jCChFOdGE+QVYG!@h^DgD}9we9YIkad5-sz~98ya+wjV2_u--Uk~K7
zWb5(0jy1de>!tW;pKW_PHT0{)wn(<^b`0Y<Q2c4|HgzFa={dPd(fv&+oa8wk%e;(=
znP}_EVF>Kx<tyal!a<ssM}6o@C%XuSK4V^QLbrDh4B)xYaPANdC^R@XeFpQj=s0b>
z?@hh{YRD9Rq4A`U3Yo7^&{HT-^Qvj22&7?!;y^&_QbXq^CsODL<@(;_{Kn*HPa#JW
z!hm8G2w#}7Ck!5}u46~S_d6#019-ynmh%y+Y3=CDh9v!Y4Ad-VLN$C4S#R05A(w-C
z`_ng%+J-MU9nN!k{pR31)@5`QWHa~CamzDuC%iRjX|@0uVC1Z?`H_qQ3%L@AyiQi>
z`&7Y!qE*NNjEMFTRE4y&*>Z}fWET3*o06cUwuqBZpYO`{p0rLSsMCSYD0<691>aEq
zJlh2p{j#_A+4ITvL<xVc_dSuXJ)fILvBMo<{(P;ZPq(b!RDFWwc$H!jMb8JBs<m-5
z7Vpx+7lNe&X!0Rw3TqC#<%PF&zcC(Qb%ms9dm%3{$UKcz-?a`E*649%KaWM^O>@Z@
zF}5h4r10MS)q1c$UmwVZ^t({K)uJaKGfiqlDHK&bUvCvmLkdbtE=(7<7CYm*xhqc*
z)xe5n0h{oVQEKzua1P+hj?IHTlpVBm(hB6CCUat$DG<M~U{e$`p_$r$q492CwB<V~
z*WdfEn|_~i?$*#V_pa;N|9--{=HCA?`x~!p+VHkNZd~T==^fvCa^KE-tM2``?|k#m
zp4s{rfBEi=g;j6P{+HW}Z|(o(n}5~w(NNbBqxm~mgV%C=f4l3ukJlGxdv5z$J3l)Z
z{<iZDdwjePm#&4JCgO4Cz4?3X*6H>(#zXb_4(qyQ%ER?uf?!D_8=cr82E6HeI*lHW
z%Z^{mj77$pt7*9?0Zot?I_(9W*x6)g33-i`q8vt~{Yt>z)<LNdcPI(l7=k9GFZ^&F
z-R`RPI@}E$uE0lT+i<-tm}eUukiAkjLcc%t(KJse;0e9w4FASloSYwgJuY~heWF3-
zea_jVmnwcTjadYBEp~RqaJFov^hxYA&(X(=6e)+EMhK7FJD2mvvC|JVz*(^1zY4?n
zifij2Ffy>AnV}TC?f~uu?e1(2|BAR+Sl-{^LdjRSt|IhZTKp@lbKtfzmK6hORLy-k
z#j4Sxh=i&ndyR~S3^r5AmZGL1I03j;`nqcBWgjAPE_r1WYp!w&iGTM|ljzcYql|bl
zVdyyEHZGLzyhFTO2U2Q>yuWQ5UVTIx>iq21$WN`mp2)xDNIS$fNEi+Ry|7%21hmMo
zJTN7YALItGoQM5;4j~Ru6cYdAnv32-f={Fu06W!7%Xk8}Og7jR^<9Y5)>mj}0V|ox
zKu!fLKz_f@-e#R|mS`t?Jt-ZYguAV5eyR$dr}%fo^|U#)YkByMggG(aK4U(9pO^Bi
z!5&)>2c~#iaKrN?(4rMhbLde}MDKN^BxMZNCFYF>44T|?#|vD>Qo#Fhfe^!peDVov
zl;1C#Fekw;zP(+hx2!O`12xu1IT=CYu(cK%wUkdv)kfiUjBrD8lC7_j0Asswcp$QF
zk3Yai5)c#FtZYFz%9bWM_=}>3`WjiAtYa2A5(EZOLkY>)i}x8z$;Cn>RiHqLt7hIG
zk&&{nhi@#%ezA$llGrN5a+aTB<0>`70CH)}iPg05gFDc$MtbU<4w25hIpNSXQ+_d)
zcfOh0Vu-0k4{cXNCX8-Mlu9CzINnvY_V%ZdK{nD0Fep7Kc^siOA48WswXR%-7dT7s
z8S`0@?$F1B7*kcghSZZqu4JDU_^XhzT~%?+R5{1W6<s(!F6Duz*~>3)ynpY@R@W5K
zDhAIu@7^*-Z?}ob_<63}fB{2Er*<(KIO^1VME@@DlPfYNC1$}m=tF}(zo)jf1aPs;
zcah?MkoSehi3`}QGL+^_E}tqZxp!)iEh2W#mq%|&^oY9c|A!-dcKLVj`J4;zA021)
zDav7Oi6mZ%n2s2@EZmlSVZYO0{6Lxkh5=APEMPKKbHthBK>DlzG;cNWb8in{GDHuC
z4>J}0=}4D#DP}MJdy0A=fETlaBj_#HbDXRYuAgE5>T%8&`OpayFd;QDwU<8jBPkR_
zTa9Q?%C>YoLR9N3-5jqY%>a(BIImcD(1cIjTQrwGLK6ca(V@=iVn{Ms)-6`p8!N3l
zy*n`o^=I2Y?YBj~axIM!e|^;NyxV3A^%<@$@pRXY59-fiu~*Cc!<A?#?^tZC1WCkZ
z5=%h!x?%2`g3K9vM^%l$53FTGAOoG$o~7}DBVbu{fH<8tvDp9}L58-Tg;a16;fSEY
z7br<8jWgnO2yQ-|{*(q#m}xRnmDM+HjJ80&)<^zc^rq~pl8t1tfQRoSFaSz#zbjil
zjw~sEO`aXSMrSKu5%w4US}?Z`5jETWi)?Js>f7hj0dzp(<l9^G$S>|}_l0iO9X%fY
zXM^o=Px0>#hjL4`^{Nhj(V9h}#u~(*claLmHD$3845>dcRn}q<HkhQb1!Y#?vE0D@
z{M0&FL3wmJ+B2r=A@(616Yvv4ASqoGm<^{uR9#hb`iov*8yF4_DSZj~>yC-0>a``+
z{K<J&?F&}sFpx|Mp0;0hhWlD0AGETN!4qLU)n#1Jwnu-+G$)b~9m$J+ifw#WWOZ~P
zde;_lZoXojU$n6So8$-TTIC0-H)N0Y41|_JF|uis6wEKY{un##Fc;>}W&7UR&T3iA
zOylWm&}p5NZ+P3zWd~mcV<Cx!SlEpc%Q&X5ktnkFge@9OG8rDz8q@+kCW-`(|IxmQ
zh>$hNoD_U}F$>~53Dl2e*POg=(i;jr$eAE+1X696oezlMhEU8a-m(?9`IXt4cn_u~
z+Wt}*qr^T6JkCzuw5l^B2gBu^p@ZRnjyNJq^Ln!bwaIR47Ta-POh2nFriiN!ohIAU
zSxdm3&Uz2I)nZ2i$@}E)=GM6|=kev*RSYu^_*9wjN~#7~K{AFX)Gv6~;?WVVUDuFk
zQO0PLhp%%PAPrS<Cue8F%ASy30U4#Iw%{X{mPbd<n4@M-<aRb**Lji19nZ*nS-+q_
zxoI4^8o3yK&lYKket^|=2F`X~+g)R%<?Q+PqQ!>9t~mCp>_ra-%|mL%J<x8hJkM_f
zN(vBY$28Gh8^1c+VSPPt716MNXKD|$;YaZ!p~c{A26^7H-$0%p?={`5vIUZ%hX~h1
z>=bzkvW$b_C|T=%H>GT|m_gEQEy6lmTPhkSE}2xQ+$%O*iNA=9h?Hy%&nN7Lgp7dN
zO3(s~B*0xRnu1F*F!4lnhL1y47N11oBhVac5JI4m@|niqQ+cPx=6d{)c7OcMq0zv>
z@TR=ZzM;VAbbBv5kCxC<hVDyq=#8_ZCFAq`CpS|&U#JTGlb%$@RXXLHskAmM)S8->
zi*1u>@_5QTQ?PPqyrnybzuY5dSUJ=K=XF=-s{8EJ$c`l??RGhA{)7Xh<|zY0EfAlz
zgzhQHWu~fv+fO@3N5fGroilpIIpU1?S4Xqhfx>R{ZDAO$d1!2IM(=h}XXb@&$KTfF
z?YLorybKyJiF~F8{%5UwYU)us=cA^?cPX12mn=dm?FaNr^An?InL$;M632_7Q9b>+
zxR7%n>S?B_s0B#C>H7*zIy%Id7z5m+$gAVs^SxijQb`nS;K}StL9k8J7kC|c_*Sg>
zV}l#{R&g?(0jwnG;){-q3|5zNc4B`Gzc;fuY9^^Gsi4^L#LNpADpd-fF8No2tUnBV
zlbdRgPk`=~NJ`x>SLoA31OC&rE%J0;Bt865eevVAdRxNyo;Nb>eec$^zGZ5EkoH+y
z%m9NWVZ9p!&ZD0^t<U`Y<i=C$vKj!aR@;@@Tb_w7UVdgJFZ3YUF|aHxm1sh2j`LM_
z^wyL!WD@^)^f`nPO7mR2Lnl2@2LQ4d&uRZ_yJyV~PHA5j#5*$LV;zb)63&Y_F(-_S
zL_gkV%%tFa%{C0868+2;g-1fjJYl=r75+^<i?E@XrVZ>qmKJ!Co|d$g12R+9mK?y<
zT$+SUojdYVdx6K0;J`_mpcdFeVxji6F4IW`pymsetXN!bAnoE=NIRfo?T~R1gigRt
zdBxcmAoo&`fGlNoAY(utPZ~uu$s<xp__U=_r{b&?dY6g~ikuAQR5QqLtnP^f;4B4H
zvT6*44UOmm3R-CiggemD_duYs9#wJAeDa(O2iW)|#wiXtNt(@M{PHP@9a;kR%B^{K
ze63Eaw#s?N)3%AdoHKv4l9|$awN8<jj<eVCq33J!eNV&06OZ%#4YCcKOp>eScvj4S
z%ExPr=Ya4SU;vR~!0$b1Uy`LABWUE8O|cJkvxq`t*?>YiFW&pIuwoOtDoa+zr&T~P
zHg!X$UDJ7A6@o);+UN<J_K~@Gcv`e28l-Em-5raXv~|D3>2A~*lAyUI`cF-TsX3_|
zOD}b&JYwmNXBSEj)1tttc$H$H!NR#C$j1k2*uL=fho+m+{^Pjd5ejJ9ffS6zAx<zE
zDc;8DdP#z81yY^Z4XcZp$bat8(Cj=W!vO*)rZWEz&g=y?L1w}oVkM$jIv`n&eHe#D
zY^juXLFfBz+YwE7L>n+jsyp)Q=Std_`Sw5LMxn+4K&D#na4GBMrET%*x-qPnBCQ6i
zVH(BtNEIOCgOslE@U*DoWQUBYoA#aQK5IO!bIttQmnZ2HJk=Py;oMv~Cz)D`vi8jG
z=7h#J0Y+&;rH6J^ZK=p~nC_jiY|Y+VM?1sN>8MW(q&p8gH&mLF18WIQi+;@iDLT@)
z#Ghyx*R!$<9!d7f0~=^XPM6K_?a|JPM~;Ja&cHdm;v@#|E(P9C2H;Dtc%kxC;C0#I
z#5CMTN}_yp;Bs@kuPT0A&l?DJ<r-k$BUV5&MEHa(lshDyGhgj&g?hccA(3C$0JAGE
z8LKVoiYzXN{IhJf#zZEh7uq?`oRK*j85IRx>^5juC}w1N)tCCUE12=JI4q`Lqs~B?
z%@9!a<dmsoU*y96MHIRi(bU2!#<ok$Go%JQCaPh`pp2PXJ-_QeqBR?iG<$q^mZpzu
z6c&?ABV=oR<B&8_g9<)Wt1^?!!57g`@m6{t_RTfVktD}dfN;aUhn|&ze8&{pV}4zd
z;7Od`q3DTc6;d+0b$$z>Uf`M3jjpO8ubWSh&>qO2=XJ5ENoF*O05Fg3UNw4qwR7Y-
zXC!F0F>n3-{HNIVF#Mc3@>y2o_lRzq^L{>UW<jUD{Jg2TiIt1nudHHSMA`m^ExC19
zsgjOSP3|SrO9#yF;UEkZ9Qx%F2As4czE?XAcnf_eXI|QZ%waA&CY#EO?tec{dxr0(
zT#m4mA)`qD3qUI=PYilx>suU12V2=t=g=Yo&D<f$cJ!5_fCfG0pZ7`hEgZzPOEhZ)
zUQirn%wOu)lEk@7sq6V@%7BgLoa~vU6c!8en1=E91=7aRhPFUWXazBTjB={Wgi5kp
zdI@Rfuz--wZ#1?LVV(?MiVSpyNPXeAJiZ=0$>3qimL$H@AhZ3vWF|Z$L24)l6>~h*
zL%oOXyXC{uy5;p8_V1*95z2YL{XA$3nc(TP%fxGH-wloqvD#r9$9fn95YWA@!-4Nm
zXNlrizL~j7%BbYo@r;w$)lrq`>Aqms(<?l9TS6zxhCh_LlZBTnSa><|7P+q4ILf%y
zUw_|^Awltic9q~|z-w*k77;S<3P=F1r~Rf<9FZv+Ut1#ZPAXp8;J2=v!J;Jg3&{O=
ztk~3v5;jfD+xeFa$T$p?9HLi~(y~k{)Ve}t_D)A2N3@l0ndcIl(5dRvz{}YpCZW(T
z+0Q0Ujz&;EC1Nusv@jAN;z^WpZD2kml?J=nsAZ|Gj-ug)*8(Dn4*+E?{LNg7O3rqh
zL))Su#ZotJzM=@SM+fyj_>2g)t{Z<lbgMtFGt@`5;_MkZBC?DHUcg_TDK7?!?+p!^
z(p)>b%S$YDm9#61dcjV3N;oZX%ZPt)Vy!a4QO*HZD!o@acp;IGLR&HW8d|*>-i7gj
z?`Vs=Qsy#26PqZ`%Rqs7Ebt<~VZ-=j7BiJBNng3Url9l@_l3Chp9rWD^?%Xa*`>LY
zqp}6|sn`i?=sY8IKJI-FzAkG|JG_)*slA6e`Jryt)?=4DB*A0m$J<a%(c4=&$870&
zBW{Lz6#KLvXzPUsdOfbEO1uImEcY-K3oOIOgXlIRP%GUW4{1}XupjYfAv37^gKnbj
zxR8+9eOQ~N44T?8q+v~w$=4!dN?FeqZ%2y8+*o?-L$V1)EL5@Er9_vr>!LP+4#%}7
zyv3K{ie@zQyEtAUVv?np_a?9Mm8)-+!}bE!hi?<y?()X}CbgwA4^94XeWb#mJwXG`
zvGW)}i^M-~sTpshowKAmt|+0Pkdv6epYgrzLwp9i49A44QF|h=f)_&{%wI=N+d*Hz
z+Dd#%ScqwluP<j?I2~`B5L!r*WD;tv3W`kYbQ#y;B$rHlOJAG3V`s(eo~0#6Mw|1(
zt8K$ycg5P|+}^eBbAXa|zq2GGm0i~7L;n(2x~#VchuDT>T1+k)=Ab9|O^yfNqpMs@
znZ>Codh0B~@>^=!WN#dsd9K7o*#q$X4<G#OU+!EyW^^a~aL)U^-$;A;>iw0|XJst?
zU*DQK^*+b(@pESWK-W3tpDMPl{*!lq@~uY?H>F-UvgpqrzHqB+)X{o<*VDGakB(h`
z>Dc@(lf0)wJ&;Ex-5l6;n+$|_ouok^7rLjZ7LOkAeZ~*!Lk%jmoO`wWgKfL887KRi
zRU-{cj8{*Z^#Fr;k~fuXY@;c`s^x%vH%=zDe<iTP;(%)a120clF_G=EA)CKW?!o~J
znK-Tu0$;~zh*qTVRU1+zg9Yb@JB#~u(P3S9mGf#mh9qP%fnCMtmGxA;RrS_!FktrD
zcN27_r<yzIP-&&u8>%Cod$v8z(MZ4g>Us3HTbSrD=qKYOEH*rFse3ge6Y20d@IlIs
znE`5`hJ?m;))JpylGXhN=$r0Uw#jZOGqpdl+n4^JcHZ=nFw>Fp-||4DZ3!n}@%36U
zo87|$PR$wlpx)W!&p6aKt!QIKnO1qi^w0x!#^rESn(KXZ35Q_!2#kao7$U!Ism(ge
zgdT{?yB%UW%)JXNI{|_nn<+Ek=51FqC}lUI3_wYdS{(odC$%pYR887t^@JZ<l-GPb
zz_*9Xk;7CMyyeGo6;Y20_HR`-1p>0Qar#`>*w#0QW8@P4jHK<Wh62S%-hd2%*I|1q
zJc#+)L^%-WYhn*3#tsZis1^&432rAhm1qr+PK1=FYNpGJ3%7+o$-;j1?V;jNo(}&g
zUKFy8$}+5;F7P-HVQxHJhibe@+8CC%&#Qw0fup!Ovk8rDB`dQ<O8XRB3Z{}i(gWhV
zmX#@+Y2~eBFhN@(lnk}DF<8l%c>1EV$q)X6$XDGL<j^vg$)SZ7Jh9uZd(KsRJnSDG
zihLYj9Bv+R(g?l+Zhg&AtevrquC|Rnmlv6KpTCJEhRe}<Kb2PM+cOJ=GZEDKdYt7U
ziH}=wguc{WE-vHfQ{rWag$+hC+b8zcoL~b2&%AFwwV>_WG{-bIW-yk(=GZ<m8jI9s
z^Q$R+G#FB*ij%nv56J4%{7H&lBF((E%hD+R<FW8cNom2=Rk2;B$p}Z-WuiJ#wp+EW
zeK3*n`;qcj_z1@3$PUspT+&XN(CoJKR$9-o1B{o1v)e%ajrnC}GUen6veFLY^cJQp
zn2k%gyn7<D*%|(Ix$UgSuhZhmzou$kIxBCfV5?0S1J#_GAvs@mHnjVp8d7djT^<Y&
z)_HF&?riM)E#2EwaZrp6OlMB@Un#M=aysp8Qc_X0oT7Xo4YG|$Apm&RxARxL>B)5@
zO=g%Ip?LVx%UnGP*=y>K<4!XaIb*v!Y)1cawRkKivbgFaPv|2~F0c0;nE<iRr#EHO
z>GK`E>0rWUeegY|FtcQr0EL&N5wRFYe82>ou?As<+Vn(RpHPRHUt%Mn9b#C*uHSQ%
zbnM3F4<tUg_XU&Gnm(T11u-#CIlx8~RwV@Mu(@H`q`A4WwYFYu!Ib1kE>t`IVi!$l
zOEWcsDKSxK?kU?lSEYAWmZQuPS;57)<iI`wR2JAUgs2h4WA$SdU!2!M+%{64w=Sf5
z06;^4W|iZbt{svOBoIln>3F0a`tfk{$m7L%Hn(o?IzYeY@wH>0;Ut&`0DgWqfL{jm
zPnZcNq}D-wf=~}vd6`Hq0CT0F6JCIOezJeUQYK3~k$GzL4Ba_{JUki=itQB>ebv_K
z$P>j`8~U9|QY%_38Q99@<ir2ORHdt)GyC<pLT4CW=FNY9!JLt1XSgL4$X+pTu_0KR
zQsT>B5q}xEQ&-UoCDx~aURaIZ5GN~iR>lWbA=laD+CWOLGEBM;Aj5J-oZT^@OP2zN
zqm;nRZe3c)Sv+xVmm;WhcH%4gHM`)jS{T~~C)xo`QmtFI?7)*_+SLlW0(2IXg>r;c
zl2`vCGhd)2lp!AFsTxVVA-{QSJ(j+o3DxOPkL-z|co*|9shKgA)oByuYDv)t`T86P
zo=gpISRWNSVps1{UU7MuuuE#twuX-v59p#ho#ErgPcjbGp6PfVzW@%%%+5T@*d^)q
zQ@;GP&Kk2ThYz*o*b}+)yy=~GDmN^bnH06}caWY|X5ml@%K}5eb09XVlkHgCzmC$V
z$GXnJ6ezSHD4dydap|n{$U;`!P%iaMVPCBf$JRWUFUJPf$&x?xJF4e<H`o$78oh~~
z5GsS4p72fMs3#haS53h7o&Q<l+rxW&K4nsMV#Y(Ts@m74b^Zjv!x30Tx)T?-szl?=
zx;eCOvR*Z#bIC>vT-NPu9DyXuLnSzpn8<t$tO|Wnf}CrBhsDb(-sWQAi?$%E8*Z)`
z*ozpJ2!jq7;g~$V$-KkXIzi3w=9St}meulZ@JB))0tr|E8doSi)R(fjQ!}XTwOu?`
zMRVDEI;N0}>;de|<&SbnUU*6Je%F<U{)@1Q?7|E!1oM-ch*C9mp~Q;<C>bWn(hLHN
z4%~Os`BAy;HP0`HZr=GQ3(1eM>4b~LIGFdNC1M;X#g<PN6x1?LNx~sJBbgV$+CcT5
zMAIbO>eyL(EpB2D@mW?MkrmV`T2OLV2Ua)yZQ80-FIGtalKxlKpQqpzig>NBs$iD;
zoyy&_-j7RO2CHm$)o(_lA4l2N@$0BI0QWD<{6YIoOYw*Mj7v#)$HbM!!(Nv1=(~C0
zpXQ+{d6xl101EtdQ-Qc72T(TGp5SizcG5F}ZBk!@wQ%pD*~(^yQZ21>p$g-R4ecZp
z((EZOE0EPw7Xe6xZzA^~gTRv2cY)fP%Rn*{lRdm7`uby;Mjh+qS~GiFs8nK@WvI}d
zXQ#bL2KM|f(!u$z;-AP|6hwTC!LA6aG$C>*s3_GMm^eOX1*2J%Nc3#S1aGLh8h{}y
z7y#WR4Pm|`)Ck5ShfvOq9P~Hig~x4O5L?(DKU~hRuONJ>xW%}47W-~=`i#ybGZQAM
zI4B@NWf_WOWh#<0H;tKuRn{zst*^o`zaUjC&mPAk**dH4#HQ5V2hu3+5?C&r>tDo*
zRZJ`h{MLpd?-j&&ME6H>7oD{%T%7l#vi-$BF_r4GLsPCt?y_XJJQ|6P%JGe26?GTy
zdK-mGFC(K5S;`Xq7<?^yCy!tn`7Fa2I$yffpAkGE10!%JcFSz9fL<ay32CM0LI8%d
zXL)LtKclvdAi5f2BegHv?NeU(>e1N+-H4NQ&N&qwlyyx7ybSc+JXBV-3o6spDj2+=
z{c5j=2Y?Jh6qR8+MN5B;Ga>e@wTTM8#9u@PqM(*!Es`9mDb`g=P~@H`V9Cz-Ty=5l
zU{EsS#hqPF%m;^3Dx_4AK7wm&%he_Uf`kJ2R{IES1jCs5?40nS@Ye9^;h&BiDPG6D
z3_hCbO`L-f6ck=6S;0!cInNWMDKXK<yn-RXl7vP>O&d;vXoL<_CrBd*s6gofE%)S|
zzjl4j1#nd2evf~0GM9^Is64gQjEwO%=1kWq!3BiLR&pJ5#xR}@rM|>`yl8RePG8eK
zvpH684%w(Tqi?o^f5zU;{}H)qjCvx&HqAErmcjOhG5nJM{h5chADR@bMC-#eqXgb6
z29`K4l3L<|N$r6XRk~{-wt(&9eFTf`Ck*bh^jy%^uWN|+rRQ=Bp;T1dtCT6d7vjbv
zB`EJm=pBowgZviK+|3u;XUr*i*f{baU!~U1M!Yb`BN$peVU+Xbs5+AjUnuQxpMt;$
z7MqX}iQVzjRzc`+su0KRxy+2wQ%FRn{DFVS(M9Ye)2mdIFy7GxP{~s15mW${<=jED
z+%Us8Yjx6^@b1tPv1~oT=GBcnZVdGsL!bOKbUbncN~7OhZlxcGykhq0vwE(iY*)=5
zfB&*9sD@2;$anP-W(rhC39N|X*HP}UKD}An5U6YGP!n#bWP-a4631Y&2uiGAn+`U*
z1s<!wDZPem5w!gJqDfVWU2o?%7Q9cf`SDOB(ob3x6kFZAR@ww7E?>7>BfnzYKDS^e
zdtNRf+yg%Yb(ak>wdH+~ZN+xgjLwKKdKrOSs%*WWT;9vho1UDCzU;Ki9Y|+V=<LZ3
zxISYu;a|_nYVtt7zYu^JJV3D@q(0%nUX3Z_#J?~AJ)K940OZGTj-9a>M#UdEXvrkV
zNm&8EpGU9AN|cwPH0PMu$IH#il582<2xny@ql|ndBF4@h3|v8H2}n}lVn}P}V6w}n
z(TmU<Z;3K@8y()*j#qkAef~My$K|%0<<jAi{O&1!iO9_I4gpq}MAVe0B<T)4pP(rz
zZtTz!#tyCYXIR+tN^Yn?4Gl=f^v}_^C17w%-8o2HR>QsrmKi5#OQ^mGyfMJ%lF3K?
z17G3+_xYp*$Y)-6Wn$5!SL>30{KZwnhU3etR_`3S6%BeKA9%w1SDw}0G0AdUDt*{^
z+XVwEla^mxOUzf83DX0ifIti~PjR;d8D_d{kzKEpsP3(4y*YdmIy@FTiBceb`;($1
z?ZjKW-Je6<2W7L4x1U$tpP$S28PN{#>f`=P4>mD|F3L5525w&xe1lQGl%Tm}D$vp-
z&=4gXCkSr^dX4Qwl}p?$F5oPvvoY3L*iTVF3?s4mn)yDD<~L{YlVmLft-WkA&^SXT
z=Ddr-$7XUB&)`0A(s)x{u;1vy-?q7;e{U7uSjOO5BAaj)g~LNLKC{u3&TntZWcXQC
zNKku-2xw`dgngCmYYfQbBO|=K-@P;svY{+)p>VG#dfu8dXkSu&HEsIKbPXrtg7+6~
z8rLJ3&`NE6_r|GcCW{_PU7Y=+#qZ-~!{z#Qzf+4mY#jYjV@vomw#V6q``Kvayz6V-
zaiuA~Pu563DVWTSwP{k>vSx&j$f)=rrK{?m!Ely!d7BtJs$F(~j0B1Pl%#&ww5d-F
z!V2Kga^vAR^oCBS#R_<y&PoptjHmJj&xZGVl`d#MA(wy|)Bom}0w<|17xr5>lu}66
z7S^-^RpDNq1N$@EAsvWaMrW*~i@Yq93QM8^oRNFyUhSKxy>fX~3zA_5#xCb8jIQVi
z=r-h9x;Cg}w!VQ+GDmxKXVet^rOv4xk2ZKd%+mffJFsk`FTHpL`yq~#3~5jSzNjlN
z6k&@3>4ITd1!tV-!8K6&R$evo64PaxbG|W^=5gVT@rA&NzAbRga&l>~wX-RU83O?<
zFl9P%-bC4n4))ycdGIC*6xpcgXS(afqv6B0J37|G-SK=;fH*_f@}hm*sCwt44P`TA
z%hJZv`EWDSanMQV?nt5zbLT*+^`8^u6=e*OV_$Mn-?b$&{GVv)na_0c5Vvmtx8j4P
zB5mTnTN*13E3zw;cYQEZ0ZD>Ui$uiHAp#tMk+TQE3ngDj?y0sy1X7t0e2^GeX?t9#
zyfSg8>{uwa%6YO#jIj%-zO>=Ot4&@MeVvp?(;yQK`d~@}9W6Z>4cZEtGAc9L69F)3
z=ztz|6bZ0r7~B(Hg-OJ_BjwPZ*mtG95fV&ji|vyuw$O1~?@;k?b)hPO(laktnqB*N
zFF@0qPQl|B$cn5+Gk4kbKss$L-*OXZ8j}ZAoq)~Pu?V<5@R&3&EXv7RPA4RZE7^aQ
z8<*W!|D(VP9s9W9*|;U^=qO(*3z)YH&35MdDQlim9r#!)XB!%;`lVL1fmI8{wLbsy
zUPx)KrN_(Q`SALYI~xsnozcPO`}2#p$@&WiY{Q3Y+p6GaG3b|g)#0n2+bb3)G6rgU
zU|ELTBKG*Eb-qo}shZ~bQWg*k4lqTR1Hnm3>O%p@d~_{LB{{=P0-oXskt^f@yPV$^
z5oD~_j=?Uitf8AAI~m($%b#!=w)*l`aYm?8YU>4T5(*e-mlXO%eMbYbg8Hk6_K-!4
zQq(9x$`>J;HStZ==vOH#AxkvzkwS;U<s+NIT_Z;#HStT#U^LK90*}#Iz&sWj*K&h)
zR>C<Oy~^Y0{6J@6Z6APoEn|c24Zjvk9k1z!^Zu^*nwE2E%CfT4<G=B(m!G>gZRf`2
zRWIN9d+(px=ic6&bo8G;d2q42=lhqJ{dVV{Jn{bAgx7X`{Ox_)UODq{PNNnK{;$;^
zznRxDqtTZ?>4Sd+^R?s7Ug&V$fnyfW20@uy0vE{Cy+ZpYH(uJEjPCLmOjU&?B{UEj
z&>KKHlZD?~Rgk>sY+w4ioYC7qHAhCw(VN-M$h6|y_boS${!Y7DpNBg3J!ACdv^+zX
zDF;&Fp@aOM8M&|NyQ~X)J9x{?7<bZ`I)qXpDvhTArW)Exd>N4MZ^R{&+5<aWZ}3=C
z&S|-&dylO;zPaKcJL#z6$dQOvL1jDU5|;R)zF)Y{Bz++q0XAn;sPz@6(5<4TY!uNx
zXq6T(O+0Y4)Qj3uO4;^6CPO~2YrrqM9_g|G<xI!9_NB<i6zzq`xkW6J1QI1aNbkA|
z1YbbL(u3qtIGFs2*v~G|0oNQw2u&dh<(Qg`jk>asPIg&a*f><ns))u!=QT6lVym4)
z>{`{%#k;oWuI;QZ^lnIJ?Rz#)T<qJBIWtQ?p%w6<q0-lq{)F-=ohG*!u9da!lz6OT
zO2qH5Stx@QmNNh)zzM9uLj`k;QBvOqOEYXryN(6z%^TsqxU0^{Y-##zPY=@P3wD+T
z9tj$l1$KsqSAX^FG~?)pw%gCO))zREfb?X87i%2ZDY8Ma;w%r@%R7W1(04^TcPb~u
ztQ~V3jkATp$IWzfJWzBHYI;^-XZG>N`We6}Le!y)p&ceZ;s(=(e~x_7%ZhdHLG_%t
zIXrD8bp)sET#+pVvEE8)YopS-%ih0g7;MHL3;mhp-!ZX*Bt1$77n|+Sf}s1`lnVWw
zSS7CJ<YnLkUKd(n_P1|_u#;Vpa{na(zGb-ZrLvOms<`{I8g#RE8~|E?(1lEY<X@4(
zI_{k9wDZ;-=g1L{&%;Kpr!DI`VMTK7A#U<EE4v$FsLWatGJ;)68J`~>@mN;(@?Jl3
zY$`JhABsv0gM?o)i>rhih*?$#cPoxvdIjr&=sOpJ4N@w4cMh`Uw5a9;1<!sG*Q-ms
z?mI=C$^&(INAC2IBoR5ZIzPSG(m?XOn(YjYT#bIv5`E9$tXtauAf_zHu^ET(i0e*~
zi`nBQ+%}3Q!j+0TZe0>!(YXe#l7|MIMCP{IX|}jxMALjW1M+rZ6wAkvU-o3j3IiCu
zxXyFWC~v4z1g{LNO6T&$R$2mp@HZ#2^jR?`HmFg)&?jV=D?eQ=J5qb)h^dNNMgS7I
zEtn1r3NE=?Ee8~;m^CUU+f0#?z$w-1b@+7NYSMvBH3$VOZ6+;EVz^W)q)|>?Ob0q9
zbJ;?XA)Ckfb5FF>^T~kK9#5>9C1}gub<476HMNyMbvIU0qZ%yAs8w~R2%XRrx%OEm
z3Mq@%*0?{pC<*2b<1uOWyW-gSsy!2zxwrFvIK<bfFzVu6Ipa_kmRonW&r9Hf)%%{V
zDsa`-&S6u}AD>dU?XGe3rY+>uJjKI3t<h<Uog0deyz6TyoY=FS5il@5&Dx}%tqw0B
zXq&t{!VhVe6&Uznr}r>SV4pNO{XcVNGskOhcc6M&0^11jGHs|*2lD<8DAG&0oHh@J
za>UTf+8_8ttjrcJ5j{%_93ToQmEm|J`HN=Am6SYED~+u^XoEQ~IXMPg!<ShBftrB=
zd$hFs+${omQ}!4uv#4??RfGiIANqlBB6E5ImBa^GU_l9aEU@a8&ve6{>#fC}!`fli
z6F$=+jHlAaWUdD(1g=cc3UD6PFrZ@q>xJJ#oP%G`$I{&HeU`+rn?owW{Eb!Z0=cYv
zLNG>HdsXNC)#}i+$|2vyjt8#G822*Yj^UEKC-A1JvD}pq@Pw}beuaM(y)`h@sT~F>
zyEEVk)A8*FTv;Efj{YY4JFu0oGyI;Rlf~1mk^bmA@Tza-JMiTM;PU40l<<OeXVPLr
z&xX=QY|x%0Ven*o>&uGHN<Lt%PlItbWrYjraEv=34j<*>-hW6T`Y>Z8%*nbX2rq|N
zyF~*JcVF}hfEz+#r*s_*lu9djI`lRrj@m(jP&RN+fYu)uH^!~(NM&2V(JvRzl~bL|
zPe=|>1ViSGWkFn5jssva+tSJPOT}wP)}tPz5x}qL2UPdmdG!qsf*Q$}0U!9eV|Ulm
zEU!U^Zz4?J>5Jlv&v;n%(rSxkx~pkTXo(wi-7E84I<FdwGw0wv47^EgOx<dA-C-!h
ztdmf!XRdcr`VcUDnbCs{=~?;Ws<n=zOV^1gE!)xI>mmyavp#z^1eEj&fBL-qTs+-s
zbye{$T2n~U1y$F~wmY{xA-~y4lCzDPBQ4HITQuG{+TG9+t_i&t#Vkj&eVP{`VBY?z
zM>{gh-;~n#hE)&UOt?z$BCfG>nHuCjdxGRhUa1@F)P}2+=Y?0MJQC#LRrrDIW<l+e
zU3_O*O3G3X>Y1cG#0Rs=+&blBOH;i84X*7kj11zX+oaQ?gjf{)r#NA#i8izNG%`Rn
zB&^1OPjT1Osw}3miwS)!JZD_Xxv~^mbbBVuHZpJqlPDr=(M!xPU_mjKq5>Yf#c6+d
ze1_k_k*BH~`e~@l^WmN1mzbfHG|=}E>@ZfOOBW>^MtfRrs)9Y4@4^2TM@VL61e%%t
z31btO_$Mm#dOzrt%6qBGS;k%47WyY4Wda*Ou=YJ|!TA0hWoA~-cHox!>u~^G1?{iD
zJ)*mgI{mOG)Z+}d7PA$%ZR`$qj`)n>pzXFTO1atUIm4v5p^$W1*jxz1bdz05!vx)N
zYr_yjm%27Ztz>kr*r)`EQ;e;=5K`t_5@A(=FqBs%EH_qKUr$R_OC0?-;N1a0TxbUz
z%FgB1{;_~hsLe`;qqr<g5|~1$IT1{83*l`%+y&7|K_N?r$$mv>1x%j6sufPU0VqlZ
zRiOPF5G({clRJg+f?~jYMe;?imTJ|#LCR7AO);XNbpM=4LD4G2ed$GnE3Zn6*e>KR
zJ|I)%HhPdF;ak@1XsVN?&RC%NTo>{msH+dGD%#UoQ^?z7XYE}DRi$hq0S|(Yq%Iop
zxpkpgTcW>G$(9UcfF>LPN%1KqjdNP)4goc`VB=p2)1cAddzwUWc~YiBih8yl_paer
z<4{4g)|5>r9u}SbNjPG>tNMn#u#qFR{d%8q24f?8Yw@R9(eAvEuO%EPZ1Okh@`Zcr
zm|P$S%EiIvporCLT#7wpYZ?~Sf2%N(p<P~Js6Lgpyv2L2QU?@~FRaK%i=Qk7gyi}5
zU7(B?tF1#1z^v<7!8UEhawdx<Pp~utRC|vL3k8@*W~s!*SgB7RDMr7W9h&@>iVFnD
zO;0SGHMF$ktU6^^R!{js$;qnh2`qi(?@~LtV6Mp$Lr7}940kx&iokEE%jxUJxbl*W
zWgoUE%M>Sxk|w354U1i+^%Srb_X*jD4VC804|t<&{VTQ{Tej!=VQmxg=r%qKcr#g|
zQfK*x_StADgbF~pyDU2yNLW&znZH6Um`3E31GTrsu{~6Z%~;nU6+mq{Y?*3BYU|rF
z4i3F#9P{HxJ5fk7dW&nHeQ0jJD3G>50eVVoU@GFCo8CTK9ci)DInGeFZI~5>rC;39
zw-EbHvuW21&fNS%y({VZ`9*I$@2+Z#TW%<wQCrYcRZmID;^mcD>|8cUMK@n&lm{Jv
zXfSQ1M5be7Q=<R}lBOB9@{(KRS^Hh-#q3b37Ho_IhYy6!y@OKMUJvpul2E+)o7(vQ
z5Gfd*3;UDr$*-D2!74;hKu{KP@KXqlQi{@ki-<`f({zc)Wg{N2Eu~nkO%3w3tubX7
z&AD2c4FYx?e2!G1a69<U$`?zRsItLQjxkq?oXKL>-SyVcXU#1A5#x#G(9g?#yQU&s
zpOo1NVG#bf@?s&{>sc6ov<EDggxsAY#ydD4)?tQ3?I}#U3b>-aBy`v^>svrMzQI+_
zKez8d-#`?}7!>8S-V6w|pnpw1Ypupp+dSRAOm}Z-<D~b7Sf;z$7H+c*pUJcR=F6)$
z+!B2&PwLU^8)(=qK(DR~GhhiM?^OtCkvA#$CTs7QX4Z<EzQ9|Hsfmp=xJ@b49Vx<b
zVZOZ>T$peLNfyR0L=b_qSiIQPo1XeAUBOHcJ2iOl@0ZPB-B%PzodTY~2o|6y)wFU3
zY1rr?D55qDVo~IRKqBq0bPqETI**+beFDOwULClLRD=(4BSR-*qd}BP!N;gRMT6;v
z>dwkvrWXL|q*uNo=5j2YK)s?!PE?3rp3~8Ph}EDs`#GkH#r<ilS`-?IOn3l<36HWT
z7FfZTWc)VOu(T_T;@MhHS^;;PB)`8BueKE}HqMS6)C5ZoxHc>XT8lTRVVTM3v|Qyd
zWf75|={~!DphWUsTU<}=ncZcN)R~>5Fm?Ts_skF`3Lwb`u2F9Wop*bf6b+YHn!(9g
zjo#0sAGln`z20j4k#o6Dn0QFkrA8DKMXE^Hp;eVUXrH_#z1D^LJbn%6aEF*(t4I&b
zDq1f%2tE6J7wI^U(q8bW>7ua(pts5ushvz~9~G3XK&nW=tGHV`c&fC6WLVjI!FFZ6
z!t?3J!PR2fQz*Nof%(i_zhG5MS0CBpYk+}KgY+{u7B|PZzG8<>&J>-X=xk9k)RbH8
z<wQ?KW8=%n6$FwI4Izm=w8Q!8&`8+XeV9|bzE!p!$C?J3!?@*n|G=*mn<+q@#?@Om
z5g6<U1Z!>&yk1o~q*TOu3X%$%rQ(f_%C_;+ifW6^mc~;F7P^e-MWvwH5S-i>egy73
z4_+XO*A(y5GuW%;ex0qJtE12yZVr6?&6?jkz`qNU1{sYB_X&|o>svNbCV-vqqV9#j
zD{r)5CtwJStqvsY2BibnhKTv7dp=7hi&n_Yb0bH{)RgJANDuq=X=km#6rp=L4%#qA
zrsig`U8If_p(T_icu2(?W4qkB09%x16ZH_<1z3LCaps(Vq_PCrrVHL2d4;x}Io6h3
zS)MG>)A(9Ut9MT<bgIrVrq(;aosfm^@_Lb;Wm#z!CT|EbfL`RwILIIhQ*;_ir$v#2
z;m42UI32cnkL(8^4!=8Td=JYak&f4=J;6g6LnO%#&!hNK_qN<>Fjj7Q)@pWGi3q5o
zDP;*O_XOSqe)!_J+T2yZlq(?~(chE3r%Y|Gv>H&P7x}KZ%P|O<Ef|VVn!8)kDEq~~
z?t*<fT+-K5bgsRuV~_aL_FSj}l`Zm?%e)+?PBO+KKcq-5UQu7XV&AJCQ-VuEi;qZ_
zp1N(PgF|>P&XSl#v)7KwaLX~qTMSL$5$$l5TV8AI+BW-hE<o8qG4exw!0Ag9i}qv-
zlvxXA4lSY@$-wd13>-*r01nV_RYnl8-GM|hMKW9{EKOykBxNU!pA}^xJX?B%qcYG?
z@;~guWvrpvI<XOuF(BWH0^W$qM#xM8sSx1{Q`Ps2iJRoD0kqPAL}H4eb6#_{4&BV_
zrvfRS53@=(7l_p-?<_P+0;EihD5oHrWt4TJK6hWpBxiQ_ZoRy5U0{_reO$hOgC-M4
z9F*;*9wtUiO72MutWegwsXc^r*HW_#D0)l0?zQ$M)V*bBA8_~!<UbqGCv4nNQKs(^
z5R;)^Huc8~v9e=k<V0)d$kAxGQ3HUEUWLb^0k%Z5o;r|QSC~J^(&JhQVpA|jiZetf
zkTudIhQ&Sb)zh@*OJw5(s)~nv!o0Imhj)DN(cD$qGt7u^J@sC(U7Z#Az3Iff-yXUC
zHv>Q0vgRKh>((t;w<Y6=Z%lo5(wVB)j!a#*5s~k#B}Lzuvv<;O)Bo*HrvLW|lQ#E1
z|JeQG7hc*DH(tvc8r;)%=2zQ|$M(JX^ts_9LxbB_em3j(x8!Yttq-e?X+)dSl>A3;
z#t`dA0}p@xW_YR#@haV+y7a~ZFQ+OQwcq+EhGg3$V2UJHS`iJ|M)S5CDKz;qvMQ<M
zCjluW$jnNFEF1^(W_IDJPD96iiiP$eI8hi(GB)}AH7&T0gucKd1cXhl^0@G=k^13H
z;YCh|XNtTQRMnC?suM)^Bl5m2;~Ht(8x!O0`9+GMK6ROSd?CejjPgsiv@XOlMD8!Z
za-a@PX&fXJ_zNOQzL=7b1+}nEp*j^sYkId|hUa&1Dk~y3=d*KbOKM3;cf!oF#Zf3|
ztE0bfjl^Tq*(2@+?>MP?emCFvjbL?rc}tj$cedMkp=rj^YU6NxG%IvZ^qu_;6h8eW
zC~@Lwwn&vp2K;IQmnf(M%XCs+0te#2(96zm-e|Udx5<<dhFlt?NUE1f_;T^bKwA7+
zdEC<8vngXjnG52$DJZI|oTm5H%O;_~GHWgb-!co(Nt#qHf{T@~Pw3S9R5vF=!4&Vz
zRk&*kLk3QTYC2f>rIc%`!MIE{l^eK3|4di{c@_ID0rV8AKvPU4*&ca`uI=kIf@)d<
z>;thwPL=cl*U`asXFztML!_!wAq$B6fmv%FxfFSXsQq)#s-T|KB4&<oLKm5}V@m*L
zl>5YaAP9%h*Ol0kFp%t?Z(+TKa=)R-2;c-UM!;woT2p-Lc@rG?$`hE}d0jcVIBEo5
zgdHYw-K-ue*$GgFk{{GBe)*#z2)@BPk(<qV;kS$<8PPUps0TzKG_E(RI4+W58_}_5
z!597AK4bJ<TTuAhkpYjRtnpObp_2BsGK40ezR}^^BUSQ)m}(gC52fW34I_KK$)ZeG
z5<!zxLy$a=ajML^D+`NOB=lmoDt`uJ;ffb-;+z&nWQf^I345mWR2a=*B5r<F*-iuQ
zXK4n2K_9PhR5fcLc_Mwm2cGK}L&48iC8%~&QZ|l56#C6f`=#EI?S-qdK~dYtA998B
zD8;^%4i;~&An6@+1n!|CEt!Gwui~n)!E1%Q6(pe8dm^jT<mL-FfLs*lnClOQgtQQZ
zjLS{dNDTi?xM$?Y#ltz#|5>~tb(<_;65fXln`X8LUKEobHlk1<Y}+Ysm9X3!^xfs{
z?xhG_Tf9jb_8BNSgsjoCPg~6OoZDM#@VSSW_XJ+#SQ%6C$W<`qSP?m8SavzA^AqG)
z>&~2i_V!TtO&ePc??fY_3XP69HDmOmEfW2#I{q20`zRxCXXG9@q*tP!?z07+^MnSR
zAGJCEydYRxc_7~*24C+Y&DrnRlIv$@Bd*BQ>9maqwR|@oWQ>vG-2>-|IS?g_D9PtA
z#(<VXwV<*Twg3kCrH+Zi1oJzL5X9E8-%SVY-#HjNA@%5>4NG&ue;Wb19K^j`{;T~^
zHIPfPwIfnZN(&FMpX;4#2QK04%<2G0>d+Bo-JV!tI0QVR-?8mvfJn@hT@Z}FIaS3^
zq$EF7WAHvnMInG)jMqrCz@S+WbU4TI;UIDG;qm$s>Ot~a=R2*p21Y*AE)~D%xeFGA
z8xO)V%z30an-0R`W2gbZgSo(jfhn&SJx_@dC#u}$@&wd{O_e%^rH)F<bn8>DTJOO$
z0h*Jk4fqs2DJMnO*3{;GSisgYSi%mr&J)*3lD{;iv#^ry<q1D*yE7Di+BVc|j(!jg
z9wqsH&>XEFyQ0On=STm|IC?R9lN(xEe8t>#!#X$Np{7E%A_P`2l>*)-6qLs4ACg`M
zWIhg;2bo0u@ev-9xY-6pf<#hNkjma_(V=HCH=)=Af0ky=x6ASn!<wD}6*pBt6FIby
zq4=*<ZPGSMduiiYirLyCXO@g}6%YGs6`|g_dEyp?*)Ew&h&bpMb!eicm6SCRNx~Bs
zjp3*w224|a@JeikQNwBKFJ2I#C<>4ZMNl#>=PY@ee!6SzdfVgC*E?U$^Yv?!8$WMN
zzB3+rYNaZF-1S7p-rDm}tsv0tcmhnLAUROemyS`V^uN0>yq9tMtiUon%y?zN4&)!1
z=I8nsW~LIc#FXJh>wI1EOuydTKA~_OP9DrW;u3WmT(I1pcNW<CJ(_>OdDqNd(Ob^Y
zZ9~YL@%`e!Z$7zb)ZR7XB_oIT*2v@+E)XXT;vatiMMt-&J;hR^o_;jQPw4NUn-Lw$
z9%)|&13#pphgK>g8tFxA=9ImFPz%;DNRIeqV#hYLFt3@VWOaqJh<DAHi=RPwghLQd
z2}Zr>peS7%5{=R%g}y6pIIC*1)}Ch7E_8f{g@1ua43%JdrZf)>s6SVlfsY9a7B~rd
zBt)(zCVnm6JJP7wHVeK-2KSr*s{za4vGcD<E6U`7?uJ(ieu>~fC&lCGwDln1y==i`
zr5pcj|CvcU#`tA{^IgwP^UZ;gX>$FXk5u@RdZ#>^U%c)KN7X6osSeo(U*x+7q|bX#
zO<~~vy>{<K)XRswH`bxU1Z;IAh&uS3ETSrp6P#5-DwZ-d15k=9$uF4=KN5nX^5})4
zQ=O3ASzTaxY~7Q6TQ2WD8~*cAIkk-FXIG<v0p}-Uu`@p0-4gk<C0uVD)eP6or<S6J
zZ)oeT;$dxcP=LuZpo855MBM=|7f?>5nHUyI;Td`f!b1u(*JSq_Oy=d#D2-U^&;h0Z
z;HXcYIj^OrwU7M)Ju7wCT!LY}FeOkLn#D@JIw6=)9dNhYIthd7mht;l-@8Z!G?px@
z;#8<7Ywb5c;h{ZAs^YwNONB@kVSS2z7RqhAe~-umzau9S5A>qOVR!yO9EeK+tAz!O
zSf%v9piD67L|KDWk<v#=XTV{b6Y%reC%!>L)p+ynIkMI1ZVe8E9;}8jvaQ?ip1;RY
zW$+3Zs0Kj50y?0^eYoa`KS>p}3q11!2|C7R@Mxw2C|3@#fr7Ui>924{Ae}&1ls}a*
z$>#id3E6&?S+GLW0h_uG7fsU6SH2B!w5hC@T^se$yNsb&s9pMbE%VzCZ6jT_+dA9r
zVBW@eP0cJ-e?2Y-WZk#Z^3XhT04)iQZrb<SML`u43GjIb)8KU#zYRhzn?%SRl-%;`
zi)-eTLkHO2^Wa{nEv=hKV!vQo%IKqDut-GHg7k3k8oXh#$cEY9L@m=#HG@2gONfdd
zW(uTdDMF|b1+XKKR(x(#aEu>A@=YJ4v{el3&V3GG!oYVFB++Zq>%bzE?2r?ZLsDog
zG)o!^s2ZeeB&4s1|NI<<Ah@tlj^_zt{RHf!MkY)px#8m8T+g@lnTJ;o9}EXau9?G=
zZO<jhx@S~?_BLu|)pGPwf+=mfNOtgKN}!Y%oIMGLWvs`zJ!Zb%RVhTc^*G(}NGf^E
zwccHhxxH0Pag@$Jt9=>lB>dvq(*pVHEHh?j=WE|{oQ*p;!vmo-`l&7YA^pV$qjqmY
zOX&Xyy>r#}=@;i#;OJxI{8&X2>_kSsldB2+NM2GC@ikfuq=^8O5fo&Pi9Yu}?Yr>K
zc(NEaNOOONI0J<5NE0=ee`iB?xw{|AtmP@hm5ez-H7U*kR1m2F%;XO;|7IyHQLsV8
zf|Maj%0_h}`CGK1%9*khL$5;b7gb``n6^_26oJj>cw*AdV?^2vCI{^08xwa@l9JUD
zEe~mra#YToQW$bNX0qN<lCI(uaHd$4i{DE+F6U#xGQ=&rFkTE+m<;*Xzk-|ZS?6n#
z-!?r8R4~>Vq*H^lkcfTXQi*@F2=M?{&x#MlvRIa;Inbt%JOW41FDI9kNZ@4co)~*e
ze5x**bODvpobE{~#qnx#`^0I~Vb`UnrA}&pd5QPos>~HLY{L*_x}xD0Tks=8cr2}D
zkj>(w?-@hyU9~NJp?y_XO6hL@gkUL6Ub%vK-Tie}aYRTo^Z`U6Il7R~?VyG50^%hx
zbK`_Nl4SA&k5fzve*v4C%3x{BVr;V6swAZvtSA#n>!r)9n4&W}P_rmOR+UhYr)v@U
zGnG5mYQos&iik}R(F<fodxe_pYn6hM)fg87nD=kieiai6j&+fsV|*y0*_g;oY7g=j
z@057j-zc-fsAHCZ-%RYs(d|<6f&la@m@ZqmuoRb~4v`@-WiR`RBRgzcov+!d%(j76
zZE`?Qt&d}4{diQ#^o#T0`Umu$)!y~IBiYADaaT<GyzZV#9Y_XS^k_=ZVM6Cf_Ag{8
z9-CX!DGH)a%bz8*n(rZMi_H!(=EV_oSu)KCy6k0Frc6!VG5Y|eNnWJdHu8Ue>$0{q
zpwr4?bAwLq80G>=dEJLe0&U%Z|1K<|&dR$TS88p5U4yT&&o#hfIDMWNK<YK1&6Ia@
zY0I7p*iRI86l+2{z)_V1!YGchpG=2&yC;)akO>l*6knLSfWgr&=ImS;vF9R3F~kRe
z;ro4^P7sX4)J|;(kLq;juW0ZoJ+QwQzaS<Ub-*${xK42~_Jv5P#l(Q`Neu3kC2LCW
z8T)aBHyL|@?NLC<!2yits~be4Dkn>$FgbbT)`+xwwRs=5bst7`QDM@)hg}cpSJjIg
zp*^e2x<J|SF}h>}A+(tu+?&!Bpw85fGE1Tgq(d1hAW~4PPS`9ItYJb-U#M#jOr5Qm
zi{%XYv1HqluJ+xJV{cf|p1IjMD(87)Iqw%)#ozMJH~yLJvtV9`O65vRxZALU*m(i!
z&zZ}0VkFvN5Dp#hO=0G;LtaQg?srUVRFMU$zPWNN6|o~=sJT($$T9WQUhf8}g11#@
z$AoZ$jN$?R29nY;Bs}2B7od4aQHgI{Qokrl_1sI_=8Djt$+N00jJyOlg%uHL5}$_Z
zlu$EOq@Rj}F(%pMY0H2PltZaU2}#Px0?u%b)KD?{FpjkCV$>p$eN18oq$V{aDgp6d
zW_5TFUO3ah?y8~26Cu6Db%&-#hLXisT5n|!8v<uMw^xVCOvp~N8Om0dn}R=ygJ_^s
zuI$lWWcJ<gUO{s%EiC0OK07B&w6!F7C2kRFD5gPkfT+rF`;^?uzHLH)gHaU_Vaj%K
zs$4_ChR{9z7jLgccJ0Q)>EYzE%Eeuk6AG89`f`5sS6_*qL+?lb?P&4s_|Pd^Na84#
zph4%TF8X0~!%R58z9bQ;^O96twmy~i|K<Q49T20%)aPkZS%nFHV!on=k}IK%h?G!N
z>I}RG#qxzthvUA-=NhUM_F21MC(PXdT?8dT=&yKMl<`6$-G&Yq`nz%m=tf(p{5zy;
zR18%O+Z5QFC_*XPWB)f!V0TY7rz)_C5-WyYf{dZ4Xu|7|0VJ&zFcL)+`7_>LZoNu{
z1@=~S()n{%_Fa=Q^C6Zb&(Xm>^FA!f{d0_9XyQ*==Sw+|Itf)O_Ylu%6TSY!`d$VG
zhgAchPLI;3Q9Q0Vy$Ao@>9Gl*M4z3eMw62d!f<?<kp(!&`nsW#-IEKWH}j(XQKt}Q
zB!V{QMQzdBt8Kpd(eL$TMIJ{4`dsuHE2+YssMIh+(fQHed64+dDRf*&Cja%`>wj33
z`}BDSZv2=sMvhhnjlJ6~B701jH2ysk+?FC7SYL^2Zj*fAFYJMKfo%m11dc$lwcYQV
zw8c`C5Ib{ld<wnPUUaUqO}O2pILU`Hq;~{Hxp~|c6=J!foNNNGI^R|KYpc7*P!h6>
zcz4Ar!xVV?r>%mW{gLvF??rX%I~a-n{@4Hd^Oe7SW+?kyG&7-c>h7!mXUV$qg=x<o
z&q`>1?)xRv|NF$B%wC#2-t>*%4gARu{`@C~KbijWiodSd{PJH9>}iQ>{@d{19SdI{
z939QgZeM;T^W5Kvv!Rl*v3QQCU3iS%Nm^bul(Mepw6z8J=L_|r@KxP+WcDd=U%9Y_
z#G1-XR5dQ>Z~3fi<nc)J$UNIwTUrAS?|MB*UfmpX;%+G6)rDye4a-Js9o7~7MBh>a
zhWudqy42n&`;%v8?hIzqoyvyS-de!61zqE+TX98S%e^D^5sxvOMOFNHK^u^u-afTV
zpNT|24v9#?v*VJwraosKJ>?1iS{EHz?Fo%UNAsdBHrAF66$ef<wS;aIN8+P5ZGo1M
zbpM9YciWCMuk62JYHY>Ps_(EHsZjNtjQ7rOqBFq^RoUYRVHV!Wy6IE#)I)TFa|e7w
z8JKtU^|P3nMTtBw64W{2=*#-ezsOErn>uOhU;vlKr+J{I9S5sZ?x!jgkp`ij@*UZz
zTsjG?@*fB7DF7EI9Luv}h5!;S1to@4L#qW$wJ4ieVW(<m$%(L#B@D;D7Y!24z38|E
zLulvJBq}T6r&9CaRWW0chXoSoRn#T~S=_{x4GTD~60FEi0FmI7BqJ?9akp*wsCF>y
zh*X+`=VWOYz43-p>Wc|8z_j?BjDYpQ^Av82GOh&hl5I#t6*vdd(i0UEn79!D)OI0;
z!1|@x)vcxP$GuR5Tgj^(D>o){aQg1pIkLd)L_jb5U{0=KnP2i+h95X5K0mwXK;%V_
zHFzuf!B8~IsrjQM*VlkiKl(6=Kxeo=dh<i)-P+FDq}~DRI7|n3AE=nnGp$U|+T%c7
z7D)&lkkkT?@msEl6*<D%5yDCT5>5!Kn9KMB7xb*md|ik~OIpC7JE)DJX|@+7W$;)8
zJtfj2nSsxPuV`VX*dcH$2?}7acn)rD_ZwWS_YstGXsfKs6mM)rcGPxB^*APW6S*1F
zBeGI0SsLtJbd>q{;t*~DU;j{<*s6eaP*tI;)2{QX5arozVQpx|7DwU<Hs=(xtIWrV
z2Lu?VUpp!=j@2HueV`luX(&DXagNQ~Vx2DyX$CHa_EU|ZL!clrAO=8mNmdeTCMY)q
zTzhKX_?~xRNg2EkYiyn3wMV_=f3*sde<Dbapuj9;NeV2}?<R<G7gG5eFM734fB%fM
zhD`g)_}7bS&gSFu6dD<d^f<M?)wWw>=d}-^qU|&1Xcl6Oyzf=4?6>YqEg{Ji9avlH
zo5)LJc>WI#Y*o=)B<-=z0@k>!<hv63dz0^Au7YnanX1_pE9&mY<fXSB=P7axidJk)
z4!p_pWQxn&AXV&xnKnQGPV8D6pkrja@#r>(re|^7|EKdJ&FNbpMODYi^rI=!1D6bq
zqGWBRE0pCM2=Dk=3i4WrBH6uC&p3+-Wg88$+}HNK_Rx7&5LsN=QzHW-UDn`1Y`X5=
zS2c%LG?A&Iuc1KhXa_Ik6z+zi`Nz{{q<C2eKwU2*496yQUTVq5Yf=U&r<sdLT+#Yq
z@+Yu-H=8h%7)y#*>$aU0rsXqme3@O+nDAIoU%jBQ`);4v#+q**#Ny_>(e9)BivtC~
zYsJF|;983#&*jbA2wxT!Ry#Am$@j@LTo|XQo?tZ5O18n|K5xw@U6hN#1AUHr8tvxH
z{F|3Q=K{=0Zpxxxu&~n=2a{ufwUj&^VwuNi_-$2{C(i>U#q5Gu7<?=>F{`=KnPB|B
zD8S_<p^31zQv1l(nONV!gpaT&t}_(HvQUu9ss|1o4G4zZ$$@F;XVkhZZkrUdfHa;h
zw3ou00uEC8$5mp5QwWm~*g`sf$<kIBT{6l<^dzJeh8}^<6i&Qc#{(=76R83&Q4F}`
z*0vx4Y5zsb-2!e}w9@mtquP9iy(k;Zs1Q@p@Gfkbn~&d5>IT4G`8E5Cd>8pKbD%V$
zm?PHW#RUclw~4n$(8@Te=Rw`5GZf)g6}HmEQ<F<6>wP%$^>)%joSw}tbC<U-tFh16
z>(t;WemoTQbMel#giaG9FJX`#i;t{oub&_$U95_6&`DfT17TI<V@T*e42i@#pIrxw
z^BPV~2nrjmJeg}}fDoHev;v2U>VnQ{$US0(3id*p#}SuDG{6v>wyC>ihIormprZq{
zyuT*de~<SbdYOH6><Wb%S{bPM-yGY#EQw$T39osZb1!w5Gyf-(F}RX)btr+G%=6;(
z1;LaA$4)3|Vq(rf^5s7in5wz0g*D7F``@hP3Is=^F3LygV(>^gNF%T)x+B_PzIjJW
zkF2q)8=FHX4C0fAfR0Aje{u6}3HaF#@GNOb9V}W<69%R+vH=7NV<V|`8|5H8D{;yM
zW)maTm>{abaDQE;EIEtE=VEU28H9#ag~^yzG|eeGyXN4P_szwlw~(Fwag@B47an$w
zyk{s*?!Imcz6jq-OM@)Rt{A4_rBn>Hg$Xiuu&$eW2#Y>~-^zIO<CB^zRbnZGM~t`7
zw#g4MBIH#&ky2Zb`n-QV+k&tM!8UQRIeFL4)P*gmki~FUYEq=v&3ds!2SungR?}_d
zXvN7>@$<<96?0f0RrI$#66WaF`313+kV1swLB<tgW5F*U_z9L4Jl^-YJ2)BuGEfj^
zq$UgijmOMpVKvMOtn!@VT^Cxpw`ok#$ZU*Ex>YGRstWmqvP40%Bpv9Pb{)f0WeJS(
z50&Mm?sw$ArAFZj+s+#C;Ec<32sdXfMUB`rt9z8SJl=kumjxLjRS@Qcy|vyZ<_?si
zR`No>bzM>4n&ZjnWtfO>h0yv&oHpfp%G^nCN@%dt1_Lj?!lI=6(T7fG#GkQfQv2)o
zCYL2t6--WSEc|FS0t`<$ED_T=TpfQc<n6kBF)wtpH8PSH{{1wkX>uX;2>MVg`&fT<
zzUV+&MX#=hc`@VX_Eox9<Gy-yaRVj0rT8<!6<#$paB{)`?h&-_lHj4Otqs_(Qfk%>
zEsckuk+is_Cd)sueXYV|GAuM|%M(N*gEl&_4`j=d=rLr#0uX{~Z*3N7LQv-l)0}uA
z!2lq|jAbb*Me_!m7qXB%?hc85g0$dY4F)1ob2>v#k_arwI0U#hEmaZ4UFl#mpA)%-
zzoFPO8Kf4b6w2pe##E*~I8@OdiG%~$pGoa_i4~Oo0kf^ec%yC2pr}ptt1=-hwg+n3
z?GgzY#&fO2Rx0o!#T;8yr9156SjjmI*O<y^Q`}4Mv-@`<v4WZcq~^f!r2ISz$637Z
zrNYUmN0^L?6pr5cIAJxe>75<F*7xk$<o&L<i`jS;{go%$XYSN;jM|pItl|SkoMoan
zTchu`gg>Z1bAj-vJPL8mzwCW#7C(YQP(m9X(V4<X5$24jW2*!vt{kc-)Ud2%a9$>S
zGVeqp4pai<M#yiN7KV%73bpg;_5JjHd?mGqX=8dP#o8em{i@1N64d9aLOOZ5xr_#^
zc7s@B(9&a7Cu31F+)+j2&g)^Kz^tu%gN%T+55>jWtG9g@L`BJ*0+}FdW9?fLbz!?C
zNJueCUoNx3g?tBvi}K1~t%de#sGdTZ#dM62ziXKhz7J1B+ko+xL&19@znsKFT-CiY
z0j^p740}iE`Ld45;x-gFIg!2Z>2|8raw=2ko&s@fU|Qls;?Y>I3}rT?_GD9$R4dbT
zxMw)+O~CDcQSa7N_@fpV6qNCgIW8CVPI-Cvy!~gx{x&B&!o#*mv$^=ASpE2+C;Z;k
z=qOUP->ORT6DN$Qr=d}3`wv)}3I`c^lv<v2ETNi>Q&hML?N52*kk0Ur@Q|+vuPV(U
zf}XJ0)ymX5p*!V31!a01YZ549Ox0O|x}kbX03h6XUidprlsWt8@hq->3@ZQ1VLca`
z%h3wXMEuhh%NR-7FYrPH?OB0R#YdAv0Z1VKiTYFO^g<}I%mrbu0*Wt+c5r1-&=~~y
zY`}Ybd+MvK!IGvBkSlnFI3}Y5vH79Bd%sMiNPGv0wj4Fsc^l<Tm{KHHa-q+_X3+l@
zIage5?)->kHu|?jYtrd<Xev#h(p7DjX9iX!>^`)uqHMgr`HY^4Pe9+ux8bk};@yXz
zaBw%51nq6=ANOhNd1jnmdPRz0c3+&;w4&Hqu@U!?vIUbG^=tAst*f&IQK5mg45Qp#
z?Tnal-_lMQ!`HKl6CkEtbw-V&c_AHO%>WlGJNjny-)x@^g+`-4TzXdfx#^+I=MK_;
z#wGSXEB8m-T_<YAJXqv9VkgfX;5iJ=t4YioS6F<51$N3{_B`-s%UQieWE8?G>#XE)
z?xFBL5JW?XO^&}%EuxS`fKb>fthDKS6l?4;!YVOF%JN`MIbktSr`Ap>HyLmT`8Qa<
zg8TuRs=`2p+l&=&?x2EVWW^c0ZPA<2M2#vL8W<NmGptCYAWp`;lcQFG96B>@>mBzr
zafk#87f&a}85WOpT!~_4LgnH|BiwMyvqlQYBjGD;;Z)kT6qKMRxGdGXse0BjDrZ7U
z;iYHc2fhw^DmQA6T(LGdXt%m5EBk6&6S@hdlOL!vQ1|0&2(1rc0K|@`?7Qb=PX)57
zZUw2+mb}7EO=v7%++H^2`WnZ^N0K~|ciH%O1=|wNus=E+^)=gqQKs(QWVT=sAXLki
z%LkU$6S<ppWb`9bPVmKc|7&Gq$HzKsIX-A(|MBsy>)iTv$CvP*J-!zzZ9AyLvUoQL
z9ax_k<ax)|9|7I9@Iv!-080=%>H!6Sk^scr^+Vr5mf?sY0RT8`WCDPm_#hfkUNY>H
zJ1|ZV-8BbAs68F`&{}v-h@q^lR<<*V+sPh?qO_5N>flj>yA<rA8&m8b3sfeCk4G+a
zYS)K?8Id)ToY^L^Vn17ysuB;?auV4>21bS|R6C~B3`sasx#7DiW>ezGK}~v~ECnXj
zA06ExpR5^f#F!l8Sbe7d3UNy|td_29pS^WUkMq2(&GzYlxi}mh8I6YGRZgpdLzrS5
z#mnhT)EoUIm>2p*OXQUl2xg%DN-1?9^N9+<K&W|Xt}(R{T@Rnibfu)Iw|}+-8#aei
zy_jkHx5<hOwV%C06M3>-%*u++BA8$kkCgdx9_bJ(mM$}6OtN6xC5F-20{kT}`WeMc
zyAnlnX(%5TP|G8m5dUiOd~AT18!IFk9f%}asj-2KmB2HmP7{)hJbZjD#o)Ms^A*lI
zH&B*EhzbT&gHn`n0phYh6rj=c33Y})suhb`Wvn1$-Ou-A@n=-3z+ckyiJ2x9AOXh4
zfSOl{y-Y&%-E>(nP5hbDi@Tx*+sk<4U}+~5`o3Crr#%L=QYp3u?kh`<%cMv0w}doK
znL)*iGAAUC<U4kkB?LG=*GhQh^M1L+@>;(1l-4?s)3rrO*-#S=z6Ch%$03ThJUdCB
zzidtSvdAN&5?l?Dh&l49syNlXylB(TL_cYbUX+^U)<A7}k}lo<aC3pH98chNy-VdM
zW{3l3F0YiRP6!wxj`0_^qu060q{LMo*dhDfWa0_ScT-Ux6S|suwcLYUTAYSO3NeC{
zHU$@s-V4@oKpYclc~|;KSdRRP2}?yTCKV%7qdq)j+-enpj22?LKLm!g^_(b+0cI$r
zpi&qj7UsC5i%WqJBYO&BJ%e=eaOH?!LZ<N;q<G}J=eikkQmx94;Fxud@R$^zLIKtM
zcyw-}d5k%p(6fIj7nNd`muVg=$G`S;1xUZ}@yff;eE8INJnxLVchckcJu~IzGxv|z
zzrJeS?ZIzN{nLLP{Kn+`tNUBOUuvDP#`IseCQsG=?$6IY_|*HpOKn}>@#&l%%dLM5
zc1B-Yc;Da59ZnH)LeaL!7B(f^I;tJjR>|WQ!xY|#x!Gb=L&44R3|_MASq#bBI#jWb
zga)k(vaB{4cFEZtVj+A+XLXkFIV?~?$2V>~wjT?t8cGcl+N<f}vtG&JA(lb_!nWr%
z6~{~R@7=Y#%3K^Bc1AyHwcQzQWhKcntZb~q&(T^&hw*86F7M;3n}SQaN_-84htTdX
zuPM0v`~(Eh?`;l3LQo|Rrd^B7+?i8~vN-|fFE!nAfnv_k>b3z;2%U3VMd%S)V94S9
zATQnZ{zdV=9}3KZa{LM6EGjWTWBw)NxT!L9g1xO;3^K3$--C`oqtG~bxkL%v$Y9KI
z98%a^*?V8oXb@5(TtLdG&DwK;?-EJ!uno3uD_$d^8!K+?#h{&#S$JVi%7YWmQU`8I
zpl@T+(=WAfT{6s&<&gZxO6-E|E#Gk)mDD~mi1m!oryGvH86Ww5OW<6icc#5#6MRND
z{lrlr!1vKa%3PS$#C=;ktFI<ZdMf`!(eo}@v3rga<9tgS5gknA!DOW)v~9=I4KgVc
zBE2~odTHAN^VV7Os>;VRz6XkHbZt-1`B78&<K|Xdke=k@J9&5~?k>LlE@BUC_@Mab
zNjriSX~TJJhZt_|DPuq3mgBrS5ADZmu8FNMvX(+w(q~B>avwXfAl+*l1?dn1P7-le
zf+`X19qyGrqj#lLm?zua8QS;R5-B^k4)EuPWMP)9%Urc_L40rR5M2esd)V;qeH@=y
zb*EOxL~X*DI+i8}BR;Wmt&wb>jI|}yKr<KFwxj~tXg!QGkGhX?(;@8mmiw9vz$WAc
zUyL31#|I^1JUKL2i=hS+$e@~vY%TsvKnl@4@@>Hb<iPDGY-gRVp6Gk=!Be+C%(Nt$
z@0&V@i`8bWfiU%?(g=_Bg}wWVcxh)c0iJru_b?1Ie?Dah(FtPsMr_*A>C%w#1?dSY
zE8EjzCv}KA2^r&(?gt>(JK|WDHZS{^Z||tsg(wcoXZn+04LNULY;lEte=3L{{;)Z6
zd#H2dEkkxgVQ`B7xc`QAA5Qp!)1V3E4c2}E=nuS-A&Fn6i3^nYm8Q^BUA`ncwHj4;
zV6!0d5)3W4$TIIDR4+$~ASCrH-wEqH%=F-a^{}g0t{^?=Q93P+84rcJFdP-lhQVSC
z+2(0~MYpKzzIp9gbLKwUbi#@TV7pYL-XGH9^P<8gLD(&emO-%lbbMseo-CPzb|V)|
z(iBfk0`>x(_7^%-HOC<92)3puHfGgrWB?6!K;PD0;<e;^iY|+};-ex6fDn2Q%D)}9
z*Nma|Xr<AgrCp!zYdGFiZ~|pOhd4u#7<0Ik9W@-x52f|T^qi55fH$Z3ll?bLnXCE%
zrRHXHW77u%!bEUyPj&TFR%Vr%#CaS5Kr)yK5{GwvvVRv_C0*N}vj+PxPyBb$5CKN`
zr?&8Oc?)~ru#5|CK6GHP3^OQ<r4Sj)nFZ*vPRQ+o_D%>Feu|0th;N~8PD}Te$pZ_r
z#RC$f66RY5x(AW?UG+XqZKCf@sXeNMq+oYB@CczgfRDIwt{{3b7m!)LWHrNB1uH?a
zp7Ira+SsN+<?YQg7ol1P6%&gAWb~Iz6vQz=NGn`Ud~C@SoY6hfEFphn@XS5_s&+Ad
zmS{~YL3I~p!Oh|E;Pk6-o<11JCxHJ3g>@$iOTMQ--HV8+TzxKg;D<ylxAEp-t=xF0
zJhad@PaW7CxHi~O^j%rN+BoP!%gyoy9q_fROs4i;24a)q16)pc&p%=1^ZE??|Ha+A
zheds+d7tl0&y2z7U^FHkXbFm_U;={3ONXWsP!SJ}$DpyKhTtIynwl+khO0IeYWod`
z7+d5hAk)%@PUEpOCWfn8x@sDemMWl$V;hW4s7`lP*@;P4rz#3RDXR8!|1jz4p4pw9
zYp=boefN)^(ZrO+@A*B?bKl?ldmujx^R7&>b+9@o2&a@oQI!3G5BLe>*jB}-3(wca
zB)eZ<c>0w<zkRgdJbK$T7AnT{604f;$YE$k_B4Ff{N<@@U6YH^)Rk-r{1BwN0%B|8
zc9sXuHXME{%oqXH#k_)-LXsSNQVBq3vOO7M8yCYuxnTIWs21`?zVI^abDj)m96wPN
z%<Ea+ml-wSKh`|ZM=diSZkz3!R!_oPc+!VtS189R_GsXse8PxhwuCc-u(o-~qO}Jx
z^XFD;iO6cs1F+F5J9)!V+K}!gdNtlvRr|s?QI#%|4v!uN9*(>a=7RCURj^VOSo5fO
z9h_k9)2#P_FZ^!`aCC{2mgT_Mh95|I8iPdHvumpV@VMC-{h2TPuhISmwiVT;-Inr*
zn4NqdON`6$sG-8NbVFtnJQV+7!(K{V)Nq2eC2}YDbGs*9iwfilm@PI2fhn)I%*)4-
zNFJQ>SU%jhn!NMuhd2_prY7eh1WvE)$(~!Xwn4kirg*WbxjV?<_Cnj(a#EQpuuZTt
zbH4+|CK<_B0df@G*x7LB)vuSaxmZS401v%Hf58iRGYUDA@-Of*I1?FQQe=3WaZ_Xv
zrM4*?04kN6GvGg6M&peCi)Ear9F;wh8P6`_sbR;aGO;t+%jqjAMzK|vu{j-1J&1B)
zT~eFOto;tg?o}!qlu_LTndVEVJku*xX!u>vk!Vy^RIG%(v|;rpgpWDnLc^PqkEkBS
zn>=gYKAr^O!z4vXP!MZ1cq@}_Shk)PMTsTl6u&O4)-9nJb6=bPaZk@xEjfZU#ZsEj
zF?c8y+anyeNN?eSBN}uCx>Q~|3tn?f>Y%#=elN&9k8YMixR=LSt@V_MM89OduBQ~p
zam&#Uk&+W4<C)?Ap@DF%p6PW7DKWK0%5S4X8eq6^7@YfT_)F5QHLr}~K)WYRGrA~v
zD%>CbJrtaKiS~PF*baqmg%6mOv!Mm8MHb`Mtp$gxq)ue8oO*d>g@W5O#}}Y%)a;Y#
z%S20*NL4|qmC82Mav&Hc2Xbv3Tuv-|_c4r<@YEDV?jOvDySwI}7W~q`aSp9ajCU8Y
zqu77LsuWE@e$VrB{8N&3f?ZepH%OuuE<=E*GU7$oLw5p<rsuN4qmQFat}@jz^B9Ne
zeN9?oN^CSC6zO6i1?dW*4tj+|C@-p8i6C>r=2s&`X?s_RQK9fu<VGD=W~fY=eXE*J
zLB%ZDe3T*!BvlhfB?=-$HI*ra5QG8M8cyiYAcD$aFrPco{;J?K*CR-UvN>*jqU9-!
z!vSO6Dn@d>InYfG@k02Qwk>Do!X0^`RviY;#}g2u-?&jXw{=p$DkICOlCO<ZzBk_G
z03(VSEH_Q%(fQwrTCoc5+L4;Yu26@01d<zqaqn%y4@U9N*M#h2pSOKXJ@bca;ZN)%
zx{xb$TlVTJqisQD;Lq0-`C(F<dw$ywP?B2JQLK!l$CuQI+fhkF0#h<ep@7p2SUt}R
z*eC%U+CYm*Zm5kogA7f@8i{IWQDk8&D)I$^bIyk#{gJGRwS3?7<h78n`<lGS7~4%G
zuBo#JZk+q!Fe=Q(Ed?7(O;Z+79;_Tfg{cN@^HG^>;qWj)BzwXbeOi?aVnca=rIDL~
z>l`qUA?#I|+)I?UB#es@E+@CT+Mr2RFw)s_^QK&;Ym+j`>O2Gy?tbK9Bv9cAej3$!
z;66}a{h_%JdvB#E^kdBeu9a1FT>ZO*%tp`8m%0K1*<o71>QKPY|5YimvEGHaks9{J
z?xfiU85fxhord^=FOhMuX~DSf7Qg@vT4K8?CYC``9A1KpE<3Lp9*${ApNKu?uCW8=
zEB2t&sG~0QheR>=4un1;XltTw%*XZ&bcV0x+efyTLwN})-hI7SDYEA9W(cX9lYU4n
z+Nc1KyH-;OVWu%&`E;<#3Z9uK<#AY7mTW2OK}129hCPcqG>-#|dp2`oOM<>4V`&~^
zKgbJQ?Qr~Ndrz8b@;^v;;A<{G6)gjkiFuw`{wo}pB@NMsn>Z%fDbzIMd3f}DlE6%g
zIxDKiUv)@u1qQ2Z<oj~NdaGaBwqcn`()>xD9%YtH|HbNWf`EiPI<}kO@RTK#-9T1f
z3)UEJC8?1(!s!<Ql#H05t}V`?t=Z#$m3K!fmCX8Rg57C{)0eqE%Tuz)UJ0!aH|QNQ
z%y(|bM{{v(PJ3;FM8pCFEkHudE!x80(w+`ifwQSztNexXy{Wr19gW)h_LxEL6oe=2
zi+RR{iL}WSVDqJ<tU_$exs2OXO@gOcYF)4aTYW;&iJezIBpHw6cT6Alxq=~=@{#5u
z?wFsPl6hnJuKnI;`mz(6tM=xip?5RRaN^_pFFbCEcE)(<Dvdz`ym1nJX#)k~ZzUF%
z<@^gHhB*9eK!E=257A=GS&1zU&(hGphlIlhW$jV_(=n0#SD8Jhdx>`>K;?2}4Ew3M
zGbx4VJGaub6q@d;P>R1!ihJl_q{@u%D?M;HwYlofsOp!*LvcX(+~24Tlk<;q!gBYj
z&eG$GQK}odq^Q;!DJ8j+VZwn-vBlLSjDMpT1!;pf^|j&<fLtVT@Y%OkLyaDABP13d
z7>g-R&fkMj+*@PAJ-*MPKRcvsvdvG4ZCehOsKjQIr0Q{AQHe%_rm0z|F_1Q>&<klX
zbEHT&HPBus@{**_GMt%0#5Ie|7r~14KVH3<Knsef(@`ASMGw`jc!wF|y|gf{rpg%k
zY36^#8?2i;Hb3muhriH`^(W?E-Cg$2j~wpoo)q{Y#{n+1yh2gf>yV(<Lk_V(q?s)V
zSolp8_2?#<adW>{*X9BG7$1a8&!D%VgO(A9Of|?-jzS8~8Jc+W_>@ch2r6ybQr>bR
zX`TX(+Qh`N_X35jyXTgF9$Q)>BMC4Nt|9H6DVcK){XC>K*?E*&rgKyfkkhde7Z^6f
z(>^mknzfFW1J{mupB$dbLkukq{a3Zp3!xc^;XnN5Ze&g56NT^6#6Jc+Pj#iq@781z
zMG74!XPo`$|G5`C0Mt9Qr2H`B0FZhf_mtW^ZJ$Nn5w^x6m$L~zzQ_3}kpA~jAkhcG
zh^eg^nCzhNUyU0o&!Z-RQ%DAoo3F=Bf0vO-a~@Xh4dpFe^9Ske5a%yZ>p1vE78-^*
zhNY8oyOZ{tVsOEYX{pMZS$zDFZHMvWxFb9bR`JIDgVe0TXs*!5`Q||Hwd;M6*Si5x
z3vDT&n@mCJAc}g|4ACC3N(|X)h5Sp!Ic0;q?o!Ost#`se+*>~&^oy%f=~51I){n1S
zwKicdwGCVcu^6ltum$?36c6)pWhN56imb;(<ssFnLG>b)v?WPjP}7!~XNXynpre~J
z$NlrLWJm$h-f+}^Wm01#rxTYJ>`vhVP%Z^u;|(LN!NNaen+0q55Bb9)eUscDnLer&
z)EfRV+wO=@%ve10qX{a9J~dB1ioWCT*l$9K`C7Qz-Pa`SZ(fGS$1a;T*#~I&uXVZ~
z&tEomZnSWV7jyI=Kf0pbZ=LJq697r_afwCc9UDn!NQ$Yg9<vNaT4*!^4(=w71g>UM
z<r>}feG62;Sx(d0h&TH$;V023zFS)B#NI6nvu9s3EbjG{4gbfB7xs)}HAN+TIn%J+
zF+BnAYNlMeTgyJcUK`-NNn4@p6vh*PnD`EG5mOF|Vce#4d~0L$n=B;Hh)8!mJ}dyb
zFzVYJy->SIQ*iRTyewlgXBW?Lt1LdFf`W-N;|5LZGkex_Yj3WH@UjMfW#DXPTAGD9
zUwa)$XD4d>8uj9nsW8lYD-giTTy9iLl&~?N0Ln&VOXs*Eg9&nDBmq#jn0qpX7iSX_
zbL0DUZY?b9%$Y11?<Q4|V50I_Y=N(ZY|h$wpmu<6ugOq{^EVhiA>T%hlVw)hHI3(x
z?7Oi=E{<~w<5h;q+0F|{)YRB|(-V4V0URf)SGS;QQ+L;y>JAZA--G)=J%CaU)FN@!
zkYHb)s-+x@oInj|1;)a_5fz>nC2>}sv**dN@byb=hj)DS*8>ATwXM7Q?P>p8{+dav
zzVp}Lox0&Kzgv><t4}*0d+qI~>=mDVe_q1w-}F6wy6M+j|4R4jV<%E>=T3S5`+xt_
z>~s6S+P3A^ui7^4F$c$fzeh9tvk%`MJaO{MoG)%C=T^XjwceQ9mN`bc`n^M%SME*I
zC|4kWXv0IA(B&}7)vJ2j*sCyAg9D*Y!Xug;2m6;quVd2%tm1dl?tclE>;&si2t_=G
z+T~P|{t@RzVW_F~CKMzPVxfoSSHsr@Ndut0|DGKE<9l*d-V?2HkvRR8`;+njK)U+g
zBGH?t(%O+l%s|_N%42yEf!(d>2c(tYUS(XOoQ!f@Y$*j+(*`X5H)ql)ppN&kx)tah
zH8N?-QohfI%)uy+H|s{6qse&{z?~PBK!P6x*L#k*IOvV_ye2bZqBjri3kVnreT&y*
zhyyEki1xxLh%U;aZ}D2F64eg*C5GGFd0Rq%Bek*&QlC88)oK(;N%}J5rr@quoNd?6
z`ty~?myo>iY<27;5as)CNrJLa&kq6BgRed*f+I>`1?MK)l)9OatSgKu`Q0!dmxAb1
z^=`CucX8p3Cw0*296gNr@ASi`taU!mR$Eh=XY0{t>UPFW?7g;=fT4Tl(5G!<I+jwg
zkCukp>>~%vw-#vLfFyp~Ji?)^8GRJn2EF;NYiu~|XbZpWT3d2Zd7kkXhXtrj1)vz$
zwfyB2gN+@HMKFt_JgfQ`G_DCy`@GrZZud8kB|hu$p)6ZdQ@Zn_=SFN{Iu9c=zV8Du
zo$%AEtn@Ue$*m<tX#b|n`bcDw)CeF?7bvc=DedTv6NUQiKT08gRFftqf`UBAh8&a}
zq2M|;C`?g!2xw<$G)>p!m-j0pk$wncr5OR9yaqBHmbfYLLC2^!ir>ok6tuBt59Y&=
zMto>4rE>;*kop<8S55O19?+mnQ@Q84Dfs_jR2z7abz=G<UjMul1H7EPqd0CbIpqzz
z{BBvV{<C~#SJ@VGqwC(n#FhlEUL;Yg{Cv{j@^t?O*2i&aZcYJStiRG?6AEsPV97f%
z#+sZj#b#~~<uuM}IR^Lq%Uv_$eU-c`BD}FZ>W|BIKe)}-6q{mvUMW}}_t3V{;qaJ!
zBr-g~9{lRlywElKJ<V8W_zSzgKKcsGggb4)&)dQSq2Ey<!!JoQ-yJmH%Xh7tQIvT^
zRX7ToQ%Hz$jwOTBHtzBd44xM}*SS;R)&zF9``@G>3T!O!Z!ii!cXDqIFKAPQ(MiR_
zO?L{Q)|hgCb?Igr*v#y$?hH?g^4R5NWaa=0g2SUu&uA=iuS;L1YvS2)owDyIRPYYd
zLdek|Evf_+dGvyd!u%em`p`t2l3z-p3sj+Qs{44lEG*SDoge>JZW%%1|EQJB+{CA)
zziSkzUa*XwR2jQalt~1LEc#~KON>(4uH+gsG~<UkB+}cH6o%u5Al0V028;?PlPWA@
zQ>eD@=8hb=cQyD?>DYJ7@9Q0plI4htV}bzAl%?w|+^z+NSbcJABG;Yg1bn@OXpb0M
zV{6N`K>Hjgvs#*TNPX~9_?us{z!{SFJJGYQ68Tt7+Vl_I{m#~&Etg7n-aQn$q8V8b
z{xXfz*fC%Z*<lH(`VMvLYJ$4p=N!WdCD<3yK}Bx^3NGtR|3!18A@!2>+6Uf6K<x7d
z<J{ORqoi>6OkhS>=-pAV?+s{QCjBLM&n6x|Qx1l4u-%^J7){It-AUDP6wxx~Sk`4N
z%cEVJ+*nGvybBss&nAI_`_=6Quu`R}aI}S#wG~>dj<~}~bD*nWzY@Sx;(0YgwRuRD
zVwY_N>MSK6bu>?$%C-P@;;gzzk@2Z8ZKn*=RfNi-wwFLO(JXVG8uhr#7Ec|H_|9T%
zVhHpxB(g-hRLK!zMoOWmRZhw=yoeS}8mFJC?@jP2yvB}F)1ex;4tz-G3xQ#KS-Z}B
zt4V1rt9R9%T?JTEQ~a8?b&6Bl4um~auB|W4t@I!z=7-&EiQ$!rbWAFlFS##(mdvRG
zaqbeMV|ur?Wy*(970alH`AeB&R(9oXh2Kxx+JhOu{z*#=E3B1`o16DP>y8eN*~6~U
zs_?I`cC>}<As306zYh<ahwtyz{!R_qno3)EsMGu--><xE(w=LRO15OTb|6d_?pucE
z1<usgDCO^Xnftm(duS-yE-M+^#)N}O554@NM9-RD6qNnPKLF@SYLn!h`%Br@TFQUt
z+s&(;_y3obG`y$#u+2O0)$QOG+r0dOx}7q(sd`OxA@mLON|tp0loz)iB`~Rw>zl35
z=;<`IR*pcrX~s0($rGj;s#z=fL;a2K6dI3b9HiQUEO%O_Rmc2sOiZE<G$-UoT$<R}
zx7@MStapw5EYBWP5<~rFrB^d{7SEie8s&dsBMy{>Bg4NNwvByjMQc&tFrp|B`{p!0
zIkA7I#oa@2D<>0317#m97HNTU>(T<>GCHnuEp|1?+zOIL`jHNH3Bwpw{d`ebn@KCU
zFfZ^e%pvH^P!mq4e&r18xx_T=r4sT6MQONph)_dt)3kPN!eC8t2>=iwsd8tD$8m=W
z@Q6_E>^Quhz@JtAWm2O{(o&R<W!KM)H^0iqE)r}~IWjRtcVFu`#H@5YfCD;0wx^i=
ztbT^-Dj+;sf5QMCW|URO{1-*+n8zqOky9W~Drf#I7Xa~@+Q3Ux?!}<kbY!YU89TH1
zIIMa|Q{TqXD&X_qzPHX+vV!&-s=4)7N&31pPjjiD-}LlItJ>%aCURF=c#taF*_J=2
zvg^lDvlm@vRBLqPC|NzJ{3%ok8@#iad_+M@XZ7Q9ZB0c=fxB5dO(}RBVS3!?OKoKe
z%&*q)Z#(kLzqwx$Ng*_99?7u}e-XX#MpTh44QK3D;&UT7*k17Va32|;^(WJ8;Z9Mw
zO7can$K?+7MY<ua3qG@kTZO0u!WfNe-N`x1H}DBmpQ!vOqdyX8fG_U3zOrJs_hq@o
z=3sDBI@^&tTOMGX52pIQ%|rb)=M0nS*8gW(LPNjq!;aud49VP2&Xtgsny%P-BT7-`
z^z^Sxj-9<xo)U&Ip4VA&4l|PG=yK@ahhX47++wOpNK_Wm|H_51*tKTyW$__EV^Xni
zV_}>FNBNqB{J254voW`!Yg@hNYGT-ZzfZBY1z$tgW1&0w$QS-3n#~w}`guHA2DyWp
zYrL~|RUZ`L05vvS>9V13f^pM$-PeoDDX>GN(n{w@kw_~F<DV;!*Vl)t2UQxBKVp9S
zCNRL9MDGe(_L0<T>4t{`yEE(Or89b<>z_mAc9g0P6K#J(Q~EQ!d&Skb%mujrq{@LT
zZRo$qrgHK_A-e+(@F(Y>nZ|lut!}7kFUkw#ZBF9?k^e%jQ$L*b%pQg|Tx|X=xB|}~
zBywR@Gw&INEp>7FGGBaOD<lF4b~MzC*pk@!^BLKR@qcN;Wz!Dj=J=jYCvs94@N~?=
z=%F(Tys4JPzLA4vJM4XD1Gp(SCSd#>Wp&;%c-C2@1ATb+P;BAQTA6hYu~hE)u3k!q
z+K!pREx5Xta<AZlZ2x)SX&G)abv3wCra>N==_kHv1E!G(6;}%-6ZZz~%(=;q8*DiL
z0Sq8I@-%;P|7n8l(q2yQzmV=df(661RV4?L8ISFobC?jR@r8@@JV5`pET>TUzEDEF
zG4@SE9(*^|HjVyMN)OM+iyJzixq=V**EcJVMXQtTdboAesD&hzSMy4y*UifdSY;x@
zH(Zq0-&o|G&$(4p!3@>gj5nk18?9>GAlGf+&GB;PO$6|~C>=C~>iNf0H@J7u2SGH!
ze2V=&*mFyoAO5xp>F};tO54fqzKhiPZ&G1W?-~xq@+GhHp?dF{#zVXfF+P=4ofw$v
zTtd;K9-}r(@C)7~(0ghr;3n!v4(Allk<#h!Dq874HdyX7DVy#0eu>$DZJz&Urn8ug
zQRpJiT{RV`k8?Y%o`}?zG`??_8+7!WWn)O&1urhIcesu;u?q3|mC_~(+x|#8P$YB&
zb)-PZb6<b0&bMM{@!U5m?{>6>+>hEvew)`;_WOTxV73K63E$<`Yq8B{5XRWAd)c!`
z4|hO&O-&(nLima)g8fxS_4C%{fh9$|Ur6`2ux8i2ud18E$2>vYUxsjnUXwWu9R6A2
zoAJq}U}ZZNTUa?Mf`kJY?M{K6&NF!-a}%@xL4Ra7G?r45nwGL^JX6#@(o9TpaN@iG
z=+Cpm-bm&%KtneGf+z;Lam9BGl(rqJ*eU5D*mJUgiGhuC)Efsg8kv)d@bLtjRZMlo
zI!C(Qk5!4*xP}g2Xio}=Kq(lY|66QnSac=qQvwZ`$v!Qk1<9S{bW!g<-kzes)5;m#
zKJ2$QvGG3XTrvsw=W3xqHl!&F^9$xu?8>o}J(U;Bl$bx%cQLjv)(8}oEC*8#RH&|2
zMAYshb);P4yW{|aCU&5?Dr56*Pyis%f`Yk3Ilg8}G&C;Wc<`eRUC3b%y&rA9_f-iF
zR)9K-&EZyaFwz`uE0Z1~CwwaWxn`s-*zw}tYOD7M@ZvfDOUm(c%`>O6h(Ev~hf!fv
z3Mot=lpH)jFuk=RA18|RL0Kt*UYZH?TcStdKflc}xH$i)0ox0UXsG}&#e*5JkE*IJ
z(l361w%vcaf|353L*8Q2Jb2h}m{`{sV@rF+KK~Pcx_)tb+&gGP(f~o2XqRqgW2l~t
z{3To|o?QjBbE*VV(0#Ghfn3K`I?lxs<fNvn)w4q-LL&kZ_slj{mG_{pF}mmKPfVm3
z{>k#W*Qw)cJ_THWSL(WH{ld`s&{AwZZl{@YDr*@2EQ_IJf%D>-k4dUZ48aNxaOC9{
zVIUqA{*g%37q*cdz^ls4I)(zY=hkY5Ge-sfbFbQZ$-kK_%7n!9!|GjUS-5zk<MRRY
zo$yF!*cI;OybSl+gYWCYcNU<`g2lc#{PXZvZDTp%+vYE&g{TUDsZmD5ziW-48ZmCO
zHJoA+4aERN!Llw+tTmQ$TbvU!8Le=uc}n+{yz}yU^9Ae|R7S=b3eq4(y@hq<Nde1|
zqIq(;uoudWIIHm@TWG!bDO=euyDU)&?iBg+%@Ub-oo<kcFUCdM0I$A%o#$g1R<iZj
zTqq*PcXWq(RdH=H50RCT@WoE5RMzSSRC@ITGNe2jDxV1!tZd&hB=X2)+1@*3jF!lL
zTMB`Qp9BQ2lpCArz7}Lkel61Xa;&?VHmEeDO&B_fcndD4D$^``Oe$@k2+0I~LO*@~
zEtTIoFY-w2p0uE`Y@6LeF;o1%{#~3S_4E|~Zs<HjY|YcHqMdheM@}%#s-rq3_r7%c
z?z*Is`PkDEh+|4*&On<;|KWVpyAZ)7CjR2(5{PppoaV)BjkWrJ7KKK+HkT`LqvJ-Y
zd91^%JR0toCGHI04F^x@#s;zDXl@H`wb2#^T;Znh``8Bdm734^;`+EDcPSe!Fcvr`
z0@z5z)dM=Fb2seX>SsGth#vx|46+mB^;OkfJ8f@(C>OC!RQ@MKCD?A&xt@tBzLS&%
z)LfDs!F{w40#-i;x~YT};W%NsHj(}VV{G)Y2PfjvDdy3LXo@rWX*y4I-9-4voU|v^
z|C4SC5gsoy%xb@do<h9-<j|DOe{mm`lqe|K(@nyMM=+7@kHc=c!09^ae~C|Nl8(bC
zBW{|9hK`xzv5fP%db?1pv;Vl+`L*isio)LCLSJD*T%>9oIqx^~_YL;62A`7GY-pc~
z=*wb*MyRixdH|P#E^Q4o-RZb<Drg+rGWybYKA4m{;~DkrjS_=>_k+35U783s?T>Ty
za-hB-eq@|rJY{^ABwGVfKUcl{#W1g1(qhoXL~!mxyE<xUY(E&A;)|EX4b&tgJBV-0
zxiIq5+joXD0bmo$zSNc7`$`A)`W!9-*qfO{qhY@${4>4z8T*!L-WySYnD>4(@--Kr
zM)-Dx1&E^eZf-7~9q+pv-V<s5N6XWZcfNP~r&oiuvwqtB<HDM~`xi}J)%RS@Z!)Xq
zEP3iTQ>IQy`N#Jn9{UHwBR}}>lXec&HeYtce%D~U{L7(tZx0T<J)}Q#=i2rQKTCNl
z=O1=3n1D&vn6+#W+N#Y6C6C*=FW<irD1j!hu#6{wq!Re*B=`pKtpq&x!%M9ay%lOO
zgx^-@u*B~r%nO|7mTQ_7=RPvv6$9;tCOp|<F1hQoDj$cIoab-+_X@xxz$*(~O(f8b
zdtjrk7E-$g^<Yn%*w?A4X4r|)0MdL3(P)8t{y0O7=v^`SWdzCxaB^jP@>)cyiJ)@)
z9@uB`FOULC?SeJfz$SQLAV~_EaOfGpfgfk|l?F5IXY}^Kc7$bLU5Lr2WX!GW08*;a
zW~))lxd_70l6jDoO+>fUJt-zOu#M1K8o;Ce6|6o7f8AN8kR<plj3(xjeOLGfYCX|c
z#}DrGJipaqSa$}lg67^`44FR54}aY03U%j&=6V;H7rBC;-!k7x3}4fPJ{q|F<dLL7
zNl*(kY99<VP(wu#8b%S*^C;sz)0;K@soZVW?kPM<7Qi$gT%NV~la;cr{7+V@WHC24
zlkq}E#CxexL}(6DlDsx^(a!5IHI!K0BHje{Y=4aigV3LIG-Gg6?SDnYnuR5)6d3rX
z;bj)Q1QZ&K09T6&;PW|dWDh5jb2QbQRd8Qd%Md}l<Bl}R2M7WiYQXNLrpx^{2R;Qd
zTre|VVq7m8AlApWQKc1Pq)izmJQY3?v7zs?H02QHNl&<HBN>uq!~)JUFl*X?^b43B
zcsWp5=RRPn#h#Y&Y1F(}^xZo%6Y_@yaT&soV?BLZqT*Clt!JW?Hz=~A<1n^{JH!i5
zoWS8z+D(^A9Y^kh1t^@tx5*R4t93<RUTC9D`Dh^gnJ#z-ieLJba_cfyBq=bp`#pM;
z<B}uSkMS>I9a~8LkP7Kp5cUdVf<E1wpq#EL6gd_I+Z+i^%qi1Wt5Z{XJ0A!TDsYZ-
zi@J{SrpVP9xCSn8&M@fA=-&entwje6-^BEQNUt3?;pMCHygV8Qa4lq+$dgpT5VlxS
z6C;Ng=o{A)W+Hk@P96dH*JZg>RgJ;8EEDImg!{707~!PgPK2^Ac)SZ-Cz^*&nv4MS
z<IGD*h8dUVP7v9c7)m;{9zwYwBQcobl3>Kmx8ZsQHs|X=#f>z(Bk%3A_51vz!43V>
zcgN?XoWLTv3Z6Aj9rG(|WerY|r@AINvTYAU1OV3Y)$_dUUm@Bs#Kx6i*p5kPEL10V
zxH`iP^K{V7r_kZlF`9a0Cezf^ou6JjeCLpD?CzuC-$Ng{sW*SJ!2D34-rk@uyBo0I
zd(=GEKM+bU?8ac%wz1h5QTUv=Kx349)SJMuwnvzfa&1#hFtZ3eGi2Z#+!jN!l_VTK
zu8I>iPh5UL$l0RPD-6ub>W+Fl>KMKaF@wRU3d90rkSYmo0EhrJ1vXh7@Vx>t?tK7?
z`oPk0Z3jjZUg-~VVE>6H&AD8^%H3%&V?eKlIacrjpc7xB7r~$8$ThZ$%1fk1;xadc
zt3$+iUat014~{s0wqkNB<2c8i&I`7fsz?=7-(RuVHd<53t_DEo<K2r0`I1C0RA{oZ
z2_;w)kW{6+PuhR$`|M(b;|CM_V;xtcJlTjp#OHC&%5?9s^|l@@wjS}auZ$G`BIwuG
zMuO%D5RQC*Yqn>G)xSdAYeDWfzgK3#BZ2AIF5IwcPkhU<S?4$3@VDy1e%IJ%`09Rv
z`FJAtVv`?2UMP=*GBb>zOm)F{Qz9SsP7pdCLjZ_#g$Ajncv*RyPpg!~HcE#0Ht``3
zG!%QDhj~}X+yz#`;~JW8!S3A{ZZ{?fcQ8F()VNM~Z<tf7HXjSpy)p+}k}s5WB<%^V
z&UC<;*-#K<U|l^K0vj}t=2PRo;4;)@d_8$O0~0Px2F!4Bs`pV^%~UDT6ea0S^G{U9
z0O=m6Xt+xu8A&#sx{NCKKJjbXEJvIOwS_IgfJ8WAa*mD9D(Ocln8evrRlBQ|ENUap
z2r_nnE<_k6yAFpHvB(jrww&A?0&+aaG*1|2yh57KhJ#Po9s1jm$~;`EEnr%xfiQ8^
zE2OD-X)Tw<Oj-mBVX&sW#hpDDeqK)_LoiWe9$*0lO!o9_XIs>`A9pn+;XHxu^u)J%
zda@a060sB$orNQaBX(~7^21nMHOW4_$3Fa2-G>@`_=+Z^3+9DClbu)=(oyGPQ-+CT
zYf_=5zbfW(@TtPG9|=Vdr?^Ptj6w+S2n*&2^@V<*e}nB2Zf;M#)ME6NQP|aF^u6V-
z-!SBF=b8lQ7FO4KX4PsDLw(r@WoK`_4>7G0d)XwaV@aX|@;PA2Q*8aUs^gfvut<U!
zFRL3y<zemNxa4~<F%w~e%&a8L>bg{~67VSfJ%sc<qQb<1N@P5uPFPKH>*aBdmJAK6
z)MP;Q>65%g0i#B?tN%s~Csny?!q?>eRZ~F3XN1kjbH6DpJIsZ+tHf1PAPB_3VuYlg
z140gyugFt@xUO<>2er8)+e2~oGcM1t@)81EF=O}FpkG-tWY|38ISZuSl7}nHg=zxw
zXk0}Dm+9tefcHWX__pDGO5I#NAU;e-?)K-gS$fy;XvK`VI*FU}uYh;-fWx13db#Db
zs9s;l8>S~64!7ZtMPlns9e#rLyY-s46-S#Z{JDA59@d50LWxq|gzKqp#u7ufTx*+?
z5J!RC3n6={be|l-bZ*M&S$H^e1B$Gsra1XM)LqCxaHkULYNqN@k7KvEHz2g7yWmla
zoeTcHoRPe=)FNA9Py@2e8^qTEM-qrXzU|elE}m{}ebb*q>l;Z3xP%x-aw2yd-Z7HA
zF2l48(Nog$xG=mEM~Zz=0XTz)sX#+1jYXLW=GC%DT4gocUanDTb-}qH2PUeP%PR2~
zL8d6SjSfnmw<t2DUdG@>IxRjs|9(DUeE3IPYPO9QWeFcrSZR8*tA7|q3|uFXAb;}}
zY`s2A>bw`rE+dnL1z<7NCi++Oq~<oCSf3K@k8x;IPa~BpfJh0A0O}Q|evI+JxQ{6l
z%fh&blu3*9p81-&I!Pi}ElDrpsT*Av6a8H6hJr^NL=$e9G55S_xXP|vw2wu)LiZjG
zf1(N6!@o5L!z|-RLHB?!IvgEtwcoRcHFm0>(Xr@zmWXMbJb3<zE$GlJvb1Pvq12k5
z>3=I`5PdbJ)*QpS%B^B7MNXJ9Wf_OePC05>gEi7oK!V{kIT6*nfREAhs!>jhVf2#4
zkvu;_EX9NwnLM@w&)V1n>E1I?1v!ALf=@Lkh_y@bDRDJf*&0c0^Z#TGwSiauPnIzE
z&zCUPygD7awAK?X+-j`zG(X`Q@2ff6(5cu^fx{k<8T+WXu2Dzq%?iC6`bk-<>zeP2
zC#NPy%KF(vaJ91H!!E#{>RE9{2z~50?$OL2s60bRe{*#Yws7Z=(2DXLcjXmE$03S;
zPA|M0Ym4J_826NWoDU^0r!$O#2-MJ*z0=iVUK;+b>_pwzXJH9uwU_z#EYm1~Q7;Au
zEZ^8EJ?+Kl@1IN^EH~aH^35BAkI3?77+KxKJVX6v?%7%{o<icAu^rY5U?RU?shkFN
z70=Dab|9lo6_V3V4awVWoQZeTGR733Ih=uOxCfYo;c#VwywX@?OBTKt|2&iWrdh^y
zFcN@)D7Gplfy}r;$hos1pL3crp3G9F!KYpJwe*t2$iruGh11jFa1Bk6%sN5+)chRv
zL=@41I<Qf1VvwXv#|7h}aTy<P8FL#Xga}sN=4Pi?#~5ZC(W4?Q6dfP-+Hz>ifzvKt
zpo0HOtSEJ{(0OQf&Zkf0hXpJYy240%sK#|o?|-ta&8Ll+nbmDtMHcIs_0^uYO%I`D
zw(j02KYzAuRX1&pq_GTRXSBj%!j{T$)j%^GyEtF`t}r4REbm$uzgTYAM<4nq>!F#u
zBdl*!E{lxVw*T3ip#v=A0qj`A7x~458YQI_iODN4qr+Fjw<(GS$}GO5L3g1Iq-}f^
zEzf%s*8G=NF?%5k#j7mTC-Qz+iFIyHG~p1uDN&UZU}a?4>gXAkXaHLvtGeHw`TxN+
z92FgcsI#MqJM$o9w0@J2-B&{&(L@UrDeffm#q@i+Ip19G@?Q+bxN50b$PtI%eDsYe
z$?lgW=mQ#3oj3_*7sQ`VUx|Sqk%1IM9w&0|?Rb3gq$W*2Tor+C-3Hx|(<#<5MHDLg
zyUPup8SUDK>Mkg6^sH`JMjO%;Mj}KD`KK#bj|7Y+QlmT3Plh?xQbiUt)=|cfKKNwT
zGl-_}Vgg<BzWrkLL5z~1t*C_$6u&5%INU|CcY$UE)OsWjpC<7o?MAe~0Rmbhb;^m1
ze({Q63E{>xqglrBL2V+$cOP>f#%Q%?c^E{9R!9@9dV*f|w;M<a5s^i0tT;Kc1v$sq
z9~7UWSDg``SK@)nqz>|_h(hJUrJ*81dnDdG%J&WZuCs)3co}yNpW>u|?;D9<i0V|Q
z8sMp2(JLG++=(^;?SCtuT{{J#w4gOWKEln#Jg>`LPsmxSemt!bVYCqD#i~G{S`a#V
z(o50Gx9q1}ZVjbQdjjad^f%m>`e-Ou38!|E=M3I4RbA!M(vOQal{W-sX!PZFuQg20
zZB7@ZtC;+7W#;$9_SO1w>#!i&R!uRT*>}Z21&mw7+!w5dT@TfMi7s04neUv%)Q%j9
zfxTt^Tf^Ac-3q&Pp*y4w-a7RHn)W03@Xs(z5+_<JkV^3b%fVK&zFavi3T`<LE<yO7
z=6|Q$G=tOwUGJu2Elr|C5SQ|6QM18UrXCjbON5D3@B}JIglgnG5quT055c<+s~w$|
z^ALt`RZNae1Y^Ss`}VLJ7g!Q{dyroogy%ID(TR-fYpZJ`=^5uh@0ZF)taVkJe9?T7
za0({dCDM4YE%b22n(g;3DO_iTsRg-_&Ld;X(RXv@JrjLQhjX|;OVSKS<?J{lGRu`J
zvLH~-J~v`i0yN88FlS53MqNaKBtvk%y2`H0fc0vnCBeTk<IJ5>G6ugW4NbETl!_bu
zbch*V`pk(l>vkTU>TigwrR1wCPR}9)l52^Dqt>S5&#jnASqBg=adZMR@JJE{omWgf
z+xcyb1E&Jg`DRTuOiDj2zFaFTlOEb;WBo#7_OUcq+31@4r|MWfQArwb_d@~_Tw~PF
z;H%SmatV|xXlDhY#gx!r`T1n$NnJBJyAC`Ke2Zovnvk^@dH4c|s<&I5S}JpqRQGQ$
z5Q;xSW%?EGB@+{TvFXGIE1lXwU=z@dMznv_e`^?8&F{;X&O)5I80=SM@M{{l@#{@s
zQ`FFThJ6M2<Fg17AcD0u8<QAUMru^JD4R)Frl5q4D(Ezq7;|@#FvvUy$rD?WK3!k(
zgV>L#GgS)^AT|&JFdFGgsY<DSX*#6kVXo<^r`M1+Mhyd1U5DP`Q-C4zC1y|#0KN#q
zib;*pX^8%Xk_+|6_2tW<O=X+Hi|l7Lv+9^bSCVh=7Hgkt@Em{)TzqscDt{;ll?`~W
z$=#1CCkpS7XTb_0MTwRah>1u!0es5eB;i!qIR6-5C~#J3!C7N+TA=hS9CT0IKWUrg
z^NZo{e>a$a<8Si*FnaoSg5yAWYWw@^|Egg6{MY0E{(r3b=zG=|FIW9H>$krBm;cz9
zGVjUnKk%2gr<nfc?<fAPHl-!&z+c6jJ@rqE``UiErTFEaXo6+KR|j6P`>y)>zVqqn
zv%PHlw>4p3=%ce`N{MFVXn&+FFZ^M2_)FJlPWX2@8s$@az&vs&G-Mz4o&AM3rZ07{
zt8YvHTgo%wBj;q6AS^&D9A^wk7XfFdPRbRh8ybR!BHKLJp_R2h&;n*@><Hx<iU-yT
zWX_EQcN4G5IN{>r#MzP_(6fdTdB158N>yWCc?(%|yN=54EzD$(Ad4#CKh`F=3;Fe1
z>^JFO8=Ah6dv1mn0tEnApr)fsxTh;b?WvMYoR0$kj1!tkC(4JK<%2gM+yX8HT0sHI
zNs;7ckfEB2>-N?_+`Zx5;n^YY;t>?RqM0$`{wA=*)9Gw)rZ|5TqB7T$X^FfxREfOO
z{&$XbLvGeB%k%882yq$}%SdtACUfK2&u-tvIZ@k42A<><a;f(;>UUXcj43fEOz~9u
z8;PjT6>gkNlR4}r8K))Re{C#ymS~|B%hX1C4G_KlO-ARnD9?^s-px6yo9E>_-s+jM
z&33QfJ|fcXX!||SfH}}=R_+Yi|M04>Kltm>uV{V_h23qz`sgutM`$Ga-bKyPUv#Z)
zu1Zds2XhGTo)cZ=9M+Ke0BBb%i$;P|(!XALL+sSjolOlzo+ATWN`6qTl3lA!&utYg
z2@d8P0I%4Fr%ZFX@yCS_l%)HG+F)XE>^_oa^?oe#)QPQ4GzgMBMiMSsZw9Dv5|%<I
z$PwI5NJHtezhS{{$`p#Htdu`_GXrtY4tfd~L|U!`)u6=u6dOr0LmAkit?B?tpdy!*
zrio&trZqC~VH{4x({;Fm*&Z@Y)s#uq(2P->4`MJ7mW~r#aNaz(Tqk0q!3(Bn)e2f*
z56R_Y(kmlRLRRiez>O_b%u2ldm-%=8q3l+FO@j5Ap5>ezwfbFgj#d~cjWNcVby3PQ
zoRbOgZes+h5+mrEv>3Tq<f0ZzkDf~a8+wa><4`|gY#;lRa+CEi!isttU$i`Qp291p
z`1sB*^rN5of@gHWdmQUcVZR;6nUFQwr1ws6b%a04BUH5CObqbv7HD9@D>q%Ce=x6U
zZi)!3-@V(tl~E0pX~@OhD1NOHVc>lg9Dymf4$o&n^Xm+!wE>Apy~246#*jBDfw@0L
zb0z@XLFsxkp=6ohD#-bo`4IhU?IAzBl>h{y?bVzPg@H4sxEv;}MS*Yg12LzDmU0!i
zJbHYay-=UV6JQ+bUev}@kR^zf^=Y;GnLRf8n3Ql;fF+hRXnv%Ygx-aan^<8SXkfy~
zw3KCdf#+}O0w!BE)XykZw)An6^k9gAg}p)=1usfEe_ZS71+nI^@$U~ZWZf65=eGr3
z#>8@4fO3iYQReAf9m1Zl@P~Nub{!9<+8w%U`O0@(HR8QBJvnDTuW=W7uc$(!va^P;
z1@><wuNnAman5{IK@E0nD5vI%Oy_7b!bvK{J~IQKzpjSTl!jAX+vilS_V$a`Iy=@e
zBPQD&KeKze<3{w@4>frAUA2c#=|Z3Df}P<(R4m~ek+!tJtG=t6k;t-p_Ir8ZKy=tW
zP&Rret!!|H_l0f~JO@k@#APVan8ra2u~v!+$@mpfH(c_g5}Cn!m08>5Pj;<l^%%Sn
zO1(yQXP^c&M}%)e6WCIn(Z7HyuEKhR2VBwL*pWQAP=^w54aZTe9-jy{$um8)kiOmS
zY1HcG_6X%yjO|oQJvn;?nq4@^Mq`T!$(H}gnT&-(mwUKJc<B(Ktg?|%BGggsuw@Hm
za;@@XIX<b$m4989`<(3Ti>g6_GY2pcLE2y3mmF~{Fsg&1!okkTOp`unVsb4~{+<c^
zxV*DD@_o2A^pzgY7lEm)GvmWW!D`oMr-I+nT^lqN)lKotI!wMAJQmoJ*LY|xTIHE)
zw&sZ%J)`eHoU;QaL@wM43x|4lQmzHeG%j~pU8yl8)*9y`SRR)OQ+z!efu%$aF~hlG
zZ~9?p?WxD#>2cj~g*rnYx|C~)=1^EMhh63`4}^ble=!f4{Z}<gUg*<9WLuuR#yX<^
zDy-ynETwJ-%S3V*1udZqS&UGTs%WNpmn1caLrz&<Kr`D|?0y|f6ZbKAk`mUGEvye{
zO8%E-J2&AEVwx*6fCRlD{pixP?NXb$gb<lu4G<9(sH6O%9(u{^eAJ^~8q<brGQ+TO
zt`UVi)gP>n09%Sto%~OVrxoxTf^)_#Zj5OF%7RDhHi{uXZDW`8M4}JlUdJq@?PeU8
z?ti+DfA!xFn!n}({Fm2sUaEP=_!d3$N`1qr@<K~&@dL(OtcY1sZsCM{OV`Xs_A;6t
zWCb;*xXY6o`9E4Iu_sS+E+L5w^)a9jtDIwjtI^o5{OiOOAi*mw_$p*HJeR$4zTv{L
z%AOk-4h5uC0(|oCDf`3ETB1#DWna~|g&byop82Z|%?{UT?+w%3KJH7we#uSuTtC;9
z(Erv3V{>ssLLM(F5v*b(lbyLl)JL3-YR5_?L3(F=-x7V+oZUSa7();AQ|qu7s@1uT
zsE5!eL0km5FukOl?{Y8lorl`9d*Nj_&9$K~$8*5DiOx9ZDA(PaqUO0Unen8M(ulS;
zl8V3`c+8|ix|C-41HguVOveE#%UXv;-_tbE1uDHHXNVRzeq0yG+w(4WCZt<3->AW5
zWjR}Y9NSS9U43hmY8jpMYPs#QZ7N^B#50ZSl>fTFOE~eIt$8YGjaJ#SYX}Hh207I#
zO&|Ag&H;)xC@$?T|CX^`Hs@gQZyhzJC-CyVSd)azK6bDsVWlOu&;2@G#!y2VM>w%S
z?Ol#Fw#EcV)S1(b50q4>`|kxv!!@XdBc%!5FWpip5x&`ztr$w@KU_KT3u@g!IHVhU
z`_C8fV)#qV*q3>sy%&xY2lgj1-dy6E<^SM=_36tN`(EFFP{f^R0*h@>6{XLKX;T5;
zYGPr=Pf$*sl@&0WV<J=B&*Jxc3)qn&pCd`IJWi-6g#%9R$c>qurfpECB@d4q(6F|e
z$uGjJNBcXnAc1Byc8EC)K1cg-Z11}$ZztDeVk;YPy{(91wy_v@KmR-HAQ!HnU?I?C
zKi-LGf1|i{Pl~KF&06^o%lWY|rq>s4A5y8eDmN`pLY(AtpiVvPPfqkaNs&S;&o_)z
z-jKpvDxY2Mi(gmA)$1~uT^|&F(f!41b+7wo@u2a;fGS^aI%9Y0Jf}?kX%0iffZN%L
zir2dkWCHh6F5@~(qioZsa?jphybQeCGVAp$USmQz0Xz8?wA8${C0ovu0;uoKXq5l7
z2@{N}S#z;8(az6ya~Z-9I~Dd1+Jj#gJ5srt{pNd`@W=x5yNtZ^XlyA}j(fLb8%G6M
zJ+Xw8VkllcspkpDgS$<ffp}4}h=&t<*B~>w1i_SIpc=U?Oj??Mr)wvs_ssVZABiq`
z7f-527=NUuaF^MLfNv1C6>ixU0_#lMtd-N5VgX!dUXPmYZC}Y?oeh9Y-z1d{&sHjq
zYT^ZI6XlouU34Al&S+=bNWIYrA(kazk|G)>{t+^#+MOb?w@$oSlP6X{)%%8E?@7PI
z@8~L>(a8+IwIbjY@Q(um0cYrcr^Vu|=V~3J+7-tvh`IaWf=EeAhkL(Crb3dlu?j#<
zsw6obFR;NIBmK0#LOGSPujp!8u-d%C++zNr%l~BdREbfFKW(i`@G$|bNyFA-DE3H4
zM(o<2T4F7FvE7v#S|1k+5slhdz~=&Q&lZGOC5`eR5n#@NAfICO@1~51EiXS4b%Ecb
z8hb|<{>`57nEl=!=~IHgemO-mQWm^I4=U=wdso^*UquI_!|h>5`1h_hctg_)nTTP|
z(wAC-)CGV&Nk0qfO;V(FW#v+7lhjp~c=!rWufGoILXwX*p`<~OeNY0Hz@H`{u&g2f
z<Zxc!^gmzYIG0ND=<DLpP6)i*P`vdRZ!dq@t72;Fya+Nc%Y3;Z!Fy2#rvFx}du3d!
ztM<7(Ox#6eCOhuZN9K?7Ek!GX4rv|#O?v4Y>Kz+E1qz8)oQ_$w&7qoa#GI1$#obC<
zJAwWbCLp>`AN|dJnQq=6Wx7%qudfyb1U-x;19pzM$kX$3fJ>TY4J|S^iR6A!3d{>$
zD66+S?T$fH_9|?n*slq-`uC;pL|y>>IHG=_p*`K%niQjdPSUg3^a0hEatB<A$OwqY
z{xi%mr%_{q#?sjkUWN*UsuzH2OZW2|ZbUg6TNZ8`#fSQCnjNk7XH62d5c1^f#=0YI
z6R2-5hgsp@hQH7#3p61|Uf6#q{K=qs`0|T=7nHy2=3QzwrX}`nQ8#ma2UTk%d_nRp
zoY|s#q;aV&DuCfdCl#aT3!rY~%R?Sw5|^FXL?yg5-?)hWfOAiPWak1J7C>UIKm0D|
zd(p=eKnIz!^}4C1ipYpY5vjo&c^#x;JKUE>vFZ20nWZ37#nt?+OkgJlM#W77sL?9-
zXd2y|dZE%G3uMnY)eLzqj@qpSq)~*Oz+MuBbCJnQfjlWK0r#G1d|5_datxPv(9+2%
z;A~<V#d~0qJDP>g<RPf0bWO?|Qq{9ZrA^|dUt@ui4$H#y1jBv($hURZbly|<j_8j#
zwq)5-V+MbW9RnU)N8nx1*IVOa^^kZmnVJF;?my8*k%MvoVE}NZ=~NV5?#j%1>cc16
z99WE^qD{!poYuXTW8E;{VVM3%;TwlvdD}edd*zOOq_phI+jJ{!;Q)Mxk!Wc{BWXhq
zg+0=Sj!7FjLK}L(9{j={N?TWQuz$idj7E`(7@LYvF#?b{Hn9*BdN;spjYiqW4-WC5
zX9=VZQ3=+hv3c|O?ip&4%T~IDpb8>+qGuO|02|m`cy9Etq!M?{xic^JkBf=d#nW@T
zMLx^<`h0b5!eRE55T-?CVoUJJW}-g&<CFA>2%fn%Mta->L?y9T)16?~VjNECMTjZe
zfX1-QPCo?o0MakzoDUal7)y-rCP#o<^<fJE-XuG(aX?r{T%Up~$QuIQMA)=W5sOAK
ziY)LR7<U2~mkJ(FO0<GY+4z%II0HlK%}W3I(^TT^v0snH)$c>?{&lOeY<w`silcQV
zcoY;MWEFH>jG=kq#a-PBSH$Y<n0lV@NfvWIC`9&i2x(c&_`c<sE+0SlTDKu}!}+_B
zp-}YLCG&6&x(Ns?fpBjd4a*l3T(2rWAO7v3@T+YhB2mJ@9P_<B=CL~%itT7Env~K8
zLJ8g6LEqbr<j4QgO6xP~y+JGNBa=bAXPT}2h)4Z(A_x^*2QfXBw>BzeGvjU`TqK|5
zku?|3iveOw6Il>#qtN1&>Zr#l>O?vwE)bp)>MFtO7Gyz<NY37i8YBKaacU^$k7il=
zuhp1ds&xADA>^AtYb`?e5ACMzrO>ziSn6V#eR9j9Dv~vaV~hC)6Lnp(+Ukh+IJdic
z87>l5A2;stV42*6n~pNfInQ~?H)p|FI_W<#-7xgulDiWjjU2cDxU!x3E84qy5kCwF
ziJF(0YuM9!KUz_4-D1l~rR!5^Lp`Oz@1fwFF6byG;FYlfI!|Y-G?kz}sr3mF*#<~y
z1|FWTt~5p+mqW|7N&*oRR#=Yw^%X5$x#zja>fJq!1IsyCQn%T*tZqK-cKUP7xAftg
z=HWwSV+Y#CSl#d!iJ{-vzkJpAa{$0E+uMR)<%NE0zSpi9OL=qI<r%v(arWNp?swA|
z7Xm2&lXVNkBM|`rJJPo>FXJh&qD|tZK$9TIz39$q@!5Q0e4Ao)Y$&uGEN5qa+_wgC
z5@iLs>l98o4`$m$pF+L)yv6iRZ82;vW%)vBQ>)fJkaB7Yvo{Dx-V#Q8yH{m`APh_0
zWy2gE6-M1W^V=Cr6Hu&?l`&5eYUMgqD`fR%lcRTXV`2;BA=zpoR%gfv&eYJf4(~ob
zsmv~@&Z0byN@3cJi*`9*t9fJLl32YZDn8MOxd#Ompe27Vf5irrca-$1OM{0bh-3a;
zU?Ur<`AXd8EU}3e6!1?uV*|~havO@xcNebO$|iGe2%GQz;qPwEy7SM0Qy<#vzL#_6
zpML!213!M^p%Z(O{@bronrfV%Oif<#*qv{GP&?x%?LU|?_xPG$|Lw0Ineo`3UrdRe
zvgk^x<FU7TckHlS&(s{<V|NXWKIuz;H|6j9fB7sBL~W4_GbuXb@p*fMPzIPj73zy|
z1{0>3T+3ci35Fb?EAY)$CD$hk;9{!ntU0&A8rS=%XW_cI+M;cv^&xkYedMFCFEM<G
zp?sTig}r~pbx0rjyaQ*?kjEabZ~N?Kp?6lRF9JD7>fXTn0Q#BZiB&)kbLbn2Y;|h!
zss`CHoSRL^1qXm(X&y)(P2%H`GP7fA3VC`~Tx`xIugpM%_qS9Pq%kOROp4gmlgboW
zm_n2$om$~iOO`QEU~x0N!xluxl)Ib|urKkqn&}<o{yaj!G~Nq|z{P5Z#2{RfK*+36
zmsv#G@nm`ia>wI*s7yH>OPKo&QRyomOOJLP+F4{^hZ@qDBS>!{7TtIW2cJ;YrKB|W
zH)2w#nv%6~cv23?FlseLWcM-`=Hco~*TtnEj4AmpQ`suQQN-Uq_PwZ-JhGKW4!iuP
z?zIMA*WKFQ*s~-$E+X54Em*BTx!$><S*dYngWcoJCt1nPN0l3MX{3y+xxUG_=4Hx<
zWYzmnvd*zVlmT*k$K)~GJf1YIqD9uVao&U5PkQohB$~f`+kB_V{3T6OOZ3=xo1@3Z
zM$iq9T@8PwD+|<XbdysClLEQT#px#tTRV@D-x|vYa{kFk4+wFSI{2TMBDh3U%H@?F
zNyB4&g|i#5iLv(*jSai@th;_<%37`-DT#s1Vz6CjTlMrVbfx8647-|f%o5reF6Y~+
zVb&Ot%m+BMQ_%_aO`l2)X(6{jh=#PeF_%m_-qpZ&X>#UxvoklVSNq`NoC3dO^1yGA
zE@wPSc^jP)lLPQe?K~Y85q;wS+na6mmgcJc73`fP*A@jEKME_u-Z^MH$XxC7C2uZ+
zED$LRSBM)cF(rWurvJiPR{U`-qyJDm)*9SL==|-Q%d5qqoc3tubFuDAYwKzx8;^WP
zb(LT}l|A){aZOK`P#55!u#h+7VxvhgsOgNwgI6Yl?gOCClJ}UETpydCxp;N;h53)0
z{bzKX*TH-wQ~%+%OoR9|ec5Gg$v)eu3iQ}hc3aLxExBAt%cFBFVapKmDy*;MeAYyy
zJzef951Ve|(cMRR=PUMAIAOeUtPhhWx}A_0G5m=lLpPV09PlY*t^^{u-&QUCe4rR2
zY=hcWt=QzjdftTIi1ARn2`H+MVLc+(-f~l(1r96FPn8j;Q0Z?ui{;+*onSOD9{aMu
z*xE@SF2pFLi-fvz7W+;Lg92!W#%DaB3`p12*f}%ayk2sbA{8;@<U*IIdbq+I+T%T~
z3DWEWigCVKlsF+xZtUdHULf~5>Dhd!EkO@OLxzy*t>CY56<XH6rysc(yd8|Uf3kfd
ztr$$NYC??HA9cP37QtY?HI^~#vY5f;z;rdfp6!+HW9LMip><Sw4!Dn*mf`t5H_x#)
zb8|Bcy84OIamDBu^%N;{{VNu2vn(#gJ;SW9c;<Vzb-}(vp*j5TdgzSI-9FKbb-04=
zj_@Z0lq)%OJ|9gKQSma@wXBpxo=N`J>gJ+sib|+q{8YjNzyS6crYmv<ufgMTA~gPe
z9I4d6&&0#)7p_NMW^Xom%PZ#yGc*$PELzRZl^n8q)H90nIGBW0?VPMYt;15Q?6g5k
z#Bn&%Q*WILJzRO(>P<NK|IRAL5KbpWGq73LUgsd|DP59^QV^r<svXzUFqf0M3VLcF
zS1^uUv!NO`_?$N}nY>7qA-yq0bt!3{iDmtEhs&F<OrD<_Q<%ksva!gxDzl*om<L}u
zN-x!On4Tvl(BSLoH8fO}N65Wi9mRsp%TcN$_Op!XH)`4$0Cf<DE0<I@4uXcoEasLs
zDqlN2_L)AUW2A;}jVH0}Zl`%vH+FAOGu%!F^_A$GntOW?6bEmGgRSJNXo8WUobZSC
zQQz5fEs5S6U7qHyhj&*u7sA!aaK8XN9(D7$@RL&gh5R2G{3UU|!W1xW2OU(+*MA(d
zL<M@BC?o2{(o>ue!C?zy_c!KoN2(4l09~A7KW1<(#|qOmPG)UkW*_;ZLdkI-iHRob
znt@CUQUs3D7cG-gvI4INjTlu0c}AS)jJ^1Gl+9v%3zHEo8-YOC&7!K%9A^$R$9Yq%
zAgQ$gBSB73$>3EeISzr%7+0tJCE_VhW#I^j#K2S<5f$Mo+sdWi&}k+ALMrWaGF>Q@
zHV|d!uZet10c}>P-b2D0-pxvH7EUq@9IC9~-@^6Iq*;#5DO(Q(=l#yR+sER~o9r*z
zJ)Q9+sZK<&KV;Zdm*F|U-Osc-wy*%Fri938ezu_8=KH25mnY%i7Mt$>cFQS}CE$1;
zn{{MfWZ~q-v!2IME#@P_xn$UESe%SkJ@kD~rzXsh{4RO9=Fn|>$UJg|NYCFL`l1zb
zci2HEYYOZ!+jOB1&BI?zY*n7KK+JcnDfGTJbEWiLM?ZuT{nom=1l}HA9;}L}wTNtK
zhozN}$GBcS{)Ifkktn-z$hkn$9|0kR0c)*0=g-Y-xzF**a1TqzDbn}=m!A`6jf}#f
zRpeM^a_Og#yc;Dm%4!4MRR2|uZ+!o&%Q-z|Yy`xh&M${$3OWSdMG|kiAH4zLS<$j^
zk)C~J7(#UjbR@|V>@Z=#a7&8}t0sz1XMLPYC@W=BE$E<(fXNo}KS?tSJxW4YrJI>2
zIchjxcI;aS^L1Yt)erShUL(4mO$=dKQ8;fB<w~(^za%vl0J@_VYB6u=O1szfE7!aB
z-m`c1`BzvnfD$1OS5GH+J|lLSHJLG+5RLNZ&@)&iu+dN7Zd}zY{s=t_iZ}OEb*WZ&
zA7U<&lI#DraT^;KhIOUqcvS7Kg*iL#>U}PyH2j6x*Q5!56ut$>7^Wo(w?>a`=;tuD
zDZ1cg^F8}$q75GSXup2!?oI2m!28<PD(xlbrdc;K4u0_5Z1Lo&!wW$i%{|XncX{f^
zoov-f9cLJ?cC}S36rv<G2?cr`$1xoQO*p4jI&_XpZlbTj2RLK$fM6`?&kR+{txVU+
z?Q6=(bk|cTX^&M#C`o<WJufJ$Swp^pR<_pkBgO}Dhd|!Z-Q_uQcCs+spbv6q$_XI4
zM1Cn_EK1^=S~L#yOZ1}HGqBf9_-X?_%f&Yn^A_4eJ(r4rRCHJWPDVP43I>-+J~%js
zt6MKR((zHaFo?k)YM`q#!BuZtcy}<f+61i>)+T?Pz6yu#!Omc%{gmBt)<4^}!r*y4
z+mA|AnG4HB*3*5UrudEbsDG#|xy=Q0s_jJ67kgTnBG-}oab0e+pL{Q2@}U-t;>65`
zMo#@d_wtn5cY5A3xbuPz&2`O)Y}cl+D|{)jEM)%rIL7}kI{c}9WWcUC^6qRuQk<eA
z*QF=*bjF*<`b)2!I|#7>_YWQraG}2@&%do7tO<2QlxIHL7l|v5K5Tj*1=#?*%Y7r(
z!No}_!O(z0pxwWtzBqG{jDA(M4aZNBXCED{P>JLm-s2dS*sF=DYkR72tHo`AkAX6}
za$)pt(83EfGt)tW%8k9LIt=9{698!dNVC;nMUEuDT)>Wt`gamjL>Y?{9qdZp7;qt>
z$)S&v{!<VNA3xG;p1gRA@)5iQ!?MN@Ou5d#D-EpBh53>BvlLCDVH{sowlc*U{5U!#
zWO@RO%JEIPCGNsUM=pfcl{MOX^&`t_6B2tj%t26PdyQ<B{#xkjN2s^vN57jpHJ#L6
zL7V)I<*K$kez4q)LPK?hBpMNM1P-95gslo^yj@$LknKM1cD~p%Qhyhy>2uwf2dU|R
z*_&hD9B9|Pt1t6kHT&C8(V)x7Uy4jv7{uU{?jH8otlibdXSu;o*L244G$8Ba&4`W#
zln7~}u+d5>rlJ3o<MJxF0AH(6j7l0ivF%lU=r6QzRWvE5Wj^R5N1GEkvUklK|9QsN
z5~a|pPAySDL@GzABN_JTgp1_7FvHtiR6X(WLB@6SF!y{btBhipRBF|3QPjd1o56h@
zQdy#ry|wa~S?5HK$?dkfsxGv}Y5*ZXA^&)D8CC7(@l;9i31iXWW`k>{#A;x2^x$bP
zaWV*at<DAGJQei#RJ>f3ERo>WLP`|cDSaxeZsn*6e;}GE&=Ww4Q^6mpofXJMm0o#V
zDB(I!r}B6lJ(~LfWQD28u_Ri@d=!^!xz6R%x7_HsRGv{W37H(4n)1^O5F~qQ;>_-H
zI==0;p3-s?QZ=r%6CwegT$^a9C{Ld|v3JAH#j8KWWUOuMnlAJ{y=RK%Ui*>gvA4!v
zhCSF8I$&1%^Fsgh3lymqqEi5A>bZw2Uu;H<_<YZ$b6LE9RP$HPQ9Jxi^E2Q2nhPM-
zCK5|0AOZqv_HbUrINHRq=R7B*7Uy<YYQaoZ>xECwDdF_LjLS_?kFTMLG+4RBt_uHw
zU4fE%#T@^4=`^^asi@U8)wV{0h#W2kXseBEPk*K-l0Eh|%<^%8D$ki!6xu2854iyJ
zu#zMIPcc=mtR}`>vIK=Y$bmr6FQGe(Su(lbXgpMkGO9>;KYhqL*xlINQ`mTCIQ9S%
ztBij6xEV(7M;!F$a93nY%;t53wS>kpq<<M{AOBQFepO-s`KtV;p=oBkZuIM&>HlK6
z^e(nlygEc1W$v8whTPbN(o0s?btcJ+t4$YYz}e2X>a;0zb=+ad<f8p6hVUncZmzB=
z^0X5flbJzSrZTiU&--z}f98TN6x^c%3B76qbDS-@S8ZF|=FzrS0`+Y#6~c5F?An*P
zZPoMMo?g=f*26`WFIvcNhhm_I-n|nG0f}x=xv;dn{=-e+h&g?XvGMdk?JO<~^dEx2
zNX;QlKL3L_`%HoVL#+~<U-kh#EUl!m8NXo48Vh?YFvV#CSS-dy9x4Sv+R*(e-wT_!
z0vff8r5uz|zy?5#o3Z<;iqIR*20+{@v^_cH#eAI3LyWwIK_r4U9*_jutM!X8!ll`i
zsO<Z-5s1917nmKd`SG8pgi-^Ej*^iD-wh*Jv3|vY3_?zQEb3HoPB*v{sOfo^FnAbO
z-iQU(*IGKhr&M5a`M^x(m%KL=5T;YEKe#-4f2lIrby6f7o7@M=wMr6=iH$^&o=26r
z5RS=Sq1^AFmEbj*vU?-0f*|d+k=bbJn&?Xzc@aCp#T{XdbJmJ`N_$d0yqJYb(k9zJ
z*S~}~dntRBN3q-)pyGY8?5oRx1>v7-LKD!kj(G>l?rBEyLWAMonf)49w|X4kH04z1
zvf_hYb;Y^ZCX;1#hH25>OgRN9)nL?U8HXW6Vx8f>=362_r`HsC6GUSuxH0!@|JAxq
ziE~<#-{YRI@25KQ3@_Rnh>>B6`k&G3qHW@ET!!u?{xVnafT2GM%`3{oXs@&9twrux
z2GUAo&Ok-UZA5Pae76kv{~RENl)p5Y|6Z893?`m@fp1b_$6K!Z9S-lJSZMI6WvdYA
zA(>H)9bgJ#)x%B`Dl%MPeh{u}xot$Y3Oj6XftqmO?g6Tk#6y;H+=f4vSqB}B$YfQ8
z0ezIzjR<eJ1gL+ch<;x`ax)ljZURx-xvIxB-+zoYRGLvwJ)XrRsn3jG3{?XE)7*ED
zO=VZrPXSid(;nvA<}ntMBjefXECYG8>+lXJkD?^OgYos6o+X=XZ+=*I=WMu5?PJHJ
za#H(PZ<_S6b@Z_VnqXV#ZG9+zypK&|eD^z-lH;1<#ja)9AnTC2EPsgN3H|oDbuxGm
zW<TN7v6qSJ5)}+%9B2arQV^e4!%QK-Q2LZQ8l%^<pTvO=s8Xx|UMXpaY^mbTk*bON
zS6+tG0{d_llNKdUgVL+Me@g+sq$33h=zWOSsooc_=WQsdnAm!@Zx0M$E`p>SOg&*Y
zX&@hwmaxx4jSQyPcvhs%lA{F%$*5Jnm2{#~L<uCX+cP`k=f32#zDdP_j@jD`=bro@
z|2%W*U)(r7A$B<GFaKiE)T*StPyOPltd+k`|Lf=8%boK1BQqn)lODPLNLIq5PZa;;
z<)56&gh?OB93HdfZ7H3wsc;A7emVX~tnU@!XIxZAhk68gxW2^Ov&U(u!y++CkytS4
zy;?AOf<RBQ95qF|miSBzgyf~A;lX@8-zvwExQm{E;lr`ulajT;=g{Bzq0KX>^a^_Q
zWbW>)2WH}yPBO`Q=n(7YR30m55JiB(?mD;A6o)z4<V9S73lpbRtQ0Y;huQqPh9m?d
zEd!pD8E$M$XC=qF`w1h(R!@mN|C;afn>wZStX;Wx%6u;`+)D5TRGXY+jq+vicKGLQ
z!Ljh?VhvkHZ{@wI!9=0Ly6kdSpO6wJr?}rtDtQ=S0Jk<sGmL6Eaj-#0#XLUn1**q!
zn+Xn87uW7QtS++4S|NOs>M6N^Bv3%{<|?!?)la*mjyD8GM4pnv4UNn74f)R#xyZRS
zdL{(o@;EIq4ri|PY;|+wk;H~3&_`=lMgJaed)I8^Ow7ELFN6lJz9eW007Bu1Go6)}
zM9M1lB{?cDiTEq!iJu-!DIu9U89pAt*;II~_V?k~6J+VX*igi)xuxtZv*gT8KxU$A
zZd;TXz%}tQH?NRRvA)E`NxrlA7sy~3IyvX!QV2S!iQHyiucmK(FwXq`S^w?eO4s~i
z8nqQ!5ATMYxmI0F0E{|*77hX~zs1<m=)YOcI`7s@2g^yP83+Bi?wt44sKzv_vsGKN
zIDaJ|L=H&@#Kpco$yIB9-=$nMj|B9#5XJ<$(5bV`czp}l`i?`lTqDB-gezfIG;}~0
z`UKaWryr7ipPa~r6v1OALj{xqL6>+fb6*0=QEM{rBN`yDvTHxSH4@3ZZq*F$IvGsL
z`az0#I&*7v7xUps9&t-SInl~<@tst<#Yyu1z+PG!!`{H=p<PlCsE(wZBCr~H^92VK
zeKc1wo}o|~q~C%L7s=)B^Y&}mTY;y?cNQl$vB{`WB|1^Ac{1Ms8X28p|H<$HECs4X
z7h)<&d)uTuFF>)lso|3<X<<uQCq|L{URFn*sZjU8lT{B9Rq`RGz^eB0Dn&3BUh)Yn
z=sVai;PlAX67i)J`N0}WZ-`kKPcm0xUPUqAL_I#~)0@HXm>ptc+3sHV+SE9GCG|oL
ztnxYCYvWt0<lc<RmD@X}%{>EFZEV|8P5@_b`EqLA3DW353=`j_-G(i7R_E0zwM{H?
z%-ZFvDn~4uKUA52=Q9QF<cDwSf`j4T+zy{@3$9}BbF^*j%eEjqs*eWDUq<FbpQ7gt
zzzk?%s<i==7@5ElI0RF+7<VtmpG!orQ=L^n#n#okfTyVE+ZZ=tQj6<079KT`{tc=e
zfK%Nf&X?GwGg|kS^iTKi9;elyjG@@X6u(rw`IrSmeZVr+n359ne`j5rVNS>%<Ax{z
zDycTH$tUo-F&n%eZwj>)KTWNSykmK>WD#zVx2kf+(<r!JRR=$in6U4`3d_v_Th}3-
zJZ}SFnc4}HSOs^_AsNOc4X1?v9C(5jM->9(0gw8h29P|8OkboxmCi`GGJB7kUa&|y
zJe?LOxzQ0RzC`+w1Ultky8f_ZZq94SOu5=P$?BF_vzl|~-z<06%So3$x1@q>Axxaq
z8a)f0R8xebHO4b~m~-0EIFk+H$gVito-z3$<zMGc_Z~~1ySnoDg*y{mrm<@p<uZ<k
zXC?PwJo#N0^dAac)r@|W82Tjo;c>IP0_rToLGLcycAL3(O7>Qx+<sGiQB9j|OHmVw
zttZ(PM~dK?vqb4r36FY!zMxB_M&uo+h;j_^NSu?^IGsrumy`ZnX71KYkC5x0E<lsj
zaE9ypMAs?!9WlzkFiF~MX|_sHz{xHh)}Vhk@^4yfVMJ6|>k^5xnWtlTLnMkwGGAeT
z^5_^Gi&wDEPg14}NMDq-^th3H5{erq7I~$j$`Il+()qrNYH}0>p0bf|F2N?83YS*B
zM(<-s>qF6K*ozsggr=m>JaLqZRc8VvJzJkFBB4Wf&8#aG=779Ylu}|h6XjVW%||mG
zBiY0p18whhvQJxdgZhzQN<?=}0<hV6Q=Jw!Y{NR@br$7q%dB_rAkZx@%fA~@5FrUT
zS+Q^WrYo^AYFr%Jy$AqX2}#8W(d}$)EIx0Vs>H5dUKxCP)Jv?QLl;!ibY{=}Q+SVk
z#1lRCsW}wTgx;K`+T<lC&Q^Xbj#wp#{8OVF`KcFP_HPszCnq!B^>!?nA%oZ<B<=w1
zY!zX!&N1vehMW7%a;7A_<XcG60*7T+6Uo5JJsvGR78y%<?56QCHs$S5`M+6A$bq|I
zMNJHld%{U*Azt*s5+T`9ek6^rrG91gd(fLoIp6SB?~q03@i92SLHG$5SK=37Pc%`f
zmwSfg7L{3@gw&ui88=ImrA<Ty!{z8#a}i)PmqPfhmCdQ_pwq5~AH;&;Va?_0s;9(;
zkSoG4mBnn^%qMW}Kzb5D64ik;<%qrF+<&&3x{=vTX|L&S=|6w64&2nby1DcKtT&kj
zQDN|7wN5fVDk*Pud8>y=FUAw0dlsvi#gN(AYFKN43n^C7SG11C1X{J;?5TwLHWs6;
zw`kGWC-i>aY7XK2azGPiT`|g6E2YnAsG!ymYH``FE%Yb#nUWJr00*frrkzZ$1Z2&?
zF_<TOq09Yh1U{Cs{i?btbxfO&a`4h4Z5=O=DqmDd5sh+EL1Y!u>f}J270zKFo{>}(
z=_+};xpn|xn&$tgaF+5QC<tjHN)UkF+71<=HuN*<Yyu^#5-V3Jpd|~03F>Zc!4%0n
zm3tjj!pW%yKO=jNdKRksoK`jFM?BZ=!-!0mWcLbxj(TYqm{SHxLb!QLkz@7F$KhEO
zEgwm+=QCQ4qq>u->bP1K@%Ew6G4khlmyH-t3`v4^Llf@|#@9@N#I9fbONFFIsTL_k
zz>-8|Z2|EXvRw(ldC29LD)?Euqtp`H5|P`}mCJn@$17{7h%naEOukU(UmEM1OGv4d
zqmZ&N?8U=A@I*|BM9ET7l<!;xdu0nlhXo8A4yJB4>{>NBVVk-^KMLQw#mU<^ev4LW
zf@$F)*H~xx=L2P5_S(bIW0%7>nDm`VGQyo|X+ny~G`u4MZ&KE6Q=v$QwD>$X$4cRs
zMg4oZ2M2R9{Z!E+18@<G3k4T~3i6}CORFG8vQ=8$Fc7g9c9ZfeMxr9dbc*f`nCS#W
z$N<n62FaC<+743u+$HQyZrCG-3`-OFgOT_b(+0I7lBOZq|3Br*D(*C8y8Wd6m-><F
z&CL<H>4&Sz_fxBM^F9c02R%0`A;OrF4?vXpe^B=(P*I<0zW+@3e<soB;1c6TOS6ii
z0a5fa@xpCT5tqTeG*+l4pvIu#4$KU<ZYtFD4T8p4vK8wMv>|C?(848#u92p?H64nr
z?X(?<I-xpARhhNXDiunK`rXg-!}Ltg^#9D9^FQZ*&OIlm+YP9y_xHZ<^L(H0(!Cx+
zWNov7gbO-<Aeh8)x+|SjcsP#iJ!ezw^An#)Pv!UVnvFZo-Pd5q6;PD(%I~>PAKYsT
zwC5SU7_;Ra#IST^K>1i1b|}XKmDEkK%CBtq@{~`mDE-zb(A9$e1ng%~u0BozoB)rp
z*kddihz{vX1(jy9-I9NdftexWSvnsLQRin^c>6w?1_C$3cI+o*f`6HT@2teTVvfcU
z0Cfe*>x_5HxdZW+*Q@Ldysu^Lla^|j9dEBazT-LiyO`J|18aABfdVO5BME<)&h)8o
zCMHq7-+=0|tAw>muVY)^h~^Ji3KEZrwo#@@oDM>mi#U>~k|Fnu*T7Cm|AE8W=&9r1
zUBO}-9X(dFRY7WxBWL#&=P>6%kBDFK89ymOojl=OM=JeQ9SKo2stSs2nOq!@d@jQF
zC_0K}my9H8p&>DHhZmb_x$|0QzpmZp95C#5{Lk(gk;VC^x-UA}ehfgnaTjf%kXKo3
z<`(xx${0dNm!<i%NsKgr48Ybl+CRV>D8KJwG0xj1fAxx<dEA!%3%|{oR_?lJ^SSc^
zU*h#m8GH*BJLENg)3+<@3sJh|iuIk>`L{ScA8W@Qo;Id>U+`AcTvCDk8SXs|5&ES;
zz8}J(;BpY99h=vMYc{*F2QwXd&>$a(D(~}6Luf3Gf<Sk*Eh6?-Aq_k>F|uT_Fc*i&
zl@PHg$Scg(vmzuS>fL;W+mLl}H_t%g476g=SN0incG%?=xk)o+LDLvL)Tm@rIRUqW
zZTX&2MML31GGh^ivuZ<v3PW(GK6tb6P*u7amfr}7$krgDij)(v{02LqH(Ee8RI5Kz
zN}{Ijq3xz)9VpCcUp(7{l|z*zgaGNmr2Lg5y?*>FY5X{EpUI@k`X%+o+zCtQ@Cvyw
z1e)L&Hak9Q8`(}EMl{#r-*0+!7Y|)p(h;sxPkKsH_t_4@EqxJl(>>|lZSX;*tnGP(
z`}%1(EULh|IqOjnzAdKJ&i$YNxOPQdae77Qh8<z?_dgE&nCwmEc1GaSkg=o6SIVvV
z#$0b@$Yl=T`Nq4_&~Ospd+kPWWnn{hCTR);-dlWcxE8)w20YZ4w#OnAw1G2*ZG#`i
zQG3c~VaeY>`S${~Nrt|glM}}g0aq4jGS^h}35b>q)+<r)<54xP=aSH<jb}iL1l`R<
zr2)+y4gl^KCobiht)Xulbj?vWJV`k9Erw-SL>68TK4##ID3B)yGbRd7HN{w{!)zRG
zh@g|q<Q~djO-PBwKHx0Ii00UXm9W^$b35~7gSHs(5n%i=L|wu~c$^wpWPNE8*CrPs
zcI`sS_$b)-eYJJ*{btd&J}pLngeOD%K~2O{$2DX1JO%iS&ah`#Mj|F{f$K$nd{&9)
z7i%h1A!pOkJm24s$x4>Rc+yL=4_6dcI_pnOiJ$I?+EBswOO2GPrm|Ex77R~wal@Vx
z#Y^4li>PcCt^Gs>M`gKO)I+aJZdj9ZVVWsnTk7sA``JaKFDRa`AUrAGwlTH~fDfNM
zEPI>AsF$=KxNAFZf;$YXd>kK={)^`3DXFBA#)Xiq<Jn7>hK#N?3jvBSR=68^4@=im
zA<$J(_L>kl;6>x=>4n@N;7}<?PJZ9a6qBSZQotF?N;?`R@NjdPibXe*2${KspaxL$
zr*H&@>fCz){rhIiB`T8)-*ZVd{}-;LV|WGD->=wy)xK)g;I{an%_Lpp`fEi7a*|lP
zs_cesJdU(|mV8PRRgGkd@ctSv{0zeJo*RsWi83j)(m7|#U|%YCs}#yH5nXfMJ~25m
z?C7bnUa5C|UB*EF06yQ28|3ia=1lb{_gakiy>hr7>krP#T%OUJ(B*lD-h>T=EyRrK
z7KVz5aR73-`&7CF%F{$Q(G;R6MT582!j6dAzeFR~C$MeSVR3wbFT|6+AU5&Y@eRF_
zXc#2;9F4T|Y8na*qs!D#R9c<y1a||$y4UqB356Y@a<pH}KLAWmOW8m-g@Q|F1z>~B
z7iEGT>?-Vjgi~JM%`^Z}XC=dao-7Jgl<CkW;6VurC_e8G6_iQX7*ek9$?O^Cd*E%!
zE1#z(hSS(4RdRWvzoeE^XGr7cEBkK~KUOYGG3wsR`{X!L`A{udC~e30O6x4w4JVXv
z28Mgf9bt5`d7W3=eYs<M0yjE)3m4X<+sj%n!}+4lk59L_vQnnu7%0;gPkM>xabYst
zisFb31y^WtRz~zqc^xI8N?0gwp2MKZJ|nTDSc@Jy-Ps>@^zhJqmT>I6O@#(~G6L7|
z>V8W3(lPp?aVW(9a?{wSHLA2nNe;Yb%=i<l$`P_N>3pKBYSSX_!Y+B!<uwLaqFX3H
z7@%%4Z3Kd+wok9<H<KK}Sfr$^gV6|(*nyNm=VR)s`fS(L(AQPA-j2@nx#k1YX4D5i
z_nqH_KeDm1e8zW9jy&-{qaW4(#lp$IT9)&v{U=}CkNhwB5qEPx|Iv=UvmgEI^5(aE
z-~GkH(_be2{r!_SLX5W2UBd-!_rAXHRq@+jbXt&jUZmC@qDdBf9h6Q&NL+S}lU^r!
zfO7$F9ywl@gjV`C$kA9o)V%~L8QF*%Em>qzM-PoZGOu>9)c7Si7KCu6yCc;w06UM>
zqt-QpjDxwIxrOx$L*)AfclV+nRAW?sfto0BqQ0(1M$jN+oNnZ}$W1^2u#c?umv~Jr
z>I2h#iyNz)*KnfargNh({|ID#=Hc_wjt+Z>qb?;0i_ll$+i<IaG{JXF)|veO*s&gW
zR)YU-W!!9Xt$6U{u2e!c(jsxS>7G$SVFwyXE>y8SLBg{?&NmlEavv8|Mj&gW5((8v
z2AZfyM9j3&@`h^`Ex9(a8l@`0qw8?Tt|#uicB5&m+c-MkHlp~yoT&==Ms>=AnMHRW
zfoPr;_&m>l%yy?x`TYu`w?(<z=D(*KTW{QRsWv>L;2`IKNe_@p_1m}k1oaZ(P4+mz
zeE?xfCfEMSR@I^oboiM)V}J847eHDxCg&6Ml+lgkP@)?nF-0zg7l4f-hsGI_ppc2v
zs(I34`?n90D0}b(KZu`!yF1}yu3CVv56)OQ_%Y{le<cs+L~833`N!lSv=UczR$m{x
z=P<f<WGWwbbmIWj)SEz44wI4Il>eyLW5;#4Acta{i<VuGe+nysI{!$aJESUfnH(BE
z7IBfQUO-7Hn?yW%Mf{8jDv_1@f!Hu|2+4>+HA9h7S&p_+?xsH$=mL(AM=K5ZRZcj$
zxIn`~fd#n!`fG$o#1m^)vs-y#Gi<J#{`Ah%4);^_+Re2`%EGNtSzEeiP`IV`EXi2V
z6@jNe#mlU}HdN>-M#LncV=f8I45N5W7*T0mqxpLnC_imE3_?wBow{)L(ZjH=2xqxq
zd*CI+exGtXB(P6$Hu+PNAAA>zbDTne(r(3V92vIx8jPbxKUQS6qMOA_(i$PeV(adW
zS2H`3(w_q`7blbhgOhlSkr@#?mugY#ZASW~q60}>lhQ~@lv~SC4cyxV@~XGrT7zL)
zp%;PGa)C@6id}$s&^%R7K2Qt!Kx<1yry3WE&V&z<_fX%~n<{z)6*107It=`=c3MM=
zZ%fWjanRl~*x%5B?Y32m&or`i*_sUCQ0s0109Vzz;)Tt;S4MZLyr`e3Dft_u*i>(C
z`JWW3hM6(wz_*%gD6D{zmX6-kfQ*WR+0<0onAwC%%`FE#qY3&gE+M&##3@l+j>Z*g
zWSIhi0PuBgLyd+ARdUi~0Q$JxbUc3s>;icg&|aqDV=6D~Ql2L_@J?V6kjQlFy1ga}
z>&oWzY5Ll-j=EO+%E+9Qsct)G-o~|j<RpJ_tz#P#k&`q5#~<9%_>0akO-Zn|<ZxNv
zrX%^2k8X<}2)m)<#{Z&CxsX?M51tRVLYLxI?ke}1lYj32JjmM?IBoM911JfRB-}K5
zzR6R*ZtuTPS4auaMbMZbu4zHW9?OZ2NijO<BN>h^;o@a3f0`Yt@{Fjqx#qPMADsbn
z4kBEObEzkFa4}OQ9t-}5!W|AN@UiIUc?Qa9Xz6VE0HA|gWl{>0R-%n^s0rbUUG%(=
zR4HsJq#>2fR?m|%G>VmNOSd+g0vy+iB;K=JwJbdqx^%mx((U4gQP$QgUNV{P((zD8
zsKX3()ff)?P^t_IgAMt>0B)qmRYM_F920x7ZWW+XRlG#9pApfg<lEr@p`mUeW0Fo;
zy5yI<Rl&YSytHsV_ubkGILo-(`LbkPV#NV`Yj{S=g*&?g8;lis-Uf)G6Q)-nq_^a+
zYK)5DGGB{+aO%0>^x9@-Q}!i!?v07BWF;;nV}MIRmbqt1meAVpvFSD!o{jF?gnL+3
z?mDJ|J6D-PI{`g@X&b3g-ZXw93|Zr_!=D_ef$aR?gODE<-7S@78e5`aWsrsa;=$h>
zBbKPP1}BArMIn9rdbZZ(!hQU$L1Iv3M9LJy>fSGem+3T0f3&3VIr_|~qp(_Zek{<J
zY!o4w)pHw4A=tGhf3m*vPc15TL`YO%jiJBxZQ!627xIlJIY*26P>C34-!rIapf^VH
zfYokEcd8uVvNz~d$Hlwwul~s(V$3~7JH?Hc87(j4VO|FZi45AdB6;MXP-C*>z@NKN
z9#=tCq@LlilctC9Q4p!e^>~~J@@d%48jXuf_q-M7SRp#Ex@H4Lfo5ZpRJB@?g~qD~
zmEbwRBLHHZ_vL`0A<ui$Uu|H1lCaIRGQz_$5=a->l^+?DSnY&gxk+ECU)mJ$21_bN
zadRgCg|C=UcSfQY<h0K4MCDY>?%|YNR~rrMe-RW}M>$r0Y(@MNY(+j1^F3Ti^k2=!
zp&@8nw*Q*-YX81V7z4zTod)r&%M%wu)CB!+0^RU0v@&CZ(UF$u`Nc{}EAL@5S-65l
zR-9(Sw6odp!n`$WF|Gij5@gHw{j@`TeyN#kDs%<Sa&n9FRicLz`%SLkttm1sewp1`
zs9(`UG=!S<2z&1m@CTnPBOWF{+cQfNL9#*zfdA{P@k39*mDR@NbiR|=!yvG{kR2)8
zR5&{{#!NJD$HDzi5-jD1iZCQ;g{;cOj&ac_#0v{!UVVFF&%n5Tj0+bmikS$VEQ|{k
z-X6y~AUAXDs{_TYP)QB@$JwKCJhZ-lqNP>0I62y&e2M%>#={&-bm;&EkSiFd37o-G
z!?*pxk)e?tw!p7oLOa5&msSz$*<hb<O(APC5#p9*A}Ej5a|qlWA~%#R*%XiA8Q@88
zoW^l?CO93@8}o>mViYOUUASo2u}my<j(iAl=eRPGJcbelGTuM3xNdxt?;a0)%xtj9
z|96jNA4n2VOF!GQ_m38~p?_-6Bi?TI0@EJYH8105hL!-FSdt%PRKo^}@TDZxLW*#1
z)USwD)dxSoLgg5KEudUIOR)1)C%w;@sJ;wvWxZ8XAmA}z_!H~OfR4B#HOMbA_HqYI
zTRVOAmXCs=5(E`EpAPlDNk{bm|EhwjQ!R-LlWv79Bm^u$QlQ*Gw*6S<!RpX~@R(pp
zz!Gu}w>0}+hyp9_wfMGnx>op}#Pnp{M{fag;84u11T}*_>Lf_AnciwWG_bU8+Um5(
zzH;Vb&vDw76Gg=QDAp=}RbQq>?srtXYt`77x0K(7jNLIlXjM6ue+C7*PY&CVu@{vO
z<QK(D@P(pfW&wgEV{s*@zH83g+;<7-y$uXpr|;$UTk{Wy=S6r(l`n^~7SAQ2Q!t8&
z-w;jWWk>J6{ACC93pt<FfQu#g|HUE87{7=k({fG6@-eayTGvvJz)0qZCk$2DBqwda
z&jnO_f)t(Qg(73kwk+#fL)Q9bRu6yZiA#GLR`zqn({(4hzfu`{4?{(lf(hZIo6xs@
z9=7JEctP1PU<Vn+s_hY@BBnl!d;oKUW-m-WrRrw696O%`ys53B^x|Ckmn2u%bNm6u
z5WM$&XuWCEng1G2UeZ!!b08G*QsE~cI4x=sB@LvDzfx0alPWnn_(&M>@+03X^zE>@
zZ~CJgoAV=cqR@531Z_2GL#>>q#IJUrUCZ|l@7hqY0!ju6g-ADoO5haHpizYoZ>X;;
zb3UDPBzXUG%@r|ApNTJC{LbCiR#O@oZ?icS-04N`$4WW|bpBSQNB7_><$)}v?Y?oe
zFyJzNru=o*Byv<&_M@!nOQo}+u^ECoSb!;RMNU#(Y*lvtLFb$bT%uaFNPJ(;@0;J8
ze~?5Qa%zPlR+E+DL$dG;P90gjsSJssTmcDCTmVF?5t&z<vZ{;q(IL){q9*k2<GEii
zlx|;iSFzSxM?(ynF<qek@S(kpxRYupoG%x*4SEW$XH+`8-=T&U8sqGuU!y`>{%}zj
zG_|7J<O+_2{u6>?4I<>*e~5TxI3r`&-Z%|+dnu;5NStajJU^jc{R*hSqz7dVLPv`H
zkaE`W6QM-QA7IUosbti^Cb?tqM5@8_FBc}pL{5=GbM|20R+^{!2(CeW_xSnkTB@%h
zG@kOVI~D1^?Sm%wO+#;<=NVa4E>jEVG<&ltVt}wpjpfyd<>_INhFe_1B}%0NdN~w?
zgtEFgVxXh0Y^6-kJ=4;6B3$p9Gvk7Ga?Dhu@ya(gpDu96c!x#(I>R`Y9Js$iX$=|6
zbf?+y6H`1Z0v{Sjhfq~F1>RIX(<#Y*mn|iGX_KsOuX`u;5C;REB1;@spVQlGp2h!&
zsi63(UnK=rNMWQJrh~^m|4}hWO$Ot9+-^>Io{WWB@)iny3jJ8R*bp%cg|5xh@(j;8
z`?&Ad95r~}RM82AJ0KlMH-Tcv)hb2}f+t%5n3qxM{(v<A^Dk%aK*p@rJ4L34{lk36
zpaPPU+_hX;m>gu%K+<{);YcEW$Q)X5Cg{moPQjBAad|d%s(UtBlPQv1E(MK<qBwuJ
zVqt@F1q);%J&y9f`C7n5Jb9!{V$Z%TJrfK^jL_t}pypgpE3S4*^E}@Lkd@OH9P1X`
zt~X^6wihcaM*l=ubI-n)$FBWV`#0@|TRr9#=I3giJ#a00ZcWQyoz=h)b!i@uvnL9f
zGd4Ab1)<dZ`Oe3zDVFpK%Co_owUM0gQ+z2YCEC#0Vfk-f0>^x57|W^q%5aA99u67j
zjSrd>#duFQ`oYkiqPwFH+<9ZK+3tt<$B?9Kbp-ywcI<uTyesteRKQS|#khnbv9l%d
zd6Qv54t|Ih8HY$DgR6%XFVC%vfG>RLpb&k~mZG4>2&e|r+t!5#EMSp_6O^Mb2ZzU|
zn!vjUk@T<!g#REkcP=AVdze`wA1ZKZw(o7JwD7*=+msdaX$NumQCyk~JWv>w)t8nr
zf;-STU$-Pae_5BaK8R<?3mu%ERbK-p0G~qOQHT=~4@K}SrzUE@QqP;>YUH<i&Q16?
zOWK_}$kzxPUcT1Zv{mQ<LY|^*;=2qOHmNHwrI<7^4f?kY4A;+yWmQjV(a&+^=lG(m
z3xG9sP#D3&Q3c;G3NNI=dD!uJhVR|6T}8cxtCG^EZH!8@Xv;I!B|)o*WU6YeD~u$G
zTO!r-x5YC8z&0e9*kEhPIy&UO2JPm?o+UCWKdZG)?EP@ni3)0@9QQ@r?U|jU>mjIK
z34Ea3Zt{C(DsMjAuz3Nu!`G~gI!2+z-McyTTeR~gePxUOI#*X{5GJZ!CHm+b4B}W2
zi-i0N4Fj{8zU}zVVBpld9)n{*GT;i8pnxtBlSwKDHm<lO#O0|K5299;YD;NozlZ}3
zqDv~Qiz@+(IcIZOR!Dw$P3Ce{3w~gsNv1qtOzgb~(nS$#;vx$UeJt0+>0!=={Zu@u
z;kdh%A0xhi1KZRn9ZdNwn+O3F;Xa#0c<u;NJWq-ieq|(!EO)k0V^r~`J6czhi-8v`
zteTvLwguy))-+lP$#eBJs)co<?AjJB`P~SM_{oj@L_mz6Gl0ephS{*jTu7<y-J<d^
zK9<T?ojl3(kjM06x^U459v&Jw@891^kjVEAxt_1ciAt}|KyftC(@?A>|B#sVj`RvW
zB+Ot)rFhp|423hw=r>vw&W29FKu;K=iPgt@K0uJv({SpDC+b7CtnEzTQ!G`_8~t6%
z6|qO~G&_3zpAX+7>iI#M5>QumI|GJ>>_O-LO!uC7*dm}cn3ZCWBcw2z5LZkg;o(Rj
zgxMNwMu~Bb^+n^B*rDp-lI-i*P{OM<^<3?fxUs$l&MZ#st&H;53ux1UK%*eNkv&@|
zP?5<x?OGXtvBtMSItXBi`~%`z!4Y6F@K6{{oBw4^HR!3?#8%I=_ybAxZ6dgZ8YPLp
z3uKShFcfHKP-o{-6F-b?LOi^vl@PTmd9`#UeDr{0YQtj?T$mo*1x2UxCsNGw$K6+v
zaPpwA%_x~<9T|h+8c5q1FY782U*DfgC59wH6Rq1msJf)HqeH4#+z#36FB9{m-Y10&
zg_WmX{2zT6k2U^oc-O$dU*3!h-<O*8$keivQywp$TN^#?vyLAvTf6Q1zVCgsw(7l~
zo@m}V_??8uJN`1F>Tho?Jh3Y=<!_g-ekZ#lwd1LMZo|mf@09|_a6zEpAFg%Xvp&gD
z2UE0YtIa)dFSn>NuhRAWTW3lp9yI_{V=;bGz-J84kx}flltjQ|f(Px1kH{e~!W`?F
zg76Q|EAiehj|uJF=Xwq^l!}-Gn=k#{rrxjv-)L<9`9}Yh!0im9uW951Te<H8+sUQ?
zNu;A8{;uS)>sK@%_04R^PIA5pbtk(wp-;nx^p!#Q)XfE6p6E4q5R?K^d&#p)Vj(+_
z@9;Z$CSVRCm1=~~<UB8W2O$JJG=TP`(@tdUsTJqxg?X1J^74fJJY=6=U~UIm^*kkE
zv$L#(b*W15B^uhO&UkUxV4fU=)GIy+WVsPRClCiF^X~d=Uj&I_<~Xn9v!C{p7a)L=
zfMM%9b*6u#@$EB<L`@@h>7$&Wl7BEneL%EHYX=cX!3@V9j*Qn6h;)vFjjA3M;vX_d
zsR~6)N)M;1RDDNPZWgTz`m-T(xQ*}Sdn333KWR;8MTR=if+t92hBM8`!J^AecQ%ei
zIksGK_VmD6-qOih<xe_ku;T(6otC+`LR)EWi(G{|IfjO$C!sFf>cMj=%3Nbea~{=R
zUptK<;G7-#d&=`qJ+bR~Y$20<pBe9$@{3si=MKC7^U~3|#@mk=Zx;rBl_9~GcS6SU
z0G%t1LD@@n2TZ=Q8YbkX#RG8}p_IvL+DtA(hiK$x@j;^7p(0kJl6FX%9Xl4(*tNd_
z&z5>}+pl=4MN=x1I#9LLOU_(;T2`8~oKNMO+p|zQ!FrtGE_9B%7BloVXzVwOJDf)o
zpP=MDP5yaT*$@yI$0)gO9nL*TlUz?XUJHi4eFjfnbQ1dn<|R9X>m|=s=oEdJOGMh4
zo-qP6DJj$(L<!~N=B13MY+^wi`&0?q_M#@%Pei_=bzzksT3;hKFq3XM&YYu&7ST%r
zj89RNNREd{fJqe7o#KRGzyZYFY-kJieUI-oUxaJSEX{^3GSCo+Wqr|cc|_a&^}4Sb
z=#-*1SWX37^CMRch6#VU)4ts@p?f#2$mMmNJ1m6G5Tb~c>*U_;rVaqE^om%|_fQk&
z6l^m^Md+5D3Ql}t!MRK4ep}>q_-)FEv1%qkYyj}_it_c9z@d<_|K@ED`Mrn~hKB-U
zHsxy5nE$wP+BmE+j^+I-LC*8e>K55Z7_L6gWf$Eil6WAKy5`N9+})h#TzJ%hU8qG3
zI~uQ*2^j>-A!=c)snWtLu>?Yy?LxLML?y~uYl=G+`)d5k=HlA|Go0dvbZ|bU`HAch
zpmX(4!sU#jzqX_-c_)ss8TIghC(B!%(X&MNDwP)Jfp}>QAXMTdYL|9t5A~O~^#PIs
zCF&N4c2sDY_3ehpAbCAV0|1!;n;7(pH3OnpbWyxAv<>P@>tJa;=O&wn%wPwF|4s-j
z4HC_l)je8;s`%SH)0?t+H%q2sjGBFO9RZOqlL8wG6WhC8c9>XG!71x6qz^}RDK@tf
z3`W%mz%lX<ohOq&lg0k0{5^OXP21xBXze-JkrIgJj7DR|t>c>lxvZ=Vn}+<!p{GP~
z7#*7!1*M$Y)MQ9oO9*OtjHY5zU3z(X)uij~qun~?JZUtNF&y}`DL}Rt1@_q1<Oe@;
zrzQWh^1EBgCyHOWecbqf<S%33?h3_`to#E);Ts0Jb*6>=y=q^)BrVG=^38HSbuNG)
z8N9GpX?YW?!<$^^J?~Vv{D<p}<Pt<<os_Sv7^Nxw$;x8inN6M5K}6GR63<|nBS;0K
zO<b;Fq5|L|SZic#L#zk$8C~}W&=#~j4JK`GdAsjvE>_kzdrfD2{-pnCy-9gGB-w0S
z1tRGogG$z4jsEU95aWDNG{Wr1lb8CIR9wNaOx^y_yjU{;wxO*eI7Trp%F;YFA%jpr
zb6cO7ZR`^p(HFU~D9EH<YYOhhaglAED^^S*h{N7vt8n`*S>ZUv6}2W`Y4dI@N?m1I
zT`_6Gg2(2jmtbprq@=sF*j~5_7$78nwfJlK!a~5EsJx4PQ!3gwJSxNxbCAz;sXoZZ
zQ9#{YU1ly@^!nIt<zw6Z<OjQx$FRJ=lTiesbxZlBa`kx8k9{8m4L_p%!Z8}63@QB8
zeXsG}SB`+Y-BE9S9~64PV86;6%w|pdE<m-raR!T;r^NR#2<U&{3>Fut_HPEWa$w*-
z)Kr*{^svPs6fVru*R>;c^1QQBE!D`7rNG4v3DXxV!JqY|g4+Nt<3*G2JZwc51=0Ym
zT&?3|$6(SnJ3pc;IJ~I}a}I5&^D>_yOA2%00v4BPi1nKo-s@{?iVwb1Hrd*<w>H1;
zLmrR+Ksd~9<~frv6gid!pBkduXFv$)otGzwhq$K*MN?g=KUNNl0OD;LCbljsECNPa
z9s$0^$00@<sfYK!dgODfUlEl`o}iXu&~hC;waOesEvnWfJUh~8$UF&q>2bYv<V(FP
zd|t!lypd}&m~6Zn7M3>^3#h7^5_1GmnummEn=!ot!KHmSd4C(K@OgCAi_{p*_|T*Y
zzD4WyT2`gn^Ue0Nho?Cn<Qng_1)Pit@mHT|yxTO^q&RWXVfmLOKlr4`Go<{?b}vXd
zE_K)NH+h=97WC}s8a&H5%-~E<E<jddU5;SPpy`AoF|X$`7fjFkdF~%>`gQNOT!1Xf
zB)f+5G}8GLM*}qzWIZ*OK9k$Gw|wPd*p~91i#I`8<eabRI7WV5f|CoLz6fP6ja~jR
zr2XXVgp$N2_&pLgQLu)Bu*T>fqt<E&msCCZ2Gq^D+yEwW$t0i^YrhFgFTJ<H65q$=
zTfhe{O%lK<VcGJrt738$#G;wxwdI;r!q1};3MYYV7+Pwm*?H_#LINaOavntIR9;Hq
zEywbBIi*(aJTwAwP8KgDawa97m6MjgQcJeBKcy({y#P5v$~6i&u}T|<XLwnxybzDL
z)$w~^U==)<2TUzCE<yV%dM$5cv13|!jME;Cfte+LSxAa!_2r3Nifb!;>r{g}Pkf51
zDPJ0D1<}Yg)wh-f(@UYc3~O55O>sJ5_UpmrtE=oc9>hqg78q$7`=lx0Nxst@@?6o~
z%VW-c$++*_4jH?<PqG1NuDx-@{fSrmfx&!YXWn6Rk|822T5}ED07~zb+tqMc;eEvT
z7UcozvT3QoC5Q{>Rr3`#`3X@aC8~&Gz8GfEoh@DzYjvOIRFF9`VGLpbnG63|>;>d}
zh=C5>EcUcqDhvB&c&i*!_eKxEpl6dLMmfMXs`=!wfe6iN-0OVtfQgP>+&uK?SY>>*
z{?I`J8YtEf*z>r}UDzZ5rpgLP?9u%Y25D|mjE!1dl2l{ilgLl5=LBF%<cx9Q*oRHh
zxKb1f$QEhxsAj&Eib^pq+m~mTN8@2$a-=QXmcbRMdlHiXP#IsQKE2jwp%fIesgSqx
zd{H*{T<Y$pBet40gylyT=k%}7v9Oq)ikJwUGizFWBKJ0!y-=u8aV_jC7KkIyHA@@<
z$)=mSaPRAweax7hzd9%K-11R5$-9hqON|eDbtB*IV(#guWDT9hJLp>2zdE37il17d
zMUNT;@Z<!d6cchgHc{?5o(sa>v2v19TQAG-Epjj;r!2yi#Y=sA)KLAOTSGnlU42VA
zh5@u4iS8ccB+pqe<7P;x9`t^bICv-Epj?&h`59G?^Ri*FEFhB;N8TFMR3kE0y%&Qe
zcM<!4s&1A|wy*0vgr`U0)UaobKay78cgjD{Rm+`iwh3e=KoC2Zheb`o<vs!u5f-34
z3@(O+GP0o9LX0RYCPg<>el-5lg~?%G<2U(}`1i&met{6kVUFP>acmdkGfe#!_!irK
zwc<gP?QC8;l)A()P0#k7)5F8|PT-!^cVnbKwIu(P!CtC@*ED`5Wh|f?SbrK@gz{CL
z;hHsX)lN9l;SxYBEN1FAZMR$TQ#az=JknzGe_La>5C-VL+u=@z@qWRf{v~<%vJm;~
z-r0#>AGe+w&pX-%JW^;#2x_ZstCCFImby1ipljIYJH>>a7FD%3s}&{+EGiYP#HEKv
z++?W64@WjD(|>SagaUxNF~UFO4g|lzdnpgdhxLm=41}*4_Z-){cgLV7ZUY*1DoTWj
zNp&C}TTBz+WwFK`d<T-4A%iA4Px9ITNBg-sI0|GDuoItT&RvUp>o#A+0Qo4YqxV&g
zhYThYv)GuMP1L|ZSGp%T6WCay;$<Dl!(Qo%kw0PR(7#!3$Zu~;mMK=`pXr$_UIm?_
z6~uCqd1$Ms7e&)(wt;S9xHRbb8Rfr<d8?C*7G|X8Z1Hiy15NrU+AHp;pFCJ>JE^-h
zcXYC2TK8|KRK%<<(RQciFcm8wT3Zn_J%wPdm3t^q=7i<>P`tpDnNU|so#UP($xc-<
zdDgIyk_gD3J*8jop4RJ8Mk{sx8slA~1PpzWmi%j4;*nOPHz;sZp^N$gBX0*jg?Vh%
ztHum==}mk)I=7e-tJ{jv<AwmRFCK7D^s10O*k=s3!OLa%u#@NZUsf3>AS%v(y~=HJ
zulNY{We%nK>-VN6&RJeTbtx{;fMx(sYO=Dbs=^;v%%zw%SS!Lq+(+rFvr-tJ<#J(5
zw~L8%)If=LJvl{u<kBe#p{=p!g7s2ILYJWlGa{KE3Q13V&l<5CyU|f~MG7m`e-PKm
ze+fReruceY0=ZDOQp4eI&;p2MkUyD26`2T~h@T-T=p5JV);)M4Cw1fak*NxNr5^XZ
zq&_kfTo%tv{3S6P+1<&rDqRwzLk<e>Hl9s2*j{XU<VJ=DAg>4a=k68vHx^xPkEl$q
z3R)1qinyc_-cxzoc2h7BYVi>nvn6yjCQrW}(gv5%PIP><Z&l47ZHmUw(1o4R<keBu
zb8B~18}G#`L_~e2jJ84INLDn;FY|1^nxeaH2aE2e`j08sMTj<FROYjrb(GaX%9k06
z+gWRMA7x8R<?24shxJw*fK5H8(gqj>$d$C=i7>K2kKaDCu~Qo6#*$(oJ3B+kDtBg0
zuRyhb!4M4+o>~#tr;$5}z2f;<b#sSY%zSDTnCwW-f>hH438#@>8O%8H6z41sYUfYX
z$I9=`Lp9l87g9Z~+limq2{E4R@1;xGBZXIen`?CP?nazAvl;-XBf$6Q=~eeLN?Uv<
zu}wi)N@oLK<0cbb7#U25(|oe`q@>Ci$1Hk+Fq3k9#8pslgVy7D%2T1v1Rug5eFlw-
zZi$wABCdIyW6}iJ6X@`<3~`9KTzsTEF;{JrKD=j`M0rUQ<-?Lgh4&iD(N(J^+t!aP
zw|z3e9s9}d(A1upcMi@9Bly#vUmV?6dIXn>1vB<yxUt(J*^Ff|>3fqRf~-9^i{st^
zm?-#V81{(l5ZrxX*;-1>hZlu8ioCH#?=8g-O8HCU*cGe^`)&S=vCnlkWjA-*l;OZN
z<@e;hQ8vZc#!lA<FC8$PO>_e@5ZC&+5CNgrJR{+{&o_=QFYy)+msc#;4sPo_H@>`B
z=k%8I@9CQcJkr(%ozz}7z;7U|H{baper<oWyws<u6lHk!;O`<!R?M|#uoueK13RZI
zcEpCPQ9XWGd3*=OmG*-U5~%TN`TxASbrGztp`6mE2>B`_nW-f9o~3RobVkbZNQxkx
z?OL_?;TA5%-<@N*fpSc!)kL*Yx=Y{7oT9qa(8eu6d(>|p+CGnmiw8%-mxW_(fMlAu
zra~TU!^lDQFTK!LyK%pICAzMZm}MeN^38pv-DY^g6*(n6Q++1RO2*U9w{j4b>m@vU
zQuivPMmdKgHMB<p7p}4}I!;@4fAMVNFN^LmHs+Y7f|_r;-E2Gv$n~jgS;vD%l+WcR
ze%^FOUzcm?C!HK;fD(r7`3H6m_9UD#Xne}~(np?Z|M8KcrhER3XUl*3&3}0`F(tY>
z{=1=%@A<nqOP+Il=YLI|7Og4!dGW54C;u$8VCJML|NIy0madsN{crv<`qSbc{7vOg
zgO*#qd~m1ptDCu}e`b68_m;k|^d$^dVmW&Ngy=XVl@tsc_i6z*D4gY)$dEl>o$XzX
zaTJ?|f0O<!I-ZZT=x9V$8ZKqH2Wu;eaXAR2bh_SwaT<q(UdAE${Tk0zI5Ti7pKF|k
zHFdm3WXe%e2Z+B59o%;JE1at)+J8RI7|M^-_&zZI$=)+%L6;eDjJGCq%algyb&MUJ
z%(LiUu-~U$&E{qh)+QhuZ8YmEH?#C<`>4x8SZJza<ZX0N0HGQQ^Hptjgc*hj5f=#k
zPcY9v#0b#U47tDt46`a~{1d4943T->YH^undKAsK&t>i|H_<rR_o1|k@y>XsxrrL?
z&UYPO>2AN&c_j$K{nV+hR7>=tw35|5sXJ36LokOG)54Qyf)A!7F}Tj(2xdmc!>$vU
z_Eq3N=*uBi^37oH<B>~&$;0++&NY5eq+E0a0Lv<k4=y?${PN0j+eq7I8e5>>c4uhp
z5#!K+ZFWI%e=j3U75{|aV+pRzzAPE<&v!BySRz0-Z>xPJ0z9cg33ikeOW!)4FV?P^
zcy!QOcA95AxuYRsNf%%=4r`=8Ug{GkruBM!q}goY2-c`;7t^6u=Q)oCH<Yzg@zW(m
zCJRNEX8@eR9T^)A$9rLy_+$%?#bDDDFDXQ&Kpv$AZUdfgiiEhn()yOHS&5=(8@DE<
z7?T_M4<Bc<O5mF(#xSC-F>$CIv^>Pl<k$|J;y6eJyQhR!P57kJVxY}gU&fOI;vy#k
z#?h*WDq)!F2av8lE<vfDiev<b)osp$gQ{%;APwIh{wX6tBA=yir*DuZUb-;3K-BMr
zk!#4^?>TN=G~^|ZgpK_-)NM?vQ*Y!XzB1*S^q&#krH7nZZ{@57Vt2k$MrIV`ZFE<m
z8L&Jw%)~q2g|aboLS0LA7Sy5C2S3^{`U{dG`xU&oFulw$4(}$2!97#?d;gc*4SD`i
zfz5gy2i%WxO}Y=NjaKL>fcAzMk=qrZ6!c~h|FR^*2i`3d4S*|1nn}Hqe1gzn#=8@a
zNQ(q4Mx$4o=SuBMfK?$}aiMa)b*4k&CB?Q*u92Sdb^{}2Qr*Bqde;v|sUppc3BA<m
zy9E;yhFHJ(;Q*DV!M$6xm6xoZ1CaWngD$VEN`AotL;kuOGnI3}OE|0R>0$M=r1GXc
z6YxS1Rd%h6uw?p4ImQ2VV3FsF)Q!tkImuD#lZV8KA}-f_psbKsy^FemVq7a7+*Ub&
zRDacl<QekvIUK}RKm^umGwlKO>j_x`+^=GK$?8J}*|NU*QeVb`*aBCKFMa&=bYj-Y
zP)wkHUgQ_0hNdCU_Z*d>Mc+3!Cx%66N?_=2z`VAvG<_OgEvZT38{nD?84eRR%rsM{
z(MWE|>O35ZQSmG=b9$sLS-!9^%@do?h1_qqjeet3K2!q3wo#&c4|tC@-HHwT!WJkD
zd|<oRs#DxHUvi)c<@xVzKdTErfR?qre~C_JMl=vIm4s~45K#wT#}}t#L#b=Muy1BF
z(+<Wtu%Iw<Y~Ze(*sEr?x92nJs>pqnL%8`rxz+^s$qZTsL%GF<SrGmvdUtSI0H(0e
zx#{QVVQJJbuoy^TIZyc0SqjdOOMSZ$wxyIDqnMIaM%$(;UFBNUGVq|e-Y?`6`9G&D
zaICKf9Wf^`_y8>%PsP~<$vdP68yxq<Ta%>bXdrf%@wIe<Vu#Gv3oJsa0yw$^`~Y^t
z6x-a$7x}`OpHQyhNk{gxDErXjz)N1hpKwNT6miShsb@0;N;4fK$bFOVM;OmITtQDH
zPdSN`DFoQWh1N^K<*^fcFK<dDzhQ#@X=-@dPpGe(WUv9qL?`-I4!6d<u!u?CDbwg7
zUWwS%dDw1VU3F^gyz-?IxTp*$K9>0#WvFQ^S-CxC{9<PEW&fwQe0i9|zh+aCM+?Ef
zjsCX4Wute<|FJQS(X$E9%J%-$T6jI-UDeG}u{XL!Xg~!NL-VrmlzJXfC)bwKCP^??
zEf$b#Di}@i*waP<Y^v%Q99Xt701a>A(6V@{uXuK-l7sT5YPN6RaXk164$*VhCZ<@$
z(a-gfJ5u8#P)X|&qpnav+hWI?Ei4UG;>#88Pj$jY3)Fc*b~@*UwxEBXce&_}L^PR_
z^esqD1lhq<SEYZd>~8-XS_WRS78D2aRXi>#LW&?9Kt<&vVNk=v!R$ns<XrIWRQkf?
zR?RV)Dshz4fOF)(Aq1QPa|k#lvuSk%Dt=IAsuej0`ZG=nT*?bmw>NcOu<sPk27P4)
z6ge{TbN9@g>iFuSp(gqEcc+$zu7(P>n!Jq-YkPv{ri-G&T7tzevrCu<k4_*7fP>aa
zEK7ujEwLC#dV7*^YuVS<FwLqbt_Ky3{65b%x=$H$DE)cLQClF{Z&bce`eTbqe80Sc
zEarINj`1twSjd>1=CR7rkie%wr?GnI;?uk7W;BhCCA;?nK#-^mFEiSN4W<r`?U<~|
z-3gP@>C&*$TVo$r_``X?H%iMY%Ulw$g28nMOeAC%_VN%YzRtx>E`onJ8N!nni#nRa
z9yt5A@Lu?u3EtCFu!1i&Y)+Ef=})b75Pc3PsYS{-+)4%vcu3c6hK=ez@1q8j`#coW
zO&Kgn<JZnXo*p=?x@2y#uuL*K=8mS4)Fer&raTx11Od{+B{&d9irkHwVtlZ^o3&MO
zaG7PuR00=KA@du5Ba|F#&+%_n>fFrqY1kl(=>*q&i##Pfk$fw1XXw@>7(>P&onY$X
zS-`RGO?|0m$ooCx?;Z(@e}g8?wXi;&EQpju($i2`31>tDBu_cdbSqLUFN9OmpNhhW
z@|RXz6cAjArgcyhEFt#W-`%DBMt8TcNLgVU`%1ZM9ObSi(zz+C=#JO_vGPe?;1hJR
zUOCBQ1HXRW(#;x+Z{{BGCCAg$6y$O*KLv6^wOi?`zmo5Ii^o#BQYLNqTSQ1n&C=Cq
z=~|$03uKDmNN#iqHF5#3VX)dkLDs#*ThZ*<TCHXpjC*>+4@IAFV%G9BA*P563aih}
zw<OfNYbU@OCth8PV5IE9cIzcvGaXNxoRFK=bLghB&<s#!>`ijkTi2L?YXp}Teh82#
zu4Q`0hYTPz(M?iX9rDEUN@Kc!*RDYIp69tLi}Ud0KDiR|wTcnwX)igQ$$S?aO*E|N
zug^<2kE5<yxHI{jFBy82cd;S{N4?|o<~xtu*YiB^0T_1Z>*X2|CX_Fyz*$L#4|fQU
zRK=&G_rK@(qSXH*`;77!b02OKtE!syZ|6LUuP7lPT3QeiWxZ8B36sj(J=;pOzMf*k
zTKmd3(ZUg6iue4M?y{Jpk6#bE=GHwJbNKJKv3A3wj<K$yJ4^dwiyBGzSfSi@C_%U%
z+kB5@{j23%V;_irxSVN|)#W@t$$QLD-t0NeXOH2L!OniJWE+#2F3=KU3q}$uXGriC
zCV{xTus7$r4rZc#se@SFbWRO2kVb18h!TV7MAGfbDbc=N%#d!Z+!xigkt+tpL0zDK
zzn1Q-RGj<Yu-0eK0~4qHQ3qkgyTCSBd*DnlBc4V8a8jLs)5JsXpdiMb+P*Jb*PB}O
zG+Xy<Ijv84vV8dK<hR`eDs(1NRl=GjtAeBlt_zgg(XpU3yg~VXp;=FejS0jWj$OUI
zsV3aGDpLr&^TU%V_tXhVpJ!Xrb!XSYJnnNQxD!JCKXkZWeCf)tkA<9?9zR{b*10h2
z**8!oq;=2asbV2R!a#uEoK}LxE!3!;7TP7xwwy*L8Dex39kew4oObH2R~DVa&(?d>
z7HBnkSjKA)4`A7iA%Pc_Pt+Q^e<}tcP^sE!LMZoyGOzE(9k{Pl^i?-87%g{;$uub{
zlBmm3mI+`caQBqdHK9T;iAWoy-E1S>bbPb2H)Hj6n=bpQ1$j_8lr=ju<u6;vAz2F+
ze#K)h_pg=y8f0QU_!}&<(kZEPyRQPx0ZPzaO36d4kGX9b%9G&rwJSed9<`&<{k)~N
zE>vSb<2pInYi5|waYuVY%Ao6bL#XITB}<sfO3phbP&^j&tu*M}sYN$9)Hrd+)y4FG
zB*H+UCEAN)o&Tu346V@P``5$imf{x_lY`unh~^2IzN%owOh6)_#BftEumM9WHM0m!
zsf~pkPJm`Ri4rU@91>0m&d>hx8mB$uMv2&isZ!F7z8@vu{vIn_6#d(J+RjjOb2n_T
z(DLvBB#3CCdZ0bNajC3*+fGK>S+5fYm(E9szPHC*1~q4aXSIH(Wi8~b!}hRT<ET#E
zrcM4pp$)b+=k6_kFaBGBHh+V1tu5eLVN{GB+lYqszJD#4h+z0N`kN}YeH39G;~uVy
zq)8-=4g$fMfWbLj7(g5XQH*J#PNdG>Zkk}dhM{23RcP_TuF`dzseXiiFR->J{zQpy
zN29vdWS7OHfoS7zzTZAGu|h~FhCLC<f+lg+tFGn;NZjlH^%{>kG0J(8$w-_8JLEel
zN#D$pk1r}Cp)0tO=+}4q%VKgq@>JD9agYyMrqQx0UHxU^wg_L7{#_-DXjm7#t&5qt
zbT~tknPtlgLFDKEsEMF5Nwl2^)rE|YiGSyIbt*zVC76fIn)p3!X;L?R0$-7Uht>d3
zqX-*;JIZo9CD~HO&MF08a1UpUl-mqTP6)Qg11CPp<xTHQ7Ui-q&bYJuKOZ)94f|d$
zT6_8ks?@iNyVp@KnSw)?rl~fFGK^kZp|{UWkB9DiIXIF->>6QOv?=+IgP3|&V;@cz
zS)OeTJNk9$nAf4q$FAD1jIf{YHz|SS(SXjc!T0Cdqi3&u%LV9<4g7B#UR{NZJ+8tP
zvjw`0BSZe?K)`l?=A~kCY`n|!8e+N(B%(d;&LxMXS#l)Ee^{5)3#pUJqmFl;2+Rc8
zg=<RKoQp|q|81Ovx6vGeT~&jI%##KzCWzoFl_CZ@q7p}Z>wrQvN$6MYoBPkoho3<Z
z&Eo|defwc^^RF+}CqJ3R+*p&8>+H&3yeN6`fC|Jiga$rIvp_H6!_Mw2g<hiFtyVEI
zRJ0?+0J`Gt!>T~twIChL7$}<iXP|ndaVM&mPhIa@#C`j%)R<^1GfO04F#3ijm2eM=
zZBa>L05S^_J>gP>WAFHue<fc9i*sEP;qod@$``h%+SCJ?yd*|zU>=-hAIYS}s`AaG
ziyn`FnRe%q&fa0em7wm;Io7a<x<wVMtR*-izKO2bls`!#h2J6%a3Y~El@XXc6*Ci*
zAYw6buE??AKtJ78AGFy75qx%7{Ch!Hjn=V!Hl;xsNmh*hCaNsuh|ZtxJ(_$g#Qz|K
z>~ELxuAJsg!<R!|dH-J)*1yxUI;(T}2Cgb<XRV$a@_aCcEd4V%j-e6>&0;?_J0mq%
zM2Vz(Y`QE{U*~*$Ld)`;3%Qi|+ROO4bJjBkAvS%Sec4$-jYgP`Dfq+Sx#}?)s(;1$
z6aPB~=3_UsGp_mnHr>tiuxFpx*2~Jqh9^hr_Rm{m(k=~I!(S}X^{wnL)mD<mf#w@?
zv4;|J&`C9(hkbjpgCm?MAHgTpLGfwXgSQ?wOPslkLFeNp&w>ccme?lIa{zf!XbTsN
z?bzb99~b{>E~XP6uiBV_YP=!l_I<l$CpSqLFJmq4kW9V@_jeN$O6kSLvXc&qewqWA
znz}S(B*{V1{Np$5VPVDflTHRlPO1d`NEd)ZQ%<iUBrthKF3lk*MkttgW7oAn#XID(
z7w7PFI42%~t-VOWV!dhK%$Lr3J(k#<e+<r8^Fq}3SBEEV{?7OEr*+St`f2U|3fup?
z84;G~M;7e=Aa3$A6R&RG<eTyTO!R!GtRnF*zQ2B1xP9%8*UQo)UYrzst7$m13Agq0
zjiZ@gUC+()e&_z*KhqR^bmSeQ--Zp5<37o2_Y_w$6R}?9R@>Ou^ZTy^78=KDY!CJY
z?isxpac&Q4$gXdL82p^v3y3B;F7Y5q%PRVQmmxG{5oT;4%|vgL^22Sdvx`^Vg<v@b
zFD*|u8Hf`pw#Hk}Nl&1Lsn5e`#|#O-l@GqBospE#aQoH^%v#l=j~1XCqf*v9k%fT;
z+|22`E?!70O%*Lx07Vc{SXAR4_j$cb@_1NTi%J?{(aBB$MI~>d!`W|G9<mtknL*ZA
zx+R^;7FHvQ#b_eKe&K{?ae+BclE=Bo=%|}$pV-T!dR|};^Bi6j90rGO6_jNwsByN4
zc+?R1@qpr+)araBCHS(DXNyZO?|!B{iU<T5$1@rwACFE3s25vh2zcXsan2NgQqV}*
zAIMPDo67gD_Rh}&@s2ysB+cH^U9UYETweEAn#^?;t$sz_DpMIa2B=~f8F<tL&?#Me
zUdPjNt;|)lp&Eo?tGYLAjMg}N!oqsL4pF-NBv6d$2qGELDPMfMxOnBZq2&oYWn;o}
z`u)8ocTB?yuNf+>3BNU3z{io#9P<4r*n7xOMc5D4e1HagK7;Uq9^-zO7(}5%eiYSR
zew?ErP7|ehmsta3GH%e+@d*R_(xmmhONUeIdL}DVKAFnceldDv@XIii9T7f&|BPy>
z%j2Xm`Kt{3IcqQy;!+iFz@By&-q`2WwZ$dwX^Z|*Ud3FQi7eCA71ph22+w;6ILE3V
zql#VkaBR(Y-q_@;w(<=)8-xO$*pmVKkDX6u?bCoOEb_;4yvy!YeU?cg_Yh}YE2+sQ
z{#MN@KFEnrwT{qC{>0J|lTqM$j55$ix%dZ#Gf$4l*7;+3P8f&C!$c+caI_8G42dlT
z?QTnd%LCJpr^c7lv^Fv~EG;ts)QM^O1!>OnD<{R^q^qV6a|IBa1OWSXu$Xy8G_kx4
z(`sX|MD|9l+Zz#~vDm}%7bPA1W(DDx#*v`F?G?)BL%!=7^-cKC-FeaWph0=y@SS#y
zet2z`_qsDz-2L_x?6Hu={Ft|_wWC45P}jh=TAS@&KsVG#b1TsXa?}s;`9K$?xd&oQ
z^k8hbAlCIV@l!DR+3{jDvw%G>=3+FXwLG(%qrF>}6H{EPuLRWFf-FXrx$|WoMRfB-
z1@i+uPinX|DdR-}x49Eqk2|aQJ%*lTx$_n>RTWDU4Pr`;kl00Omy5N~?PIMayeP^K
z_sa-;S`FkJb4_7Ze!}wtT``x?WFU2%CAj<rxHi6tQy}vg!YB#4!?01@=n&gJ!k~op
zklTQ;atu;>4X3}=JhwnQG3l2MRfH$+$ECdJ`^lhWJa#BD*P&8-sE+O8V4$<qWV!iC
zD(sd7Ed@mv97HtJ><#Um|7xro`tquw>2OBzD(A$4RaLpqLrtMJ);vqJI22poqWojp
zP=biKz!~x#II$UM6<1^d+u;kQSuRa+*=Ou>el$Ocpz*r{ki`7pA|H(J*`j~WQ|{*}
zzZ&xYDz<1yqqG@aG}lCD+7bo~%Y8R^WqP6%myqP}f_)$EX<L0a*jj1z+1@ArGb_wX
zqQ_eD&?N}#A423)BrHfKAKa78o7~Sl>lFv|6<`yX%@9!_j{`1p%TJ!s*M<vUq@#Dc
zMap;8j-skfom+&SwNB$e-44-I6m!~ReUy(|ELr&U)D>gr*^rBu6?W(`u05SQxL@eY
z8BO!iehl4~mWbFIt?$Q>7n)ofSr{FxH|)zp)C_4Fww{?iYDU2i#tVPz8d5<`lY7e%
z`TaoxEmukH3h$GNo=Rpuu17r6K{a_K!krA=o`FH7O0mTo2W2oG0ZF|La$F>Yh#j2R
zCol0@g7JK0_Zn)3z8v*6X87KlpAx1i?w;X#uJ#SJYGdnCo|&^b@884j3$NX*#OI;P
zo;!ipqqz(xCCf%jbQ(3%Y`)U8de!}I-DqG)+3Fa3(dJ9D1uBz$mC1@XdH8(DlH}39
zwT)i1jUv%`M2_gOkih$n2UnDC+u2t^xoi5bqN^ggQb;vD@FFg&dP{~r>`dK+ZZ$KQ
zX7B|*O>zKa{)Xs6_Fq*STgJn%2C<{LWlbl#d^T?DN<kIDAE`6t$c}bYANQ2?m7#na
zR87Ie=MR08bV)o4ED=bO>{y&vMdQKUTAqDK>pf_Ff{T$3jLK#|4VL`tn68zJSa^5y
z0SmO*_YI-m3#f`YYYD*I)(Hhnz0+YFP_$#}A#Ol2ucRbkNIkBI77`q2blzL4IF!0y
zfIbeMM}%~Yev{@Co9=`>)-rcIF+w%;GRFc;UgcqI^_#Ku?b)o>egs~p7X4U!@)4#0
z`M;pT=N#5jQh?e~jd3=?MBo^2!g9T9ev<2{&a1<Qyv^%swNtIH2ltx~Ye~!X>`Gdn
zSb{4+XlUr%8N^8ST-<;O8=2n3K^UH(A+pcFwQTdwoVC-+&Ce;#wtEVJnU5)-uQ1;K
zijcSD2cIPevXe)rde4UpC;O1u8ozOjTvXS4RQYh&s3^nxXE&6_#Wu9)>nN5ej94|;
zfQW4Akult<<4%kTtwRM)B>3jeJyk7S*3yO<4uT@2`JSdMU{u`#n5=(ZwpL3AfOrkV
zo*PI-Ra7<K)odVOPlgnkVg&=#EyWOHxY}_MyC^rsS5=6>c~D=~A--jf)qOLigtefp
z=1H3_WeuI!U`{8>sr&;;4M;!wE>b@{T<q?Ko&hAzj25?kE0mroGbf?x!d+T3$o_Fd
zS0Be&0&_)mDU>GhOosC0{edXx)i8sYv_{cNYj|@);o+yseB?2p)n&Ycg~m5M@070q
zbSaCYf?Mi*WLzvQ60VEreAOyY<&Lk!Ie^};R3&I^E9&WJ?Km>zeXaAe7f0V~DPCs%
zl$5W;`m(sqmSV)S8cVTcN%48g#eBx_M8%5P)va3EvxvG<Lv>6ZjjUx_LwL$L?QFAl
z*O83TTaZNGvptAa?m7I|+mw%uw;T_S(~AB2O3{1%ue(T-Kkr~*`Hf>t?rGzlkB60e
zE$^d{s%$$bg*MA3LxgIc{WG4b8jJ`r$sRwSX`i``0I+}sP9&DF6_{X3W8skork8Ay
z^&G)`f@pCmUieqFt#g)hkkU{x(~!Fp`x%($4@dJV<s{hEAUYR=Xo{I|Gtk;k)y$FL
zd)mE@+V&!TEWRHi!{*oz&2yiOOCBe0?=5kDY`7VxzdkDkZmIWzcoeL3;V36Od1(v&
zNOB8Ijfj|dJDOpJiH<EocyLF<l3;i?Bvvf*IM5|LNS;PjcKZx<8qWRVv2F4n<`$ku
z?*>~@t-$bi?_Zx3O;(-URUAEOn83@!Xl?;~7mSapOy=hDa<J*Zt_FaJoP|6Abnvyi
z5sYD>IrobfJ3qVlpfO`{SlH{UEbevASDf|5-4jF4zy54u{^GPq8SmS-KZ`{|Xp9cJ
zFUNVPCaIT!!=buj`_*+RQROk`*81k!M!G=I(DKY=lM@RxVjHz7ItA5ExO-pn?`PVE
zM<FIC4YqqP+RkNMB^5K*bD@7pNIzZ6vLM;2RoRz&wq~N%^8b3%xKks?Ha^v+VB8Qe
zolSl`f8(GaYV3{+!7ko+4gB0-snKLcM=vXFs<m2m_mlSw^afTzo`<|+mF5q5uj;tx
z7YN%)l9bRLmQ;EXh!&BY`VFN9!Z^7Yu=K-pb7{sQp9)=ZjGXOg2ua|~46U4fX}PXd
zvIxYA)wFR?>N1d0!D8v!RM2G})@>>k4#mN=y`66g;T57Vr$1a$B29!7IvCl<#^atr
z(v7JKdsIBR`P*RMRD7DrixG!uwT~ibR*b|Lma7ti1Wy#Vx76Gm?&G2}dU0BE5nczY
z9hTVS;5Hy>O3`$Z%vm!z#JuT_TcecWH|Kvk-TLWP!VVI{@{2dnko!(eFw<XcJ4X^z
z$u>k9g|405>r>RlZgTaQb5{X3mB)DYGTU^%5_~kH=f(}4A7{T$l1DDH%N?BL>fJr!
z8iI^CR#^1aa8besd;5AR!CR1Cx+gB}3#tn0xhT7nB0u9pyuWx`Bzn%{(arR1gG&?Y
zh26ySOy%KJn=HAO1!lQ8J74t$Ymt5$KtyQq^dq?060{@dj}_vZxLeaK&<(jC<GE*D
zPa4XQaAC!x75Tk2e*+<4a#)JVGcBn7DiT^w!jJj&dW5>m2ke)H71pI)`cB8}f(<2g
zIVy0@(7O@;L|WCYjJQQG2X;bh1%<amRoPBs)WmWG*3lsGSqt7Ot<b{+nekDZ+%d9O
zaki0qi&rgRYLQm7as@7$W27mu*l0mE<p}2$EnxnEa!1hnwG`wsQRhkJ$5S@o4nRjH
z7gMTb>YXD)o}YDoHa#-8XMRdWx2Np6!N2<{s_8Ri|K`IYqBz?pS6RZgSzAaSv5+aV
z&zFKKp3in?LF5KYnf+`<jJPwq@u?9cjpF<30d?OxJU4A)H*MahY~G`s)jEHS@|DfM
zPw^dD&KRHby3tKoYD`~T@+Q3MxDMkrvpjD;Z|a}vF{^-Q5WJgMJ?vO~dm2EMQ)ufP
z(q}=5qclT{xgyn~`g5``C6Su)*2>LHX_aKJ5z!q3RW%}tarOQR02XRBDQjOgWTn!Q
zQ7U1K@2aJaUdIpr6D#|etG4BUrF@02TEDc*vutmE|2uI@W#QvABo)X)HN~gtXHRo4
zOcuqoIw_8fP2_eRpF4Xr3w4k~MX<x`=c3lCNiyn*O5MiSa^mlK%l}h0AI}z$#|h78
zR9}27VzWw<)jd;O9Hs2wh5II`6`Y{J654TA{UrGDf`-dw&bNvaa>R@~3mjiO)!AY=
z7u7Q#pftr4mA8<jk?twJMVgAZMI6EVBA16U2Cj^t6iu;)#rh6(=hK{>xU_*uqII2f
zXCiW(JNu2_a1dv_G;C9j0(Uj}yPN#JCdJw`_H+N&yNiYkjl)5K-vt$YRbx{^{DtJy
z2QvJ}79S8|iQ(Wp_s+)Un#Sd^G&xv)HzFc)t-nt2j0@4AfH;X0X_zp9^;SWR+A3%1
zGwy|X?ns32Q0RzE;<;y!z04L&P*sGK?^<tJEDtZ&va7aZZYU#kx?n>6CdW9R29t;c
zX*dx_pq75t+txL>Y#^N8N+QNe*JFHtFfHx^?m<%GDBU{aG$sy92&3F?zz&Ns<#f)v
z99*zCZ?DCVW}8S0Cl<OahnQpHUVcNe`Kk(uZ^l#*o{%PK+_n1*hcJ_z#N`>hRL7Qi
zvb3xV_pylL<pf;_;w`7VZWd3AgOz0#$9WCNji)m>X@=Mk@!L0pRT%9_wZ|I2gncQ|
zI%rAoQ5YO`fa1wKjcbi>D(67sr_&jDb>4c>eE9H&Pb*8)D-t3IXAG{a%7)}#*KKxg
zbY`97JvvVW>*t{Bsnn*Di8_&nG5wJ1{By5&beFArV%KO$;IuNVcyuGk<-c%@QfW6Q
z2R=)_e?FwP>7`!fuFhW*xM2KjDDWv5=Ww8RNC_zI|7NXA$R5*A|8}v_tVM<p7ALjO
zK;Pzh?ump3VWQK)l5;dqq}_r1@k{1EgT);i>|Dq2L(5`#*wvvgNLTJW&mk$QfGsO&
z!uriYveB^<yoPaNH*HFEU&s2L^WZe>UAvQ!=StcATRR~E)5dj(ki_bF(=gs6<pa-k
zjIXr&`N|r7<%-LF`+6H@M<sLiyfXpDsbH3HG%ISj9N)$48V9@7qVKD4$VD6~U2|}+
zRpY%7V-R$b%jN83HXILT+KMG8ECkiYnN?#<*NKVa_D&ja0$OBl+P}|tf~g)ii?%=v
zy&)hlK&!_jjzqaL<@`y;zhCH3fn-EGSfBkwPtB*JhAYE{whZ5@RXNj|w3BCCi^xiR
z#lFNG(-oF>iDA2I!KIu>ttCSGzZ{_v`{`EuKI@vb*UH%Gz3ZoAZM42(^|gRi3Ft;O
zipv&gNFKXKoRELFa>sZ#<iYizzAK%sCiZwh88LbbjaZP4IRc(h<;$(vNjHNFHg@iz
z;?h5zwHFc*GaSyP?VWoVY3sYCdX{z7SH2Sr4Oks*5o5~^q(4?OIC!?@+I^Gq(!H_K
zQ<3-R5&VexC%_zxJ{JKv#5!N*o(C5>_*Ovq`jfN2S^WI7xAy&4<FRLc`t&cWzH>C^
z?f?GD^vZiVr*nUv_}~A0XU&hTfBqd`(m(e8S<~T3(ao>^;xF>8U6bCLJ#*<k<$pNt
zeRJX3pyT?-3vb=h<5w(9@aQh?kZvqdvIiNP!8r~R<TdFe>Lk(QqNnBj6x5rVXS+qe
zUQC}5>qY(6U@k?20w53Pn>UoIPRkePe00fu<JUqht%Zf5PPS96FqUY$vd0A@AXVUY
zV*NXxHUo|-{s&CtMM*7(9FPDG?6|UFu_U~)hsnKZol8WXLIWYvS+yG&bVDj7w^zD+
zI(Iu*3qEY%bC_cZ1?T@oXX{x`9VZ<N{W(Z3AV5eDSi(3p+g004s>K$=C%624SA2C*
zozB#%MO#dvu31~0t@J;uoUOpTCc_MBq_pnRP`uhbRl-I^z;Y?r+EXf8z*Q;L*;6BX
zlgGS9&y_%$GMMZSv?!k`0VxP<V<8x}yEVz4Teh)QoBu209mFSvzLf0F8$3-#IbLiP
zc44ysa2l*yhHNg%(f6M9Ep#l%Lu!%<pj6?!%sx|ZeWuHqdrwf@5>1*lI8Jbh<-G)Z
zLqj~(AxF7+#o$+z`r&!K>l58%*?E>pVK8|HvnTv707uFU^P7k`>eIk;!pAl!l@Vbj
ztSA+eIfx?xHw9D5H_Kqts3MbujLIBH7<2SOXX2mtoptW1Y?sA<Kd!Q`6doChUsY|-
zXDdY)sS1#cXwl#{3y6+UKJ*E?gi@8DrfNUMV?kYT(jEmuVU>mLriL^hL)Rp~9^`pd
z`wyNWA-#2o%!~s86pLq6%)5EJ@JM88Dnc_x!qWPy(j-BF%G!A@gx?O~495Wk*#1qH
z@d^{h0Mi#b!l^pXQsqhANW3n(yfvphtYAi6>tUg&9kFi>!!)&38+!H#dVDb1Ol&aX
zyD&*>lMv=9uKg5R&UJ}T?6Op>9zB0Q$OaG5S7;nX7iPQNs;kwF+4G7<HAdfn(aZ9`
zwmRE_q|%+**$y%T7H6r;t;&`emSbXfsb~g3!s9f9+{~sDkzvi((|S`p!ExDEa7^1f
zA5yHwLXsu1g1l$WRJ7XI1Pbn@XZv#ZBCSSalF`%4J)vSc!R)=n9Tq*M93W%TNg!Vt
z!DOPD1V`~LLM(ildk0O)OjawLHhEk!ToCE5l!pu)6WIOouepRiT4OoT_^$TVmUkWB
zasdh!Wp~c;9wXj{VzZMM_fW(uat?K)`c=<5VVye12PrU-ekKaiKgf6=miX%GK{sN{
zGq~@D*CZ2_%>4fOGMH6|vqo*87)#Ttn&P^l?HmWvreL(d^VO5k1f(NI)2bBeB%4}@
z90(h?zY_b+zM^Zk-kGq%S5@@^yJo%a?9r4@^gixesB<mkPc}Z9%gNxf?>5CVo1T$k
za_`0{cInCB#FDD)!}fKiJvw!5hm_wFN~RlgYHV?~2jxOx3wYWTH|)fZpXg6O<AXoP
zptzrb>$zt!Sj<Hyw>#8#h9yBBxf0h9HfVDebW%_HAZsWWz7Pct`rZ&^E+L|O+$}<^
zFYMeW3@ce$1o4cq@e@tp_(|9A%1n_Ojy@0452yrZ0S4Ah=nEGNtbA3~@dnRD@cp)W
zrXwS<QRYEKrRzDkZE$&2khdBHwALUipss7}E=#R14mXJR4GHhBWY8iW!pAu1#ldW!
zD+Bwa@qJgy!)9gMHiR2f@bBnHMm+>Xf0DLJ#rHD5E$kn%Cxlr?!WfIVTh7~x+|Eir
zYJHWin*5|O1uN4Yn1{m)^JG?l*P85IK9dZ!Z&OK9q+1gp1%Ku<rs)gLHF;OdUQJoG
zZ1d|mlg*?hOrzsLNmU-BgFjkJL_27C<V}`&F-CTLey)XS-qU6F%heIEJ9F%2^Wm?L
z$^mUtE-L<zz=s({_a5`MhWPsPjKej?vDhN-72^tB+<<u5ML8OTyT*8$t_$NMCY7;z
z`+WVbqD78J<-BAB7%cWv-l_;3h`1!^%HjX`)+AvYx>zWflw@GU5SB`CNFZFe&)kv_
zah;vYZze8vv_gD9C|@5#CjG+q@U61syY_G^Ap|N77D704FgeVxcwQ6dB1jLQG>{iC
z(xUjN#O!Ltvox}304B_KzXA-MH+M(0`*NHPR5iO|_TFqU&>(`5qmjc-5MSk|<29fv
zv_vgZ#eaF@Q!X{TS=1cjs;%;U1=I!=p&{FYkH3|pq30$o^ti^Gy1?Q|11;Z2-`47v
zvYe@i#Og61IcdP&fFnH#?H_y6yM^vhI22eVLd}Fb#}y%PAN3*DmA*p;JeFlf))v-t
z#osrx^LFG1@6|Y8sff`yAKev^v(^HW?Mw+KSkEWDacRSEf_LM{N3zO>-=vU<>T1nj
z79#z5im4=h*|wvLB09g8!&u$0w?~`Q8`v>AU%49_7&G1v8FRNA@6F_}_8V`9DD$Vg
z>vIunJ6{xIj0CO<*PM67^Vh@~YN)7T^z$>>N<uFv&*sw2NRKag)j)NGp8}h2H@7SY
zC=G^`-IPL1(&o^|QIjB?c$&7Qywos5o`p^`1UMd#Yua@C_;RnE=Q)>;-8JMRlZduO
zdkqC@QC?!bwR$C_Xo=R6<%WeM8Jfq0va3FErwur3Jl}oUP+RJ|=3cvZ7KQ>yy#H@n
z>vk<yZ?gAL?2*{%x@lAK){jZEhoS#03r&Cg8uDWMhlaHkv)@j9H=;Z@A_rRmV$mEB
z+}tPa9R7)OqOhslRrQ?W*1=l437vSf)S=wXQ*4R{vq9y4YhGVl=P;&&%2nG)hEih;
z++=R1d~Nh+1WJ|P_G#GT?s~r+edN?>a~Z=3S51~0@Ei9o93C+014d(4b3fKo0FXUN
zIcY*pPhE_S6yr=FFLrsbzK-ko>_k0)alVu9L}gOjD(XKlS8503C3;a1Ew3UsL1<EP
zNl+KHQ>ElHg78$==v92XN63-A>j`9ms5}pOj(l4{5}vy}I;Ifh7%-o!OIP-5Ppq8O
z5U!u|Ea1mMwqU!f{<ACbZY12@<u$5Lp&aCDdJR6IVjJiinxBRoKwuN2I3lJwLC1t`
zE8Q3nfT7K^98wOk-0gzOa>YWmFw7F2moHQ52={qUDqX)w&2ue&LRU@Y5zCPi2}ryW
zW{y?9{4)F(2}dS6l@ASjwVPVLny5=)8j5)o=8!LsjlACEU8);-RyR#|_;9d!lD>YV
zqPTDCE857em8(0~Rm6l`<CL5VwCHR#n6;_XP<S3ncqeCLMbK;}p{_@Ft?Kr8ZKH}`
z3G^H9$-+AVy-L4vxDDL=-Z9^Y!vR0r{M$3VCH5Vhf=~xI(-;*VEBlDK<QxP%xJIO9
zHB6}p&jX&%bM2XqOO>DyG<JYZQd+7tw*bojs^(71+{>pvqlfL72APeMAT9-`6}pQS
z@x_enD>XpkEMdU9wsu0Ob2nzuK~(kSPIXjWL+uv|+${$9&Zjf;PjfzIdg6kZJRIv|
zgu#Wf1AXCTS?j70duroeOGHS?v4{+MX^S<AJ>NnYi1Gd4xtD<fu19rqXuk+k2s9+U
zQSM3BM=9blRPBbmT@+i!RgiDR72Z647WZ@MQW(!$9x95!3cRc!@cH88e8$E_cx?Mz
zrBXW!$fETz`wkALoT|v6hCb6@>I8^tXb~la){9{#8JfIy$w@*l*cQArLTKB}kmgBd
zk}vZ?qHd%v_I{-9aO^6}wsUAfq9^R?Tvy(<&b>gBw9RBMO+f1E<k6QjB-OYEyM*-h
zd$;JH#^*59ik$DvyJ}nYz~;Z1OeXLl-Ama;_kK2BgC&nP@QcFT)umY2b@Y|SIS$X>
zi<%>q>t{3ZU<>;2HzCQ?v&%H&oOKP_t~lRWc$?MjhI5FhHj4!ngK3AIt~WFm;t7-f
zGduyIdEFX1Wq2*1Z=^3&wDVN&G<kMN`=M<ptIba2%L(AXfm+(wa@Bo|@sX*%*}ti?
zyjiual#7>db{dL34url%O%S)xrNCQS8>PZA&}g1DSu2{e3L^TB_uZU=33B%G_iME(
z(_ig!@DRoK1ek|9N!&#7<I^Ouo}Ob51$rGItQ|Df%s~LrxTjSG;6d@~-)x#aKEYBY
zy$z*Ohl!|D-y4fPqRhXlNHXU2X(f1CEFmbU$ANRmnY5&muEv9nnIX+tOy1HAI_UCT
zShodcmHt9OC%lNM$==BE!#rYZSXfL|OxKjelqb?JIVW~+jW0&P=}dL*!6P0xh(v@a
zP5;(u*PN+`LN}}}(JlykZ1KYzyVo{$dC2G)Ke`(nf!`<lbz}W`0pHNO%R{)Rz40NJ
z4B=hxV}j+~1R1lc<zS}TyTWfXRA(QW#G#!(xiX}oEId!&J3q$Bc?#$-Zw(Tg3DS0n
z4@X?@RKmWIXrtm3OD6H=QcH49K}<LJNbra6!JVV3hB_RL4!}+*!=dq6*dHx#JpxQ{
zy21ZL-J6F+d8hf_rzbPVXmk*aaib+5;(}@1kYPeYH)s%-Aa0mg4MQ}Vh&FE*r!S|*
zlA3N=G-$~xAfvQmf`$ZKV(gk!Q{56{6-yY~Hn>cnlT?*48(I~Gf~tDn&;6k3?wL8~
zI`8tw`(9TsSrnn3-*f-&`+I+v+2qa6RSD?~bU5PzZAFT+4hBXOOwK9zwIpBkZYl&m
zZ{5CsUgsHfw3wvu((u7Pw4;<&47KtEosp=ldzt2>T@v|K+rx>85B^bcMkp;k-LN77
zI;%XXi8N^#dN6#^XX}{%ISG$1U|)t4+c8zTt4_tL%ul9Fd52X!P^}1qdBE8Dz&xP_
zEKK*Pv=`LCe8qGgZZ199y@IIuX<=u}#!@TGvP;9!YsR|99%?)W6G^vE&x9FB<aZG?
z0NhvLnzKmkNXmWkP-IZp*%#j#Zo4i1uS4>b^6%c?oy3zqh_QISe)xc~{s_p*3*FLE
zim~DbBk%!zH3JN(InzSz<@&RHAZ&_gybA#{<SvXvKPOS&)->m{4m4XLzU<b9zaKnh
zl;auk7Exh0sm&rC;pOk8$`Gf)mmX<Lr{B`G*R4KFcbWp2O?0wrIrlHOKEMJvKy-3T
zAYJ_1P!gyS0dl}pc)n^(WCgHv$IRpflG!l{ThqB<sE)}zV^iV0LhqzaEip{8!+;1c
zEw*B;S@BRn?565_HgdsY1FPZi!7GH}q*mx0v4i+evRk$%{B}{@@;G81>25hacrd6T
zW^4!wYC@>|jwuKZgzQ3czyaWvVV_}^U8*lbm07jlnkL(HxXWi_+gPco-xQ9BmnO8>
zpGa_DYQcH#{Rv_1X*Soows|LllV;1TBYjz0w2dLP$dOgH86|e3{l%7LQxn_MQ>s%O
zNrm)pVQ2RaBCx$`@a<GQ-#zAUxBgk@Z5b+7ZYZB74AgmNeB^9Jl-UY;)e&XCZl+Mi
z9tz-ysvyt2fY^6af?O%_Y(Tk>7o~??j@xfPoMS(y!4XS19N1+^_dS_=`*6eb<h`-)
zj&0kB-AEDh%%j;$hrdRl=eZ^`)ApY|@R0EZJ2*ut6N6k?`QCqXl{NC)^ujd+N8y*V
z`#{t_xh#9?s7YdU%i9)JyJDtgZfM8JV>Qz1GZ~fG@|m7AwMC<+WI3}&PcnQ^ozuaL
zwj)s}@K^;4>s@6myOyi-)-w#c<wC6xHhC;kKGr^t!x!XAR-cX(aY-nZ)HQ}@Fp*SB
zKkxPwx(`-l^X2I?Zczpv3hED5;30g#d_XTej)h4rP&|ee_2v5E@_{9%d`XqEz}QcO
z1fw)ECCt(*s<f4L9xmqGps-J$c)_^rDbK3SORXi#sv}D`jTu>7LM(|y6j4^@qzkJL
zClX_&Sc15+OfA3GE@_(c>2g>Lev8)vC+$7MfG<^f(2GJSVb?(Ox#GjW+F8^6o5|ZX
z|9q?Lnb2R1{r&;_$+t_l6z6|2?&*8q`{!w4D}J<pa#-Tsc`N>I!S7!=5xZyP8_U9e
z_TL`=KYn?7;Ofcc@BXe}%HiMS#dQsR-M71Y@WB@=Z?EgV@2m4qv)rF+x&KX+|2|Y0
z3kvVA-m6mz40rnTZX@P@W<gZ0tX9=+v~!Q`ig_bIO4byho(Wui#_>XP+1U<RMdQc@
zQfVrUM9>XHDmytUG8dHRX~>U4X%d4QFn$aRiH56(#?yCbTB~@Av2gi8P8-;VHBF8N
zSy<q70c1AfTUm)6s&<SA?lp|JmJ%9o#%72_Yg!O@1yDJm#lbp<6~n8GVLo(0$pEIM
zZYgVaeC%DC`*Mc#12tLIo3~g0+l(o7`<=;h_46Bt=_`9ov(=?cX#f}uHJ(2d)k>|#
zy;5S}k{sFaaD}%{x3E)YliGUy!nFV^Glc96kXOI5OjhPXd=fGfM7~y7_eaqS<3%lI
z9>Rfw#OAaf95%}43(66~S$U$ESgK#II@0jB0`bla!Pr{<Wyrp7cgMSbST`&1Ud`UU
zGn4bMExxijc6RsTj8W}rx^brGs%NHB0=-X<j>KiRK&1rM&rKpI@h$tZNoO@TZnr4C
z{(E))a>XB_d{L$Rv5y2?yJ0ZZ_ff0&qxb4|_q#2F7nHkozHe?{U$G^_d6T#op%WN8
zN0is=F7SR-Pcn^i7XWc=DsT~(AgOoo0OdJDlVGPPyEsGvJ+dN4hlEU)lxA7Kprt;<
z4F3uj&U+5)X)X2(8-tub>0w@5X;hc)Q2KdLdxA~tn>?$_)U>^1tvpG0f%j=UtndZ^
zz6O03vz^!l2Ff1Ad!3J`R#To|Q_`S+wSthtX}Xw9Hr1EO^sCtv&US0!xzzphGWpn5
zMjA^J4wCaf#*H`A!slo^V+g!NZ{G~0Buf?$7>Fvzq%trGqe9>*3p3~7qLi~d8X+;t
z8Dz;Akjs#Q;Svs-K%Lr7Xs%|oOw$s|9(}>#qM{f2`opi+<c$S#;91X=szWmR^vXWZ
z(JuY-`iQWwuw_l&r;i5{RoN69_8dkly4R{n=gK3_+PYjEOgDs%3#u)d*->I2RbHtX
z`9k5|uD4UYA2u8Aw<ioX`|C5YZRoo&)p?=EC$H~$i?1kQ=$qTMo7&%Fd0s^1yK7~v
ztj?jdpt(TV4Qms;+f@MKCiKLe7?D=64wd<vHJj-W*)w!=jOwa-0PhM+w(8CHda!1G
zbaI<wiHPVJ?<v&2bDfbJgqJS}5WX<VAtcteD|6E?A|LQzt_^{g&rcG9TAhscwl%{>
zJbyTi>H<bDA5ffq<wTwmob&dW>F>^UF@UqGsBDNO!M8>ill6G*rn>6*d9@Al<)Ou3
zg~91$X__Mj>NNkd(b2EcK`@rz>9np91{!C;^^Th94$I7iOyeQ_tZ#NI_n#4vL4qvn
z@Vn=026%F3_a3TS6%Q%^!$<PzS)Sw|btFkb&9oM%0wAM7WD9V2+&|C>Wg`HZ4auBp
z@&t!1MDJWs=eFElaIXeM@bvVyd5cGmE3ML$)?eKJw{PjnX^%lu_mWCHVv_5PxG^J3
zL&(HfXDcaj%$&d2cDT!VwOJX!Z>!%=64MtG^1jT<`|{(i7A*3HZW#v7_2sq1zRNQd
z4Nyizuh{L<C%!klB;uMRac4Rt5l$%tV_spZZ<IVqDla0z$p}1R4`t_-MD{)fgUxZc
zWLl;z-Id0O7Dw0TY6>RH<vIJUQPJB)oji3BnN8DExle)P>Vq*YlcEnhrk?QP26=iH
z=^?BEE;iTf)Hjx8NyThpw*lBtisLnkw#RF7mT%9p&q|mmyww%3joYIG+<TlaPTaC-
z@cr%kX$@rD3MMA2t=(Gn=#jc20(6r+XV~8ERMBwH>i_zQ+E)VHLpq*CxLoNpjKez;
zJ?9uQx#dF*&pU2GTN{z<sAr?!!~S#Bxj6RU5op8t_P3ur4P`F*=)1@fAt`sfXP<SM
zJ-y0O9X?N#{k*aFYWmz%;u=k0;B<7JMM0f`Cf6QKY51+RC!_n+Xb+9;y!YuGAQfw^
ziAbj(+iFd-=LF3^5!`-1_J#8EnaaH=|L1+aBRtTmKYyS*{dc$a`hJ)HOrMiLrmJSz
zNzh7cK=M#iL=p|%#nY*|$R!K(YR!~LYdgf|iUw2L2#j?|S4#HG<aTQdNF-s%zH&Ye
zO@X}Jj#mKyUS!)TX-iLPpHWxgwF19DCuIR4Hz@%4e$5$G^Tmsk<d_xWcvbhf=_3`Q
zF?vpFq`2WjCB#Pl`Jri168YsSX@GW$gts)#5xRv(TDQZ^<1>|+wp3?t|DZunAP8th
zo>0iC0+mQtMX?<Vk{vs&7!aO|yz~5r)vD)32syP4^k{O`im@V{C{r`^>5}i1+&^yO
zo>8f2vxU}3rk3E}!faub#_=i%Ixu#l3o@m-8&>xc@)t4*GYLWX)%Fj?urz%HP8CYO
z$1kK#?13E#koZV9Y~0(W&+K*ISoYf9Ijc86Zo``)oZy!H#Thtj?9(`w!@k*z;M5L?
zlM-8KCKa=C8Y=Lx;#VxZuHJ*nwO-Ht=Da&ni3h%|QrZ%HQNBp2#Fn8L&@^YW;{RVi
zZTFtQ|K-zO)`Vw{8VH1~U`Hih5Nr$L#^}b)?;|IS*K+Lt<Z0*YXEDeHIBuSV)aZiZ
z_D)=W2_L#_OhP1nRw(53RyEV$0c)X7t^A9JU0{~H=B`D8c)i{ceR*#87Jb`{55T0*
zhY07D^`UEr`k|_x8GN?vTEIQ~B==M(5V*4<AS%Icj&`22W#+zB-KNEsR;nlyA&;IU
zOO?$9g$g}XM5YM#1ZcxbR#**i5&?qY$mrju-&2WjMoCNDd_DM)lsotqCLd4A9&;Xa
z43$WS-B)cu8?DN45(z%4O}cUSUg*7jZuj$pD;5`iTQfS%xED9Xh42t7roFVrSXN{k
zU)Btpw*;J8yM!hO3TndRwiRW}WoNBpzfSdEF_2N~SHA4VD7?-eWl)9^e0AOrYI{Fb
zd+Bxq%%>YW6O<<uBA%W#cy^wt>8LfK0qaOslWYiO2MW_oa+0BTk#kDzW;q?4>sXrM
zT1~oO<xbEt_`9N|#$=Gsuv9#g%r%W>k`h@W4GT7TdDF#kV`|yXR%w#Jw1UC7V_mD@
z-<hEfag||JOL?TX!islR6qNALOs0Jggj`YTi#jiZ-fNR?HvB3fwlk-bC56!vW)D@t
z^wq+k3|3`?Mia@!R`V_G`=f2`n$GCUX4~kxf%muTUs}OL&xHuF3K+@cn&uI<9jqkY
zSjXYIl4&74T<N!}Ja4GFeCd3JIV!PEKhfHg(Nds0UhCJZk1oBXBtR4+BUr-&<p^&P
zbaW;27>BCjAaV#O(2^r6d3AK-lhqloWyev~5rD2;cHhb=FTtFZA+x&mAKli!+ogX)
zAECXpRl8(yisR!I*j$=c(=T{_Y;-I=D>m!1)ALFkr>sC(n)1k$!V*$4Uf#Rc>S9dl
zqH*@WCs(raM~lbcw<QdA4L@w@qaV;mNA2z((U`S}yyI@9%br)H!%ubpR5oAp0Z(BM
z-hA%6XM8Nq@j7RjoIvs7gl5L)tDE{``{qYn0G<z7J~)CR(1$nD#Ji~?;Jm&(|1CK6
zPr@NZJBQcg70_+5Q<IMpb>c`4RZS$Nst2plM^BrNA0t?*qe`+`PW^))Rlgq`mY!Dn
z-e$+jf|>yBLKn1Dz87A_<)#Sx%Tt|?*X%5h+Amb;PV>kqpN-q}U|;GA67fX9|2B^>
zuCp@4#;YZ-Jv1WX!t;ZXPN)ye+%c>IQ5VtBNppCWMdwHF#;0@4MPfK7R&s=)l&8mw
z{D`+YK(2zrIc<O~ln^~yG6^at+Kf^_sd*^ickPf$lFr4EDGi}N)ghE9?^07lW!Hn6
zzF9IhX#&$$!?w4-#)DjY5!+wmMQ!+`i{i4W_Fuv60j@p#y!B<ey`nFflfHO!XS{Fl
zpy6JEze>4BD>fnTv(B7*^j-dR<?94qXv2?cv*^I8Oh6X88YqIe;B(pWt##DWi4mrj
z9*>1ebh76SPN<AON%4@sxVa_7v=zx0@HO_OhZ_V{V#tX>n??%+p*386Ky%Y+nuq_j
zXtGCUFFmStyqbuCAY{jI-8mQzvzgJv!jL(JACGjPg>X3C(-@3dqVt8w1D=;|MaR>`
z!mLBAI9Q?~C?-aLOAym&@G<#FHJ&rN1-c6Ypdzs<-~G~mE!8E)n<C7_#)r>;RDH3~
zLv9t0VxVW8^>jDucW(X7O{h**c}AK*N}=i|>oBKjlk0&^e@*@WzWzET$w4v`r4VI^
zdTGmYTooYj_2@#R&Mg`Imo9y({`sfU%53rFjjiKr@@<s>;*%t*uLqu}gL|G&9itR1
z(&^Y7E2}lHn@5&!+dKMBZuVye&u5v`aL>*>Ut{Lrff&mTe_N&}($c@n(qHAzNl-2;
zgO+<1UuK@+37MmGHQ9ivRW|s)+uh8S++LJd=BW_dh;~5bnY|1^t8^D=eUU>5#tPd@
zR~-|Y%C`%%NI1FmaiU+|hjsuc(X^<HAxmJ$H2Jr(;09a|!iJM&S;I)~yt(cOc}$~i
zNyY<ajETgkggL7^YUNUG=Ry0dZrT!tvF8}0xhmG=5k2y5ct<`)DPy7G@UqXM!?Ga*
zi=@Guua3#?pEkw5JE_jKulG!<Yb#JYl|TyipCtEbO@QP=m;UbBW)5ObBGHdN4lw)2
z4EG`fvZDM%x<5_^pr$zfr%e4i{I(2=mC9&={W#)J#aUi%6-QBF6%G&IA!4+0zP1nw
z(WXO0=2UZoQMT0%5x~JHL-$h0f?jux>qgkJ`8T$P9SbhS(o(zD_;{-p>mA3+L#vD(
zN!n**4s1Jva3;OU^@rdzqw9^7GRKA!FCOZGI*9iEbH)87!7d^GR)u124;dV@h^PKR
z@4ytpmpQ(-K}1F<y@qcx{kshJqBd>suzzgsXm`Gv5s}Ofg?%7gFZU{L=C^2}E;?0a
z{U#o@>T|Zn^E3@KITd12Sjm*jUY1`$E~SQ!MZ0i?^?a~2(R|vTMC<9e{7?u(#E#SK
zQbB{UALH<}EkDXMPB%-hwNkhpkvi7p+-|JN#==P!HO@ksX~G^ELhY}J(<N>od*q(&
zF4;*fDAAOA5<{bsg0|$%EpW;%rmSY9XJ*HWJeHkVo2(P-zS=qP{tPsX>;tA#wmGRn
zD@M=DRmA15o(MUg@8Phm#jS<*11BkXVT^MRRokk{O%`bGEg6ZP6e#STQg1#&KOw$5
z{v_x{FiL7N)DBLzyR3frk?JL9kkIT+M!CG1#F-Bm=TYv{lawTI1xVJ!8ZH2M-A~?&
zt-JrjwD^tTjK=@kvH5fD;-<Nig2>4&tJ<5tqQ~CmTANoAsvGywDpLeFkufYBZGCVl
zB9b|ZH{pJ&|B~g~00VEh_hZYDK?(5}S-y<9?%gtE@n;+E=D=V09mDk=r9WZN&k>oj
zrP{`uF2mRglA3t&xCW7WGGeUf0b#H4DmddA$lJnQa3JeB$Z`imJ8{@InlYpk1%I5K
zUj=`EvY5|m+Q~2FREBQeFx7QBUp^zI4)0Q~qnGPJjVT6He;DTUsw#YPU2Iu#aoH^R
z8oBEoz%mCDDSF7eMjk7)Rk_~a9eJ^(#C!|Vul`u#QLHdc^i--+!P!=YuT#~ErkVqx
zmnT{_(VGptPw|NBK!zwCvwjV<dF^(<8n&=-K26WL3jXqGCsF~nOP7sKm-`irGL8@`
zX(p>y<^3M$ColYwL_ZNxSr59H3{iEK=5UxqKG!)9&_}&1tP?;95Fn!K)um5$Rj_Z{
zRBoRoa+CsyN~m8voIG;PTVV269oK)<uYY56T}s*c_^VkFPmCw@y3jE*k&nt6Kqhus
zh<(HROU2R@vtpK>h{8<E7HJD;<Vu$u>h*2)U+nW;Gd$=vc#9SL1Xf`~{fZ%PPiM}%
z2AGC>a9L7(f_o|tQitjiwi-?E>UT8hZlJd;-<ivtQo890lt@jc!;!>8u*AM+C@E6@
zzWe0Je`@&o$M3y)^CvHF+VzLIix2(FFaEmh$p83W%C*oH3&SRNe(xXi$C|#*9~~a@
z*u16x^4Kqa68gRR-;MnT`^Y)({nHyeH2%!qgSTJk%gEk6^!CTsKJ5Q_XSa4tQrY#>
zyx$O`h`^hZET;(LwkAw5Vc!a`9kX#cV+0AX4Oc1bphY*9mPtqx+z8ci5|~@|=@mu6
zFhW28R45(A_LP*fvD=8FVwnX~SGQ5c$41@I)WeGuJM;#u^LpEgOd>r3YH&R{-~CkE
z1=oB0fdQb}!j-KKzoGHu$vHR6snM*-Whbwn_(65U{u?pdTh4^so&wR1>53)VR96a;
zeR1A0dXxF}3=oVP^1aia5j(+1FDF1Kh-x(#8e6}(RyBEzvOyEjxd+)WFHI{egH=rs
zmfxcZ#Qe0jYFI`HX+}T;*CNOesg<DVs1o4<dUlgxk<ugZEsJLMJ!lqHuVuZ361od1
zZD`BgjmH<Wf1~LqzhG`?X(L0ktR+e!*{mjpb>ow2-{?6VjCbMT$dR7)EtQp;Q%}7+
z@ac?Rz5869cK|EJ5N|#Gz`a!EPH+EXixReDdmp8Ip5yzPjK>KdAsF7+0wi}!e^$KE
z1nNJ-pnY0gkv70{Yn@3%RQKEyKs(r5L4gWS&Nws;O@n#7H1f6BxUo+W|1Pzq3xLr|
zAr{j+vMshlW36DRpc1<27+ISV-U>?*VhH7(k4d#eKL8IhdX5asrQ+Z?oe-9>XLVLg
zlx3%(dog!BT*!P-lS-&j;6b|Ou9Cj!@T|uL)^weZV~NqEKUk7IvCIrkm)^uOE-C&-
zaA%Qi-UsjI4E$^;uy-zBo1NN$IPAF=uq|iu@*G^Nni(TS5)WL?Nvn0uBfSFaIZ>)R
zTjNOl1XFgl^;vI4*rn+p7=c9X>Y~yB9qs&F_fMqJhX_)nr^CVe>n7tb-Nc1ZanRKB
zCg28u_NtF(pXw5P8deW}>W8K!$E!A-y-{p)G900S_cngIob_8^Y2<PHYwXcu!{m@Y
z4z_KQDSl@1`^-!6_KhY~Ej(^o1s?ab)mb|RE>3XppwpGFl>7a8?im)B`kZ|ppt~I3
z5%oD=56<1;X)-y_aRa!|igw8TPC<<jj|r`eJw%z;Ov^;3G!yp}mgC}qN;s;uohB5+
zYl(O>7LpZ)?t}aR1hm`*CNloEElhuSpB%>IFAX;d!7tbGJ_|3M&KgS_xmYlAQr~+1
zA=ftMFvD*@EN^}$UVmneOR6cO>A9({_3^fiJ=*QzcAx=M1hlyQG=|XRi$>Zm(8WWp
z)un7ma?R~V-;RJaI(<8YuI$fhzrOsS+`IRoWElU;mQ*25!;^HN8$@~p2~$m=i4)E^
zgf`K$pnbtoQTTZTh%!#X?5FHtgWk&!OH06amZSQ43D<|G&j7^a7@4f**YLbiY_Wo)
zr?#t4-vfjUn<{?Taur~RKPj3o-8C1(PjFqRRe35@=r~rA)q@p$Xm{A9D!pw&^z?0=
z6P8u8Ei*3LqFs^|5t>dw*a>}PK7l2A@`+wA+fkNpqdB&RB1^a!#tDqlpYdYL_C=k)
z)zV*^;HeBb?Kw7hCDYSrxZCX?Ol1O8_9$Oj2KFen-s_XMbewm*=RDj+MhUT=%PFx}
zH(wW=bXv}|5IF(!Ed0!1<zjGLmRe{JwM>(jJpfxovcinoh4$=Nx!$2kNOF*7UoboA
z!jY;?*Jzn37KNs&!UQ%2YRH|q1z>L`jfc3K)7i0e5_M@f?VifF5vp%xJvEa#P335D
z3<(zOc70z+axT=PIHW{U0w{+&;#X8um6q6uEIdzjK3b?h1Xi<<^!Q2dhTI6TUw%;U
zDP-kFsU?PHHxzXN^JnH@DSuT=d6I_t$WqDi;FIUXnZT_JegvdHTtY;HNjj+J7|~<b
zX)fYH7ogb_E65|K-C!8nC>tBqixp2u3b$~7MQfr49NeJcJarM8<gfy%KyDu}#=5Mg
zRbW`6*z~Eq`L?9cR&ym>tESK;o{Lq3^V8(nFHBxMvV2q$Cam=!^b3+xxKxvXd0gK=
zwVD2^bu2pXM2SSf;BRcE)z#-8Qf^xY6o00HS02Y)XNa#zx!aIe;w<{2L2+9=b>3#B
zz0Q9x%Hj#}o%em!ykS#Q=NWt%oG4OXhV)oxpB#E7i^*oH^ST~^)8SANz(-<y7Rr5o
zar8K7seFiP)EiHN8IYD=zy6`{SRQJ|7AzazhBzvw#UfIGCY8pS5!H@>QsDV7Pt}Ge
z%P(M4BVpdhoUiiAoOXpG1tYRqwv!k}P%z$NpkmUqZtTI<D6|zTcCKh~53+7-GA(Oj
zl;A&0fds0vp{U@(uF0JqvgPF1T&j9@k~D}dt1}j*PwBm$`c=T!ms<|?3O}6{o@5*t
zy^yQq3kb%Jkv*R7ytd#Is#*z&4Ut4S;@(v!dskitfp!ywFu-NpwGNV8x}*{ftIDTy
zJ-p62dZa5Kj`5{IkexXdqK#b!6SS<JhQFR9!`6w_pxg|i&xRejI<GG1UO{hA+V0Te
zCqu7n-m8yXHv28MkHG~ck?pN(r@Bh!^e%%X5Rg*ncs49Cese_n>=5nCj=cyMeE6Ez
z8xSve$_;l@{Wj%JroUZzFXRLTpjaX8w?V=Ghp<3bg7;RG@7-rQ&d0tR*SWLff-@_F
z2Z+NgL}#(Be|1sso4gKIQC5#y5aJSijvBqZhg%HpGAWJnYJbOOXS^eOOj<AKgYfGF
z$5Blnsvxv~_uz+rMeHaWt5kX~lFqo;^Km3W{K7?yixGI2EIH*mNxldG8ijL^nfI)7
z{QNlq?1Shh)PE9b8&6599m_d*QhOlKWecLkCns;F{U))fi#VVi8w+&km+nsM^&a#6
z!nZd^ZzbxTM=jpf^bcItVkG&<Pe2kw!7S*-=r~1G__FZm_+I3!NX-zUa^2e!(rt3x
zmK?oEp*7PW3b_^T<7%;S6<i8k;&hr&PLyy4hNb6{{yZ!_AO3m-<yCa?VKYUk=nsb`
zewHOQ_R}ZOuAGRng)OP5x51y8xpw|I2M*~|!;&V>PM%s8nhs_ka7=yh88=H9BMJ~T
ze^nf!JeYyUs{i*nKtBer;^|e~ieGV#cs1{2-UEF3l)DD^&b$YgEd#FUFJmuo-Z9h>
zol^_Dob~uPX_{jf<v6C}rB{Or%m`^6$3tnFMPl?!GS1A*$`VJPYy^I*%tnvI15~Zh
zrF)BY&I%c3yGSE1eXAs00L@ePs@buFm$h+stJF#&st7$3N*1pi{21yxnY+67jmGB)
z-*K;|Rg>30&heUw%Qd+3--5^UevoLd3NL~V<y>M)FNs9Y1f3fEfsE7arLHXNMgrkt
z+lp?)+zJ_RZuN{5J`4&`AORY2sx-dH@^ebRNSCu))dS|BRYZRxW)d@b`6S<4ynw|u
z(N)5e7gM^4Wz%5om2=?>@lG6`)W)3FkV%m*DH(PmP)`Ssl%`*ezfj}MgeL1$kBlt;
zQ2Mv^x6k08Abn{mlUgVdDhSv->-8yv&!n|%SiCs@L`3-EpqJ*Sl-87xTw-#)T@e`-
zxwhK&_`cAP*(jGtEgECGxs=v_-;4IzsD!~QIR<a7-1c4zd2=qy(2!c`NhfoJ_|gqS
z4NQ*tom-k14C@H1aZ7?~^|bwh@fbTd0%u~gFb<3d{ebvupvF@1`*UPI5_*BnCVLuI
z9yNXP{)bjHyg+c8`h3^KLw)HiT_r}^y`$uj>8Hbuo|>2yzd|(#vOnu$mrQ>LgWkh|
zNb2Ew#!-qfK##>!5A^Eb11rk1od`+g(lFvN-G{L5Ku0)eHs|O2w&;9K68^-kC3Bf)
zjN$x?!je$70j5Iu?i$bfsoIrz?tVR^#q+|@6W;4B_jo8^ncUkVK!8QZ9L~cn4CFnf
z?US6{9U^HkOIjIXbm$$xf85>X9c4H}a4;7|5u{0HMld{!lxO5JFqTpSr8z$)3N*5a
z3J=)TC!Po$40>)X2UB*uMM8{$W(x0w()hwm!e<*?KHhR&WF1!G5499dc0MUB<6TR5
z(`_wd0|N^Vm%Osf-a2Z@N348iztB<{u!PPI@SCZZ|FV#VY*#rAfAZ2Vp9oDWe#>*V
z*DGCph#z{pE#dB-g$ZjDzU%7!-U;gCj<~F!-)-(}HA~%G(Nl=x5|==j&biyelR^F>
zb7cHDZ+eb<UNCpsPsoy{QG7(!^C@MMA}XdiA)Ks>eQ`x~DAl?iy&64ub!g@(Qoibi
zWYfK9HcxRr3;uNw=(!l|uYqFgEoKMWfz=uOX8E^_yd==Psrt6j?Vwm;yT9(JvlSc1
znQ%)Jmx^G7xdjvE%4y^-jJ6eRo)UDZo@RU#&l!VL8;of(K;(~au+K_w5(qUm@edyy
z{3>d|8FeG%I{m4{5T&UTo4l5C!AlWVMZ?@uvQ$SYJe<-jenOOE)&HI=Tg7hSTFV7Z
zX_1g3sTU8sI>r{TJD}|E^y8wURYi^5+D|@`kHd+Rz~K&=-^FcRN+TAZbd9nV{_e_`
zH0xpg95W7A4mc{L4c0Y5_b#=Rw1f}MAHQv7Q~p5N=#?{B)(0#|D{HXM!&PRyK&V;V
zCuDm&lI#<y{55OOVl6ZDX^h2xEn%q0(8pg?a@Bw0aYiZE>b&PI{;LW8OPT&If3f9m
zf^SNG&IcXQ-S+BK?DlcDWb827Zp|eQE4(9G%(UrL=wo~2ThAsbiS-X%3r{c>?tj6K
zMA+crHB?7zuRO-YG2^y))(iW}{tTtO?@d}=6ceipIj$G<2Vu&F;A~|qja=kNYDp#r
zm&6J|l3j<YLOtI{yTZene)+k?quPa1zo)igzK?pq-l_>5wIFwTYRxJJ#yLdSTCZRo
zHgkJ>sRk`2Fji(U$w`Sv%O)lE91lLHv8`#+gmy1(vWG5fs^c(toB?0Ycrfkq8~-ny
zhv!<*{c=-tn*lpX)tB*~Lj_|@R<=dpLQH=}8UP2?K>*MuhyGz$@H!j_UfH`<wYCwJ
zCszZSFp<a#-9w5EI|Z<zSf?CDj;~NfMb8cflDs){;JEA3;kDZ`oz}olm!Ea+Z7a2o
zhoV;IdRs&LiL7;ES^jCxbo#|`+l4SU%(|PaZEX~ZG%UFlOk58twD_dI`*QsEstih$
z_qQDZU76&6lW19|R3!|tE*da+yIVZPhJp{axU;4{<vgdWy}-V*bx!O>ur={}$l{I2
zn(}7sB8gRKR=fJjqS!?&r};rpYA~Zg02CvDLS9())8fF-b+2Z9>JZeA#5*wrTM&gn
zig$@EpZi%nUIF|FI}pf3B&@FWGcS9&ze|yYBgvf#qnnk#u38gXhk5w;cWvWy9dENY
zX<icFo{oq&w?H=w%%|D@OmAr<potavB+nqYU@RiEQ44g-dR(D}d~1mDFYMKfERFo&
z;Gi?W_j%N<5W;H3=MfMo%9BAiC<re=oBG@kB*#)aTxC2j9uvio+29oOd@!OK*ug!*
z2dJWgdv?2aaWMF1BRxNqQqxi<vU3$$q^T5Ey~xkYf%jBv2|=n+<6@#C^gQ-1EQF~!
zI@>Kifk+EuB-v?WXlGz_5T6$kboON4@&^U^QTt|gFV-JhjOWB`UHjqiImx=}5>06z
z@0LdQgsr1!EzyOQkx5N^SFC34e~^IO?Q1yC2H6W0lSS;mU>M9C`bAC1QDJm=l9gNP
z0C3izY|Om4WqW)_>qY&d9QUHS9rjuK)8GhltFwa{X3v!2$<M7lI^`uq*1Adm&z|>l
zKlok8BQC&SK6^rt=2@~6$;C>f-B=!bc$4j_eUvAh?1(?kOrLQ7bf3$e@Zrz2zmNF!
zn59pjJly_QLEnC0N?f$yzy0@BKmTumNi#eCdH?r*@Z*l}{jOm5zLI6_Kl#<#@z;NJ
z^NZ?Ve&+jZx8*$K@Qj{CkNsVjJHrU})KuCf>xqc;p7u2y-&FS-W}26<5XWz<=j}`A
z)#kdxMlJ|tRpQZbq;yO3poSjTl(!Y@E9&=ZN+VbQp{1`e;qJA*p)ZCW{E!Jzc1}c?
z+9ojesGbuh*BciXHFIYOJpniAvUFicyFl+iE+HB$H9Fg##paZYY7Fag=K@<!W$$r%
z3?@TsjuZmdJ~Kh`vU?FaLk8X$Dy?Bws%WibaOA|S8<xIz(x|R6PAcQ;wAC_;8%?fk
zj$6!lpa}L|X4#jfvo#Fx{AjWMSHKBhJ~mx@K5e9}GIIN=)4>iAZ@i|D!1~c~YO{06
z>>LPSXCMVd&N}<lZpVZK<xZD!IbrBhSKg=3ZfV)o;wcYNt{A*^diRWk-;X#yCU=L?
z6mi*H)9-wBUTZ}R0vg?#VfzLen1*0?j|TV$M>)ZFF@<cS<;*aOZ<(58-3a{$5T8Xp
z?m?7^WDkFb%T;Kd71A>Qe0V6tq_Twl;<61!{WdrcvW8+WR#N0_Yn0GLQyWCZd3DRn
zCa_JOAY`f1NK?dSfDzTGGENYDIKVMvN3h&!k~BGKGK-G|VU1_MHaX}c6n&voVaH@s
zq7`06j~xM<JrtH);$D{htIH$9!**K-j_kfaly`f??)$qIy>YRlQ6EMHlOSu8HawN*
z#P=cWe~v>&=PF~qZgvj+WlykUorqh_e@ZH6Y4uXms-AW-6CK45n-z8o%~+7DQ0a7V
zFPt`vfTSiTJZIS`QcJfnfC=y7@Ps5+1QMNQT2E-OndJ?vs0Bojs|wMvG$Yu<>tE;z
zr!?_}b2uVoSApSz&pDHJ|1s(CTZ>a-p9-9HIc&jr?dxSV^>#cLHKh$4??d{EgU8K#
zA%YN|iN5x=0GsP{rq|n$Ff>BBaUnrD)mQEL#^WqX@Yg5!cF}|$oO83#v<Q>YO+6jE
z-0xIxTGK-&y?vz`?i!ps+Oxu_%V7!FvGqCZplPoGg{J7DqMY?5;9WFV`(3Bd#TJ)l
zTWb@mQy^@`Bk6V3IC4iqjgRckXKU-2nAjA$nYOI*bg<_S!Gs&IelFCnwI<~{u8ju!
z);Kz;rCrl)#f+%M;+kZw)Qp=yhtDqEs9kXsF(nHc4w-YEfYYZ{demie_1dPnV1dP~
zCkw+dsm++)?W!m$89CA&mL60%m#I7QaQ)chBXu(g%!!R3`~AVwzNTJs3VrJ`eLo$7
zo6_vs2ZWO?KC^A5>Pe%46i*^v(dZ<&X{XFL3fp$&R<Py*%z8eSeDngjfv5=kOo_TE
z$ykqK$0CkLV9{nw!g-1LN$D&^FwRND^H;7S`q{#^#^I0vzSYsTS`Cyl*&P`Z^HvH!
zg&S5HY;Y<_z$h44V!e!`+3*(zlw7CU+;DHb@@D3}(3R_tpRj6|w4K>YFMpzJ2PV=`
zb6%TR8kzn+g4;E=<kANH9|(qlOo%ZSxoUOZKILv#i_ep~`*O$5u9yVxl@Q+@gR8U7
z`$~K9q%HB2w;1)dxX!Dr>dmvdUolpLpuPd9DeG}zY|FZi-4G9GK{GEdFQ%Y9eJ73`
zu&)GMc-5%e3sVe74Vby7OS20W%}I7Fi7#hfw(fGga+K;ftxqN7%bQ(EU9C2il}x~H
zRLD4?n5oTMdp#nb3d1gtDdo*zUDRbUgkx%zCQbU4V3G^@j4Jz)W@jx8LOJc3d3i-y
zjBLE2fFLt|)mR2Ko<C{re0NyX?9rx+=A}WrV<#e-%P@+0ZRwpoyDxPQ^<>_^AA-tw
zxiJodRZV4f9=(S>29qK8o5RC>t%@<-o-LOQi1DjJ7i>_Q=qM2_;6BX!Wz<Cl=q9I}
z?L>~driq7_-b0XTBd@LNdBDZ%=2^^eJ2KW)Pe{@(<eal`H`_{s(e%qwH9no%pDv|U
z<9_L+8Pkm4@OX>co`t7CU42&0O*@MhGk!|W*>(E@Jii?g);2=>I{BDobB>#WrkkE3
zViRfgw9R|9o%eoMI(JQCM==odYznX`ti7^+&bBLg_gEY<b3S_if8hBdAn)#d!hL$Z
zcaseRA-*HMzKzN^hbAoO&@b!=_LS)W_5!!`RAetq7H46ugl>LfGJ4L(u^ZFypps3C
zLd1c#UV{oXxhG{qu475Yj+EaACpB>dlEl<-n^Cu>e$4Wd8+yoys-4zXS+9c(W;9m`
zb<P^yUM%Im1HX)7QybR@cLGd)p!RWzo)sD5>IrSkVcmsfmCUBLV$Vqxx8yUwzBB`m
z>WI`(*<y~#U3*mfR;aFg6GD2&EPDQiecByOj66~<tjQ@D_8cxsT-_R;e8pZrDll-?
z>J!%V-k~qP>@)OznK1ChmzJYgcs37SD-u??{JAmW2)+jhqM6&F6qiKD^?Z#R0*!fA
z4hs?C3&$~4VB|Azt&yI?ES~2byI4vrE!aUVKRR3=p674aCY0|C+5(7HZ(qU`mOkEj
ziH39@y*Eb$7+uKae6g-da<i}D?!c@vadz{fvdr>qd(5mp`F19He*PcNK~TuvfcJ~+
z%K~GcI<+~Fz8rRcWg{&(z@=mMnSCIoZE1*;fWqoALZn6nzkKpi5nbhAg8!=Gun>Q3
z7*z6x4w9&Da6Y@KWqDpd3*>}>0OcF%!o6+<B2rI?fB9x7US^$F4|R3U7<I0+p)+Yi
z6QDz6x3)4}^$w;uVSR}+_%LqVIetqS+2<G}i>!t_geyZ%i*80LSPKDB;!9o`!>5<3
z6Yd;aLTn%?ycq|y@4G6d74mpVrW0j|b2c2c7@9D%Z5Lg~QLb3g48)f<1YbK1WWi?+
zo8wY&0dkKb^C`2HVbql-GLMcKz?H}J6}WXbgl{gi@5P2jcMjfC?4lHH0TC8Wt8`}~
zLX%%GYF{6HCrSI{tkoao_1E@#s{D>#pEDqDYj0naWw1Rb!CRd9pejBso;9Ira_WAA
z6Cc;vsE$l3%z9NWi{>pE)49*N9O)l&c!u<kNV+O_(l2r2%XzPEg!Y21HHNQ{?QoBH
zSI48}$7esS9yCx3DHB3&<C2KuM?(eD*p{<i<iV;n8tHPw+6C?&E}~@l;3dKWX+NbA
z!x~4YToJ-`k$&d$#~hD^yjZ^MsX!C4F)V147ME!z(H~ZpG|WjY4NncN(FJN3Mja2|
zOzvaovO@tD?`;l7g8#wo1fRu!TX89%4}EetT0EZ|R0iw(4bm{&-t%%+Q|G+yxE-E<
z*Ap{m`cH?Vs&h@OSgyzpiNb)y{{POAppR&Wge{!#Vpl~!=ZP}_t0!4k-mi81CJxkJ
zhIxHhDjOm0D*#71YN`%1hwF(&CAU{1jH${AoED8|@x{35Kxww|7#GhE-;55>eAxVW
zU})5FTV)9$^QKRe95ZnTY;GAOSGxaxdT*ZZlRCYl?%5Vkqu*h;-`b~CX5Rl~Pr%f+
z8IQIPJ+fVDphFwp6!BP8Q0iJoqkSDb7$FdNo<X8Z=dyD6!3pg|=nA=&YF{9WIo<4T
z2?3<u&@-amW_`L&-9m8GfOuBv&U5u3?}UJ`&cn<wrV{fC2rD4=AQP<HxOU`t;leKM
z4E`i^Fz}@6%{R<K4dWVlm3BHPK0N-N067wYDQ6=pISmAiO$;N9c*)WE?_<>h1+;>X
zGT9v#PB^5s(lLl8>wL5R{?}b(0s2@y->UPyzw-lqmEPSNqMWCNv?wzY-hHEU`~t$c
zqGO%KPSie~j0`XfVuM~Xx}G=gD6?O3N|I%DDvKsF*0@~u*yCR&y5O?@xailmP#n<Z
z_Sm0*S_Z(-pe2E9RCH?A7~S|F{03ub>o(Vtb;ksS>UD0E@P26tc<Su`WFeh}S0J2z
zE{a-w?}vRiN8S)t=Zi+tUQ^KkvoJ{OA>35@spO+LcEE{)N*3o>DYdEzhH}72p(Q67
z9Wzr+VRf@FYs0fL7TXp*N#s&wX!>iK(&H(mE}G^K1N@)wG<XbNpTS$Md|KygRvz=;
z&GA2L`0TRbcCF0tw-k4~fBru8O(h(W=A8*~oJn3#S6DZ-AR?s$PZdb{bp8TCm#7P8
zycUF1gH7IW$U5Wc4A(88B=M=PYx0!2w(!(BKf-$4Hl>Xr7-Ivq1mF{lzmK~9P+ghD
zD@4dI)M<{)y!4~|WZdX{LOJCs>p=vd1SgoqpqB%}?t9kd!evi|d>b0_N?2mpS)wAx
zmu^~VD>d)6B0Omd&oa`#)Z1qzm6;Ky5IOm}wIsB1|M}h5qw)~;`a-<YVeISM@A-jD
z7=zh{!TvlPdsftDy+3+EYVJy-=eX`L%hHak8oG63KKrHeO|Zz%=+s5jIEq)Z#LVfe
zwSlge;vtrM7PG^2bmD<8Puo@nM~Wp45|EXT|HbKGolzVv3Q9T*b%s=r6F)8Y3HtkO
z6jf%-@Kj6}$0f#YX*wau>G=8SCT&Nf`Ol9=sNK3|Dr4JZWGTtkxB~OUpfFx6?(3QR
z;qyN3AxW#={5ZLVR8CFqi4yDhk)`$4%CHwp@uRxdooTrLux@zW2}+;eYw>g`pLQv`
zynj5XJP1*S4A6R=mfM-0?s&a>$Z(@MzT$uHkc@j|9dMN4k=~KpW5%#qL?@*!ua=%t
zu*tUQ%+);|mC`>fp<&|bQ#E<{Cc9Q$kI<D0EJ~$fk4EbTCi>9A;wJajdccQ;uGv&r
ze)$DkH0M;v9H@)SW1vCE&8PN4tFrF>I3y?}B5c{>!nLln#re;Ss+?e&5AQJ+Gl{tH
zMQN^W8;+GtP(OU*(%3_rN|zOW)hb=6|6v#Ur!(>;XGCt73&FYDjnobIJC5bf(93b1
z#f)T3%$;euoyp>YK|gTWyNFKGnX86YQ0Zg2Thl{tz|IngCQHI!xMqdq$0eZ>^tLgG
zFs=?qlQGtFKHRgTAnEAfW7rReOY%IlD7juMV(4IGK%YfvN_AMP<ep)1TpC5NrDHPd
zFfL#2bfJV}1t5ct5EkT)gZ*5=3>#WU;XD;y1B8)SIB{eR1uU6AzaFpxU?!|)#l;h4
zVJl9)vgFnCy+h8Od3Te06@zlnq8v21GQD-mRm0HNU4Cy>um5ja{E-O*kxCn2jpcs5
z<=$r&cW1Bng2i_?D(^<76PL|Z)B|ocnMu0-BpJ_GcReBnwoH#m#(TH%Z((fou`r=U
z!?utsL%NV7>NT4A$PfVGm%v?~XtG4875~l(0l1!qLS80@ydDgPj4maL$(O%T=@aDU
ztv4PBX9FHxhgy%?p-&rzITmg=QoW?f%g$IEVr?67X#`})3TyJ6J)5jra#Ujr_gd59
z(G+y7BiwLhY@+0so(T?ZA8$IlcP26Kd4uN+ceh#wm{PZP>}*Xp+}+db=~f16L;Rly
zD6JOvdxkH2Kh}1fiCx4oxq2vyBpmm`DD9XjaIremCc4uE>8H%ivvkVZdIn^`QSO&i
zVrTPl(J|3+^2B8PR8I@?g;Gdm_2w6iadtp_hw5IOer{FMfl=;#6vmFYCmlDf(J9;z
zyczs>6oVXK2zh_j^z3&%skOIDn@T;mRtdj;j0`+zo1V)MoH?@F{#q&5a-+r_mS2PP
zlPDo{8PS-Lt~oU~Dk5C4tD2I{e21Y~8TQwnbcJdEFn8bjrLVn`_<F<_(vA!c^$ra-
z=2-kNLol%3Y4G=YZ;RdVZ@PScSyvAFuVIgv?mrt4qQATE{~zZh-1869cRw9p5*bFy
z3bw5K!`Mb}_dsV)%b1f4JCf^Z=36KbHW)R<*2*vUgtm=P5H|V~e09o~nclA&EUeJ4
zH0OD`^dmSk{mMtZ{zm0@H1hyme#^a>mi{8c-*v=wpfJMzvq(^m8Ew(qo1|km%?@y1
zDalvU#OA53x~RBeH$Xh&I&G`fk)JGh@bD}nVNNelQ>g~*+)rt6#??$yF$^g~ICE6Q
zFuv%B4$kPPmFp+AtrncrOxssGt!#7r6@(=i{R@6=P1@j3aNWJ#Kf(8_KJV11+9$gI
z?uh2}su|nze!Y6#_zk0{)c@e*k=+w2|26roTX&9sx$M<NAOF16@{7$^maIEv_^avD
ze)L7*vSYIv|MAHyCpP^<Txj?A_II7@>%aXkef_!L-1{V=zNXUsj|e>)%x{|Mlr_<r
z?+Aw`I@vLb>6PO<lNRR&%f5K=cHk#ERJ1C0I!-dtsy(d;@A5~R%I21j9iQDZYGnD?
z*w-}7&x!HAo2k1WxD(t}2G<C~x9=%9Fy7_r@7ZY>@bvG#G9y#@>PySr&K&RWFDM_P
zX}W6ZZ?Cg>ibEcx=eHFD&pb4ho!(&<JyyEzylx#NAog;4GzOB^Qd~2kQbID4#X|a+
zs~fJ1+;9HJD%0Hp$N$3h3zKUTBe~SLcaI@HC-Y?lSWN{rv9okqtN^Z9-kWd2<ugmi
ze2_(R8o&mS8(3d}KzJbRRK_3Dm7zYEz={M;DM7d5Sv|e@+rpWFS1Mu%i&z-`LjJ5k
zXmA(pZ6)Tt=5dnnQyg5nGtu>Y(_VA=lEQzydeHxAulJh8S0AG9>u*t_lxBlJ$NyWy
zUpo{2wl43>XY1JHebSwG|5m1_t@dq^>vcBbXcuSS1bJ>vQ&ui0DBy!`Uf=#v?f9Ov
zCty&-Z?60TBs-b88_kC&j3fb>6}ZT*E(n$8pt=Q-ve#1;iBJnhlWukgc7>pIAm2o%
zxc-LWXrud095Ys08}&bqr&wk6EGy8Zis@le{12ER37cVJPu20KbKbYLyuxoO$zH0(
zNJHcKQLJ2jx(5BP#AgqUXFy5>RV-%-LGBYBOX%-Q9Lr{QKNkzrjf|C0?dx(xK8?)0
zk*a)~p5SS_UFZEg(|2R1a%ZRE%NUFDac_V3&{wq<SEogpQs?`$`ThaRedkuggJj?5
z2mUrn46rGY@H|nR9T(Nhe>fbj^{$tkOB_JQ*3t&igLv=LJ#r*Ulpi^+$sMQvmHr>d
zxWJx+x3*x1V~H_thwHP4XJcg#(wLj;xgusZ%b(S(-Ts8*Gw00AwMU(^n$>|lX&Qd$
zP)~HT`)tw80>AOAL3h+f5jMh{-%mNdc(FSIV0G@=vyYE+?1k?^3=rLHWcpIrc0fJW
zXT|eWQdcj3y0T|`jvmIx5P(X3h}YIPc<{I0%b#4$*?s%A|AO+zY|G%0-Hy&J+A)Bz
zN1d;>Uxrq>vMDQ`{i_v3d_{)iF=BGP)pPmG<wu${J0N$vJl;OBFOfBNdAR#DJN$|U
zJsg$wOM#R-qERsCM@!=z#l)24xD;n!$5qcyGrIyM_utHFcAk11+hX@e&=W=XFe~JI
zFrVJ(!hG=6bebXgdT|aWN_PJ2K{5(P>t=}i*aVu$bl`CZ01r8qrBT*3=wBXZsh!;v
z=6W;OJ=UasozG%P&%P&1>nm<HWDb3?wRfN<#D6cud!>)y4PR@o-@B{V_sMNJ4E8VU
zcKar@cn)L^KIrw}Bg5F>amOg%L=838OpsBT<3M+pxz_Werq0HP_6)LJ6MF_@XX`4@
zuVtuAaxBL?L(T%b7u`BO8KDpbsF*8df))q|D8%|{0C;ngP$R~=W=l&~l$A@qt7@xA
z$RDLiRsOk8A9MYJE*a<DweW^TBP?A4J#YYgAc_@zq`GnDfaud?+2>v&wS#G+JhE*{
zSQ!6Je-M#xP1{_m9UWS_GiW|}t>c26%eF8OpKz@&EV0fj57#bv>IHL3-d#B!%Iy%Y
zb3+$ZW5&lb>UKZ4pOfe2m<(<8S6TdPl&ea|%@s{q^IAKu0=`u1tc|wljv{D<^5WVW
z_5Z+F95Qap)aM@V^owt-nwn_TrAnX&y$#vYMQ}WLrVO4J5?_*!FX<>WT`r`x@+FW7
zI76Ee{D42aU1FcCy#!mnQCB)vn7?m746{HX+KvqX87mVVM?j|Jeu=~6O7M!Q5q6Hr
zoK&&BGjEW~v$3YFc<h#-`B4ACYZt>0#+|W5>o~$!U(%CO8iyRZG)!CJs%-wIJhf%e
zmL9L%iBZ1j@?T2ueUsyJ_Ct&Fx9T5sW_p~x$^kABi*t%{r?$>_BjmpGK%9_ohbL$r
z!egdI&G=hNR}4}w`cBA^0ZH<H^Vlk*q%-9nk)_N!Rmu;eS9Buo<J>))nuv5p{;`Dn
z1f?^{9o9y5@>ZRCcgPo|M*q2W#|S!r#3s)w4+xH^QMkGIq~lAZ<Gf`S{w=qgY!sHL
z-;S)91>Wln482#l_aDpmYVN%;J)l&Jy~otHKwYC?xcRYIv?WhLD#gwkHU)p*WIbc;
z8GGN^<@W?A*A(9f1Fw$a->G=JEPXR>dQS{}m7eFQwcMLh=l^1-pRDd9mVrAKQ*%DH
zJWb&A)!EvP5fFh|T^wwja*&Nahl2AYvn?ipGAUO_%H#}OfGC6zUUl7~IaXMS{_4=Q
zJ{z+$x1~&zChhi*-(7b75f>ob{<cZ`ObBlibRuq78RHK%1lu->hjOvXDT!1wuGnVb
z#mG-4s~s4`;2{8>j&-rlA9juq<pu|g1<(#q`*pTr6AD$~mx&D^jTRqc)S>#9jB$=5
zaSr+C!19`cxP@<(>~kFtj@*24u`W_uQLiyAD?h9aw_i;)e36s)+19*)8HT&b7WY8~
z(dCf4Z+CxDouSuIo!a8R(QWvows+w59Odgi?_jsTC(7V=b{Vc86oPD%>lffPsqX1;
ziq+M91GYe%nkC$v(lkNephfDeJScWg&bEWL=%BN@wh^vPl#&2-gB((sjtk)RLKn2c
zlV*9FmZ+zVaft&+;^0p+*zif-_~kK`6OcmNd!v$%GElht*2qs?*)J8wwJq0<HNKoh
z5!J!qD*DHI*9V^-D2d!vu9?NI&sw4#oyG^^IAVR)RJOTR{P6+%SK5+-iuW(u5*F4|
zqSzZH+MvVW+mqn`xL5J@db^due&6ESy8n@&SOz{+?8@B$%Y(*VPkEok!(_2#6U_;&
z33sS%UPv{J@OIetG`!eGfEkz|6QKiR;lVR)n9Gir3dxuZ!K!Ve`*}q2|C4hgXC={D
zT&5XE5Z6%I1YP#hCdaH~$0ZQG40@X3Rgwzy(2OFz)9%M-h!TaxVcAZ(R*pI!<Ic~H
z7%7_~aGUb*C@0olIPL<Z+uxRx5@COPbjpf}a&yVfdYvgE3CvwIn{Vf@IEEX;Xl#I7
z&sVH35C5t$^Tvz>UuEipoCKY_+5c6hcY;K3di^^&CA;c&yVF}d{f5uZWqQxI+`G`@
z{i<7eGvqgS#35vMh&-Q#na9QRhYLg9VKDKAi;+A80-EV0$2E=T*HW8kMKsewYa8@W
zCm^x${7Cf<Nu#}?H<=m2z}Z$z8<Qc0Ot4I`gOrN3PCSMnAOHI*-MsX?6@DKXod6q3
zyo~Z1U8;bJ&5M%bn<Q!l0|fjoQpyf!?yT(DU=GB{HF8N4>Cwipq@{`W4Lvrtl@eoE
zJ@z%tjfn94u%stHvj0=k;=FIJrur`@^xv1h)n8|E8wTqQo?CgxJ5&GFaxX#o5c;`d
z@ZP-8;;TydDmh_Wdj|n9=XKGkxjUu;NT(WUq_d{Bo#~7Q;w<RA#&g@AC8v6LXRDK}
zXqdq&@t9LHCwa1VN7+-d`G|G>OxLzX9mc_h^-l{z+z6K4h&joT>PAx;eMP*Gy2KcF
z!mxuYAdp6`B7VD$H<i2Rp`#O6;Q;`^09~poJv+ujf<Pj@gU%|E+G;0bZd;RUKLR&;
z&f1QR_!EbM&gj-{NGjSi|5z{<R@P_v=6QoEIp$#n`zjiD51wz%W6<$2=yZYk`fYXo
zPjU?Y-}L$Ky8DK5`pTOP5AJ6UIAike+%nvL*5W5!%n@Sdk^&%KCEO313HWs?z4X_x
zc-3bsIEzl8IerG379vL-OLSr*3EI-KSXMg}9LHpu8+w=#a7?H~G;th_Nwe+5VyeOR
zKjV7Ul67pY!evtYZ@Jmj^t55q0{uQ&Uq8H%(sijOZZY*flJ!EjhC9wRJ5|0dzPFH=
zwbiAr=W_O1!<IC?mA|iSY-y-t!_=VCd_rfYMx;X+Tv3r-W<R1awFP3){?rR^*7@7g
z{U6LoxWA{**RxZ}RBrbvO5WwvcRVTmcP_NJa}`g*0H@}MZ`mFl828*<dzSr&P&!1!
zez|bopAJkcMo{odpuX@ja8n@1699+Ho0)W4mT3{O47?lUE6jUzWGb@jUB3vfH%}%T
z1REec<Y9ED#pA`O!f`NcqF6<S2MBKXXGaDSBujCM4JR`4@^fWKiOe%P@-^cUsk>+L
zIeyWsE}49!rGW$FnAK5=YY-CMOXld5iDEwY3Zt-y*uuND`PMbY73+hvj%C*IvsQ%L
zhfB(})CZm!Iev@bL6rYbLr-t#)?ROAj^Eci@YMuEXN>=&+GgKZ0Yg`6dw@4+a0P9U
z#?<kdbl@_vsx9fd3?A~jrZJUK>U-e0#(Iy>#S_Hu8TJ-cr<3RqnWVc*81z|9Z8^yR
zILv3dEUQFOEG}1fiZIZc=xQbRSgx9N>UonRhwj1JXl?*@l@D-uCz0Kxp40}&%hLYH
zH?+nWGd?h9&gyXmm2=ZpGZ2ThCuv_t@PkO*eiG?-;zcrL<)H+(vpxu%o}DnzoY%ig
z@ecR4|LKIBi>}$U0FKl7SZ9*^^jx|P&%e=>Rf{22_?+e~Yso)~l85NHW}cA>KkB2?
z8WN9+^ox6JrKg=s3Ut{Skk6|iC|9JX6q*^;$h?)Nx1t}vr6m48z%lFcA^VFk?}!-&
zu+5O=F(`xMnkr;-{806+5!+NwOFEKwAs)!c)AF<gGJ*I2w5xSR2Ty$zd|llPF8B?`
zMUxU|_JJ5b6Oi(F?vvXnU(+FTGzFv3*k`SznTkjWUzl8U?@mL){Tp3MN2cE<6Ogx@
z2}rqoyLafDfCQyQ`DwgoaBHSgK-W^$;@jEc{<6;hai-s!Z17k4D%mYMH>sP!$As|i
zKF-CnmQF+sLZS-jvX3TuR#PI8Z0q+yVo{ARh}j%wHHlPC4`hNm4&>u8(P(pWbg2?0
zjBYnpZXR<R)rU?)_$kmDdV>qRBdqY-g}=Z&HEMz3H?E`7C3jq2+vEw-Rf2n0X2-ul
zYm)TZ-k7IOEnYk|D2HgREvxrg#|L?SX`N?Z=D0MOEo}7Kp4XTNLrupQw=Of^2=F)d
z`aU!G59ZyyY`IPMY9DMfxNgi)>T~i2-|q7sP_8vA9m-b;w<doym-mz$560jp+(*ra
z13DoR>gElf3f35oAIeLd?mikvGqo{yv(fcooP8RAM1FLXig^TkAi~Y+TYgs08>0@(
zM_or(;px8Ss0+)h>&zlWFaw~O6hN{Rwx@Yr#xu#<h0U&`9Ap(WZwyYtO=5>wV;7Iq
zMTRUZ+u>S|69#;MNw(ryIn!Z6CRLl^I2Dd+FU`_D?r2-nV>4G)6yc5$8rpWg)ZVr+
zdOs1;!8IkPkB8@{+`rcRO+$j`iy4`o3&YTeWgx0$XrsmB>+`oL{=Xb`ZYx(~GKU5;
zb?y(Q1dQJj|AzF9Jrm>#(1B(}y@@FXEITG(d7P=o^rvFmisR(97SEf??0lMv$1tcC
zCj$uY2kA~6hatS`pOzd#&~P9Q0h8LG_Y}KI(z!xlNpO%E`lI`pehqczW*!60O-Ch?
z@5~vx%qHFfIg$E(=3$*=2=pDgk-(h2GXw8^kQ5fCeK{g!@nRxA+A4v~Y#XzsFmQiZ
zpKF?#oWFS7^q#MxS`@_}qd1hWdw2KkR9qtzS8{?k$D4n+PLtp+wzv}T$oZhV<^GXo
z|J#<KN<iF(F*|x{HXU8mOc)mq-<_oSTn{#FyDsA{_kCL_M0f79TwYA}G%|nyo^*iP
z<gV>tLB58ey#w?B;lsA28{p|IWum#@`e<|t{U`s9Wfvzs**FSBL~83W=s^+|lQ9mW
zB6MP7NIO9r5nqu$jT`8qc5_B$0F+2L{V4BsAS;;toEBtFCIx<K1O=j+Iy}KK3;#4W
z)uQze;KPNwpwZ26Jx`GsPrX7k88|I{X`8Tku9TdJ7_~&5W6tGxI|3AYo$q1M_!Df>
zTlZhgG2h4>6Xic^dGi&am>h$ZgDhS3*cGz|;iB{btVXE@mfWQ@9`b`hKq{hN_><p7
zF-5{NPjFFoGc<D#{Vo;?UMh4);Vd>ynw`l97hfU^=4#}Y4T!Eo0AGt;KjUE}i{~-U
zD%kjp)(MA$lmrvA`O*xeDNKUwz0*lp(b{L<>D)ZMrzFfhHVxS%KDC$_V3J_lxS(|B
zYE)toS!IiR_FdFIIcfFRW8St5T*%40+iU<YVe8(mv|EO9{55r}+~r>ij8>=g`dU*B
z_n-Yu%F63E%?mEa?a1hi4!OxszKJF^R*PPN|8ns$%cG8iKXLN3v>OF{EE9lLhYS3N
z)?Knl)~&Nfk8`b`Sk|#dML{92iP=f9x@1N+s9K@u<Sr5Nds+r9KVKxJy2yS+^;6=0
zhL4u+S(R~M6>8T*)7<N=L#}Nj1xD*AZ4j+=>X?n#I>IEZtcO}VA*I^3ar{SS_y?nI
z<!d5gbudvZ>zUOpy%+QH<F-o80@LcIz`(MS$Q4UBocy=NBOY99{tD>CrQH5o!$0+3
z2^f67@1Lf872LgY-l`KPR?g2@_t?_!Keg%96Wh*x{pRk!+WYgXE4wQr&fR%)b?xQf
zZjN0Y@>u!W-<B=$SB!63xZ*_hv7i3&@BaFhB|V{TXlG0R?hm{F`}&%F`k!R>)EOMf
z%3xQETN(VY&-<Xi#XZ8`IxrZM_w^LZy^RU|?UrvQDBm1x89X2IpfzB(1g}UAD6C*k
z%4HyH6!V-l{fFjL-k%E|6PXYZMU;oQwje}vb+g#tudh(8HM!rW$Y7lsJ8YJ}J&A@0
zEpLgY)x5Ulq#7*7Y{b`D2_`^)Gy{GK4$3fe3wHEW)L}&E**6T&0nFeXBC}!PO3w-i
z5w=>+DQ_U@w(r=I32r*!+*28_c<_%CAw8tC#SUZxVgyWxKMc!0)INE=v9K-Bl!$|F
z@!H~mq1Bm#s}qK()qaa7UAd5|*mwFW4%cO5O7xf?gg3Rt^Yy-tGcPYnb)VH8&hDuV
zc4v@N@Gl+Fv$wZu-@%P*USH?Z_{rnNQ3p(+8j&45NVc!-sq1l8dWINgb$5Prm<g-G
zCCG1Z&jiHVkat!kl3JJdw9kv8k%48(XY=>fG+n4>9n;g{+NqWfZ(;<|pwYqNFR6EJ
zAuC$sp6O6(Nk2EawqUALljd154eAT`1(utqf}E5;zRW4>1bcN#S!B4*id={<n_Nyq
zM_N_%+L;$maNPVI;3sUTze!L&BYVv-SfqS=wGVE6U0#=AVy6FQx8=d|-hoS*o{xHm
z9^~ZRCn&Wc#Q$m3&@r4kc>gKh6zr9bPO-HP5ao4phIrlM{WTX-Fd1BtVk9Cpy~@Ou
z#iK&oj4K#;TonI;W6>>pof=3h;(q>}VMC96*#Zv+B#8B1ZdwKDMOxox%sRmQLJjru
zg@{OiC9!s@iuu9*h1KU?%^wNVUNr;}nLP7RaAgb=nsi3<!ajX0WF|bi&C}HbQrfVj
z%v!qHIrFWQvKSEEnaT0vV%s#!r?-z!8#;ae^**JxSNX;=XfeQk=}JB2xpUuU@PA#W
z6e*X|EuLo<>_3ms)L)(sXwv|W=Z8-R!lk(VG&;#s3pt>aoXYI7)85UjYP$K;D5h{G
z9lyb!NAwV~#};WllkORQjo93mR++0d#gF6EXFVl8_EZ}oA<l|2AzeYefhT!E4eQk!
z{f||QtromX<lh>;sDQg|$u$QfpmHMK=YKrEG~1Vv(uXu`Tv?ZW!K^+<bxM$l&Z)A<
zRvEKzeSCRjnSE4wc<9Y8{i_K>Bu})(``Qlr-%A*(H#|7fO90I7+X3F9{`)!Jn=yHx
zU1;&QTY94WM*=c^Hv<y9pYMrV`~UT0VXqg{ImRJlTP^%da4vd;>G5OvPia^=+v>>l
z#ToIxZVQA8T~Qyq3~y^8W^dI_KeQ?3>|Twd^}+81%Qm>0{rC8bU3tqoQ=6$3Z`Uaa
zYSDQ6nRD~TZDA;b2y@lkal1eWU&j#@klhT|HFrmRIrt>wjV-DA3JcHZ0O{qy*hy1i
zezH~e3o5bzL|~V=If+n14v1F51xxzHAP7+mXQvYyetLY{R<Xa44w_A)^e*B&O~3$u
z$MENvj!mv0w)JRG0Bv$%@{5;RawugOZEss`N}{0T+@h2pl{cdT;P|-WZ1bcO+Zg`e
zzO9GZ$+{JQw{^9V)JW^r<HzE{leW#AuVstx$l2{$kf~g<+~1SnnPBj~r`XveTRh*s
z*X#LRL$5oer^@o+ezWiQF?sjWTkiK;e6@AnJ()gti0@Kn-oPhZxs(i05%5@ux9!19
zg>p&oYS4P%Br`wB(Y&4#aNC~46luF{Ka!7E75XL;JFy&kQ?wm?WH-?&t;Eh$3e9(h
zAGZnZVRH|5_s~b0KMg;`zTqv9xr%HSm4r)kjNb(yV8=TD-#_|Js6wYS?g;Rf%qP$8
zE&FuOlGijZq?AS`CE6#hwyl76u&k^GJQop71x+jE!?dh$@7{qw4)ywcia*nz&?0-%
zHQVwg@4k~`@X_Ze_HqJ;4TBb?v)S9A+&DC8%SG~gjh$z7hpRm1)BHr43)yvvX%nl%
zb0$|>0YGIG^!#)&ssS>k*n+XJtE{ck=$iS6Vz}tcRqzd$t7r@UG06<U0jPV1VUU&9
z_WWSF#+OZnv*>K;YQ?25Dpos&2|@iz?j@d+AhZvi256FE52<Wc{gXi=G-t<9b0YXw
zcveJ8{Na?TkhX#BNSiLpre+gjH#fkuLgk7_hfL4fs~0;isJO&&MP6JrH7GE`F>CQq
zP1M(snLSngcgP`VR=zsuzp*p#_61A-k%YSmZ#5e}>u$L>#d1HF{B7UGUeDcY&A!ti
z>%cICCh%}o_%diU<NV7@xlzFrSq|cLl1>Zi$aTeF2x8Aw#TwWgGS^Dp=eizM{ud`w
z^9T-yIu-+hEH!c(a-DM2R_0+bEw2j8R3Ti88bky<J?-ss=Rzmq*$}#1XeTh#aOBZ<
z!mddOb$s|yZRh{uWI(3d;7+T+Tu+joou^hkb*pSKm9?$Z@!G(T{Jws_L+N`KKQL-X
z!oat1!S)!wZ1~W-aB!n?$AIT^faSg|#&Z9hg5BV9!_e*hHzse%`=yDCCKk-CV{?4@
zrfDhH=hnv^NW+l79??Oq2{qo~E?b(DHWPMihDx6#rEfXVz6J-42z$L4b<rOH%n8JQ
z1Wa9B6+y>@$FhJZIOj>S7~s*GiDq64NK#9W(sPP>a&>e{lRl!Z9E!!{=T3s5kamch
zCgXR3$2@W$hsC@Yv@JuZ{FF0Peb9Bsdwv<$nS{&SS|gt-42UpOA%a0t9)gp~tgYdT
zGnQ?`x2mFRalFUAwZ;F0;mZ*QSCoHXr_u<hVi>%$)8gKfYPoIL@Spn-S3utVt5Q53
zh`3Ws`g3RnRltjpZMiDSnUAByb}@XB*ISegZ2h{;b~tlUYO(34tUDR38cL`K11lxo
zl~mre3LJT7Mhzba`{9E`va=Y;czD=)&?c3JlygRQIbOw$2ysac$iKP_n@an8)P<oG
z4KTY9V=UO2%0-075gA3UE?}Se$$O!JfiE=e#d0)%4tU$)gR5<mW6L%_P7;djHv$W^
zqfM*F;5-@Pb69-seaZz|kUrn7gu!Nov-5i0-@C7V-bj{Wms01yGNadfX#V97CfC-z
z($slX|GW_bcTJc5I*O`JRJA4ee;@8AtPtIBE{8lGA3up-pyH25-Yw4U2yp1c^7R!p
zV~JrfF?fTWH=&c1zZ`BbFo`MDa!V46&9N_$bg4f-A5>WD@(k6A_#mPYXbg}Is4AEX
z>Bcz-U~JnOMmBb4319@YZ5&<~tev>Jr~PW&Vgg*kDaVHoCTgdX4-k4$V@q~STn$W1
z=;XNZhl#g}%pClEi^~zUTUVE-4le%B>vrGXnK!hn*YlY2ndi?3m!2r^a4Rx&rO1`0
zV;e&h$J9bx%Yk_$3V3*UsS*G+Nr?7*o2OHFb*jv;xTXuS(pYN2*-0G9)SKi?)>3AU
z|7*vhiN}y%6Qi0mWlz+VbHh{=VP1^iAP~m>hmoI>_L7cCSKN-o_^3}Fn;sTcY+a{a
z!e%9m%3)fawes=theQg*Rd^-Xv`o5#*)JG(Kb>xrwsff6aPOe<F)OarBc4~krC|8n
z+_?kx+TNpY?YQpR;{MRQortaYt;R~r&9P-v!0wJ(8~}7}^AaLZ0<+CTnMatKbvfs~
zk4O%Ae}$D~7ST><vZJ#jIYpor$XoLFurXSfW>7bnSJDd*13bZB%O)ear(Jv_*BV#a
z*=lf~#FF0F8Z;ZISppiKOT><g1rK3AY=UmDc<$JI(tPIiRl+z}6j~#cuSzg~UgWvu
zqf8{qz8SY9{{N)z-J_zq)4bnx=A9m+(IJ``qSDYH3W5&a8ICm70S(}#3Gq_e5(r`>
z5gq1*nb)oe)bz;3Xhg0GGNDE>L4yH}tqzk4)zLOU0nM1SqNW2eR+TZvF0DeLpsM!y
zJe#DuXWr?x&iUi4b<SG(BU~)1_WteX@_oMF?*o(9wK`TxALomKMNw2PPb8K<)#{jk
zGT+hJ>vl)vS^rG9i2k$TV$|e*5VwNliJ)wZB+{iCJqX7^3JBRTWoZ}bPuGj-&UK_x
zYNRBXvqD>z^}=HWHIk4_QQ_a5R23OTMy7U7wC3akd12ZjS2!jU;4PAudC&ALGN)kk
zgj{Ht*s_Nq3!ww1>0tgSVOI)iF22;#Gv-bx>&9{g%3{A9?-U0^h5?fKR7pw$=v;tk
z;E&B;e%z6~7=d(WENlO!Rh(wi7gH~ZAX}%*$_g4#B27aBN*4kLAW4ZFRzmHg2>tCD
zD91lfqe`)9v7pa7@a#Op_gsLRJ1!lHtlMRK6*8`K#aV}TM(hVXFbzGKU~p`+ao4D6
zq|n*`dPs*tJ=H%aBA4|g)VCFWy37Pw^Qc?oOcqnY+*$D*oZN-0LqSj)IVJtr8W-(+
zNf*+dvP8?rvOMXS4K_k`464L<An&K7B)?WUJ!_t@UQg(F0pV#Mbc|zpsco+XX)m)U
z4b#IFopCbENuTXvFBoN77_y<Ki&Dry>tpsokE`rGE>!b?dFv+RF0MEnIv&rO=x<jM
zOb+WGuL1>8-VgBH`_Fw%`>lHe!{@TypJaP}7oogidaz5A=NLmlPV*ep(KK*>>2OiE
z2Fyaub9Rz2x1pyT?Bs3}RS76m-WZ>Q$O6Dib*kPoT?duMb;1kIM_{ZRtO>FwiWXP)
zPsrK9|Hq-NQ62EIw@G5cnvjGcp-74mVwEtTM13NE9Hu*sgecTyx$!ocLyByKoHI5E
zv~yJ9#C>Obp^h1!Ne+s6sukgEXIXf4-n(y3(LMKUV(FCPS<iryvNRCfZv3V<r<@=D
zasW2_fbtmoG8v&OfB|>nHN*9WAdXLE+UU6!<r)QlPqsZv`PE@1bMtlE#_Ko?sNse=
z=Yj*q9YQT^ytwquA}M0QD4HEt%0K|Zy7Q`n+2GD3CZLihf~{ZQi3b4^#L<enQYyEw
zc<0mB_n>@7NP6ddVQ1)gsZ_FRvlJvxyeTUnA!jqgxuU-N6q!p;AQwye67EONDPd01
zC1Kq*yKq{C>?`Oh@9nHs7s%x-P69W|p&GcNjux@b*>-dbOK(eCx0i~FDS6(+6}X^N
z&j%fN#`W6$%A|tF_cTg1tUtj@JT2ZqALVZ2faf<pF`jzmZ}VJPY0Axr{M){I>t?<E
zbCc_;Nomr1Z$%V5xK(1EBkdeTfXw5nP>Tu5P6vX<x&lyl;J7G9JJUkQGxRtu;ypnT
z8WRE=pRMBDd$>o~D5BtP6e}Zev-yB5W`xZkOlmDVN1RUC_se(JihEd!puXm{%mZa$
z6wPJi{K^zLs@1kXEPGo5<+Bt*;*%?{45r8U;9zkuYvaNv&&gR46%-fSytgCsgS>Yy
zt#3|T(tc=chMZ{G7+lp+mVdp(<ldzj8B7u)=x9IrIHGN6T*1EXq{B+5@;DON;bFx=
zt8*`Y;NGRQyzBq5wEd#3W@s#|Kx2E|^6ILEfjcUDpE;k{QAlHqTLx?Kipf4jwFfJ<
z`Y%28VtT=9Dt^XhbA05j(uS*o0BKWuE>)D|EVlO~Q`!HN;)CrZWxZX#mUqTLIP7C0
zpr!orloUqG<_20HUTuveEp=#G%EZZ!i&KckgJ4AI407Z8mlRra%GP)%SZ8Q_|H;D6
zl)!*ViH<9C*UdE~cK9x4L6)|8UGtmKs}szto(>C`8dYE;wwMp=ZC#_z7WV@TUNj?_
z1Fp9Ry#2eN59hUc52THLlI_0M>v6P<7MVsHS_<x$1QZM&_3^JDPY8G_ECXCbut%E;
zAsM(TLa@;_l%B4CasMMe=TsKn>B5CHRQ7g)e6b{xP{MQmoiRy-IB8+ZF3;S%emLh~
z-w4G-q7YdPSv$_~zzncZ_+JuRIppA;wao+#Rn%&z%xXJ9G~r-ks0eh)awOsKvOp_N
zus9s=md|@>d2?v%2Y!&2s>5^U2CT=F<(H?EvBf%s`P%YB4JotscN`uukK8nk{vN7U
znmY~tpXSlptp)#{=P<dXY*o5iymtn^iEti0;Cd~0M^7C#Q++hEesu?J(_5ci_-nx`
z#HUDQq%334U`z<p%nn4jQg|*C;H0;6RX^6|91Fr_wDtAzotdIIi+a~6qEdlJ*3h+5
z{W9yK?7?{MN;C$R@d(bJ7mp`uTM-O}C&k4ASc(h_m`)8IazLVr3ngpu5K5E8B7utV
z2wf*kTJti>l5Lg-Kho@B@eQfYm^76wfJN}xn&q2RyUgSF6{c(&wNx5BBfWYlR9F?U
zOA*eA-uZoBqlx+73zbK9Oazs$2b3=_`2g-p+Z~B|=`cVyb%Q{v;RpdUz}|7fU1AH<
zAl8Xj2Qua**%Z6<_BgMS6>Ko{AmQ+stl~Wj^-M5OS0Y=vCSlh_LXnq?4;19SLB00^
zyB|fdhv`9`qtZHu3GGQ*TzuDBC0J3pWN+(OpB_WYk1(d^s$y8?zWc=we*AUxNB6Qa
z*8Z}w{Rewb#2?*!?#)mCk2&eTSTbYF*`C>Tr^esw4mHQTl+^OU5%=yV2j_09HvZLL
zY=7m=XA(Z$xA^6K%gg@#jn@vI{>62xvol|F$6e5S^7h3acb#cbH!!YC6|u%RhQdOT
z=_xbrOPTXQ<eOF2z$=!072*4<wAFKco=)}`Z`SgR1k$~>zY}MODd*0&ebDy!g2D4J
zQ|~n>J(<d<y=AT&jw=vadXz!UXwiUs>45{ftVO3#(y`ns!^!p7UjJW@-ng7@-|qc>
z$Xxp-@{ssh{|EWX-?oCWvqWPM2(gvPnPr#kf@{uC+J0nLp*e9d^3%y+2^j(o;_=Q6
zPt>AQ<==_EUI1D^s#L7^IWtXxEL|fEqe}^fuWOn9QZwBX4=!X4zWY9ONn>S1hn&7q
zM->KEos)s_*ET!aU_nd|v%EjPGsU_urIS%*`33W`)|dK6zR|Wh9vMc>eM6p>e6Qlc
zleWa*_3}J^rg=25GeGZ{VQ@dT7dQ>xLun%i3kIK5zH~-v-nhL+?VXRRvuEnqp9t$#
zu076a=G~AdLTZ}wPLfB^+~9W%QBH&(lwVT73W`N=0Yhk%iJ?kEhMn}Pc|rE1&ZZO&
z8q5|}uNJHplN8zBR?34n=9E+CYwfbGhtj%^VP(|P7OAggfhsfGX6Cqp=SXcJFKfL_
zkCOMJK1dJpIPl6*`%W2?w@#iiJ198k)j3;^R|Kxcwsm9fENyA>R-W>=Cr%!pyghq%
zmVWex!Tq@goVF)Z|7~W6dxz`m%>%AsjVm)C|K762wLL8I>&|;E=9jKtIksP1s%>NC
z)Kl9!&M|Kmx-BcyS}}|hTauC}y!ng_hS_TX?KRqRwU9y|uFMr|(^;M;@*K<i^jpMM
z86TU^;h>gC8^*CvUCafBTi?vY(IJvH#$K((Dfla3%UB%Pxq18z1Rw;l_07V}b_ONZ
z%xobVm=T;w9%S<0(G--%x<w3`q2}SW#MEGI{gH+(0{dFmGb=G)I&BHP<~IxPPE6}9
z9dW_SOD98d+Bo_(xyQX<qg?9sev1*kX}C%MNI&Y$G<YA34sbk3RP6)9)ky`ezBW(O
zfcNdR;SiJeqh96r<FaES<E^g~0i+xG&PfICu2c34!K8fJvWnOXKgAdlK(1y|fg;g)
zET@N615|4S6cDSMO!7Gq0IifLqRYU6<`R1f!qe7?M}jzE;{8c!hiB=FB&v7J<81Fi
zU69il@5+Ib4$Tw`*#NQY=<%IJ_aOU`lUZvpO$uZyZdJs}R0{O8s+lht13yTabp&UP
zxaIxtZJk}+eR>_s{W>Ae2jTrPGf2BN)Oz5Y$rXuuQ*kMGvtvdxG;y}f%UbWHgZkei
zec_CFODKn}eGGnXU%$AmIK2b9eq5c;+?Lrp@OeRS#D~M38&0&3>SHlY>Tlix7LmTf
zHfJ1X0a`oqqCYKDM%2xO1j=0cM`@K{6_5Q^N~=h85^=zln>`-_>06HNNG<C^lR$^&
zWFJVul*RFbR`X}`C+NZh>zJ)k9*f4KIwSS?mVWVJX`xm2ryRmAwQQz6SZ&gys60uI
zFz_JngF(oc!SNoK*_!S+w(x1*aC#8VV}&VmwEcj)R@iS0{;sx+_V>jwnD_SShws`E
zg=su%H9wB=S|Vdy_vsupo_VH0V#k%p7<+v7Kl~ger4*(*wMl)wnWue>mvqAYm#&up
z*X;P9eRnM0HnyO<fXPGzl_!NkC*r65fAXZmH$c4?Eo=wJ3hVXRI7(HBQ;X=HXR5{R
z;jjR)?(8=o1iK7P`x{2_*m$Kk3x7MAfeT2a-%QKv7-SPla9`0)X1Ma4h-mWu)w1$o
z@y|0_%~zHmMH0`tCw5P?9`l2<zM4t>);i|KSh84uUinxV-K88<Y-viG*LGIpa*rwo
zd{wgDt7{BX)0A@q!zIf80gptHZ}j#6y;U=8n_rjFJg;4NPHY;)mzFuF?i>r-R6TR|
zX8IFIJlp$e@=I(#<|#Km7n>?HSRX;vnbg3bl!tzd*C-hfql2r2f|x2e|B^i*e3qv;
z>&H@%cy`ND5bMbo#fr`t7Ei6VfX<V2gpbt8)`}9H5Fa7VT3%BT8SJHaG$<p?O$>^k
zVg6A%A@`~}ot~KJ_$jtjz%Ek{H5RYz*FyEF-ZkC4{>1p!*<JAg`Hya=xkr37qfXOs
zgZ`mb`NpSBxzVVhQS=NB-;9j$e$tZX=rTRNpXPaN&l^lC@a#4y_kBk4dKSc6*R$92
z^2BPxkK&tm!qgFNQk;Z)dR$zR!Vqmm$y7)4uBzkUc#v3d6Q=@`|JM=prVCa>DT`Z+
z2!A8O^d@0jauI2GMAl43!O58H0Lw7?twPo=3*S);h!NX!;wby*l1L+P;Be{5sQXTP
z4P~U1OON^)Iu_g1LgaV$Cd}F&xo_f}oT9C*LE4ImSFi!AeKYpr^su?w{x@O$XqQ{U
zZp_e(d^`k?#pJH*^;%k3&Z>X#h4P_cyz}f=4a)Gc0HsB7#Se^>_742~vrl&P!-s2u
zA^A%z3n0fY)u!%5Zzo07Bqv_mVy&(QzoRf8%}eZ?IyR@5I>G|@go1aF53QFG?6H4N
zm2I=U7e`)#K#A(3g*I?AT&+KQIHu+-X3-mdRK$&lw6gFd=_#uLiOuNG*i1efssOeL
zp)KSl-PM%z4b2c^|I6X%H*cPLXxGJkj<%{@NPos7m0K9t*EUj}@43_FSc1U$j^2ef
z!AIlqK#XkEf7AF`yTScQh{^NefV<IjH3D&E{-966Z|fpoEj2eqPZYN}&GV%xpOYBK
z8Rio5g;>yIFtJ{>)We{zU^T)GjF@PYIQJdr06XOW9gD|G1s(Vc#wxsXzU?ib-I<`u
zOEc7Ef@0GGVIMrEN%!3N1=<)n1;|%;rBpeI0<_hO#%IOYT3?Qv{05;y0-Ebs4mnIX
z^71>NC+G$j9^V@sjD#8H*pm3;7mqXqO9vEgx9p2v(Y!ZwQ&Y>J?T$nY5ELrV`%-}x
zx;8!ht^zf9E@l_RuUVQ=+FD%O?G9!pOjO4B_p=mf*peirbJA-Xg|7ytA|;~{kD)^m
zEW;EAq{RFoEH2dYV8K?4y3+wVeK|pd@bTVh1Qlg0r>G66L?}ykf)S!qaZJ2kt23Ui
zxFFpUk6+G#vg@*?D84ul`W$v7Qlnt^3z@PxP<Z@g`S}g8JVEJl+1bg_ZH%y6@T8Uc
z#XoiuhS5H)j_ugxmQFu3Q^t&tQJ;l;Fk4%CFlOYNhyn~W+~4+kzdDE!vZqNi{C%PE
z+1;#x;o%u3M`wg`^Md}-=O*VBjq8y;-@8lmwa~;Fz5(KlSvey|KsI3vv9kaM423H5
z$1ae2ruRe*D2%m^?V`^ZGgR5~-x#V0uXcdSiD@BPl(BMU2sJWwhmY#jlEjqYyV|ye
zLgq*lLOTeSs5S&35dsRjDnMGWHks#;Y%6^D`!Xudd36@8vE^;?bKC}rd~ZeVx{duX
zZ?fmbms_VJ35(o1`JKDrdS|wGxW(&6WNq-AAwqm+zZ2sfZb*u8eXRocX9rw2l`r*M
zCe^WT`_k%d1r;LuLNgR@AMacY^-(GcQTIJrEM18m@^Z}u@zCk7lmTX{<?XysJKN+_
zTrZYB=lF#TYWQ(H>I+?XQX&>kWPmjl-3L3@rplHR@s1utMbKxR84p#DkLW;i@d69g
zk~^68mjy2p;jGN@ga<&uT_$C#V{u~5m0z$-62#f^v#z)b4u!TK%C&kZDkhE{ik}t-
z>zY`#=F_>Xb$ZxuIc8ZwX&!SMT!6fM;sjp#Elm5|gF|}zhX(ISL&2XmCL8{wxOjtV
zQ<aXYvMcUI`AmksKi2g!yQHsx4cVCl)bWC&OngpU3q`ZDp!J1*_WC;hUpx#tQE~Hb
z_G3(AL9-ah`dQ2k5+_e)Ra9l@oQ1g-b8_L<NwldAu~gm+ceX75bwc@ys>omVDqkp0
zdyL2Ay`QOfc<lO-O92IaNv*ECLjfSr?uG%+7YAX@nnsKCZVK_%)=ws8*xv5k-!qQN
zQ_E6!2$9;0#zu&ml$3Q4JZL(jyO+@{Ac)<Lg)?K{*JDfbUJ-mJw8eJx^qSZwa5ji5
zWj^D?^+Gm|>)<Aa`HWZXPmGZ9{}ms=u5$W6DJ7-&Sc|(Rr7L9`0RyH`*7DY32m}on
z<&km#pG4uY7P_@SHrVSOn=YKDBQwQZ1M^4BdJ2u(w^+=D(aipGEqKB)7ENiLj#c1>
z6RHHGHOCtILEhjMt^T2}o<jTa_p!p{{c_ZhKXS>g?@KbZ<lnxX?KX#)+*t$OtJ%tL
z;uTAuNpTH$zU;Y^*ndM^>SD)CM%WQ~ugnOen;=yP1d2K*W~{G_S0g`gnMiz7McKEV
z^CIED?KMH=%}EQ#vOx^V(9b$B=9LnoW+juTSomN8V5x}5`y^IKfBrgyQ9M#=_+$Ht
z4U)fzwoqSN9rg0cHQ}1SE0r2nXf1w$D~*9cmFyASpVB(zgB0035L;TD7+ZB=VP$ak
zyrB45tBtRh8}PjFUb1^WG<m)~sPPOAwT*rOhW2*_Bc2lFvjMMN$)=_-!N*cQjMNN&
zxYRa%QMYlC?u_eKd9OLQ^ByoISXncKI~{aC*0BQ#qxa6mTnU?qrUPPwn~WQ4$2eIJ
z!SRJt`zG-YFIYeEt$knY{fU*?D^kDr#*xGEyrZn-v>!o3YEbl#h-?1P!$1Trk)D-7
z>uV=ae=tzRG%f0dj%7M?M_m7OE7P6#kAp0%ZSJK0cq(^sEGS#qg=0}KSOQxdW%JWd
z7^@po0;#IvbYa~qvH#tdvnCcZqBh4^Y4MN$O=*$Y^<PVi$bkIYC7!R>WLuyEHAh?i
zzkeijp{R&YBtIRsK^^}dD|<6%abWtwqfc}$qo757bJB0kZ-bP7ZSiaw82()EY**|W
z=+Rd*^<Osr&OJC<M3ISait7#-1~h?>>-vA$6TIkBA-=Qox)0a+>?}o#r%QquvbrPA
zUv@vIijoLOodvMlS!N_-UrJ-4y5nswSf8wzq56RnL2Jf_IN^tJk_m2aqsx(LFeGQ8
zSc4+xwaUy(=!=v`2z-)6>E%xom<tESpDYeCP=d$BOdoD5I`(Gr=p|qaLI>zuiDOHv
zstH9f+{|$zDxEkDJ_~aesw&?x$2m6ikkP2!AR(BO*{g_Bw#Cu_L-}F*A3%&WOgl#Q
z*!7R^Vw>}_W^>!)0|!kH!wqgnOa7?V<XINrsZ3M62fcz}L{u1g_dF$x6Ud|Qi<)}?
z0$78%V|zD;;iy0*aU{G2jdyYkp9=@B|JIy@X_w4(s<dk98U@;Ar9`AsFUzXuQ`Kp$
z|ARCY<zU?OojXayT7Rio1(MZ;1{{f4t6Km*GN$l5W|Ursjn|-o+Nkr1jt$K}uN*9%
z*cx;!W$m$=l=59cLKMM}qUlAn!ZPLEyEiiboF^~MQ>S-^=v{Y|QG5R5_yTiJlF6W4
z+tubWjr7KhT-$Z6?){y!Yp2a-uDm0??RR;v<zh|Ivm0=$l(y_o&cqA-olR`|z_o8t
z?lTvK(0;}bHph1)FcrDHYD{Qm15BcLiFXHsB3woASt1)XZGq2hV+~}-VHg5JE+9CT
zI<}za%D{p7D8Vq0R#eWkz+!RK6LPx)EkDbBX5lG6#_aLvD|BeYx$sisGEe}pupUWC
z+~}OoiP-A@GYtQ7tYHz`i9F`@{S$DrOG&&)mPs!7efX%WCEt3lZRlL@f2}KC{$pLC
zAKdKuR6l5I&_B2oK$w^nq4p7?BM@{!<iJ#g+(f!V<q-V>Ig&zxE-hH|E^cOp)53OC
z8nHdCbN`$9vuK&_R$id=V6qKOO?QqQz|*ifH0|$7bKYW=9Q$f0B8_1I&$+-Cjx>t=
zN;V=No`Xk?l$WwE#i!@+NjZ@un5d<Jl$NZ*0l{mZIG)$43vLNo`9@rE$JFGDRiU<y
zsk)Gt()Z2|)>;DRv^%!9<~*~h`R8$+-(-7x0+c@eW4-tDUhl8fLgGqx!JRs#Q!`3+
zd6x2HQeC?3ZQaJ2h{z=8{FXzwh#kplj%{8_ZfGRs*J6HF?favK@?+m~0dm)0FSd#o
zX}902r3)_!TLBaTbOOKVU|kjh-;Q&8t*?P#!e`(b<~>g_AZ{90hzT<n<W~7uu0~rz
z1H|8uG8Abv6=k=IYr~EM5OLs9q-r$ryQMs^zLtvdhlCs%4j;V;*!3wsUZ?S&y;S(?
z6w%ypm&$kLRFx!33**>eSzjKW6bPz3iO)A7Zei!SXYNaASQP-|t(FpLn4gZ|KIZ9-
z@!T3R-OKEQY4|X*fLXTpT(<HJEW`%A``*B)+g|X%mTOgd{k?;4?D2n`-~VV<kLJ<F
zpx2)-%2@Si>)Nh`Wlw~jdphBxUwrbD4-cI?kXHEF3hfV${I9DoNA8RK!M<Y=N6-Ht
z{;wuHv;1HG?TNSEUHg|6v%<>T%bs~DD9X@36qWlMd+nw8-@fsqm;2DtWlJ^H<X$fB
zJ>Wjks8IURYr+LFS4ucgYl+fUP8K17h$0MO@f>HMORGg~v6Zqzekks}J7gh2FM+$$
zugD0ao?LvvycsATiVc|`0v^#O9-+C&(?hrwV{R6ks!J$ZGQW4TC@+bX=K3s5gbeAr
zWH`*fH!qVfC|<&oat@>fPM-6@-j3>(#dG5;xPx!ztf{KP5Uwy(w;;Lx)$-u1W4{h*
z8~S2kB*1%5xlP^DlJ}dQJ2!j_zPYErZ_gXP%0zgNemIMD=*mOe&q{aJoFT_NYxCIB
z>${sI=GxXalr>3Q1^3tj*H9;Xk<#u)!EWS}rNcXAom2U)1CHSC<ACl9wH;5bs=?!&
zmXQfTBz0)L%+_&dhS?aeo7rMZjAf&z9Kn7wKI|O9>a0~4S5INzb8(<99;AbajZ4;{
z5LT9{N6_bwM{rts!<u3w=~0fY$BY=vMR9LZbQ|hdS47v1*FCFSzdUB9dCAd>$cWd~
zuZZh#6h#Qn<g3G;huVVS5A}lsUR%8KE!f!C%e*<mBPFK6TP@y^zJgoYkvjdo_%zR#
zCFln$V<><SY-0Q4dXp=zngh_18r3}2T`ha_thidCXMj^9Ywaj+sbJLufZs+gm}RcD
zlkJJ}Ls|2#vJ4E2Q7d7ah__V7lI-dWq%5#rmEu}oIL%QCgHeV!N%XiX&((GoQzGk;
zUZOXL3TYRz4!+KmcqFVQ5F?OdJdx!K%6o0Sg>?;Bh{7BI0coh%l-xXi{L`}nbLUNo
z4N9_wwk`x82{n{yJiC+;5CK!Z{X>;0ZZ8<jRQm5+%d5|K`4-%5^xkUGTMwH&m!Yzy
z4WDc|u;${@9jQA@I~Oq0Ml5+GgtciQGlQ4TO>%6hm!MP$FX;w5tTy&Txt#064ny_!
z$JJ&9Z+#A5oT{09vlDHJfP%rY5}50U^ky1_wFq=#s(!L#w<VOl*1}UWQlLP}m!>7G
z+Y)FXu7EETS{8xat0Nd&*cOh?jPWi^Ll_b>yqQ$3oea<-yoC6u;;JTL(^Vc#v2GH>
z{jm6=75u|BZ>7vhDxQiUFt#$2jjA{j+|**voSo<C(zxyV(Oq6Egjd-Y)ESUB2!sB5
z+Fz%Qe0eXx{cQ;X%7KyAY|l-t-W%Y4c(6^$)QqH0k-}xnn*zt|G{UVq(idPz2ofp`
zazJfGLWDSiJ4JT2%+27*MsWTGTKUeM!t=d5g>gXKDF`d2?jm}Jus-(+UR1v3d;pCu
zjPIBFNNh56vgOO4;E=aLJu}HMP2!E5$R16$s+=&Y=3?5y<4GMW7vju4H~1JPt*p_Q
z-1>oSQ`OAaP=ncgWorV0ay*h^Tmi~`q6Dt>ic|0MnnrgTHam5Xucmn)_bCq*Ta(Fs
zKQFef{PZM|5a)CYo_^$ltrK$B53Scv6E|10JEBPU0lLpRwpDk{u-5F_nreBU_twu0
z-&&IyC<|J{<LjZkCK3k5rX#e>&zj4J#D|nI6q8whbqzXG659_cms`z^j<QS?e1uU4
zz1*6F7%~rHz!hN=$Dh%C5PTz{qY=ny!lGUWI|QpUTQ8<W5pyKAmgT54rG-pxxNfbs
zP0hj6K7C?*WfFG8au6D(%=XcNtUkO<pwQpPn!G*Db<6I(q93gb@SFt`Yp@S@-MJUw
z{j|h1axZPPDD7^x_x*r;>z;r-M?;bs{YJGs!+=<U_QNFJ?y*@|_cAGC#GFJYB7tVt
zc%U1AO!MY37)v@mws`O!G1QZV*d`H1YMbFAMKY@*4H;+Zv8%zIL|?9^C8J0pFaYT;
zSBoMc3)Kt!1}Alip<2eVEYXxsu%H5#Z#4|0Y>B-1A06*lxqTz1B~_u%)G`SxO)0NU
zVN3dpG1ZBS=f+Q-HM+Ikcjz($=?mU3O(O$Jr(HQ5bHV2PR;bN=-iLZQ0*79m9dUlz
z#ffzx>ABrW3$u6P(5OA1a_KoLi0=7a$+{)XzawmkJda~aVRelj@Kz;uS5zrn4kjJ4
zmINMQ3Vjds5wVq;-wQ)stkGbTP^qljSU+)mSF*fHYnb62R0m6-&!n#Mri*-j0)+_o
zLoJETe!14fQftTALfx6deLB&(g~kiA8i8<z1fms)B+6zc9LS!Xbvz>g9w$zhKUGZG
zQqKm?vemPzO5n`|_L|Sc_M9!8o96k(*Yxlagdv}T|1J>3{Fy-DqYU=OxYAeDTW^j%
zx575{B{~xSpoKh7A&FL(gxDKKV4oqWW4f&65%Z_ca^k2uNEwhRiYyf6IR_|*bBKIg
zOQH}53i&TZSAiBua1R}Ow+M1XM_JGN9T-Pk<d~&eF{`GsAbSaDo-G90%_uol!Lbf7
zT2jPEvBkj;$4`4{Yl8K)4?^P;YRalI5_*zp=*q$^IIE$%TJhew2-LueR`Fyj0p51I
z_fYTquIN!S$mls`(2)1&u<4_St+RW#ZF^$Tj{3r$1)5IdIRNaG8;nklOPnkGZ%zay
zjRm$LSB7R*+yUrH4oJs3Gcb47DN(>vQxH5L?ygtkIybF5IbF9<_IydN`(EBPrX>^{
zNkvhP_bwVc*e}w;|G*WiBnlBwr|Dj5)*Z?u7m;Ba1fcDXo?{5xy3wCsTMfmkdVFW2
z8gMQZ7V<D$Pk5q|4H1@2QN_QVe>5*lYuQwuG^^0B=WJEP=i~Bkb$Ng8y>C(!&!5x{
zqyM>XP(D7`=J?nr#&s(s|Eo=yir_v)vf-J=GGk%94*B}i@^F&w5ucsnu{R3gqB&3o
zNbHeDy1fv9pBp|pNwyJ4y?~HcT7G|(ehRQX)|C<mLPTUFv{p*T13dIdd&O~FUJ!qz
z=2mf{Snn?!*0I;44Sqo~!IE_a|158bA(eRs&rUz<Ce{Tl+_(3wm5b*dQ>mHdt=o?=
zGMKAHN`;B(isreWhIBfrH6Fe4TZ4PRH1vi3(G}*7?uYF$B}9ktvFL#9LWF*ImUJ{^
z1^@HL8Vw|Z8P55J>DI&=4*AsC?L2`(9|xo35jYcV!320m^XU|wiJQ+S$XySXuA4%)
z!8+@{=+5N+atZ*(9;{j<;)_!eRQT_L1n_gmx$$pO8|jwv3pZ4fX^6}$H*~N?aeWNV
zvbbCqwX!^-NEgp0hvU(mvO0_LO+p1EplgI7K}6%@R&4|ClL0-TABd=K;VEKbc(3`Y
z-w!2xV2QT({*3&GacSOLyY!<B-<8W0N)cX*Pt3^2hP-bhE=1%%c;r*?_>#eO8KUxq
zg8NMx1ta@$1IpQ?f&|da=(>^@3yukghY1Z(Af<pnq1C&%2A}6`5MDc94PjFTL=pL2
z5lOGXY`vEWtz?O8?k>fZ>NR3SJ>lH~i0Uu&hSsodx64eXK|OD$tqbXB!gH1K48E^`
zpp!abq_F*kYCcv)Sn9ipOh5!jF2GD7-XydJRnI*pM2vME!K=f1gOFG5O=(>iTxi+n
zx4&$7>sL?f?b&*}-enrTBnhJ6Hd51N1Fq8EqprM>i%I#O7SDa0B#_b3NgQl*KdD^V
z_P-wmdq*V{3S2kqVb#+G$rDcTLz&pX6$)pB8`1s*0gd~Y9#@aUOl5SiVY=fKF&|8f
z2WW?XY`x_*nvh~vBC5Hb#eZli;b|V(HLLS1H@lY~Wzf7W!1*?|I2mKsut4X@WJad!
zftlb$L3ut=SJ41B3XTx7l%&`w9FoeVPHi|ki*Y}*X=s*}RZQs3mFZI*Cvj_8a&kU|
z9tLP#hriAKc0XfOjrSAN=obeyBR%%KJCOw#)81||{-;hN#`TH*K|_oCQNZJdq>;k^
zduQN)8+1d=WDdYh0-dSKput~PlNl$*thx#&NA0sgAh6}h=*RF*{|h7D<8<vTo2_v<
zIZk4-+fOx;REAHU&^eOmQ7M9_S2Q4dSa&_TR$IhL%d%Wf(Mft`3MCv@o*V7)iWODe
z$7N0yH9gF=Zh5Yit`hR(`ck%4gkAY{h}S#daVXzx#?Op)LGRU&71ESm{jJ>BVhnBW
zA@6M;Z>4^aesXlb{*JxiR-@^BQO_CRsu{K;CG-rIBSW>js%odTa$zY71D>wPa7sBu
zB5(##Bt9bp09lZ_CZQe@aDG@fFhB5Hm4tvBbiQHRz`)bNMk(y$irC3kvX5GTQ&l;k
zjuC(uPw)(&<;PwC2dd~gR{dELb!R976HD1C${sMz0G#W&P<7(#pV<b_>wUt4Fm`A)
zv3h+G@I@d<s2)B%m2W2Cy~oyygi`PhOlMQ8!xw6M&X#pFIZj502Ms*9to(;|<f8#^
z<bbPDhNa5zZCV1)K~J^se_tqE@o5{%G`LSwxwKoZlB;2e(n}G?&0G@dfQ~V+ArqWO
zcr!Cp&$`QeVv7A}`US&!*D0P1HPGi+dKW}Ufro5(RDB_AnK4NBwR$9+NTtD`1-`9-
zWU`aH5^o)<4$5b>VZh`L8xnAiv6UE_D`H^F-4Fx=@;5Y3>)OD(Pk%6FHsbubVawBJ
z2gdOmv+-GF_h$9XoL5_eVjg~`R7QAz*IRIxjU}70(Y&o68V&HQGQ>Gu>S4%I?qXIP
zp|9(Hc4_<4_M9CxS^gx0%B=p({^aYn^|~{Aa;Om3Uw@$)i3xS0j#77-`J8K=xK^Ts
zv)n*%&<Nv?kQH9l`hsq0dzobu79m=B`aiD-A;%kM9-`WV*qL;3b;XWSux|>!UJSM~
z0}XJ1XC~J7ux8D1+=$V6Txl7EhO=cvfoLx3st%@?A*zjLc+wQxrkJZKfm`bt$~Ky(
zj-7^vyfB~EeXZ{vXD}9sv3Go2MYuMmTDzk8t^bJLpJUn7(U?<pyX*d*wg;aLD7Ulo
z+}VnUVbI%l_a|cF=B^%c{b{v8Y#121qV<kYE?fy&0{c64XE;kXZDxo?D#Ad?;Fy~v
z?feo03vk4-BO6NL9}Q!~ZFht9EXh*cMbcTWX*a5!9<2jPK?i}>@dE6Osv+i*{Y`>q
zQFTxmQx>LNde`^_>w094F7tX|o$ZyGqEa_zAj|7XE;y!JFXd*-SP{YjyI_t>EU$iz
zP!PU<UFnpiG|$pQcH%>|HK799c~H>Jo2(>B^Z1xN28PGVJ8EuFOGUlgkZ-LUxOwd5
zX5IOyrP<wQPwSTGd5q&zcj5$3w7xRGm+@8fCJF%DkST1e1TBC{NXP!>8EW?j=0p{j
z5;gr(RI}{<oO*uhKFVK$*fQA`Y`#<oxuV6fy`B^<Tq|JD&AJY<t%@=<7*}@3b<jTw
zWfRV*xso9W%(`EuTiIVYUHxu9jUWryz&tzD(3nE&=EEP2Z?;WX5p(rLHVBx{=KPl2
zSQCE6kH26Ys|qYb1)~FbcTB@yz^7;r&<`>?{<g)~dpKrzWSnO7R<HZ33kAR5Y<gU6
z7-a|4$3x1ul>t<90A=b&6?b116O0$9LN=vkTRsxEY<Wt{Lm)c0+8{SWOiY$i5nEUi
zy>kaX97!CPqm0`70Ddm1EhJwg@v?v~Dv7HSEU`L-<HI?k9}{|)O4i!X0FvS|UTQ9s
zZ&StWaD8*nR7M4oByi}Lcd|dlDF8a8(WN+g+roe{?E69|mruxwy$G%1P{uP;<II(v
zD<L^Il}9HoFAgP1jP~1wKyY6X6Y6=y0G+eQ;2rT*ep7<Y@CD6qlK0AC<&ILm<UWre
z68nt$qMjyK?lUqWcS07Rp)1xMVsgH<qq4S)PH(J_EY~_wB<Q3O<gJL+oTRna5~%3a
zxISqrqNx?SCH4+M6@8wDGh>9uEp0tE-qa2o(8x=ZEIlIr-adg{O#}%72E8bQ6DNL#
zH@8yqvNw=pRMsnxbkltMYt76ungRn2+hGERLc3@@(iDVwuWf~8Q)>kl!q1JLR?+#;
z`Z#jrt0dVu^6;TO&n+<jGSkD|rlB9|tr3n#KKi?t(mWV0T!f7Gh{Yo<Uh0C*kau2`
zYLG{OI|Hdux<W{CG<8$}M!GyL;rU#YBUp?j$@5S)BV-E5#7~^U>QRG`-q5HC#hY6t
zyzdEAcDf2kXfFt9n@+Ea3=ln@g7bid<=QO-_68Uj>}naaWD(XcCuBVYE|TC<G&j+m
z2*y#di63G_PS1zxOImNrQELLrHcn0sV(nQ&;hVAL)}xnFR@AI!&lT%nwHR*C>JIs(
zh{Zke%7AiDU?k0OrSkhq&FD~!*5<UW(gP|Crzs;iz}`1K)_4s0x3cpNBz>~<>~pz5
zi>W=A4$W+b-ETay%W@Th1Per>r=1UM3G4n)`XMmEY-=L3Fegukt^&?gi8_~{KAM1p
zl{(>v474C53lALI_GzuquhmINGK&wJ3GNLkGhJ`7T7nb-QH^CbbX!v#J5ntxQ*m^&
zm@AER4AxMAwt?!HgK=Ow;si+&Yez~)<8-`ft?O|sZnlNeoWr7kW#0tBs@}4``3dj^
zacA(0+Lsr$!}WsxzGm>9qyqZ|rDvD-E(FY84fG<rUdfPy@s3ZM_v7Az;YBQ-)(qWG
z8#S}iC_wpgR6o+`<56y0GW5UvAD7yCKWwWyapuRR+crLb`o)fqf4$<C{>j%rK0l-S
zx6MtDe|X|&OE>*;^lzqTe~?vfEShrjnI~TSd1d~(qq7b#fB)L^MN9wkrPgDAvHAAK
zg>&Bgmn8e>uHV>ozw!I)*MtB5FP{GLA7vT&ZFTCf-g}j^G1K0$o2`lws_#a1KzWI3
zQzqsTg08G53CoD~4_+RhG0C7{a80IuQS}2X4+3Fu|D_z|eAwo85|1v85_4?UQEfPm
zX&TiCWT7MCpXi6juWO4~`mzQV$6BL#KNs2<0Xt9|W*5=PvvXp)%AxQv+PG+5QhotK
zhGWC>=I(QTE8bgCT0Ps~xeQId*V{Sd9j;T{ibp@<)VT6B>urra<CNPi%2&z=8(Xfn
z*vH*nv}5O`EvzcJ9^L(!oU=K7&~qH8buuzEH~p}KR*??}sJ+XQFFF*k1s2L$UWPxT
z&fK+&wBnL2ur9(6EGSf-kLc4DBuUaCw%9j1jzu}z1njJ4aRg|_QI*&8V*1R$?Cz$H
za$jE9u^|<u$i!-(4{<sJxrkyB@605$kz5mK7{+A(;<EYc{2_c;-k%+ead@MVNx{1Q
zgxR^(zI3jZ%a#ND@eK*hg~sZ%uUF|mitv0zW(hZVZ#53MKFZdQ&d?9W4?JF^{9~TG
zvqkTyGkLBIxDEsqSQ&zRoHqJ#rXl}Y=Av~4MlwcotJ(B30cL)vV=`r3p*p!ZW;!sG
zq6XktFv1#J!OM*<&jMajb!-$5%#*`3EPRH+4&2T9IG@~V!@rK*!)TeYhuN9l3hK?-
zaa6+e6JgEk*cOOu;!Lhs8cyEitwpjcTC2kgqiTpcPgyDF2ZZjcMZx=1CZ8;fTDhT_
zx?UR=Z+V}Ngk)W@VngZqw#8{9{ku%AB;|^7E3IHiu{E0AzjS@6AAFQXPvY@u8+lZH
z?uK9PPAFky8{6fJ(LDz{Sb^2YvOC`KgwOq=KK0{IFUq!Fw)~j9%TidXk9?^&)|aAJ
zBTbH3E~gjc$N)o;#+#cBPA$K(Oh{LJse6sdYikbE(zuJ%6-*IDrv$013X=vA0zk^7
zM$fUXV6w3QF>I>kMD*g8;@~s5gse!lh{QZ9th<T(nURXJkW!t}!&t+q;79u*1DD!f
z4m=4BAU@Ty*YaMJBM3iKD|u;BSG(tT1K;?@j5Z7?dmv&36x=}?Ft}_$@irFNOr;U}
z;fJV0J0m<FwRpbiW8RnMA;cUWQ1%CSf8=_|RlqtuyV(K1xm$H0&0vp?DCJqi2K2@U
zW&5hHsIF3IANQ*=D_I6de<Cxfh?huXpMhiUf+Otbn#^<T=WOEwM45NPbdwdk@1Ymc
zl_MP}^Oc2(>=;L#6wjj|+BKOlW0K{d_#ajI7X9iHc$UD%WD5Yw@O5qfVF(=cE3Yk2
zoHDt*cAaBGI-3K#Hm6ru&^=mL1RdLY<?#*XoY`JVa39l9`@rDZR}0?AbKO3uaa$@m
z2ImZ;22X@(P~$%LdfJwy&Mhz0FSQ+DFU^zA#hM_WI#!MuYsfJ8$RqD3>tn)>nJeQK
zvX7rG4PcpbPypL*vf~q@szbEkJ}<QWC1MW*gAbUC1QaI9cgG9Q#U9GCyw6U&1ng{2
zkvjx&5HMbluAX2WnM`&!n)magjE#YPWszY%Jxzg$JEkm*U_+cFAT}M%vb^u-lvR#7
z-4kGTJTdFZmhZU$m$Aj<*Pam1<o>Beyn|fJiXdG%TTMEu%bHrJJbp6Z+e-%ve&1kM
z?rOb|ei7RJX=GTF|HsJSJ<uEDtq$;3!vQ@TP;f1?;4SXbnu^{!tc!U9W(h2F-B=#b
zNb9<SNj5Y!a4cgOB?5@!1SX-_35<mRmf1;+PdiR$E8dlMc7;R{O%?+*<g<ektqFMq
zWr%snTL?~4*u|p$>@uJExy~7)29<e;vkLERpSrRkMym;>#t_^<GMrmZ<n@c<5@5HH
zG(m`&dhOSliBrnMZ=P7*d@*N5&OFD4IP2cM#xqa6`9x>T%oyh${X>I?v6ewj#mHI3
zu}tsEG>ptyAljKnX^Q27esoVufydyvYI1#KP+mJcFRsS7E_}zL?CvG@jWt;X$H*7P
z$wb0QPS&!P`GQK?&39zx7}|%;9lT8+*g<3q#&Fn@ViOf5pSr;qf>jZebdJRe2z}*c
zty2zhnHqE*93kXJr`Pa=sZ39ZU1Cln7IC4cAe@ND09-4`Hi2Xo;K7OG#c^G-56LPV
zXWU?d6Nd!|gH65YNBE*&SQ=9)fLI7Uv!!AQ_C701|HF^W)B<zsV$P|2cVvNgz;!NS
zDBh%uQ`!eym$E&-Z^TbvbU^uSuRHRQ)koufa9HDd$258l3%Tt42a$@`;I$4ZU!4sQ
zVy2C0w;<1ihn_faBe7@^BZL+8GNCx?2c`$v9TZIrUNUf3ko~QF@-4ZE?&m0U&q1AM
zzcU`|rLk;s7aoIf=Vf+LMUNsnV;&WvtnfGy$du0%h2BmnMv-{hpm+jm8MFC`l<@d0
zpFj=GYhfbB77<2+6+2Sd=*tQYM-cS~euru;!oN$cO1V-sC2HT?_ETTar*RDoj5BX8
z5C6a~aR0iE%U#_!$LSyKHVv;*zTK^N_t@Qy1H(ko$7{C~yg@E$G~v8H;N2s!L~$g=
zj2<57`s-`O^COmOw{7dM$?88dzt6o~heiNyo}`xvi+q6E{hZBt{WL5@F`7FaEcWXx
zS_q;}!na4N%l&@3c0W=jeE_816Z0c56h2T0mD^Z@i@uzxFkn13W%67wJwszakpLFb
zaQcuI9fb%?*4CTj>T{|yDkF=lh|@Gda4yp8NV|oJ7t2!Ogn8V-c6rJ@+@2T!F&w9&
z2<|T56`b|Nv{{{-DYR3ZFXU7=ZFOH9mp1aCAtHb9A$@xTQ6oU<zhHWJKr{0Af~VW`
z*KoJ)8QivP{ot;G+h-5zttA0@!-ox?FVAi{ahl#m?PH$gSq0=46C#nH5&NkS_#yFK
zb|MOrlhOYi^cOl%&L`iJnnLplEJP5??C;Ig{RNyPL`WsU8B+Z=IDTU;Eh4p6%~)gs
z<qR5DvG*j0%GAq_O^XEN>J0KIQ4-}sBM;IfV2;Y*I^VdW1FYIGcI^I+kj%^;pBP{3
z-ubguVn^D&Z(ksjXx_5l3i{iOa@Mf|kw1nJ-DMig)(?M#R2R5Y`O*MiJfdJ_chW&0
zh3#xsrAIUTWm6h@tglNn-P`W3I<<s6xRe5EX}d8yr+Y^30b>U%IG@ucH`k`4DG`do
zvSLtO%B1wxXUz***f&z50yE?aS1judrAp*-K{5z*{R?O~(pxQ600{a1=)#l*Imn;+
z6@n(f2(G1I>ptr0ppJ9!f4R&wDN>AVz)bGN^r-3V!(si4qfH*<nT)4WbZMomX5`{=
z;a4}qr6d1L91js4!|(XgoT6G-OG5OrbyQz?3GHHh>gw1-E$#z`(Om`ieMbg|lrICk
zR2}!II)1aO4I7RETUyB<tBx<E>d;HovHK5IM-qN2#+e#07#V|vU>YIZcOC~P3u5YX
zc2Hq`x2=p60L?)*%~D~~EPwB?K}#(2ZCJBZUr`vnjP(toYvd@eAP&*0K(lcH1Wc<C
zHXOzZB4hrp0HxR`KEWb-x{7ugBR@5!CD&s;z#nffdumGgJ9|@-ON}+5FlKVrBwTy3
zb86I~@SGL@Fz?$tdhe<L_c;S1Op{9)y{34R6qBOW%(6AMk4y86M8Kx<%Kw-@*fb++
z5?eCS2C#<Jj%V#BDRP1P=G&Rw>L!1G0#J+l)s<L(Ms<i4x<6I6oMzm3K<T-s+lZ&A
z2@7%5D;=vK{LoLbhVH4LB(bxaMzIP|BzLg^>J$sU`^__0;*msd7&TXyNcDkr)wHm=
z@iVnpFIYnDDkpRTI?ueH_HRT$&ID|N^AlXviJK7@%N$kV&_9jI&9>*P9h+Ot<$jeH
z94{o7(g65YI-b<OidV)v(1$*2L00H>X}pvcw`n@=XOGtEzd)7Rr*ZWRc)!k8KFJJF
z%sz@aZM3U5X7p4q3!HMTKXfIEGLSRyJQICx;xe#KHi!-(1$<G69i)?vwUQsES7>EW
z4t@6z1{xwzC2ml7<Q%YIp9!YlM&HJ^)YOWswgmMB1+d};aj7M3=+uuVe7mDqyU0NY
zONZF|&jOc~Q(6|u=~|0)Bu-d*DcI2>CDb)V(R^me_FV=eP$mM|4fNOBcg{sdcfosC
z@o3!K|GSO9b`Q!!uUys->4!enn%b<-ot?I=e?c~nK3fQ<^5{35o<uGX_lu=$&Nx67
zF*9l*+UTmGxhROp6vXprwKCjKMS*JPAl08ttkmb-DkfHB+je3{U__ARHPm0s^~f5O
zfb%C*_iP?ty9;VA^_N}sB{;+Q7{Kx*P{mcD+T{E;q5rI;LA6uyH#b3jp+;%%hCPM5
zih2F@4gbJ$@QLH8;ge5nt)mNpi@xqey3U+@#(LZI(5b)I*XBm85AEsFfGcER<clPe
z^KDczCk?K?`&|JA?pqOh$143$mTBbNz{r>E<3K*5wiy2;q1TtE@jGXX7Q(*N5bOmL
z;uC&mbPUu%U;$&hZrL!IcAN*Lj9kfDubdsJ%=Qn0e$g!+r~Z7s*zAihMw8<*_coo7
zAyt^18m<`Q?ImJS@W~%63P=(vl1&OZAc{no#BmrN@T{oQSSF{aR0TmzJ3(EVBIS*=
zbd%?_2USnp6t@?@=B~|iv|AHEzk{?@DVy#uauEOT89WWj*T9_b8m2fu`*M|Xjcq>z
zibYDvOUmW!e3#vQb8G2=X?2Y{nk6d8@ExVP`1Bc$ARnY-MPT3EGg?995bP|V0LY(q
z{wa70JoJvmh&`7`835E^I2<V%t-pL1cfBb^)kB8*#HCIQqyglb!P^Hm!I}C<WW`qJ
zhFBU9*4;2cQE{&w*c&|Oh>}Zi4Nm2TW|leVx>#z-=d3ER9c48yKn4nEV>wR{=b-s$
z1%d<i$o=r?3K^PLW$;)WEw)}RENpXseAqh@pEh#&p!eJG0M9Lr(ry~MpPk3d%d=DC
z9C-I2YQdxe=LOBk=cb2f^{r=pV%*Pb-021&rvLw)fEjZZ9<Q)&U$wX5!i4=8>_B(6
zz7M&3O8M2V7U{jc=t#d(oB;y+KElK-&HW(48?Jm(XZWi$@8>4(EyWGsP3HT(R=E#6
z6Q8@YWA%BLG0JwC_E?v^?KwgXyG6ofe*yLJ(sqW@Jhb%NSjn(UHMzF-xe6e0&@wiO
zWKuo7(EQ8O#^+FW*20^lUNp9(T!JnzTgzDx>{FOPM+4LOO~{iFz9~UzUSUz0DKAp0
zON`Q!_;ev5CJV{hWi1XBz!YZaDn&S00CzD;ky_HSf~;eG9|sL%Le1iZi3~q3@lD<c
zR{`DoV&>!f3p-~3#GJ{qX1jkEVtV|f!2@gTV~zW5z2p4>0U%u7M#D(0vNYdyi8kDt
zRB+c`a9?YByjgE0K4?4)L);~r6T#9%%KJ}hmWoj)rM-EHCbei<PR}y_D!6r6_l!81
zR}k@g?$AFQ-y@DAFlL0>gD=WtvN?D{8fl^kUleE*B}Vazl@1iTcEXeyu&{!Ja7yb)
z4M+(^LHe1L0@tH0Kr*(=5;q)WKp2g%a}oJs%a};*d60WzTQ4;5JC`$C-(9|M^|2^1
zXIK&3SymR9jZ$v!-khxuYgtG?a?9W);*lC$dN0Q!$s6E35UH6IGmIACE9Dy%hIGAj
zEoH{S%AGOUrMd-_gsD~2UZAU?e<BFmLaBp$7UTjz`0D7LdX~r}XkAUz3r{!+3+xiw
zeF)fedFiu<e8rwdW>s{ULqOcPn#VidqqPDsB9$x-o(t7-7v;hWsn2ZdPmnGE&NcgY
z0*Rvb&P$G}2HG&KEGn#Sga^ZgMaUkt&=Hm+wy>x19`)0eH(mFGG`T?Q2uuI8gUb@`
zvr4QHh=lPvOySBJj0uQmaND}I$MZ;_hJ)Tg2@Q?P@KBrQK(_nJ1=IaZ?}v_u7?yNq
zdmqQ^??jH6Tijpz#<;rzybsRCbTMdxxVj7>2Gj=pTGQ)Tic(;lfXZ<fT?^5FeNCje
z{Q{;zpo2%qLR~S6D%x3LJVF6ClRUX4SWrk;to6sa^pA1<qNf9u!i_!SI3wpK_g*QD
zO3NdKz#rI&Ptfb);kd)!5k%_P0t6YT>sUss0bw5laBGc05=}cDl=HouNi#J;D{C5#
z_xgp$SB6sx2UVQ*vu;ERXFjuc@09Xa4X&5QuqXXsfTv62eOG_GCrKK`NBVmMo^b(r
zkFqvjzxc^S7NB%3X}A5@xj3oLS8QEVa>Z>!ZFVt8z$lm<1#^>hz1&4zO*9xefTNww
z@bLMrksADQN!5z85487rws?!#H>{nEE*tBl!at;vz<5d!sg?W67(xy*iNN2?Ha}y>
z6)+qw<NUZOvjLytDS-Xua7_rzK2?^Eq5;99^~vQ7Ld(M~Qb_<^%#A<8r(6(`W9``g
z#CT`eR#~_n9*@bvmMd+J$O1Nr-aI^73Gk>NUZx-ZPmg_b-X9<PeD$$sUBQ6jtAFS*
z4d3#CMIqDX)WV&ic6>5ca^8tVoGvstJaOFNv6QZkAOu>XW!A0dk=#umH&s=H<qN|Y
z0D<_SA_&p+4Y;Za%l5mqY(91H0>pAc?LR^7cr^(wQfUx5qLGtD^tl<e?0Gp|MlHL&
z$D?k)G50qe|GXn`%9Ky`pLp-ZO+TsmUo*@8{hzj0pF8#HKOg(3@MhibC;V;SPhMLe
z{F8rp^1Y|Zn*RC3*4F}qfAo#{otKtR-?TF7nI!`)_T9hvm)th{-^b4T@m~!`JC9>m
zunJ1AmPatP?x@gyb&NZ!`c!n?2m#o7Bt%F*tDDya#U4vfn8o(e(!gMT-@*>G?26jZ
z<lPzBkN0Rij}4xW^^f)_4?k~vVB3FxRhs8(yZh6=eq|&QuI62h$E+VrI)A$8zzt(f
z20kmcoz5+(6LEyCgMlg`bu%z;Lv(GF`RrS^jj5LQo=P~Bojiu2mQBsZg(~Uh6q8dl
z71kdH_F*z^nDg8Dn%UZz_?-8dX5dM1F<MfEN<5t(+}U95*aN$-G`eF+FQ=^@U*cF@
znc&__CMMQc6BhT9pNe#gd)3osy)j{$`0AXD?z)3Dxz^wMJpNk2TL1Mm))NzTklV|`
z<5_lPOPn3vS(&0+huO9DqoJW8joVQY;JwPgEJXhh1nZ0J{Cha2HXBAR+-hj?px~hN
za9aUw8q$UbJw9GrPh<8Qx1ZNx$e--bF>B$y1$Rdya+45=d8jRsV-{iCB$jIGLKF(k
zQm#vF2oR(3B{-AwBdA+bKS$Fb>|1GAAZR#tI8PBJ=#QrfuSPDz4O=490Dz6laB-s%
z;mWFeUcCWCEJ=3rFb~S&cUm2XK$%;peT_@w7-hCPek4o_-Ub6rYywGtf`u>@n$`br
zD{Cg6TwW^6<f~^+S?H5f6@F}MPx3M2naI(52bHf|yjMXkIR~1N;oXJ;$C-%Jt}VOm
zdgnd8qJhP4^7ce{+$NK0=;#9HD?3YjKDT|GvY-y2f&MmS|C#98ajMm=ZbJ?KMLmmJ
zy;eS4>kAbS1ekb5AIY@799M7YoLC*=47Lj%ji&+cy{winno}5kvjf@r9zWeGsMx5M
zs3Z6A@v(ro(~)9bgpwz<t2$F`wshpKg^XvJJM-w}EJ=#BJzu}Rp4KNN3w*pR{2*be
zl0g@?S!KRav0#OSun_hLu@~c9ir|SvTv<Q&Y1qLJpn|~%&GNVIdwf=T(4`C;@~k}u
z<yHX3-0pjJcYiw$m>~iBZyUWkht1;x@*g<UMqATdpWp|2p<j6?&C?Z-@8~ml*H{IQ
z<c7E6SW)lT@khsst&sF#DCG21ve{q{0HaR$Q|b;tsg(d+qr_j!bV`bhx^N$9BEVAM
zZSz3)l7c{TbFDD)tH;TBy>wD_h-%$<nHfD6I_gA>e2aENw>Y@vJKKtS&JX{@S}hIj
z4xS#49Ds4PiKzG9{HL!x|6bJjbsr??&UZ{D@i+U$g~dNzUcL9{djbZnn$eF91*5m=
zODd7)`(DpG<M{&OcEPZI<c?kMnc+Fnk3#r#@eSin|L&=a-jZnGmpiYxyFte`SUR!H
zqFVmTV5nKCTU9@=FQEckHSTJ>^OT=%X?9m7Q(N0S9Xq!ekwkNHW7%(>l64~bhWK6h
zTRJx9s8dp8r~bi~u|p7Q=v<%Lnbqnz#p6EzAk$(YZ^-<Bg2P66^{0p6Dcw^1`GT4E
z%u14a_77(u@xPvd#V0LSDtg7pzAt*8amWj&-~nOqn>5cJc6#BUhL!*O+4;(c1CQ1!
z=ko{8VFgR=!3=DM0c$-&8*J?u&)}MD_rEdNuJEO-bVXZljCr&&s<d`4t-8P!AtaAq
zR5ea;kyjvFk5RQFL<@%dGc*DHCSl?))%wd=$9YT&21p79;p$PPn(;Ua^SOB5QHw7@
zbge)dOdM6y5?vL4<p0^-&t3jT<+|J*-2TZk<1At24Jn+7<+;l@u!}h2qoL{urF)#g
zefI(|)EB^zL&}J0^e^fn&JQV-><LmH^bU|8jB|E2*Jf?JUaXtn(oJZxT{hP6u4~4@
zbL6n`=%V3Rm^+t-6Q7<8napLwrETVjM3FSsbC=Jwn)gS=rtT2;-f>0Ku#<==7U3vh
zBxpSJI{N44XawsRDL-8?)AoES=J#xC!R31cnHf@fiuFdc3Q|<^QpS#DIo3rw2I4io
z2u3Z*HD#HQKBWO?e7GcWp%d1B8GzZ3PDM?qooVQ7Fkgx8TENo4`H3JDuarh|Sph*E
zC;Y<KjNZ@>TJ`QM`$sOPX887i=WV^`?SawZ-2<bR+Xp?rHSJ7uec9ks;LXwuh0B&|
z_q9>eV3Xd}kW?^ycOcYGY2+{SoN5BQfYiG%L`L~bWah81PxwPhs=5YuuvjVbb0!&}
zVTMptpQu-sPT7)b389q$OSPO6Jb4`DDv!Rp-a=-eQ~`*3>JbS_Qejk5CcW2Y4uzOo
zRKwjfFUb}Zgt-c3O&?8Q$DS0Yj+#OBrDjWxALJG=phRPBMep+Ql`AS5-prY`ziQX=
z!ooKrJA<~5etOvZ8*RY@$hk@Qq}}uJYLvItyrA40aMvmh!$-=a?B{N#Eb2RbdXn>%
z)LJT`?)|z2)Kv?7Xauup7SELTR)s8T7uXR-<~5nc9g3dbYI~lmz#NeK(M}|v?c+FA
z9zRX@)Rwqa+zN?TGBB4*HJ5gRH|>w!k4Z>ja9?x>cX~hLjd~)vC7BP{YkMAQR&r`5
zKiE85%D$KWa0V7@>p{H4OO{@xkBin29p*zH=pT$9POfE<Ze7%o%qYxkCQm{p<j)G-
z(%6ovxl7i4@P^$38~@f}(+KVZ;{v>$X>0}64{aV8@$NF+Yivu18M)b_zaOa|MaMQH
zJ7%OP33JzYL&3<qk^L_r{T9?QT-`B_*B7oV!D5n#Un1=L>RkxDFg#3Th9S2Q8ic63
zlADE>-?HO)iZe(R`d~@YBIb(GDDoK9Fn0pYr}C!0+XnC9xG2F|9hhNIi$cys_93WL
z3!@z#G_i+W031pOzkxqSWa*X74`YWBY97%ECexd<SF-Q1GUKFQPcr+{D5CiC=~??j
zv(M*YUE{tH;NBlVFv$N!kMHNZ(-fOl*{?W9mCnc>+m&ax{c^$9#XERw^*ic&_Um>M
z%j7Aix~XZJxCr3<3F@P9v=8%JM39A`j|>@IhmW1yR2>WMnKUPSK*M-rVs)k<Qa-j#
zoRJI_a++kV@ttbBLLQ%KNZiZg!ax)fCMMasSCOqs<+bU_sm^0ta@n1g=#R#0S{U_V
z1pZi-D@H6r@tVLrhm>23!G**m+E9}J^Q8pdjdA{#MIrzcBOeA`>r0KVR#=jC^OL)V
z<>6Nz=tukOFrm<}lik$^6Xu|4=%b_<@Aaho@;vLFgE7Od7SHcm@@^yZ+N>WsT!8uh
zU?<jSGm0ELgm*#u-2_clJ-xni`#}^5d*>h}hV?E5Wak@xQ5Bp89$i57rAY#?>^zaz
z*6P6N5emNUP*!S{mgkz$tT8kjlHT*IpXgodZ@C;DE(0)}k+S((x6nyz3XD_g0PK8a
zYyh?`fgvs!<{@9k8BNij1KJ&ux5Q0PC1I0`g|q3{QGRUs|JkxE<$EqbXL2bk9y5cq
zh0*(1M4+{7G$tBv^~AWm7ZkUT_p*<7@bk7`+4g_4O5=op`H}MFM=g4%-IJ*d5)U*B
z>f$%o)OKzxEf(QkWzSjNf|fO<QMP$Js6HzHg+4X45bR}7bWKBa%`UpeKXWcloc{h?
zY+`ZR{9ck}K&7CKSsdvT^AFBWs_@0#UU1t`93+a(<yz@9<cQf$aB}K0O5vY&fznoI
z>dJjF$cnDLU@<o&Baj5XL17Lq#XK`2r^t`uIX-t*>$RsGL5@RyYho&2Z0=+y)`e4{
z2P@($ukN<n?fPM>k7-1!8NI4;4IeC^BpA6yb?~z0(TYb``U*T*CfDa_9_zr*omV3A
zM*7ET-f-Tn>#xcb&V_ZlZnN$@4X^ci-GU@Yh0N=zTYAnKU(%gVPmuK!oeh?kNmk^w
zj&nVgLJ-*;YRGPmHaE}<ML6d6;q&yCk%<`^o<sAyDq)it8VQE9B&3D<;Q0Q9hDOI$
zIt3EI^}^^m?a|jcbC!;S)27HcJF(t+dA8+c>x~!ut@E(GkZQ1xRLwz>O?uHYUaAUZ
zIEhXR7H+h8D%XBr$5(5<>hX^BdaY^RA>|QU1n&oUdX;Z_(v%r#3C<Ods?)q7db?&6
zov-(RiBrJ9x7H`}6Ti<-w?4bH-5p$)>AY#I8P`>w={zLhi8_CZ-jqucie?y?+q2p6
zGf*WcvGd60ByR2-L4o?N*YPv*2O0!$eGz*HXREiqW*j<(%3wohCf)!Ir2S|{P~kf&
z3$c=bX}?(wjSLwr&*#oMxUI;{)Yox}q!T+^yEN9{ig4jlbY~_=IT$41)Q`xH9hszj
zkPh2(fmiajnakmJ`xhOLeoh>}ryQE;Gd=9oQO7H_o^R<I_1GKS?e#u5s2|$pwJb9Y
zJsfHq>TC}fJsI&{OWuPk<1`BFpz3V*SCIwx?A}jXymbKuj!Q|(HgxJT7?P$e*RpAB
zDhq<Bs&g0!)$O9Z-NRyB|I1^#f7SS57qjnIu#=E^Gk*42)^m&+i0)D>VqUx%c!C8b
z$2lQd@$2d~HsQ$v+gc3IsR)+F_oIR%7c7R%(CT=F$)BvLQwjR&HggFXSf%&n5U!$m
zC@3a4futnqHY7epcd06K{0cK?1+9$TcFeLe>e=z!x$xGUR}*HvdS$E5c?sw9G;dQt
zp7OcgrE&LZ+)n-7Ctq(n(LXv8X&Om$pOw=wenv*=&g&^p)@2k_5EXW&b~n5gU3Z~M
z7CEYu%TC#et+sYYE?CYPL&Zw<XKEt<(=!3rO%_jJhnV%W^<`CJ<@{G+xE;)~;?I2J
z30?U(QaXU%X>nyP(ydZUrI|R9p!eFk;e>b>^+FSi?XZ|_@~;w4r1w{Z$4@B+dXT=H
zNh$1n9KDZZERQ;p6|AkNbbF@A@d}g~Yr;myk0v7d>wMThI$DcB?G{F<nR**}0m|I(
zFSI@K*bDlSJRg4fJ&a%)=`uaMTVi@}cXxohr!Hw=@RL1K8_;ktcq$jQgusdFRVP6}
zTIgS|o8?p7lPvrN;3g;_fKYPJeMHYHF&$R(hE(9ciPapC+^TWu#l~5vxyLU`qQNBN
z3K8@})YEF=0k2N}hGian8|#sHN(gKbNq~Pa0onftRWK)I9)ol9S+20W4m)>ua<sT8
zT=cVI7eKK>#JO1Y_1~X*lM0-Lt6j;Z>?&mJf4t+!-2UrzYNGg-9pY-$5ZL2;byRs6
zskfQjnRa+Wr{7wzXwkuim%8_>`GK%b=P;}9qD>aB${bHnMIV+2#m6n_rX7w?-IN-R
zZIUXzt?unU7kmSq0Rf}utm9PFp+e(M4!a<atrVAZ!~l$pzE;k{Oa;nDNG_lOUNT!$
z$B16rgiW$7c}>N0>DbooXZaZqB)*71HPKD<=?jh)Q!`V4{D@8=vz6+Lp;gX2pIF`e
zQ%|wMORg-LqT7#60nMm>qBobCnT2kzukK!6@Himf6`<VIj-1uFrD4#xeq+yfe`L>h
zJZvxcX5a_^sbRSMhlb&j+A#3fZ02x+fhg|OC-22n$?#6rr0QOx79eADKaev!TsDRb
zDXsJmxz>3!n0)9J{+8_Z^*w-|?2nM1ksqBS1|)3@HK^WK(jbfzt`jn+y38mz43v)j
zQfy#QTE!tk@uGH~l~o2X1d-Pdyer403C9WkM$n4cbAsAeCp15^iFJ3u<|S;K`D)TR
z+@hr_P+SoO4;lv^7A;PT9{onfdM(l#6g)6^KO6Ekq~*@dIG^rcbfDI^jyD(Ld_`)-
zRn08VqzvV=sLBEV9kB}!7H_Yi2`)7r;<)2fBjow!ylwu=qs}Kc=Kv!7*_)nJG2a<l
zeciH%b}D<C6-^gTc4gNK!5g$4Wn=f8Zf{wTz$>u+TaoJK%}&H|<>~D;DNm#Af1YH(
zwpCllvWl!f+;dVl&n9tk&-_Yi4S&mPp}mJ*^tUFr9*Ga!G-@6l9ZmCms@1rg^baTt
z_G?CX{O@fh66~T!%=%9)f}vm-ykO|#gZ+852>#s77PWoTJns)>7h==<LV0Uobx@+9
zgYuV~WiR-hI$!n41t_S#AV(U4q^``go=+w-C}|8B(pL9zi=%6g=C!GXAgzKw%f-cj
zen-8Pl`w=Bt|{NC1V2>GJcP*7o%B*ch2W6bLe#0Y5TH%~OkK%ai-Lj{#YY7N?x!V)
zT3ItQDCeWV?b+@}%3y@@jeeNBOLy?K-mA^_jzr{r-0j<YSa0ncK*l-hp&<OopbVMZ
zAD3KOU5v}a=5758+leS2^Jrnk%b=;Zd#mM!@g3=sLlzxp!DbR&95td;b^s)cfOoVy
zY4)T(nYpgpSkXkYF7$Fb*xLBSg~YLs>2&=>V2M)^A7)4CMYad6km8n8fSD4x+p!}d
z2e23wSe!0da08Tjh_Wm<EU(c&(ny^5YrS?NZ+144Ow02d`^(EN%saZJIA`u$p9x(|
zYM;MI_pz1T*J52qtZ2?ze!bUwDMGoOHo8*(xJ2*%(By0}Id=!>?Yj+m?lXpwT|L^E
z;ji}?l$*UCGiy=VeCdq<QSv?t(YWgZqW0<NX0r(2@vKb*4`pSqv~{pks0;Jjq=OUx
zMXna#BLqVml3Sl8B<U<E?p6r2OhtgDCHGKxpMn$64RKmhq^38YHH!Hd$b$iH<D}|d
zHWZy}75@s-JJ=h<YxQ1B*-$UfcrY(ZqbOws7x?=(mVuUv6C@jAroB<wvEcZWsBOny
zM9JMXdCZzD;oQ#`Y1~~&$~TIqbI9cMRXoc55_~?}Ua{@J6HbNf-mSFjz1}{P%WBe{
zjl}Th%rv|TZRk_klIxG<3o|(jChF&{FFOxmK=Wcf1r7J#Nnr-v1|SHu*SWIU(X428
z|NOzIPr>zFsRy;cd+WlPjAi5RzHt0kp|J-({o-$4X#U%Ge)@MQKl%GFkJx^7wCfj|
zf9kSb9Dnk6f17hUXY*fvq5r42y}z1V@@m*SCzltV{`Jy(L9_nx$lka9=eBy|fMFod
zWEgNq|IgCycMGCfi;GfBTvX5J9YewM{fR3%mNJ7q&uoctyqOmyZ8ZT|ZbsChMkAFe
z{5vk;|3%%Kheds+>AxL2nW!;A+>NxzCJ5Sywi+fhl?Z~UY249BWDt}@G#qi}nCbx4
z>4>-@tAcc(jY(sI0hg#@L~0@ub{X3wXp9iC3X-^#6bckn)%Sd!FX>r&x_{^Vcdogv
z{{1E<ics(SKFj^w_x*^G?pJtFvA6=%0ee-HL9Ewhk5$>!P<&aRVKQzU8cm7u3)Xy0
zzGR*@OnY+_3!LMh)p|{bbfdbBd^t0sb`nK~bft+Ez17J-ltlIY7-6@@>H4E=Kj^Ku
z94a4wVoG`Vj`Kj-X^FMHI@>%w=Im%eb+?c8ru_Ic>cOrW=h;OJd7Cas%==G!2+Bfr
z@k<fq1!G4l42)y=chZE0Gka0#w55YZCR0qY%L1wL;zx#oDL$$4z#^Hc4qAk~!S#@v
zD>Czyfeq9o0`^HA*@QntJlksdXO*DE^|LJF-paSbYo*5I)jlg0;gmz*5@<P?#4g`c
z8=1Si(aFPDKNhx~N8*@?uA#=5fw$r{R|BmrZT34lD01!C_9tCw>}-Bpc?g|IGw_LT
z%AJ;um?QbxHJdu}uCkD-*s`TyN8=ogyTh|*rbR?|fRbXdEfkniZ4Bd>)7psERgjGv
zuQg_(WGlrmlM<SK080p*66pR!Ny8slXD-FUGIqQ8$0lx_PeUm%jZoyo$$Lgo?FE4=
zoK&+!v6D*RH?sQBGZBcM;sh6!^!ZS9pyi>JWwHTf66E6~ZmS~bK|go0-fx!S083fZ
zvD4{TWmurvNcbONeipCk-ck8*vGSd##?EZCO1XDl`Mx)$^X)dvm9(~jLb(A&%<L*3
zb(<8>i0<)v%cLml1?BUiWW~2qxf&%V5lOE~ypVaqpjoa-&>y;A;YFj_1wn=YnzKXY
zvvIc>5^P2G+tL92aV6`5>fxuCt!~W{mmSM=Va=W~qTt#m?IV2`$0_Cz0B_{6$sMcs
zV7MeO^N>;U)n4P0@psE414?9QYJPYF)VR=Ei2E3WTBZYP=7l+qV#&**(h<{&Y7^bp
zegHu(_uid7J-2u0Z3%XZ#`Z8uceArGsW9n&eAL@)OKltYqRiH%Q=Fm74d{cuo;!Pg
z>cqLBv=@aPXSTNbwOBJEH_i=b`vzn_fK;Ki3TX=`T+64xAx+Ej&Tj=C0EJ>n_b3tY
zqh7`w0fr(v&tj*s9WyLBk=jMXM6${B78y;b;4GtLq%$j9Gkr|*Tochuf>Q%pBbeW1
z&ImB>MY|(9;3_9|Ux*C73GfmHT)2Dwst#}!v#)1!V`8<rX|+TB;Xj0wGE^6wGPK)!
zzA1TIl>Nu?81HM8{33gVuK#KMz~?j<w;Tc=#^_cg-M5wLmAiTi{r6V~EBl{uzS#RA
z4r5nqlJD^fi;$|y^D@rHmVuA`MYT<YGLGF_P6Ue0P3sLuvHpn6-bu7a+nG?{lcQh1
z9w7w4>k0~L-C3|AjPV=eAj7&9%@Cd)Kzhe$ai5Y%p;BsD5~Ix<w1E)ofk7*f9Sj*e
zNG$$yv(6jdNFP2Jf}%VUh$0j$%|N3&^LKi&lA~#vYfa*qa#k=b%=BrvT2puA(J7s&
zN@qE%v1yX}%JjDP@WcRdFKj+w39$6H=<Nl1OF@)1P2Zze+I7&Tzm7P6(y!zCrb98?
zXF#5hF3^7yu(dF;F>b=wd>1H!4AF&RXB_JP{5dDURg#Q+GIWQA0~Qyd`T|F*tf+|c
z5CooMa0DUcurMLi#UZl^H_nc0^?v^BTEmhX@LH;Hil@~2N!=L{#;<5tS|<S7p>_gh
z5E6$9e^SPT8&|c?8kUbC|9IRpP+t)&pP!%AxWb=ZYK+dkeRqv<ITSx;k1^W)@={s<
zfS+Oc`zal5$|YahyS@GL)^$BMb&uZGcVFaVz%HfvpzW2e?5O^44yV{{0y5a!=jrb3
z)Au=BZ}hUKBg#G{^yX4*Vzjh0)D>hWS(XMn#qw!=F-QVi!C0!4slSB4U$r$2T+Hp(
z&f%%2^U&8f<vM3IV*x}MKxad6;9a2cM~SCeiv1+=N2j+L9GjLTVbHMO8GxGFWYv1a
z-yA$7*w$68ybSUQsDh#Fu*e%_dkac6%<6dmB^8eTdq*rQ-GkuY=oJ*=nB}q{XL@^#
z-ujHLOJnaZvR3o%nY-P6mkVF+SLCVO(mlGXT!RdGvG+iNy;%2Wec!%wjd)|vZa7sA
z#94B(DQaus+;D$hKXjNv7|PmudS$GaPVJ(P0uSeBEoBFf`3q}Cg<laEV(p4R%>UVl
z0VeO_!u}Fk#Q^suL$$lN1ZHw^SEtuq1w}D$7Py$FSw`?T9>7GvyqUX1FhO`D;8amo
zIK!w44?(a>0&*j?4QwQ8rP&gPZsUsiYs5ugPChqW62^oDJJ+PYvVfhGEexMTO0h>x
z@;(DUMktcwVLmye^uEiFf5AEFyBhn{{%;1NbQr*P`{+y&8Wt}kJ!%cNeRu3&;E#8<
zdTDTglG;xeckvmkTXB+JHC`jU=E=V|)J8?GCtk&ak`#SNF0T-YHzWaaSiJ)<C()R9
zk8=_#T6+A_=oKge$j8+xa3ET&lBP*v5>RF^ujVgtYc#)4i(-adE4UJuhXPI9C*}9J
z=q}1vku^?O<RaN)HR!9xReU5lGnb{;j`gq4ApOUVab7X&a!ra+-_@!ZB6L6cRu0@(
z?kgQSOOX<#ga^*+=r-tk+7J)%FTMt2Z&KQyw%t5<=I8nOS~j|E-$Gi6wh#BKs@Pg6
zez&#%E^;Z(nKcgTUaq}}Q`k?OxMdBjKVB%~0ACpzZw#E_5dEyWbC5777IJy#4bzGi
za*FbiVJ!u6pf+G`H6g;96Y&(v$w>rHiRDz<CwE~9#mX&)+E6VbNuMAHhSyt{v805=
zUe3-e+SNqE3ij(7AweVfo``nI(SyptCEOAWB`%Vh=opB_@U4j(OBWp~vb-48_o43L
zdZnXRd01rYakjOZl1vaEAHLnz9itm<GaRn$`LIZ*bfy>!^Z?Lz_UU>}iu&?HbwA92
zmSi58*pze#h!I3UJ-qX@4Ww+*(3zJOa*Gk-i_<4!Yh-a1<Y?G$CL$#XU!|*KAVH%@
zqDD+XQTz8z9=6^X{F;L!3~mUVpL43mL+Qj(xJ-(0pgUg;D|jKR)grltkNU`XBBFaJ
ztm*Qp^^>~%Tr!V2;g^p+x_kEP_>4wbdQ;j>_HWzzA6{s)+mt)fTqx9c-{v)E_f~e{
zF6*h(z1`N=@%5&HQ!Gem!ccai@%l_%YF>2f+;B`v3diC3^L)o>lu@!(2I@g|B(R4M
zw*ijDJT^&!?veP_Zqoy|2G%ApeseJXV({M-)vjPc0^FM=SxBFFR$!|rq}{OkVF|oQ
z*D4^8<>DwVv*oKHZU&s4Wcla;1pKUb$#HYQMQRMPPkiP|S(y@9zyNDh*2MW)TR<8J
zsofztEPPB$axJRORR4d(uFv9n{d%@f(4XHn-EWAp>?unzHtTwdmG8q9Bc(pUu=iaD
z>-DyOru?~UQu-58jE8mjKUgn?nrvS>U_pDw%=QvD2jHx@kI<lS^6LSCwG9Fg4JuR|
zD6yAfyULoXqXGOM3UxUu#I$tJ#O6}rjDwvYMIUv2A+CUg#{YxKga0{b@&P%*oyGo#
zyx`gb9JiD_f->K{Squ$YlKw=h382hgrTUM2BvIlo*Rz<yaai40M~{g5Q^JVe>wd!N
zmEho{B|WDc^aF1z_@-WSP#&L8ezUPtj{K~zlC2mnCM0c-Khw13=(D5G$*?)^e4`)0
zaSMzeEg&MC5J)uI23f1)M}Dql6P6_TuOh=HOf}V(Oq3xT0bh(^_S51GF5i>e@bS*_
zta(znl|XWCJ|s$0iTu(qP!w=y=>`}$tUW68%Gzt@oRSP~(xk9jz}HfoodQO5XB?yA
zfO~-0h5xk9yeE>FSf9Cr;1k(EHL>13xX}D1-BUwm8r8yzKu7;S4ZeKk;YE8NFP~Y8
z1|KL(>yFCq+r5>YHTFLZ)cGdgU#wB=f+y&@3iMsoQPu%n4-Pq$_lIIi$!CkwfrSa<
zDww>UL4Y6vtoNbtqzu3wiah6QrlVFD1z}oF;qFo-N+;`Mvzs7*LdFAoT3}fQI3`}{
zzXA<ry)0rX$`e|+`D(2YB-~-7h=W_t;mOA5?htO;cB5PxcC)x>bLWXzPSx<9g2*I$
zHTgsSiDNQjJQnQYpg+8{B(40A>!QmwJzas;2RK@2lqg-_tps`GX+?cMlqKKR+wF4V
zdW11G8cT!Xxb<BAj8CK4W`N^U_aXld0@pW4;m|Wq4$;oh!hN`Y94(G9Z=U(%@=FyS
z;q)HH=cgt#++?~S$cDe{!=M*6vxSQt7YWm~j(p>QD`v<Ne>v}v$@OARP9?C=JO<T}
zti$=c#~qZultW)k(i5u=`h{SMqEKUP@y<O|9Z9Zeb?1{<hHf8Y&ImV*R?k~&+Baa1
z=>4k9YH3cf{B@>(&3qQFoxO}Pdy0ijU;m$({^N3UklCBkMih@?yM<Z$OyLs<cb!Dd
z2aesA7+=&nV3ol)Ma9R~zMQnAKS|DRX*jqP;13O~6yW2Hq0{h!A-Y{7Jg{O~?hD2-
zV*4T&W`yvX7vQNR&?4KqjAg4}9jd=TTZ>CfnqgHd8)tY)+$<nH{|;R5YxzUMCi9s#
zE0}87J3lPTB_d~c-&;^<bX}dWNN%OI*?Ju8_msz7N^e`glbpBJSVrQ%no!w$AZp-}
zul0ONd$ImBq|;sFHkF=i8asDWEB<cz>=y<6Gacb}jXfuJF0w5ylW*l!>W@R*3EDP%
zUm0tVve_@VL~Yy!YhYK16lv0UyU*rUuVVH;T)d#X8(D`1!HC~OUzo8asGPld2PzzA
zCGd$bdu;tPa+xeQMg>kyY|X0>?*XHC*%2qo{M=(`xLu39Lf5Zit0T+l>3du|M!Sgm
zZhI=Of8+u3U2V|LnL$mtkuF=UjcpdG?ThXm)F}?ijRfVY$#$i;vim(u27aySF=hRY
zKE7J7#I-382XvOhf=Rv4{YR^2S&~|@RapG~vUC+RL*DCJSU1w!A#mSV%I+uUEYh-{
ziI)!I%Z!{gH7aSFKTl;$*k;2TgA-x~KNa;3Dr>A4a<y+g`{ZA^0OPq(bP(5RP}op~
zKA>u;<<p5-VDrBc$5k#)(KP7BjH3vqV(+kEviX={q14pUB~HWPEo|4AeD`3(-XzyV
z<M0`K!#oDwN><uZWM-f~{rd{y4ZE}QOwrG~Ee~5Y*2hKmgSxu}`Rp&p&D}KVRV+W}
zMqJOMtZRxplBQ<pc)5AkNjjzCKNKqY27F;fK4gh_9yhR*i(G*TyJY+)od{g0XZ+36
zrty;1Wj>SG+WT<^J3FcLh{S4)<YO5IS+AsGbGj@!{ZMuJ)-HywxftfWZ{n{a*0A?O
z7z}hTQ_DMM@U=?09VJed;4FER0mcKaL5-g8Q~CIJ3};wCFVBs2OiYyvu>0TNj{5en
zz9+TJ{-{W~cUbwV$o~0%$?N)(${suL`QtWgecQlan)(%*zVDA6tfyzf9bPL{S)8|F
z>!3Y?!+*GDG(-IyTxMpNftygeRmkY+0C>3O$fc3NJIVuJKBM(UQ7sZN8vV>6gDMzs
z;c38r>$mtAUPDPo4~LHeR4#`g<2C5U@=HOsjua>TiI_rJ2Q1(Rii3B-!7S0+RR~U9
zEEO*9&-f)IxtjOvj`UysfgJW7&vWGR`x;BL@<WvMa9fZ6%pwO<@*Oa{@RYk3b$v~B
zN8$@^Ou%>6Eq~4yEh__xw!&nJW&oO=o1dSUIv3sxPm~ggu}nDKOZ|(7FeJ?>N&1F@
z-@iT*N8A9BKV$!WH8OfKy|jzrfHpIH8de7N#Z5?%)IPX+35!aA-{{#uTf`Vxma_7L
zsmAei21P@2k&nW>h##F5UbjMfBvP9P8$)##=KZS<(84)I$LPNrr%WCXJc7}qwlgxs
zGTwY{CrCizg8DT`+m+O)euuWZ;VFOfrGNb2Uplcbsmo-yX$I_RZER&-vSIMRiR;72
zXA-W}$sqDYbs)@#Akq52xgS71#yJS!LBx>M@U%UmjQnePJGIOtZaS@#toY&gX+_a4
zoC_z21u7#cl?x(G<KUMk8Gr?--69|@tU}N*vB`s1h$>pHfqZK;j$=FUivH;v^~=IS
z5AL~fDALDzM_J&#YvrX_ugpc`?t9*=Qp`7XX5B!sb4u^cz^6M12L$|{)TkuoVuJOX
zHgutP^p6=F9+=ZOw`NN%ZSA<PPa3N`zDhbrH4C#i#DDcJ;L_`6j5j0CV6EiVV*Zpa
zwIT+^;oSOFfm>C0{#wZ7`I5|C46m2Y6mtvn=GLrmG3Pp-^i4)MTWILs$Df%$dE6l!
z-~%wa-R;C&m12jL?@~<RhPls0o9R9~!7v)e+Sxtk^nl#(#;Y=I`Y#t_K1i3W?HoS)
zKdwde{0*3TeObXKn?v*lfkLFZ==-1{>PCd#+^HG3rh8PZdq{iFu+aA1&ow<Ue}(Cp
z1=AaR{!q@SKYir`rML1Ey|Mg4Hbpb$AplA<C8Zy+Dn_QlVhOmJdr$}{BiUK_J+R-z
z4i>Eivp!k!E$)1oR6o2V`54r#sr)`O?Tu`VQc3B;OG3g1;4esk`sJAzz{XS?JJP1q
z1gNV3eDOrXMwnmtdfRpPZZ}<-_tyN*&ez(W|M(yO)c@?r(f1DByR!e%lV4n&@_70?
z9Y5PL&GUC_wbPAGv(9u~+0%TqdaCd87yjY*?IDXknD$0U?91ccd*LPDA>Qj<BOG2s
z3fQarvC>rh!26qXTVC1v<G=F515SCf(Hjb!`ccHl8bdxNqWQv1_y~s40LaIp%NKdC
z{nh;h#d^@*p;&a*@0%-?^SW-FTH351`hJ;owbd47{ifMomeL8x*886F0}=;)^xREB
z*WrkDZ<!1rbfaPx&HxMY^DV*YTIBR|ivHZejG01=k2_uLw>=^P-A>?44f<nVluo9S
zD#O2i(ka1-(<^2r)GaG#)eUd9j!6UA1_Pt<n9Wm=Y-r6&n--e5pYqYAEC+jDoFkYC
z9`lFt&HbA+KM*q3G_ev&+?lk9bh3~8e=Ng)Q6P6dz_4%)0<5B1L<f!T!P>7!&fhS4
z6jz^o{l5S3_=Tr?w(I+@X!@cmZ$??~qN{JSUDH@?4!VcX7p?z(xbpG^ldhYOpvTu*
zpOA7FGOJG;d{_JTtZ}v8F`qC)ia<8Us8p4>rB`@0bJt{SDOcr#K^c97d4*WbH#45p
z3cO_fmGBmQG_1b%BqAl$%_&-~oh9~8I8_mC`6nzm(5?`ARBLhVp=Q_%xr|%JTezp{
zgp;)k6qTTpR2`6Hu=0ns)&WaihA?v(jjN+=NMz+%BOTXf<%Gv(`V?WZF#mK>#&2u%
zmb-R1IsIo$iqTmy=^l+&z8Sx{@^zE#8nRKH&9`;n%8r2-HvYdH^f^ovW3Pw&bj|^F
zp5}?sBm$R&{q;UVX~afU5POPo*`Jon$=A=BE(5agP390I1a~2$X2!_`jTUst!dVys
z0~{j~K_+XgJCVUT-!$(rhn#aUjK)foAwnWtvqkC40zDX5dBx5*?{UH`D>Q3j*6W4H
z3dU|)UTQvJba7WRAJ9StK5xIu>a9GDRQ$b!r27dl(sky;x<@DlO>b+g-~BwQ|IfwA
z-8b3W+}8hInPM)}Sr0_@cU{m7oOjk)v%w62sQ9Ft!Ss3cSgGd3sWur9D#20)3AE-#
zDSM^!SX*kzNfI`(<qa5#Nw1o(LB<yu6Vrk>M3*8|>jg}MSj-JTSig_CkeV1KX2o%|
zY`$Mi+oFnE){Aqr@LSO@ackU~vyM@Vs!AClI%8AQG87>$2VU41nOZy6G1WC;y6Xn|
z3XetOKDv1AMP2iY&bAxKexvN;l_m$pC+fK6{foWtnNl8W20Bg2ofl%yHO`zJw23yN
zu(9eA_(1vao3igQ7pF_)702K-U260+pTwPv>&3mIP#(ulZzm=KvJz8rLH7wKOA-w)
zDhmwcG0qff*m{E~v6$A0H4b|uv@w$K8Yi*Jy*5bL3!wn9sx`+@whqx93$ME-z(A<l
zO+y0SWcf6bZS4_U5Nh}tXjQc9`039&@$$Jd=Jx~Q=a+J&YELfADmVZ8Nydk<Y#FWp
zvD0pE8%R@bzs)Kg`<=FdTN+yx{lU}JiB%5CKP)cld4PqiGG1?cMsF@?>p#$DuRovi
zxQtcq2LtWf824byMzklDc!%?ZwQAE)Bmy!jJ{r}D<+K$1YuE*6pkqT(=?JqVm3Qz-
zydiqLnC=s@DF%QSi4hL~+qTAR`Q4yoM+q0+p(%8#M211)F_3>woceTCwFZl1Fp=g|
z0h}Y`X`gYbo6{`A0rk(*$7WMq1b*UFkapDkhJeUp$4yJzXsq=VqtKD3YsYHBV(-7#
zb5(|$fqiYskE%=q^P9%sm*{O%9_hMUHMR#|+<UJvam(y+n_iFZs0)~m8Ip_#s_-40
z#ZZz>!tBmOsOHFmZhX-XOvIFpn_Vu=E_awA3HV+#vBqq{V1X7nGsaMDKF7Jpe+JV7
zDmwW}D`>+K3V#lNexkvpaTyY{VeE7kSH_+2n_pQdlc98+gRk#mNsYRG^Nq|j85+<|
z0y&S_lx5f|yPy)s{8@Ix2Q=(6hXA_BvEwMkvGZM}FUgsAx?&(V`SIO>FIwy!<12fL
zb(T=w!_VQJRxup;(ARbg2h^y6o&?*K7M-O9LwR3Y->o8`a*h47oWcF*!WEYTGj^k|
zVoJXiX%>oQ_=K|)n;3WVueayKfpM*elbCI`a7mebVTOUiPn9$>2nv9Br(($>X%uo&
zq^O%#rdwBHwJXe`>AvD1O^Q&Tu~1at5eCB6i4oj@Ao2bn_l!%VHk`sM$?~LnNP6|B
zZsRz^g6W}|K0cY@_{V1$UJYLHesE8=WuKC!v?!Ki&M8V2$&0NK{l9M<JoFOf9&T*m
z`o)t>zp6Ru*Rm#k)2_9xSrhT&+iHJ~;eek5M5)wrid1Bj^}?oP3dG@dgaw8|nd5)d
z!9-`MY28A*;;48a__~nv#dMQfQ;R)HU4UUNgd+6Sp0a!*vS}GWZ#*=_Yzz=iVgvxV
zbSBb23#R7j3a~@5&T3sKz6kuRcpr4E)vgO<NT3GpY|v$=Ic(0*=D9hD2K(40eD$@5
zSg#XQ%6O2%Kp>^96E(fPZI-8%zFT(q4qYbO157y~qO&aM(e1R#M<42M{#61`8BE~)
zS3@=ZT?c<nX$SokaeghWKXL6y1mIrb%0j>c9JXYLh3I`EcNjMZW++Le=K4sU_;h%W
z+y@5ua(E)f31nUfBHoY*saOTi0lu)kX$fcu0{w|554%8G-Hv88HR<c6(J4nbaeWYC
z1y6MjnM<dUuKfg%1?5UQp=D5qGWgx3%io^2a>Dz;>3{?i5*M&QV4*d(&044H)^v9|
z+a3<!cW@Dj|Iec`EbaH=g;w-H_YjD5z+|ma>MkhX7j0>boiMlM#Zu$zoAQh^qGMX~
zS~^S_G5p`OimhPnbdydu!5+B;yh!eKg-03rNX@CdWOe0(gBDG))b?t){@bTN9zVuF
zS^vQV#h;(1<3Kd~Ug(47k*aB<upV!azQW*=;2!JdvScm_MRaC92@Rf%BngwKlK_Y|
zx`Hmx44phbHQy=qBwJNmYbP1kOlB)CVu0wxF<EEToPI}gdYR5%@;`FAZXWAW*nfIH
zrSq=dde>QZ=WvSkL9_jSjkEGW6%fzz=+-=+SQ`H=07M_MY(30)W2ymTL`hV@!n3*c
zGefmAfN&#?x#jc*Yu8Vs#pCtDx~k#Z*q+-g%g=BTKR`D~h$fpGqCttN!zP0U0}#}W
zsPiG2qOCHV;p<%^nOlDW+W=v7Rfzdjk~rccP>XE7qLX~qLU8qA92Bm?QtO$pRE1~x
zxVT1ixFmk49=k@FZL&A(`_IU^E4`Y&X62EB0nk(VzE_&9W{suD8mjM>v!AWB>Xj=k
z`cqA7u6JyXNrUE6Sbh#n7@V(ZpRkKIF~SH(vYi<pJCvV0qH3->#fwRYLq}wk2^Wc+
z<8LHRk<^KIKmD@MxLqnTA7Teq6?`la(d+!{kqoRioFzEI&|~1w3`f?#h?`z0Wq+wG
z3)61gH9Q`JLFytmcl<YU*bO6uW&V;AOK;SUB@pu`r-fbG>9qA%qev+-VxpQa0h#bP
zw6SNqPBGcuRUY25Tcd1$<Haj)wk9hDCZ+wn?sh~<_d0#o%_z&)=(j^zd0L~n0$L&5
zKNzpFIEeG0qb-Ef1%xlOLd-z$4j|%_?m2v+`Ucz#P+bpxn$nYq8Cda51!{miBQF*S
z^-Wdl{=8Z=Y)Ci1AzwAmCxM;i%OqW+T%L!)sw8$XoP^!TR}tllS~CC)cnUp;&G#}$
z6`m^GJ~e-YtIB`jlda72gk$V(yX*-qWaX^U#@&5kCT%AKsM+dOC{k|fx*`A$9?Mm@
z&}MlsK)d#q=|z3{*|S`MO<#WT+}LxLfbCn!<gjfD#?yS@i}uqMh-(M}FCM$8fOzd>
zoPZK&0S6~#hdGw3%fBPsF~mOemtU?84CPoVvd$kcyNb$)<3&~)*|4tyi4iFIR2G*m
zjV^SL2K^PLNuH+#N7wo@Nnh+dxl3FM^5@O}v(s!`(fEThL~30Vb~@8XNCpgsm$QH1
z-<{l~7GQ`RM*YE=EsISu!U8~SCb_;--|zX?xxTlzDM9(>yzcIfN_&Ie-qn)wLygIH
z4?wl6=70CmCvMDgE$Z9jbL`TQV}|}=9`RD#YpjPAW2mktSpt6rwSA@1Y`4IDdk}kW
zKR$?ax1{gf8n|`)*mIfwhvEyH;%F-TswmVO=Qu$0XdJtV2?d<L@I!=Nc1;@2N*9`!
zHeWTsL|a5wT1+^TDX%<4*=5~gexI?~vkj9{>yttxjUV?<l!RES>XH+JU2t8yHp;w5
zoh-np%8e->p6@B6KPvmPtOTTEV^BZ?5E1ARCN0DViSC<lh@>JSPKbe0SxyPY^Y2Gm
zg2zzh`OjUxbEnPPrt5UJ-|6k|);-1y>w=CwAIfcql!sT-ocodD4!qrKOQ=-#wb@gn
zEKLcvFA{Xdb^87Y=j2DnoRt@tSOkO#?U45cV}~`G(!Q-{*pJ*H4<M^;@Rbr~NQIXD
zPL_|%<<#w%zD2ZVMSv7d7EE=}V*>DkSkH7AXMYI`)N!cCV7~#RGDoC%L^v=;ZT6{~
zWTeY~jx?&~0!YE7<)^3@K@$5>=;jg{bC0<I0q|F=-w{=P`SM_`8`HqGrSo6;B=-I>
zh5%`bSuY5U>6uwkJCpAh=uB+`J#yCiZfBdP$$m^Xf7jd&fSHs2qt01A74^WHHKp3Q
zMVW-D>u5z<^3vS3?6(z}%CNd@XqU27Qu_(L+w&bqGonl7zX*@7m8EX0Y1y9*qOP6k
z0IrO^m3s?D6-)D*A@*t)&SEJ?F0mP({+;>r!1SHE-<al0_AcLr4r^7q(L1(~G{e7e
ziT$5P9$8Q_vL5fH1=1nnxXVnaVtHvuVxwohr+JT;;V4bs`>g}@mHh{HRCb-!TT**5
z8&JM?2<&U^#Z&!hN;@U`ugdywDC;j4+2*NTI~Owyz{y30P*~xoE&znXioh%DGQ<S?
zn^NxG5ZI3IA<j@6{*#r&F0l=fj(z00+;BnrN#PQi)dBChx=GU5Q5{mt6W$%lNfLf)
z<4|1dqkcsuMI@8yz~#y(Ab~PdlAd0YEH?#{H))7)5~q<9a=Q8*9X}cAzt@YE=9686
z#%$i)7Z$H{C<CE7Ta8kDFs09S5QhPKo!)}+UDNp6arUb$`_|as>$U%|qm9k7@9tpZ
z`_36#8b=hwbKET*+eJ{?JwqxLRs=v$y8#<5p3@DD4gp+KHRc$fisuGh8E2R6DHI>c
z`Gs;GbCy7I804+OnSpyi`fA8+d44RHXD#MNRHrA7mM?0(K2?kn4<w}zI^AkBS@Jrb
zlo1LY_`^$)`57Q`+AK&nnLeR&B5JGX=j+ex(t73Kcj}nsNm<JxJopfX&A=#GOAf_Y
z$|+bFXqz5)DnB07nRY0T9CS*RvlS}=wmbOR9J>FyeNQp{^vJIs)SXb>Y+1`W52dga
za%EV($)wjfTdZy)v>V-AIu;00BNubMY_ehr+E2=H^9GU>h1gBH0K6#vj&K}YqT9l`
zQnYr8OD+Ts=#;ASauVg7@WVjq)c%i}-#;XaTP8*%VTg-(X1P?(jDqWL!tHU)-;24(
z>o^d3q_lf#`tRy(cQuypnQ$n!F6B0$Y}CN{H!FvlQaUe~lsKjRqOSkeS5u=KM$F#c
zII|_7Abd+;JP)`ifg+5Wh0<YeGaVSB5sLI|pG{r{4)q3<72rvveY*k-c><Ms0G~v5
z$1Y4XY!*j~pCqIPmz)^6&4+x7Moq=vTDuxP#Fo<G@vrYR9N3z*6NXRo;h2S4^FolE
zOmd;skB7aXUFYTCCet9l@a2@ToU?K5WcEsXH4{*lat&~sAj;$%gZh+Nf(1+1<~8cO
zeyqN0``>!kT}l5HjZ^yK^^eYz!s*ug?i5*LA(T>X79~(#0d%g!+X4K7<A^h$^QS)x
zncuk2I)=B%4_zVLWTr7x@M_UlJ?IU9=6sHbjJX&LaU=$mG7_fUCj91ut6+^QjO8Xm
zf;tVd1b|SAmw@HdpKKwZeeg}REXBcNzs&ftn^Mgt2%zh4u)jz^yR86Hdm(VzkkES^
zIT5`1x1GnBU`5#*l>68^>dgtak{ktRZR2p!!6ogO)pY7cUfN0jbNSKYP*>M+gX47S
z{zlR)zX07_KP`My-uSOF;+f*H8J$cF+*B2kI3x5D1#F?{QweAB7%G#4ka%UGU}pBA
zey0)T&m;H|T<ee<{hUi!UYz!mljFQq%3o<_8kye{r!{eqq3u`-VGz`cF)3&qNh~r`
zDm;Q%%Se(Rzc;RY?y?Qp$oppJpgbxzEXoRJTu#o~q*3;ntW`Sen>u?LoSl^3aOEMD
z3M)JN9|i%pDsTOd65W)1H$wUG*uj)XcbxG>Io*`a_e}+!Zn(ze4K59H&*eCuY>~zc
z{iT!TMGUmqNtI8DojG0%VLS+A^v})NX>7<A8ZD2pE;H`t()Aw&<ZGAn#rU6?0C5v~
zlsE^6-|Ck+^#8`x|NNEc`cnr|H+nqvQOq3gu~YtKWa+;SY5dn;e6+lGM9E6G$G^_`
z<-cw&sC+tnT<$+^?wFJD`sNS3Ge6w9{~zlv)&ICW>n~h@<vW{(tc=-xva8GTs8V+j
zbC~UupWBq0R$cx}V5271t}H`N7=Y27e|=4EanlOMLHN!yIVMC<<ag7AfPb=oF&-R*
z{-g={0D$T;%V&})u%BjhB#z37$RMyw9ky&%mHUF=q~NbBd+(PiHz<bN2JVy%>?tTZ
zk^JD9fmC0;l3SMYkmYE-DL-gtPo29dX7VMt;B+8NGz*Gnpa6sLy$wieLdO9nBo&|x
z(d%)vD{yOqipa*-nca-=X_An6r(u2U(?S!Tw_}D7{(Vu6ri-vg#RGE32hE=_c)NV%
zcGU#KZZLVDSfKh?<i{J1&U{9sK=FpHK=je$5}CrkqQ%LF<fDCll&dt5l#-e8Q->Rc
zQn&)>?NnHnbDdvMIV`ZL5#Hg5_>qvrwYJ4+;QSAK8<o`cuBQKtZlGD;brz|^yvo}v
ziux~frTQx09M%o&&?t*>Ms2oLIkd@#^Dxc%U$p9HI(V<{$i2#NE0b9sjP-@-);ZfT
z>a+iYRv7;;s-6;GF%gU$#{3jUiMy7~Tpv;XZOO*iF#KkH6jVl`4!{hY&wHOdH%uI^
zH}4d%{H8l!1%k^#pYVcWv)FA5XYN#{&<B~o(&n=-3(sV)P{CUkFc5(ulJu=q8k6Uj
zG9-pzW~{{yc`r6UuES>^_28Uioh>CA+j(7=34Wukt3}t>soZAaM1Ij9Ep`2kinXP6
zpw0eZKyUx52u$pUa9vO5^qA5cdC@>p((!A>qfSEp^$MVVvv<c4qt~R_6u1e`&|HFD
z@c*D_LLbYB(9+SVM6cZO-Cr<<Li65Hdx7u<k!Hi7YZ>7aSeG5lhDAU|kcimg$OAl9
zF~R6X%f{EMJxPJ#6;5$in;G%!u<kG1@p-=%xd8u7!b%As?>zp}qfkf8ZbWcd-@vXX
z017x_C2Adt#{e}0+i97nN1&8)kIlT(seiOW`91>jfYPKNNU)bA*giMuIt$uxh_1Y&
z-htMqqx!yE9A$6!RlfGMcbF;^(EeMdWYYj!Sil=2hz`)th2B;36YrmANDpwrI$`$8
zz+Vbyr!nI^^i13XTrrH6^MO)TOiLR{SDXlDEsV!>*2}{{U^OW2zx)JerSD~+3?xoU
z`OiJV#v?m~`41b<_!_?rUfMR5a&bMt$lOq!Fn4du4F@?Si6XoF>085S7`ho>9-71T
z+OZI7-aol3!^mp5uB(ClB^u>(o%QQprSGoNr3@&2pBGstX<qKmetb=7XtQHL)Z3~&
zhE4<Wk>eE8c*?Je(Sn2jTwf*_C}Tk&W4cLW^(A@nAgcvwY-O22*~IrJ2Kb@n^oK#~
zjj#F7<!Pcc)-qX#oegK0ch3Nd7;q>aiybYEojU)i9*K5`X5K9)W;C}uRJ(3IT6KsR
znbF20Be8%5Mlq}k0&W#Qsb8_y(4BW<_68I=pl<S_8s*1FR6nu41SoQgJBFjt$jK^B
zW%y{aSe6a15;<$Qal-Q03dJ5cU}?5~+i4dAo~z*bx~>Cd$@f_d_iR!Bmu(k)lPr*E
zZt4dtQT+!TG`$XiNv1g6!>WTRYgh}z<er->8c=7hI#MKQR3~}s&X{8wms&pKSB$h6
zJ~PY_=@-Dakjp{~vvB*RTSD;#PKZ<ZxuX>xWc$eOlK|-46S=nJE9N|8(#<ThC5iGs
zxD$iS(g2}@V)x!yqp~Yvx@EZ)=r0FfvnucgMJgACNzvf9b~ql8Qog)=RNL`I4s+d%
z_||p4Yx}W{JuB${TJ!_HN?DuoweGIx>nW!xt|D|jI^}IsvazJ7>*}XXew((l{BKUx
z{)&xq>XSE2b%XK7(fc9$qL<saX;Udutb&6daRzKb0_Goy@zFX8JcPH{Bi0<k^NHm>
zmOLq!Lksa1z#+q@OseS2v~oe_>uL<EKs@Ocx3gGF>^S@!=mQ8AiH0{On@a;QH%eTI
z(g@o4krJw}P*_$hhs0YjH4)A=hVxt@ty|dC7Ivtd3L6C%^_7L`{GqX8lVv;(y$E4}
zC3AYH#o}xGQq%ucueDBZ-vKk$_O-LszN6C8#6<JOz{ely`OzPyI#fO~`r6G2w)O=3
zl>z-jpEg@-Z>4fOEtTg)g|9jV)OE0Y6Q_WMh2}$i;v_n@6oU8qN#iEc@xkz(j&_61
zSvVy9G^pe43k947+3>m;(BU*Lw}_tq1&R5<h$;K6pVS}B-ZAJD;2kbV8|q5Yx?<WC
zLDxY~z{eoAmcmx*#)wJo6JF9m=8!`^qc+0`8nZTR>Y{6LmC7e-de<i_H<bi=;O2aG
z0bh)=n%|DH{Mc+QQ}z}07un*z9(OY@Mm_ABO%UDK;@CZM_CzR<%=c6ujQI3fvu8&b
ztGBdf8N|^TNFtB@1>!XTm?5WUxSsgRP~#ix4Kc%K@615$8lmpv6?(9+Ux!i1?)Z<R
z_P{Ehptj4p5MC6sNRG5sJfK6<>*3K6$lYX^Gu-&b<i!0;#ZpSPi>&ekYE5vjLpsV=
z<<7}TCpI~i^PlHnG$aR|jncDqAV0Zhdy%EQ*=8B9VUeHxw!ZI+mdc)*w!U5A$v=AD
zeJ08d;l0O{Vn3X6XE4zZ+;Fx&Ow(AucT)pc2g97>NvE}{+;(DCam7ZKa*}n_Hzr$`
z1`Q$90#iJoc(uNpI2t|=9xa`LR#dlNaa2%HnwF6u$QF~lHCLRBcSINL?4~O*-!c3`
z@T~6hVuwq`HhFNO@87WsDvYY~RUsmglvI!%ByX5%t1>FsaF7#|8kjg@+UfaV#HSC6
z;jGDiN3ZBO=qU85x^C7N)`VZSeqe2+sOiL4ZNMH;*<a;rFHU~AeO<?q@^hI-w<1lS
zi$bMvuIajY1G|s>6PHNQRt!LOq2ptc71&QB>wmp)K!Wm`Sj{wNnM5z1f$-AnuTgRq
zO5&1ApE)hp{n?=T^c1xW#?m9TP*KD_MAk?9gtL5+pBC7@CMj}0T$lQiVSDHd>T=Jk
z+fz_W2oM;=jbQkvoxs&MZ$PvQzavsqq)ub05}R<%Uy_akHln_U73CqNm|Sa=4~zOk
z_5JI0{bf*I2a4PJ-)&Q_%+p)wjB1JlA7?-QTUovTmdR%9)jf<J1+8l-G(5mN+OeEs
zU>*@|Q{0L=yP}?`BXXxu%QjAu$fKIca<Vwa@Z=>K)XC8h`(vdzNXo(@P&Vh3^5~(%
z&%<Sb7t0z9vv6znbl`g~BR}#dZ$REM1kr5wxe^LcjlmwHh+QwBgG_$K(iU8>{u-tt
z;-|v*G<#*D(R*3rGS|e7uOzM1Jw9wR7NwXt_ip7ud>c-ZeS7kXaukE^@y)0{mR}6;
z)T3-nF_ORhA}^-2>FmiSKXwID%3bUDMK<=*g9q=w=`RN_=3wf<gTt@mFNe3iQOtp)
zj#fVc0xuDXttI#^xv&`5E8LPdvU`kw;wPdEF^3Sf*%B<iqj1`Ci#-f)Kte#K&{*RU
z>qa#^)qSo@i=qCeDhMFBiA3Zml37~NfDyUHkV6MW9*NXAb(3=4%8txS^k^~E=DH#B
zIW%uRe`BcOEyb4H)BX3Kx$cFiK66>J<-D%9TmNX1bIJqfl)lzx`*&50JC7Zt?G7^j
zf^+6%waW_6g-Nv}XR5|KIM~z{(y42H!;2=U{||Y7Al<qY0YxfvG<;T&@inoWCHFCK
zv2T(uCviK1!emw8Rc61e^Mw=3o0EvnIAh;dX)XW1p88GqJ(0`iFUfIptvPbA@A&<T
z&DJlQQ{=fb!m~Fs%}~CT2VWtr+XKDzVsEAStH6G*`x{CRL~dnE+S&4s(OZe<B(ZgY
zZ?<L?%lE;Aq%|*pZ8~XuQu$Qi+dQZ{%-~R5a`8ZIBdegoq4Q#yebx@+Yt%0;p=$I$
z&rC1RO+a|5?Ns`HUPSIOYWFv$hQTK0uF&e^gyv%+R}(J+jGN)PqnARi=Vbgy$kV~t
zZ4}B=VT+zWV&agj#$2u2A{K+uVu=hvs(|_x#DdUtmk7{-51XywI^#wAqbLQHPPqQ@
z(|YB-gp_We>~tc#?uEaTz#H^ddz!Q8N-NmLv$R#}B-kQHA}sNPWio-AbHZ}G5&C{S
zrw1cWCiL87;MZ{K7@=R*0P$uF%CzqY1iM*O+Qswb4Z}%?&esSI$cvF`jyF%u(uBAD
zLDA&b)UM!7vFRNLYRo}}`7E75MIHnwGXxpXz51IBNlprE%C>R$p!wb-937YCSr$5<
z4LoCe)T_5WHdz~#hTh7@ZT<4XdsA#B@Oj(bHu>n!Zj^I&Vl0~%zrD#*<U8;r&n10}
zT4y`UiQv_s=_vPh*xpzYLeYHmQaUhtLLK#`pCvCZ&4qSSrP5V!TrJ@Cs-CmLBe7Z*
z9Lf+6giHoGY)FLi>!dvrhSG{3j_s&&9qKPY5GN#tmmfJ#CKi`jp3Mq`d+tQ>bn}-$
z$CBMM38bJgjgywal9-uocs(5kJj#~bVn)7z!3JB=jm|*n((nD$rE9F6ZGGRg*!w26
zJ^U!#W_$SGy7b#|%I9f7d=4pnRnFrCApdXrd$X+FR!uSv0iUq3KrNuGO+to~Q9Kz+
zMEKM@0A!-8O~q+IimGjb+!Kle8dpf|HfXs(Vn&=G+5mavLBh;i8Qp<4u+&mu*h*at
z%;zo*xJ%2>|MT${(lzCda4dg+=Zxf>gXfYYdY{of4zE=11H=61*(!IWPV33Mk(&Rh
z|J;r|7PK_7GV84N)l%(|G~*o<uMqB`XGgc5rUd)w5`Cb!{NiP2eBP27fGxHkYEWM>
z^;)52LWY??{4qo(D#M0)3N;NoELQ-wl(2N2MI+D|hKqYoE;kfZ=ct)|uel^bhV4^b
zCgD#fjk*&}K7_1zH8S$sgn?FHq7Lg1koe53#a+V?4hse+${|O!1g9A<VHS5d<ws}R
zk8j%zZPp9Ao~Y!z4k^luZPpIdI=_(S3kAd9tigH0t!d7SXq+sQ8ds2bDYI$2{}h0E
zfNZ>mo^LFxD<S^Kfzf6K5nO-rerIqV`4cw-K?V0MFO5a*Y_?~OFVX*qPuZs0Ao-GS
zu#*n2<E9JT=Tq82k$u8{71_&-J&7xuA;gASc9CpOhaQhsA@HgS21{@czbWn%q3`ef
z;idSj*jL6mPis?pT9kWHc1*tq?xwhQm|6pSQ;YhYb(V`!I6j$+l<^wdVrAba2Y!(k
zRsgeRLWlR6XW5w40igWG<U<Dw(gK<$@x01*X&c})6${x?@G)6Tz*W=^oz17dv#z8m
zb7r_`ZTaDaQ(C)D=)8u|zkSS$Hl_Z(W0sw#VsL={3iwY3U>ZSAR<gT0XukppuRWD1
zqc<tl*5~-JirtNDd9-}E_Bu{Su0QgK2cfo~9`OTb`##t<@$h34`o1i-+bx$4>+Uz`
zZ5N7^58L`4o>9K5>P_h>lSiELB_E{z^8lFzmeLnmx7J_d5pyUdNTcjWDxTM(Cnk8#
zWCNT!2!$nbz;J+4Rw#7PX?PCYONZ|FK^X^u6#n4)ra@i(r#J*l+5H;13=D-pJlT9+
zoEX({o5*shastucFU*^#R(KQw+HZuW<H;5iK5lqw#y1MsF&yD><hfsE507^pSZD!1
zxvjIBlp;B4Od6GkSEG_!+D=(6^j_0HidT*))*Ws1*P3n?5Efa!?M&WO5Z`!;%lFLo
zCa3LN_D42ct;kD*uj-d=$-E?93!uJI`88&ATxT(XljI_E&JHM+s!opoP?G01J#^O$
zb;VG?vvnoLg|mD*yiWiR3Zj;nTby*7+BFib5}hWAm`2+CaR3|ruwC#AI}ILhsGlSj
zU!bRu@jfD^tKLeJ1fOYiH-s`{<_6Gs)uYQ|8=OH65isv^&wn*@=gu&X|AMF4AL|)K
zJX)uFIM8P4n5TzNpdPf=^sa9bOiEqXVcp&Fy6zpV$agGf^wtKrc1OW4zl!u%9lG)u
z^Kf!=BD_=`lInFKDjRj#C<=1nI{`0=__~yz1`aHcnVMcjB@=+D0u0;Ds&an46!QD(
z_kl7cvQ2?5QUyVZ2bJ>(O|VWdZ!R2Oe$L;_G>@x3sGJWk7qQ0z5w_FNEYos#WVM;<
zBpgA94<bV=Mo!O-H|$-Q>f)L^z1__j1AC2ipMsZnvP~mauGmO9_rEw-2ZUCipu^78
z#3T3Ce0pvo4R-maPiM9?`qgxNmG~<G)3l2lV^cmjNn3;HPwulc5|N^GkMCyrr*Nr@
zB}0<f7Ut0uO-*obNy@6Iug#o#A~pAQ0{WnEGTaht$xK>OnzW?$YDMNevbU_qUZt(C
z;`24ap@vJnJAfBO^cG58F=bkgGtHcstgt%-P<>}UVg310iN?_SeembdITGp*?5urp
z^Mv`AKKSiV)%GQ(i2|DKP#zZP?!-q8+(}5PqQ~mJIlj`0gTVbZo7LC$MOu;VY0X;#
zn%xd{&P@Zu2Giwy{V+s9KZ7I^WsQt6)X>d6rxD8g$s*xE#s5FY%p)f?VBV?<p46-#
z@j;f0YuLE1%ot1S`AU17Vpi<71l=w5Ww(~G;6-QE^($x6p8j>)_fIDm??^syaM<+;
z<7*ZT+2=Iv=cE4FG4|XqH{H5Y@#Gf;VNoGB{(03a>v!I0`t(S}KV<9;-R!D;Vsl*m
zr9Y4GT<P-T=OZ6}-eY;xd!cB%V(R=h&~MW8Ttj5bi~6pD?6b`?s|+_roGITl86cJj
z^!!{0OiC*@@&rx=AmSsE2wmR11*TYyO8C-pbGzNkjL~ihlTb|D2v}Sjkna=GI;x{S
zw$K>Hp8GSD`!eN|X=ngE3UTri8*s2G7G`td3vtjAP_i_jZqBT5P`FP{1x)j+!LrAf
z4e9=K(POC6EEeb@y_t{&HWe@iw7qoxvtq$#J~Rvge=W>=$dIe60;cU8(RJHCFUr!W
z>(VC|YX<QDMyGaB>EfxdlC;-800^fjJy*c#EoBu0#wc4;R6nZbPJ3OM%@U<Ne!5M0
zSzLMn7g#4o-3@ahfM~T3%Bk>b^{=JuH$!=r#6Jofd$_6VDc59!7!ZVLWbiD-PJkty
z6r_NyY#_wAGf4&qWX)_2Jz<qF>?WH-VEmx%epUS^0LwwYN7lX@W*AkfXEWfR_=_(n
z%pKehME$Ea801KgmW)8OY#CGC_uZ>XHeF1xS@f2}&eod=mSF`&AK1R73M#4?xDuha
zJJ=f%l6oTSmY$<GCd}plZ#Wg4-$}Fw5!y1D#zWfxIvk@FA1!PcWw42%drZ10C}^9%
z`HxdB@bt&8|3Fxlc9uC4kKf$rLc>w>mR4L>F#VOZc6??yc#Xe|0!Pm;7pc4Y5x6ss
z0#1QyK6zKgGE42aaiXwefvC)*CLLPZS{o{yOEv>Q@lK2dn{uG`$Sqk~mG}y|V9CPy
z9Sdr4=37?y)y(kWE)AW5=Y5m9W@{9S-fHet3>S2dm@ak9Qy$%H>yZ+^>%$#7+pV_#
zFUIRE@tVF%ntuC>ngJVadD<N5Cdde(v{0OAcdTq$K2)(BWXAq~gRY88BVZ8Dt*a+X
z-%bfem&q}seTm0Sm^oT3xQkfn;2#krt;aV02+epnaT{k(NV(eDNw|rDOvJ1IM9`Hj
z7N;QEXfw~}uyvSOYk`Xv;RbG?Vg{`8Kv<Vf7sRs5TsAddT{vcRR(0Cf_Ks%z_r6M}
za`$PaXPx8$W0iyb+wexKdvZ@iip|-2xUKKoge2Q+rLX8gXG%)L>HO$Cmn|%sMF-tD
zmri2~j7m?c#@UVJq4E$IKAGqjYUd<OtC=fj&SazMAfG*#wIMs{TTC%n2Im(J8t%iU
zB>0Zk>9KH)#zAGW_+K9nSXY{Pkj-+$ghUU++H|V&=0j=V&S63~%Rk0=UU*oUZrgdQ
zHIXjvH!CuQF_+0F$S0B5GzkUWku(ZGUinbV!lSR`Wi)0(Kk|o1Un_q8`<#5D^$&BK
zt!JYC?labFS}oSYflBTUy|o1XQPjZwW?TDzzGCK{g+gDW*uz90OF!8hYTm;!&`ZGM
z)*e!&^Bi3^+tuOBNxvvtNQDxtE}myeCeK><ck$Hw3{A9|n4M^6tqgo~7pN5DZ(8z$
zwVu=z=%~0GvX#7hdN{DFY<lAQ@OI(jsu2O+7MZXaz)f`q%@MvG&>N(B$4pxB9+yl9
zP2WC~QmyoVuD5+1WeZns^(yxqY?m9u|4fKpM+vX*zLJpin)9pKr~cj<Z)x<S-k4yF
zP6?Kc71%0_loQr?oyq<S7vQJUo>X@>pf0|^^Y0w?B+Eh$J9{1{8*gxv{Tk^97UHfa
zJ4w$4tju!>AAV+OVbjjRv!-YbrC?;kr1^u`f_clM3q1_=5}0Mj&k--G>3QZ%bH+h~
zo4?sT!e#p_V>-g9MV#u`KrEwLxAN2d--er>iLy2_J@D0AXY0(j@Z(L;br;}UpD;j4
zpVI&53wry5gk)UPtY@8*Zc`Y&S7ft0==!Q0x>=n7HcAa61`2=wuov3h%o(0U0GM&C
zQ*2!*qhU(Int*(kM2c=U)%=Y>XPJa@_AeEV$UVqQr4e19lU8na6XxiuLN&<Cit&Jk
zZ0igDY!1p93=n{xYh@7z)NB~lXIM-qcVIXnKV5rz7^|xHPzbYl-u3LgcR^Xk!=1Bz
zcTw|j^lVL-De3WD&4AAO-VXK!-Vgd_*8#tlU9eYK&Ra#0#x$kGpIv{~D|X)i$A%n_
zt{$$J1eY@tz46=?47uX~+|*b!3b`c^z>u`Y5~Yo~6&TH^*h;y|p{M&M9`NG|yjq(r
zD^)s%%SI<S;81n^f$h+~dED554$^ZszV5tiehM^mTLMRc%49P*!7L3ro;mjTNz^g0
z0oa>{u5?%!CgdWu0drIN7dY<*)MYz0EnBnQFuu~9uj?r>J&M!y)hIvGuY7H~sJw?$
zmaRAW#}zuw#cqplvfamoeTuFd$E=%01J$xa-G0}hQh8Tsav*`KVqFvr#_Pm!Y)WD3
zsmm}Bw1`5r6sAh|2|<x*K!~xppl7SUIs;?`@{MCVIye|pl3S7I`*5%%aJoC8OAbbI
z^CBKM8DX*jn8>Jci;xdRfC-dBOhlnxEsG|1Fn1S@t(5p;18t-LJ;fD=*-7kS5<y81
zjgpTU+?Lx_2gW6tKC?DExO8{coR5+pwmK*Ewpkk-S_cjl<yq>kHz)rvTQS_)k#gs*
z@|}WnreyAMTW3R`e0DqRn*4dYW&o<<z`LF0(5JMHv-1xc;3_m>B7=@wzWNsONG#ua
zjY#1N8K;&e@~QD56*kV9bpqa%aO@a`sKj}RYhrW?YJ-dEw9xg2(ZRA8md1=e5O~&&
z_mMRNE_TxI(W^eIT}~s#V2q(`uuF?M7J#F}+Zpj>oqII=d+7gA4=9+N#<Bw<Q9QIF
zr1Xa6g>74wr~i?R#=zcd1DBfbcj_M=j<V-#EDxM*_VaoR<I<k<au1Z`A8tkIde2wl
zm1O_Mq4MFuO67*UbdB|Pnlnr~0x6%hx&(nJLgCL4WG3a6kRHQI1rmo>2}y0O43P$l
z*5NmK_VO!&v4|r8;t|cLi5M#Ju-Or`pGN7w2h~+1z=%Q;G2|_B2cN6In)wE$@;p73
zr16nLjyQHn4wb)yB0+X0slyeyV%rU2;#Yk`Y(eQh;I=cw;@Myaoh`^P7j3~#!`?`b
zz(3Ry=MUPkN^3(Gbw1_cc%>oDIjJT5->k9LFUzd`;Yvi?Tb(6Q%11hLxTZU`sBPAy
z*^Qwcd_4-et=Mxz#mWmm7uoH;1deUOG+y;}oFwVF<tLnr1Uv%z^02||9NE1Tr{nw>
zsuMo3nK+^{IA1UOJdiarjTt`sqYNh@S2G0bH~>%5Eut}3-f~(+ZMNJZ+4-_^GAEDT
z9X7-&?uZn(umY!&9-|qk@qw`y1e?z1r-r5nyixec(#f)uG}F!hd5ZVIEFaf`sJ^r!
zOPs#vH_8JCr7z0%;O(~li`Y=VM?|OuXgBG11OmqEjJI$sPq6>FC1n6Pe7w#c(Khhi
z(;5*Q5@dbK7_3>=4Q|T210|+nbqX7Bn+ehw-UgNu(f&i#Jczoej@V6{fbhBjE2YMh
zSU+0rN2Cv7NQ%Wa&xoUEiYEFH2p92OG=6x9d=zjTudvb=fn82-VIFhbI)Q?G1l8Yq
z{=DVM%$Urji|PYEuoRe5`kd`8Jnnb(eL7pOt~*X=@cgHgS%CvMi_cctceGl5Z0W3e
zBWCWVyn~Hrn;hpR)PcW3`0Sopv;}Hz9`#%6a?pY~vRZC1_&j?vV3^jU#u|!oN<aap
z;gIUl9KtX_n4pX|JC3qr35*t>S~d*%v;?~h%Pt?a$(-rb9pV*Hn&|^W1V}e%y`hfX
z;xcP}-rGSti+#RuCsDh|bUY!ag!UfQ5eymw)YbQALZ_7D9NBSWxREWfEi|*QPfj$t
zIc7H79#uZX==vF*#aLv?Hrd&=H2~%5T7vE-jeF4{--`q6Q~7#cWnV+oz~|%bUA>xt
zGrDe|xW3zIbHn6ATi<Z@sQHdDRF1*f6U6AOIfj-1U88{D+eGXraTdIVj~j}TsGEqm
zg<|%_f#s%2xs8vZYmquJcjF4Eb~ij<uwy>HEP=JapSqW(lNf!VyFW9iP^SVQtOqne
zv3uTI6T&wWWK7_}FjITEwARZ#@I>u5ao2l3f4l9siqh4VLa${RmRGc2(sYp}A*J)Z
zvXtb8Z@#&Hvb^J)j4$TaB+lXe9hoP4y-5r*=ZeNu4)nBqlUF$Hl{`Z-juZ8D^R7(#
zM0oH{-5W&UApgJ|q|iWgAl(#TaU|k%p+&zMI2DJ&^b9R4Hv{Ux%Vk%eYnd?&Q^=d<
z5RWAtXA@6kU_C+Nlkm(Y)%1XA2@R~~KIO=9&4jENMCIaEBaM_$AJ9G>TnA&*Z5U;(
zA8^~*>nXRLQ#$m0XLt1P_?szOc|+UT9o27*XtkNk>>XI!oo}<KDY_rp-*->bGk9*#
z372MSvUpi7k1mKqpU7xd&_&)7{W0AB8V@?`a|o+^$-i+7LgbS|LN>{pgO^2kX9Qbz
z`AL%Oz!3_Hs%F{I64q&?DY+MyT0Z0MtIk-UwZf@y-c*|{?HG|?IFy3O0|&*>qfcUw
zl7q}8B(wAI_ju>m<t_?7lu8CZk6cfY4E%G=6Wu%R7pFW}tg(f+CaWQr*{3Z(%F_Au
z7eAfwuB~fIPgc<jZrv{k4hRS4yx{_h4BOtcv~cHEY-=E=N6zM_!f``xEJGDPXEv%y
z_NQk|)w0f=@~9gM`{0=$s;G_Eeo~QnkQWyK<-5=T3y~T5^8l$@Y-765MTT99<p!*A
z!6}Y3ww-l|F&1!!X=Mgh+i0_-dSxo&N%Q3Q0p*SyFl*We4|rMx>QHp-D>b(A<lFm-
zEPozvH#cayJ=<(7zUhI-P~2ueN7U847WLcSmNwfbY;yP2-5-eR3)S@9V+fo^>s&<{
z|G(yRPf)WZZgd`X|7l&1V*O6nua|=kq{Fg4Ola-hvDsc~yGL$4tdt~FbbnQ*Jl5EY
z@o#Pp{~|80eD1D^h!?63p{b>q+_Ifszo|;J3&h6ghTM3ROSpzSe-5(7C>4^~ntrL{
zEb%1`U%7K9J95n#z+UjwRL3$gy5f45P$QCX6U4AyC89`KN)f7VN&*=Zx+y8w+t+&&
zW>~UG0Y>2}FGQO3H0kp@yb^Z8Wh3o@Q83j^t@toH=NQwt4ln13(nubeLpiJ+k9*_o
z$Z+;Wy<M#9WyM7q_*a4r_O0bgZ>z3N;&JI;QfpM-S6Jzk0e_`*wNws}T3dT{jK3UF
z8w80uXS;E9E`uzJZSq^BAY3%=gQD`3U}nnj<o=>AmhKtl-S%8hU@y-gkuUjZXQ?7K
zpEFDSG{51|>Vm<5<5W@kSu$+W`&7yS<QbZ7mF6-iFK$L&j66VUyj1P!qf{9M?l-{T
zkPOLC)rr^Bz0`gz(Ia-$)W9W4-xmk=p{N`%fd)HZI%{px_xHB-$O$LgS>)w2m}L{}
zY5kMV?f0wNQrZN#H8KAcfAJiTD~LzH5Z$_*<{jP*F;$ke@lp#g0E87xu}Q@4_OmP`
ztqKV>wBR6?TA%Qb9t1U?796R>aDYZkM2M~MN(R{`VKUBwMY5hvB!UPcMS!Mb#4sSp
zpJgEwPY6~r#Ald!FS*z4NJJu8#~8z^BF0>cyF*lhYR-8NkQiw__U=5P7S7`}-5)k9
z_{VmHR}QfL8rzueaJ_9SqfOo9%0~~%^aJbK`r3KE5RGfvlzYwgCbdV_<T%sc1kmmf
zJp-3%^N<3;rY6x%#_k>VcP9Iryfv;uERbovEUO%BlwCw@n~k+_Hj!<YAs2L0*5Br?
zJ)Y-WOd`^_!WlgSn&_^PqN(UOWmGeD1u*ZN!Ih<pFt$AdsKjR?gi^YrtN=sx)Uk8J
zB^gSJcMT=$1(5qg{a2??U6l0wnTy@CZTFrw**;e~qm&}0N^eWn6?d4v=_K*`+V{0-
zJAr+Ep6a)5Q-L;S`_}ZxXVzUuh!V|Cy@K%7QZm&Ldt=zry)>Qg2f7(v0+i(Wp4sl*
zUqo6yqXy?L5rgG>r6zFbcK0>ci3?C7?u?YQYA?JXb&es7f8<K@oGqUPOvB|TBVw1~
z`BCOffej4eJwvU}!6G&$(i&J+2-gAYb?qz%<AF#^-h2cqT$6M@d>7&}6h?DMxj8dq
z_m+WFU7y+%_x+uWt9|RQGp?q0pT5T)m27&~q)d?<BM<<^4QExG4OJQ^U!k)NLK~z<
zUf3X|94a{fXx<`y5&4n~Oce$AUa=AEiaG?BHbRNo_8yR}HE99D-iRk93;YpZ%5xVB
zCxA0T)3e)GiQAakMPt^lBBDRF-GZ5>%3WDhN*GbMu6Gc##A@KcQ4kc_m*X~Zxxeec
z$>hHLq94yVSM+|OnDiEX64n0&{R)!snJ62p3+E}tlv)20ACo@gOjE@vTJ+M?8C$kD
zp2DIvam)3j)09)J3SwZFotA$vCkEMI`Nz?dR3N&ePBqV<6U6MuJwXB`*bIWKfX3{K
zT2Dwy6Gb6NrKL8yacm_j#QhlwB^r|agU`79<(QWjO1*>;?qH?dh#x7qWlUCl;;&Y+
z8Dte7MnHZVoV27RVY|!4fy8h$V8k*cYLsVQ5rLyT%;iO`&B<M}qxvsJrC8$Z0M>o~
z4Y2O)Tx_!UU(i23057<uQc2ZWZO!(3y*k`NQYPT$A>Srd_y(Y@uyj6DDEz4T+!;|z
z&e_8Gsd_Vd5rL?+bGz>;HItnly_u4c_S?#Yn=wP)4NqD!(slZ*<<GtGPosa_GNbYI
z<}VMvw8rqM^RXW)o6=^MTwD2m(G%m=Y+X^e{JoDqTC=A9RO+H}?+*Js`&8$HZmYe!
z(xh8|u*?6{j{LhHuQ|P8*vdZ~f2lfk|AyMV>kJ<ZiLK6B73p>Ct>u0>%l=UJ>^pm1
ze6F;Q44PPdmfw3hYu|=C??ZnW8}!cbk^Xzu%-TEqsn}K1-x<F2oxP(SkFS{(J9hEv
zm;6@##{1Cz4PM87x5jnp6ZJ3uv3=2W<B1oh?c%qu-S)|Z)9v%$9J(@K)&4d0UROrA
zx(>Y*ntA!%9T+Al){>`97wb9~1t#oRpKRNw+}G(kziRP#^5rLkUw$cggO}&5`G-c7
zJ5HGu>^1TUm*@O)&P=FZFrxg!3BzCiL#_9d!Baw>I=;U>*kj6}{p}&{LC<ZQaV2M2
z?eg7c_Kwb6Hhp4s?u65O_f4qZduDjXq*;4Cj(s*NXtiJLs{OHJ#t%8|ePu(f%j)TC
z{o0pa-s-y4ul-fOtTSu2jZ2%5xOjiN>#h?+Xa2JN)X<@8$1V=>+i*NM_>WHp2ORo#
zoa50UW8HQ+joSUQch+>Rd2;0xpC|V(8xa%yyKNcEroXe-(dW>K3DpZutnnN1`?riG
z&9>X;A3StOQ0|3ll1xp`2+rSgc+l3FA7_7DzP;CYA?jjpE;i$n%952Uv|opnM+FX(
zQx*oS+BS8FqZ=P=?X|ISr`r#`^)VmuhG85VpYPgjUN1ehH;ALim^9>6?5aOZJ0|C~
zJY@gSk?-uCy>U$3DZ`YdPq>cs+c53eXCsflu<V_^ZLXzqptT=!Zq;ieYrO+kap<R>
zj2(Oaa@Ls*UTz^z?taN>@6nf@s(p6tsVBXE6Fg-@o!2j8pBfYNl>DN7`kRisf3~(d
zfB4FPk0zM^xw?3thfm1#wPXKj)DzWiL)$;UkYK0d|MBy;6U^mV%?EuiTCFXnTiVBu
zeXTc&G)k!MeZRsOuORhg=Ura&MTX0~A;G(bKIJ;)vdgZ-aY5y~UCxgQdg8$7wbdi}
zTmQ)YnZwC$xV&(eQ~RrbsPo?C8h-o*$BETh%Odx<lzg`P=!q9fepc>sjEm{B&GFz<
zmluX?bD2`oK7E(dhU%=bZd0bT&viULVcU$=k(c+LdEs(K_Gil?FK3K>;)|GVj@22f
zPQ{LKJ2YzKJEO+L)opcj^j_-6jd*A1$f517L%cJ_4*9}sm!Ipuy-@3Q=%wo2V;Aq9
zR2|c`=ERhp<D(um_*%@jbk_RuZOKZ4&iwVH1grIv^KW(J>u#;D?0t{@DW5NHOS;b{
z>z>=sME$mpS@*B^9-MZC&-8NEvJu*UZC|qb+fm`GMr>R;{fV{P7Dw*!@Okd3<JDpR
z<4D`(SgP}9?LD(5apduzIqsftDrZ^bQ1uxnI7V!!TfmK;KJj$>^k2q~E&6yu{qZ$J
zPVZf}p>C~Tj-%U<;3>~lFa6^X!>$u+hK%)e9m&r;{`8Q&++lvPZQS{x!M{7vzQn_|
zaHV`x*Q!Ugt4BKIsh1ruS4COxYizd^^LH2Lp^j+N{PyAEKu@zpd;4HzcdBOK^Y<d6
zbPyZ<?QqIrm**3MUViiVOHX>mc~|{xV|9Mw)Th^;imlE+z1L$(iy>r0jC-8Nly`>K
zE_kZeg>NP?C}i88I9pEPNyD8X!N2i-={X+Ih_$CScm+H0XG(?yU*Tu2_j)t<C9mKG
zyGOW8-WT-k?y>U>?+*$7BRBIm!68q$9Gk#1souNn30I$xA!Da-DfVsf`lnOwuJfPs
z8!~@l_1=9$F8^+L@ao`UwKM$o2K}zq>o<F&PfXb6^>0tMkBqC%Kh?fy!HJiK?S5<e
zb3>{Z__d!}<5FRMn@8*1c6&UJR%g5Yc7nan{-P=A|55^*1G>v@zIVa5&GzHP2o?h+
zRP@-!>+ScW?AH<k`#K%&@Hd|Ao3Z!k{=FWY%<-RPX7kUVtse1|_e)Q@{_;r<%KxLE
z<u04<zvpLM%zCfVVJm+(G<ZL6w{FpD39KR87Ch;_>%@dph9CC2l&m@3{t;iGedOZ8
zKTSC8`qGmG<64e*w<Aw`@h`mzu^vmGNDSIF((xUh|I4f8%a2tr$u@b%3nZ%Vc}Xrn
zZ1o(+<DZRMyqcS)eqFw!_N5~>ULLjKsp^G$Lj!*M&>m{i4>X%DTJGuxik$6LH<Ny2
z=-%G>l_~j$HpTd!#`epd)30WD)aMTm{$1jqW{p_7`VZ6M5;MpFf0$OXW=P^IZ(g1m
zlDK-rMAs?f$4*I`zz_B8#OnM}Bi(kL;6Qy2eY^WZC)Xi{kxx&o&KT>%H@}LzKXyvu
z>OWM!y!zYSgr=dHE|+7r?Vfb1eODx(%t~VTp4#IRX6<!6zc66ju_3{$_dn_V(&}xF
z<Exkc+lH{<olYCZkgWESr;fiAyCCOx-;O%=!jrq7yd37YU*3O6;<&i#7?-rqmiztw
zrD4Q?p<~8Z@2>g%TV2moKFZ=8Ys!+kZuPd=>((VybakI^ZO@M@^w+Wo;6bwKY2W1T
zvuQi@Jr6CuzqVotJ-Pbj|9$71HMV4Emg`XGU-$Bg+mr(3htJ>E7`M9?6}?ziX|LAz
zHN2?ne{JY8$7h#K+%+<2W9{)VOZQ|BdFOYRM`gS_;^%AUPcPp++rKu0P&Ib(Fvs?#
z9Lw+O_c{i}KD9e?T1aH(Dzed3Uh5x;HCaK&_fK$~GB37z_dYp)mtPL|dTvOf*B>XG
z9@2hlLgKO~rjFmXfxNcMul<nZD<{_wLeZ3xH81aei8yy^Z~Ul{^V(gf{M(bRFRgz0
z#Ai(twv8H@IphoPKN>=6_XdsNx=8jtv1aO0zrC6K5i%}W_z7=EN6$nz{+=(gmd)BQ
zHfU(ZABWz~ZF~EHgZ^LAqAF}+Y;|#-E=hTBN8o3=qPRkBhvCYt!`N1~V!rxd-ueML
zZ>7hk>u-8D&3Tkt;QvG2yT>(oXY1mUh5?acf=PjqN)n(TP$<|r;1(f3h*erH%5|ir
zjkuXP?$I5(dq6=FZx8~Oo8WXx1w@M$3LAQQ45N}dCK|AX>1~`&5p0CwWeNmDu1S8+
zda--AozD6E{ym>h&)FTUK;G|nt@W&DJ<och>h;xSMs=GewVpC;tkPa5bn47DsQvja
zxZ?=u-8z-4{9Ts57GR@B<=u>Mj#mrqm{6By8n&g@^I~@MI37ODcoMvvz6?1xE>NaB
z)|TF-jw#PJWFT)V9q7A`oR>-JRrSdM8J#0%j9ZbUN7^)%H4YrqqC+a~%@auh88-UQ
zQTH6H_?(ik9a8z(uo@&f`kCuimEGl@G2OmJhpRm>Ig~J8Zy39RS?u6{S6}Vq76nV_
zHy{+AQwrIlknCiKbGjD}iNydEQN^8XVO^%~E%$7LUooX?p}zpBM!Z|E5_ZYfC;0vN
zkdaHy>yF!3uI&;98jCqDEP0Yx&f(bGqgop4zCYt@D$Q%<nzJZYVWsx=$VrFnReTZ0
zMrKNvzr-?1!<$yD0fNF8FSvQeA7*Wj76=-%QJj%Cked9Yju|#W7Qfkbr9^3{R@oa*
zuc%e2)^u}uiH^H^ecd|Mt5)9EyBxgdoz=5gku`6;IH5JyTo}r~Gq&%tegiV~^FhOl
z{g=URp552;Xc7cA6UOj&tm8Kb`cXIKv8kE<{UfI-vzFI{>8gdg>nWKy7v8xQwHj`)
z@x^B&+KrMCFBF?%XO54)@(8XfE^pw`6&dpFj<rhVT$r1lV?-sZ=eu-{h;Zy`@76E?
z7GUO7+jC-fO5J`^U8!L!07+cD-zhH^H?Cj33V9k}3dLOh(n^)Le`Ee10b+y*5>Ac-
z_ojM7w=dFT6YoNXq{6FEcU3X>b=LoiNWc5N7QlKd3#`c}6-6_%73-F&2VRU=J)oH0
z6lD2o_4uWQg=wwTZs796i=5LU*Tw)h2NAxPGB?igiDdEK(ha+Pn>F5bN|CN9oz;gb
z6I~o|EUeYlFOC6MrGr2j8+E8e&-goa;x>4!Y1KUAAMIw_gb%fMeO-CFT%~ylIGP`m
zp?^UNK#|~AY$`n*)*8mm;0n0F&q={Z%e#39)kMdQIMcXP;HMbWn|i%pW0(Q|Ud;7O
z&BT9zhbgm_@06&@fteW?*9`dff}}jmaGc|Cj`KK(t#eM={KcHeg`ZU8k3S#mUYxqs
zF;p;$mEPC(3@c0zhApEVis>s?SMNgQtC97pV{H5q9f6qZkS^+VO^BJgdZaMb2P!Tg
z3j0J-IT{)Sn~ED_i{n#LT;(P<0;uuh+<<;}>z%1`I)ykehf)q~xhEKnQE_f+CP84f
zD*QZd4?nJ7CZXi$z$;15J&U_xlc5*ZF}TYQSUk~m2B-)9Qnub1KPxWPoh_1v*HZb%
z(fgyjNpjrHLm>rpZ%)@IN`c>+S;>3Zde3~x>Y|xXJBC1ozOA(=ELX28^53RI54uFJ
zttr!9n!ca+E88cM*-xDCRw-5J#_UGDif#PtHsZKT69=8+#NEDi$}W`Hu%wZd{FvP<
zf&ZGrs@CF0m*9!?XX}NjKw`UO{F~W4L91)ut|Z6NPG!1tg&vo-6^&F=o6L)#wh#a6
zoFu9~rC!x$@2rY(>^|ojtZt(AN(uN(IFrk*&~xr^I60lj{vk3wJzD=j<_Wt*aneK|
zW$Szc3B;GV@^gX#=5$eeb!s)|m8HW$mq%{oSxx?0?Bv;J9eF==Xl{PzKGVk=1`7Xh
z5@uX%YMT5kG^-$gcF*bt>t|OhA{Oo@F^{$jmC1n@lM3v}u@U*m>8!kc_>CdP00tS^
z?B?NVQjwDB5)a|{8u77<-FfF+fl{|iZS;xM^q@sLQVw`b{cL*Y$gcs-)VTLIagBhv
z68umC^wl4r#Ig8|ESzrUdO<`y0109kO+u*yftK*~b%T|-`5B!I+UMKh<0p9Is#7zE
z8n&Ati(j2MV57k|l{s}C63$v1ll{7=zSma|9h}A+xD<ty$v+;Mp8Qj5hr$GUp}%$b
zOAarQV|!?YZIqj>uO~ZzeUxWED%s0ESDybO`zR|};3R*>1)nOjCP|3jT97NcpjU0E
z;(NC1-^xUzK}Ut(AH6Ww1wYtblq{`J8MN<y1~s*Rgp=kc&gR8Yv%uD6l&ja8j=FYj
z+j|t?El18R(Qs|#F?&}wh3PUl0x$-h^kLdq$pD_Q<>|S&UJ|yhlr7<<u|)#0w2dWm
zixG-gY@lwp#6#|~^+}E!dE6TURY&1WH{RC1Y03CqgZXX=0Oy^>iA&R0%!}dEA8%;*
z_UGzymi1diG@`8t9jv+n-BSPZAu-?@-6@)%5?s!v@~|?Bce&8Nu3nD}dk%OQ>_&2<
zQIb0NOmqIfpk?NJtg)7=zi(gTB2rN6(F6<Y0GXrQ<lg9*kk<6oRDltwXgJH1vU6hO
zbA4^PLvlTmybB1DK3_CVyLk<P=$P;~<j>Xt_J7lCzYb9AR>R|7IC6AD&A_#juRNNa
zxHTV?W&M-FJQjq_0>5#L<w?W>aa5wzrqAA{R4lZO-dMHP#?P~#_oI_ciSo;_RSk-A
zf{jlBa_qqg^W(&UR3JM?iqS#EQr1{e%gq5U%3K2iA}3en&lfyy4Oqqisa~}wIJ>qS
zg%!^v)*T(C?#b@wq`=ux?v%Y#ksVbjjgJ1(Or6TR1b1Dqx2jQTREybcgDOQFuxn*h
zCv%R;O6D97H7b?QAzHI6UDikAkwv3r6Ft^P2}1|&9B<IhJcZ)>vEt323)j_=2v>u8
z7&X_p3qJ?617VnUFic>SRI4aqBJ@1_DVUR~n&t#k+atF1!?Wg}3#;N^==D7nPyNhw
zo#M3d9t9@KOLYQ22S@fS0uc>SqBOHK1{a(GR7}Kgm!dp3x*`wbZg|(B`vp3U#gS1y
zLpZjS;k)2(AfY$Qwx+Wy3g^nS=1;7Pqye~BGGR40+f7Wn8FHov6{dGKVfFbd`7<+{
zvKEhw^(<X<J}hF1g0p=M&fUV8b*ofD)@e>$YJgj=s)>FDkrNV9ms)7!=VQzEjai36
zxzS&|o0n)<GqQ`A4WV?<p0lwmC48%ws2|LLzVwA~G^604Kw89fjs!^Q?y2|*+zTru
z;<hmRU}IZ4s4tdUn7iEu-?>D#($(fkWVdpr{I;>8qXoEFo#jW+N}?*E&1^edi95ax
z-LzOKKkIr;wW7+NFdJ}fvHk^-(3iD;)PLaMbj8GE*#*U;W{;lPZ!0FCT(p~=Kv)he
zjcFd!RecwF7LSAMG~U!6wBFRt-~OgVg6^)(c!9!FBxQkuz|*BBHQ_x9Kg8T)DC^-#
zY4URr8ovUfMTw(Ys7^>-&WUOyQKU$}Q`=JUXXyQeVm2b0q7bJlije{VGuNVqE=0c(
z-S)2zsatBOZu-boXe3a(8rpEmre0LkIy}7=G}9Vz3?skAU%}T{FCO0V@V0r2*0P8R
z#lxG{S(u}PEOi6oh_}bA4-2y9CM;9OKx^~_tc>E?9AZB{)Ib2JcmK(m>V-B{*TIk>
zBe4J%0)KF(-P{U(0zTmfVKpgoQHXJ+&Ci{xOHOp>;V=UtSMMkG7Bsqwe9#iPBBkoo
zDnOIxr0z09Y^Q^x{UNcNtuDl=OSG<xYx7K^4U5Z3;QDuS^}>%d+lhlr=aeXApz&ns
z>=(S)Gsd!_`Qh-{sgp%BUk>k|HDCB4Rh45o`y;OER>$T-OPqCyxGJy_8Tx%|(AvkA
z10hj1Br@q#GUCVd9}Em8@D?EjYoTu89#ZciQi<xDH;a!WKoJU21NY$&D^tR2+wf<|
z&M``R)IP${s4^0;7I9K0S2NcntZdb8-c7dOsdyltq<UOSWPqBa!Y-s<y^lYDMVFBS
zf~(6Ijd*VX(cw!R?aJuHb(dy8&$8@O%-oLb>CF)|e6k9i?Ix>5k^fCXmg3DVtYW33
z+b<g(`Q1(RvC0sMR9IS03A0b#l$k`yyYfgI8lh*=(CQo#0nPCh?@*M&)Oy3S;G=-X
zN)7lx=qshcBVfn*o*T<{QpY<|3Hlj%gWqi2hEy6SmG2CH>ZZCXpj4$1mW25eixC_x
zm5Wd$Gdd3|8Tt-4ycj<(N~seHKpoqFZg$|fnbIXk?Iq&ia&8>k?<0Y%v>aT$vQ4H3
z$QFYuWCJ5>Onqsmta*whmK86p`k9_K>;69}p48Olpl%S`9WbAb@S^gT*zMemY#Whc
zi|*6W7OfEQ5>rEQSgSxsIpwB`3lF2xp|f={$#KR)Kq0?Cu#2{;@R#(}!=iq_VsPno
z;V;=T9X$A8oI0Liw-6Yty_X28F5~|I4}@=pb7fGG@eFzdq{k$jDOCZDcRK)itt;+B
zTgXRt6PAZ**2S#N`Li!YJTZTAYWUL9VCXn8$u8Dbk4uvi7WR(k*x80ZudwA-t?kP|
zPR-{1`E|#YiEi?DargMUb=4`38x4MS%E9eQLu}iM+8PHC7#ScPag$32o&D?V^@vzp
zwEQ;LXk9fxS#tSEz}}S{z=N0&H7MxB)N-)Op!6|DNH-l-+E-Qq;JF*ysKdQxPsEg(
zTM;X6`@x|aMQ(C?dz)R!#6z(p((tF~uA^bqIhU$9s&r8^NLY{^^3-7GO1@7gOC*t9
znzPj|efGNbQJi+6&&6`*r?7Y3&%`JMm3N!B$iei2h(WzZbcCR`;~D-={(n8tIqO$P
z8X8_+y(oTgbF|_8rxOM=AAaWC^y_m;U;pR1J<rQ_8D9wC1lX0n{G8y>(QiFojjj6E
ztH1ec=f8gQZ<Vk9%g(>7eDULti#_+h`*dl5;}^pJss4{Y{O{v<WsPz1^6>QFz=P2C
zU0dF{7tlY}>^_n#Uh<Z$w~;3~cB>nn@o#j^O$#`r0xf->C5gbr8Q`=ar2<+uyo&;j
zf9mk>{&k3#!OF5QUfzFSoUPql4v0!X`+#6+CNHL4?T!COw=6@BVaS`f#hy}KHMiA%
z>PX~#PUQTUb^7XrcIKN3%S6Snbw^R))bxR=3t9Ug?bDhbwsu6CLKU~Nmb`7F>KvcU
z5DEPJ#kQh4RcAK;v_Q8NtT~z=VVyEWo#1^zR|!Ohv4Fk7`A%JWhyw>dvZl?*5n$-z
z=2;gkk~#um)+V7%FrrDyjmzYxg{XH7^7CA6_^NvKjyA-Ov|jyMxzP*3!}rMm@UKU-
z>tp31sr4W_WNw~Ldes(|7r`7U%Yn=7!cRam<=Sa{in>qqN}x|y-4S4xqgN-W!gRf^
zde11Xp-el4S2$1q)-w0qg!$^egVy)=MdpvaGrKPQcIWkVi>;Fnr)N(o?v1*@S)8^1
zySE3H{&sWS(tN^D!JIG=6o7v_SULtJVE{qdjha^^f%m}6A+t3CaG;{5bs4t>`i-mn
zL1o1`q3u=sb5$`mcv?REb92e24g5TQ%tk%>A8L)Zrn=L&k$~5w<_L}RiYlSMRHr)9
zrm03t#6PF0PO3k_MUnDt4l5M{Zu*b=)`8YaY(vTsf*D5oF}_ZH7B5Z=1W;Fw{wx-)
zi%NBSSHavJ%aZlr+;|=g7MANJk%ezJlr3F4wE*FSUOQcopk4Tv3B|`-?CrbQRh(?!
znBBll$CUr{Y>|)3T}4;J3}7{6>pkP;)hX_D72S>5<zNGd1H*)<EeyT>pmUnOw%o|e
z(?2c6eQ7cB{4qN6TvrMvQV!zg=g(H2RG#l87SX`MtC7PMwubd${Dv9f-l`;VutbW2
z`JU!ECl=TALYt;JUC#GUZ9DL-oAzPoDb4I)Mb?2ww><Fnq(Q~>Cmw^^fn61wAEkXa
zbfe?a((%w&=I-zD2)B-|K2<pWjh%<$q3L>`@3`7uxObW3P_~7k*fQ|Bn3bD?NgyF(
zG~1Zo`e(?!UHEyr+7Jf-e(Hd0U0rOEHS5tR?a&V-52iSdG9MkhhL14t9}iV`oY)gN
zs4KuNxp#BedN~pua2O4!vh`2vtC@#_(J8SRc+iFM47Oj1b4Re^sj9pMY5b_ZH3{LS
zk*j&b3!f{NW``D=6}o}gqE`~kJ3G)TS%$;S_isMCZMVT$vdfT|ArwVB?v`Ypt9Pu#
zz~LkqOZC|{4e<Mu8PWP<_Ux{sMCBwp+2qFH3UgvI<OJ?3cgl2K`0wN7C)v97se_Nl
z4t&peQKDSUZ)kT7juxk4E+q}-9Tr5VA`X!CF+gmW>7PZ^ZFLH!VGMo6iJ%Yv)m$JV
zU5XrE1<nPTiv}KZNM$A0KUEIavIT?1c#Mq!2?V@mj=g&Haj>rcXTkdDAA<Eh1uLx`
z6S8^Bp^8l82u?OV7l{%fh2YHV1q`Yx!)FfTrQs+Mnm`yaI1bFo@jQ|<b=6;El<GrN
z1Pgg3Ob1A9Y|+)?;-%<o>m5{QLyT{N;Xu^6@?CI}>a)~;fs;PXN|u(ZYeFzvq1G3)
zJ~CSqS`K_^;Jj-t<^k0DqH#Hto|`fVa+B{ON_01FlU7dZu41dy9mUk_uwwS&Qszim
zfpOnp!GeEK<jnmC36_F=;ifN#tDfg)bTOxD<431Di1?_p0aG<zeFP}s1V0Z+N*H%e
z!#O0bqrT0iFZO4SHMg?N80e<|(Xq;GesZiM9EkyK1k!4HmncFAIOUmIKf(bsj~Ns&
z5zWdIF`K}dW%667Q7#9ID=jD8fP;PM78!ju`8H{=F&PIRC6#-sg`6)o49uO?nno^<
zDV9F_K=H8og7$UmC!q=q3K6G&)xI$MO3~;j=1PA`%UZnMon^k=I~IXgMpZ<#rz3Er
z?l?D@FOIT$2Yv^X5@g4?2;e9bK?hwEGNm}PRLrKKyuX`1G0-ni(r6Ub4WgQcp*#Xu
z)PuD{Z^?=DZqG(x^lL}i4XEYjB|DBL2Xvp)AkudPpmLU$r31A1c}n$LQF%ay(Alc!
zn}YYkm&TGEyJ?u1;NrO#KQEx$Z(Qx&qF%-Ib&`8~swSHw=daFUlhEYkt%;(^+wW8i
z6@294l{NT**4(#Bv3R@%>Ww4X`3D|mW6*l_;c!t*6x)j>5l3Ne;-IVcw|U}(408r|
zR1ef{q<^4oAu5hr8aPfHk!N5v9}yd`Kelv2LqpnFTTqxQ4aZ&OY4nS=IdHnqg?Tr&
z$#lN)m=54Z+u3Z?ozqmtICE_f*Wg{6HJ;Qq*gWGeb*yCu3tMFR^R5nDjx+Td(sH98
zfP-i8puKNuxss#$TAkwPooa8BBv#sEqza&irU}J|*UU6r!R0AzXz?QFzZ^cG_UJKR
z#xg{_I{DF&yrKOQSDsi`@b^wd)u|%O&@^NYMi~74u~w{M{>zTdbJGo%W+$&|ydz>v
zJC?g(j*ZAg?%k^KYsXXzq%V#nt{q4SOsOXb2+E=8mj*d19C;xrafpxcrl5aXpM#zn
z46_)cC%=doIid}>hPe;%@-2hgGtncUO;U+<i1vVPb(!%4-P7QO#BA3%9R48D%y{Yz
z&&K0BwlcRM5s1t`L;tLe{yFnqpM=dzHM7I!@gs|P74(tC!l)^|`A;z52ASU(E?D^T
z=5T?wBI3m;-Uv4ey)ak6atJwUpNNq6wu@1rUgr|?>DawpDi#OeOwhL9@vB1-l>%go
z_ZrS?DiZ)&md{jdt5OcCxOw^~_n)CD&&X-x31yd>+l;EX0uaERjgT}(3&bLk7=sGr
zSY95`Pl(?~Kq^hn+|a`sHNOS=fM73Bbt>KQe>~Fwx8S2P9`G3p^BY04GEArD$E+v0
z!>i>oSN6=u9~9j?<}<N*_Rh&u=H{Q))}R}gPiF0(U%T(pB=k6wgC`^N?{}AY;5}~F
z<(~f!oM%Jf{d+m54vv3waoB9NV2h`V^=97OVdUS1FZYF8vzCTFM`vuh?vWPu5vNjU
zc=E}_jc$UFY~L2Ny}f^SMXd&txT90TJ{l%ztOv42!-DA;X+|6}`kaYs%hY=|Aw8FK
zZ(xX2j&vW40}(p{Ljnn+0Y>?AAqXqudq%tr@;?QTn1&2&zrc~YbReu?XO;a%-Fb}Z
zG4AzlN$)hayAoZw$>;|}EbWqQZ4;~b4V+fY4psIU`mF->j$jFzN5{jWKf@Q^3j@QB
zCDYf#izUMgmYu^zll_|}Hs^mt;3(+E=L5q9ma`|bmfr7BjDEN;!n`MI@$*P)Z^7!w
zfOxI|g&TB6;U6{4vfdP=NPJpIrofB$VRqbi9*|UDiBG_^AExuhH!-nkG}b`;1Td7z
z_W{1i06IcQorj7l8JWu6c%~c`bQR<rxo8#<TKJ|k*V_`%Eo1-}RVQHIrGJy3Eob4U
zBEBJic@GbkI%jnQ&5m|}BxFhU>i+EW@O;Tc&rA!Jm@Qg7%$_}hc~Q|fLzfFM&;4Se
z$6|prqi}BDl2cLJH!GsW*}ebl2-&=&m?6d6o*PU880X^ouF5z@5lV?grcWGq5Z1}#
zsJR7&oTyT2n%Xq#i<C!ECc(pQ6&MrC=Xf?he}Xh2{Qd+(jaBLLG@>x{PXoZl^F1><
zy;8%S^Ynhw0B=!rs%d4XQA#HmWsd;dKmj<u*_soyd2vhRfyGU;!-dntG5&Q|h4~H5
z?390y;?8lGqB|e1KD9Vymk@zuiYZT+)AHf*L>6cH7Jc<{E^EOg2Ghn9O7s&gMlTiy
zi>`@-%q*rwy$Sjgkc$|$?ZA8@r5>cxQRUoQkj-u6#}(rj?pDR%i-SpYXJed@h|>ra
z6R$O{@(u<1ciJWyB*?%*0&T|+!FO>yro!K65M&IRoX2+|7gemqbw>BP;*p&eYwD>`
z_DuJ0T8(Dx^_n1KTGsy2+R&l=`(N%`nr<mFC!Ct|Ub%y70{|oz1SX54Xxg4A-m6PY
zr6FBZYW)dZww3ajROEf+Mqw*p-ZoUs-t(G7>WwY{@$zGAcA<TM0ubS)@s9@&%*ZfC
zz_#Ed^7w~Bj82E_?Tx6QN)^rEtJY#Pho8}=0UPST63KAgL8+qfp))7qT(u*3B$2|R
zN@b728!TICo2(kd?Eu5vivg;wN+KCl?Jf`VdeuhG^><_)Fr93eyT3023(+k@6N(3m
z`wlK%*)(x!_O=aSpsbP>yeM!fE=W7KGt1hyXSi?y+g4i?k7^nUCxu*mwAwQ2b)jgY
zLue$F!sYrhhd}3mHpy`mNVL%y4OtXECyt_~E&Z{s5#1%x0`3&11V<5@)ypu)nfR@Q
z9jLpgSbUn)wNo>vqk#}3TLl7Yq`*uBwTcqTDC;N*2oUAy8hJ6^BGyLCa@z<D%hPaU
zsE+{J_X9=21Qcs|!svqU&iA!E>R8faNUxZ(TpT_){YmJ=uy$bVnJit$<wwWUtgXWZ
z_rJVpy<K9S4B9T>b1S{$*lc65N~I(zO%sY@Q`-s~VX-?pXebNTaX}H*k=6wYTiown
zRMm}q7Zykd6vWDuV4h`mG5vcjz;o^DRZOZ;T_g^IQBR2X)+7yb(@ZOExS;ohVgOga
z2sBR+r;JD?I?rI9u?lyfJVq=Ds8WI0sEmOqj$7gNqdS16MwNa|Ri?7X#o>hFX1FmO
z{W;BlT><uM;nQtd2d+ReH+ME?*!=0&!`;?z-;RXF<~l%9$Jak8s2&mXgI8$^bO+)W
z^eB+D;8c)vP-~EkRrV*!fYX6;RNM=oT<Jcby+(Lqgn)^`*3<y%FEOXVp@A>(ccMH?
z&=my-db?UKB(jjnF84*tV9Cfq{b^jLH=vUwLK5He<Z_#XTS3PR;)D2&c6Adj7?8#x
zcl-f(T}(YOM?~9$b_tXRBzJYnHhf#^tT7b?6m#HMKj{cJdzil-xNHS4QDU`BSUjwA
zn+o?%#fDoiykebF%oikFURbCYnlM*noEEUUvR8(j5DEo?;1frjF?<(dGSL*AZGcw5
z^_OTGS;_4zNgyVtCqQxq%T#U5<8jm5kKlsvJ;y6k(5$Rf5v^FgHrFp6cwVM^nq^uk
zS8*{E4pi5;8{c!?COIRPBD9%dvv`CDhFXeIQHs)_u0kCuh2X*u1dIBjgV#suRl9V;
zcxNt$xGq&WYa1Gz$}O_kfQ^7(_Q~a}WRWN^TjvKR>qtE)ixo9V?0I7cb#aC2$fCsp
zt@V8{Zr|Bqj5~D1F8tA7cPVcD2pxAEoW7T}__rgGi@gONg)h{r2ew03MoL7rcOEZ|
zMPV8TR-B>zL=DMu4D7x?8%aIByaQi><b-fD-~d5n(+(qGG|@$M=~G+5Z6jBr3x(K`
zkxCltjEtlbSsp>JQF)X0y|8#L&OUkRMckSUeQn!ZvcB@7dH@%~1&Tz7kKN?&t=Q;`
z(fyY}W@C+YAt=&}fHq&Xj!u{-3#@A!-Y(=On8zPz=NimIJG6^$u%kG(N$a@Ty@|!w
zs%tS@fFNR&gM{;EjgX)*zRE3GQA_DtjRr>JEd|$wUed-71Yfq{cfm#{lC;$-?e)(|
zff&FmL2WG^gk%vSF<0TM@+9=BX$U*<6K#;y(pUvYxSoX;z}DcG3f(I!8#fm$VXDnG
z(+h(s0-MxKSLCc}l@g5tWO}?r5KeLHO8K1<7WjJ>zj;M1XgF@GtK82i7kw;OxLrrB
ztjqVD6W*BeC~sUl|1AmCiAzlD{Ye-{=1+Z(i+v;F0AA{FbbfV%h)W5>p5Rz9>9bW8
zZm0ediF%Zgqs#H-c=}2uz!&rH%nUH0bjNZOgCG7Kmn7+NshuMrBWYqM5$s)$R*0y?
zGK8MGEL}uH#2l6Os8~OMga`*iuNqS7?$$X+Dk12e;t;wm4$y1i2OXNyEdo*z9pTZl
z_;N((yy(=zt(%H(BF!BQ2WH-QGAf_NuR0<KO)iG)R0Z*SpUS`c**fL5N~Eauc#mX^
zn0-cr&NUSy4%>6e%oJt7F5Pw~XHHAlR<S_93ML-6SGc#dsx8c0V56_pN-&E<m3WMY
z1s8PZWnNG%8offj(B~63%1%B9;Dw>Y{274?A4|nelk?FC`Qe~Y-i%(LkufBa=o~7O
z22}|!1p0mCst57#8i>g_w^;Vk!D=z$9USFHuC}jTp7T#Anh9<Aw&G{Yuy)%&2BR6l
zXzfja)qv0}D)0^fa{Vy}UaJPyhN`k<MdPw0!d911x-*+$4vakTz6=n8dW8wb%wE5U
zmxE92K$QCJE%k0l#`&c<l2L>NH5k+ek_O#J?~DWw9;y+{FLA+zzd%&R30Mq~%)n)c
z)<F*9c31K}31Cm1r<9I@&FUi_BJuGASg<g8>q5~|h4o^|MA6BCmm)m0|1r^<b#VUt
z@ImuE>+Q&*sa9>XiEV#cl*<_@c0eczMY%C`A;xWc%e)(rL>hB3_`}c-q!n=ss695_
zh<n{ue5CkJ*>A)FQK@25SUbt%%Cq(BZC3ggW18WL-zp05fx!NXZtr!a_Fk!={}}*A
zhs%i*K~Mt^fnRaqc8&y(Q>SXxsFUim`4u_`>It0Pz*BXyMHu$_#>;DIIH*2SZv2xg
z*Ap{cweW&hqPt9oU+6j2?YD9Yzlj?v7{3mjJYF$m={Mu$v!szha<#tq#~<M2l7WNs
z-_#5ooV<cX-tK57=<-vftm6Bo%1I67g25VcCV2~vfF^8kzF=TzJ^lK4&}y9{?@{lP
zi7ur@Q=L-yM_}nN_=2Dsi#QJ&<~>&jm_{glE>jKe>IAhnDsvaXN9bO_U(7jz9y=*!
z8~pGMg9(p8Yk)r@=*kdWog+I4n~4&s{Q^k{N~V}t8`p&^tF2!pSSA$*=Eiq=nCGnX
zb7shsD+k8T59g15rLZ1vDVlx5`eD}Cv8+GwP9;Vahup`DmWuvYUeMp4&}xUT+d04Z
z=lWNLo4UT(So?1kqo0V+j8{gzJTvgCFyDLc<^B1cAi-N@AHCf#nTq}E_T&HB6(#y@
zj#tn*S?0ykY}VE%uYYv0VPH1z)Y$wxB~RpMTpXKWCDl0|_imIYijSs;fV&Q3pR0et
z;0dCxe<b)s69js>$PF}C+-hrZK4cGanM7UPNsW*?s$3yw!|a4)Ib>KV+-9T4%t*v&
zmcw+h9J}2q$d3>cH-(jL*hNCYr&`0RQXF@&bunOQNMBb20Z6Y3I@n(PMPi*lV7=IA
zJd;i~7Oxy6{dHXat1NC4bkQNkk8yR}Vk?!COKIbUcXsBPwZjYd@V4&h(4vRqUt`JS
z_{DIOZjCi-=8F$zPaXWmGL5(S+|05J?Z{f}zi6F$uzFo#GKOjx*j7OT`z)w?lDB!*
zp*s!C)*nL(7effnFG1&uW(pb|jWcL}p?{+W9}2#tZMawAub_wC2s(>V@?esU{6@pb
zj7)MP;CVsO5TjCw(PFJ4YzKlKh#Z&}PXcOFh<X~phRS?%^e&8CgI|J{xP~kP5ogn<
zzCy!>RC)t%TDgN&u(#g{wKQ0lJc_0(6gT!o6>ZAPIxzEAq}6ik<SY03eeB=8y~3@}
zm$SR~&*`jR3Iu{^%$X~b0IK4>M!1;N=S=1I#X{MKk<wXwW`{};sYuX_@EEjHvD}l1
zKm-azauBw=POYaYoW2w|h*a#NRHc$h8k{?4!ph1ZhV{<rq=E5s_%mi~C6#ZlUnzMZ
zocG^HxYmY?o3tK!QNBxW24L70iS6%EOW2Y#<zX8zU13Wq6C%HRePI8@jS21K!-1iK
z2iN!YSQdhY3Tu2$<s0*^TwV+sQp|if)?=F5*R%AcUDm?Xk>SE6laCzj(^d@dbtph6
zx=_8jcJeAya7KYhzibf@YZwp$Jb3ju%UbA6Rsl)#x#z;x7(Z7xaZGOb`Oqh$XOh2<
zI&j?&?Ee{7a=&q#EtD74dvJx_qLVRC_YEZe5{UT)-ElZnP|=AUk)#&3yFz_a&(Au?
zh%-p^3`?N0ZSI7uHwSY;^1Vb&U)dD7-wJ=&)x4rVJwCV5)mGzTp7kN$aR+yIZ*_@8
zyw~>y??zUxYEA8$l`psjk399n32JHqRH4yG*H<<X+l$Fjt5L$}iyLuDG5F2;Q9982
zNW6fe+*l^`3_v-^xX-j|e8t}Dn!|z**+2J$=(%>j3!GH0`nLd#OIB3jgy}wV(BUFE
z<v__E4DJ+1BT<&4;q3?5<hvko0W9M`t1KgfApS?6Z)$ytviG>kwO3tL&yQ;q!Szx!
zI=^XnE;P?Px--&@cgR|<RSX?m{0@4~lglT#1E>1v*y4a<@nRPB1P8y{q+K|frI@_!
zX*>-s7!cS6atNHI_<O-PBwXlIC1eTG#KIzl#{iilME}(8=HTfg_rTbOlY17;17;?e
zs{s**>6pPQBnXYh7YT+i?Jj+rTujX&yh5<jFb7;K#Ga&Tgge1GE)|mR2yf`B$gi2o
z_%(5Ifa#DU(XY46Rzp)u<}pS#S6#JwXufw*F+H*F()605Ra3D`%>{Xu5emlr_Z4#=
z1RZ=Z?C^WFZ(Ln~TVISwP^)w>2;&ipg5A2xosJ+Okqk+1UCD0^OR5t31k-(0uc=L<
zG>O@{<0t~Bj2bZ1lGng;>tgiB*t*0fneMGNG6ire>{V-RKvGKC*FH2tYx7idSWOH(
zQ-To0BPx#EQ}qGtt7T;XzoCx4QU{<iG}0s&B;SxPhOmHjh8UaP<N3aUfXaA$8MeQ#
zPe$o~p<nHt6l}ca5suir)jj+1Vo<pGgV~`4)7u?dOIhgfNU6`MxtCv`u$0Ya9sI6)
z-=(G7H?j`?``8Wb{M|bhk)><(08@k!Nc@jrjEB(;l5iB^v-A)XgP~c3gb(yTW^>!6
zrgH>=YQX5kz!ZfMiFyxyP1RYPEilz?Mxv^6Bn1^ehl(<5`79FQjFt5|_!jrUW<Y8<
z7aSJ|tajb@esv9ctPL5a_421&C|o<qc7wQp{E=ij%Ps_T)XUZ}C=-DN!St9ujkMgd
zvyOH|7LFaS7+zdVh*<K8I9XT)Q<Oe(ZX#>(vxXjH!_uCH#Wb~2*tfknAR{C>RT*G=
zp2Mw5vGEi5x37;$J@i7o;dg*iHt>Ymo`&K-*BSGk{t>9Iz|Uh=lR;qj*U(4u1@I%r
z?A58bf@^P4VQ_YlKPwIP5{*a=92R&BJT*#GzRRQ{6md<s!~Vez(W${aks#g}ZEP~0
zVV9S&wu$$y?>CCtZ9zNQp3@yya}RA3OM)f+ctGyP0H;n&^Fq)UI|XB~z`M~=INBfD
z^Ty4ro|(@*EVsw@E#19!Vc2r}R^EPNsZY_(@R`3~&dJhPmo8`*XHIHozqr{m-8!o<
zUpZ0%1{}GS5rX3eK{^b|00)6Pv^;l*8#(+GQ$jJV1;)dXLq4|=2446(q9yFU400La
z$n!6yL%^eK)0|C*xrpJZ7-<G6U=sl!fZyaoAA}(847oN9Y7}d<chG1dFNXQ!2QgLz
zQePoSj4{^<+!kXGWEx_)xa0Q1`gPgbUovsY6lA@w#e1%no^6QhF(6pYSG98#w7t(u
zuB_O-J_g|mT})kwNE}NpC4+iTA9|JJYr3DsB^HC3Xcn_anqb?+a6tDv@RDj&aZJ|$
z-P=J~ZAV`kKO)ts(c(ID1U_=^O;ErAjU$q5Td?LZhJuQ#f^24+<R~saG_FKxs+IT|
z$Lv)#no4e~hAVtElb1+S`=?w?s&qIGl-mZ+LEXs6UOgd`%vF7$%m>63V)TazF9dQ2
z7Jl1MFy4PTXGrnQy@t!Pf2kNQG|yyRntui=1KmVd!EDwg>-{|g2k(s94HV8?c_+yF
zZ8Kg1>qX*~CRB{I)Ur6i8$zo8ILd(}V@-rr)o5a>snqdT((<rn`|X&@A9Hfd7`eVV
z@~7egDWjUuvw+%%yX}VtT$~=`4}dbpO@FMOdSoib8H0FCu-kjX8;T#>Wo#pF*Ryq;
zP#JV0fKw4$kRXJ`&gh;%xf3x#?ji3BsykOU?YlJVVV$CCI#wN&J9{?Ga(h6l!2f-I
zTq*Pu3F6pZb$yjh&~UT*VpyO!f`JN16(<Hha$0#VvA!3P=^vlm7{-VR1pXY=*>cf|
zvV3qOm1jAjG>JW~vL6*lU=C|jR&LR6WgB93PjdbG;r(%fQ$Ahaj0zI~-bVkHers?t
zOT>y!4Y-58)!^3;0&S%$yGsVT6v9++%(ZB>4g;7x<<k4RPEnmZCPaJDNoCj`YLj_!
zf~(iu;cz)RwGgXe@!E&KysVi2O9B=tG-yrDdBa7ESQ#+NWZOqS17$n#Rzkbg5o(&M
z(82<!_)+{2SsJKhMI6(9#@K);fm9V8Nx=R~M6tHzgWn}R%5`ijKzhbs^_55(p>=@3
zixd#>8LPQ=ZuDm&s5uzTB#D_-d%#7*gNj5-u^F7rOT&X8Up_Jd{?2t2D_DQS(d<zN
z{NJwK3UJcM)=8ObU`*gpZk@G$4yEImPhsHvzeVIt^fwG@7mfZ2<~+qC7}xH;!*NUk
zHUa}&D~th|&pxs0iR3_8DVj`>fsM%ZB`fRz1$KhhTf<!sg%;Er2KDX$Z$Wgo>UwpG
z<9;lJMI8(kiB6!`3}Cd@drPS#`p4s!hJ1B|lLP&K_%?{o@l;alGvHhVZA4SP4}ij9
z20_=ca#yb!!?SWOrUZOAMZ7R=fakpW>%)M87}m+l#J13{Lbtq9B9Q;9yGYIVx8DA(
z<!<*T#nR8$&4cnE*3Gv%A~e&Zb{+?TX&ZpCq5a1L&is*6x^NvsChexj4d??}s{;}a
zZ~t}B?D+z=$`OJ%9vFWpD4C50G|fO0LiT8y`G9DF>5K3!T2O)U4|rIQUDD*scOzW<
z1DA9s>dqONZ^h%K7v4UY*)dh<&STCPdS&tMkuBD_Jgp^2VZQxNL*c`rVa3!pktVm1
z4-Vej8DzO)o%`D@mn_pB>mMQ)(o}~bPLsI#aYCcnY_~2iL0#XIRjZDv@cWBLlYYSx
zuGUws1-6cZj0ZkV2GxqQfliPief(_XPsQLSW*${KZiJ!~%@oo}Ih`TwFDt50dwhWd
z(SzbkaxN5;AaL<#06NenhVvk}Cca1>huH!+LtdOLHkc=~MT}z5p~7Fpe|HVvAsq$*
z77jb04rKQ^&9kZD$&h(_|HhpKiQ+YNZPt9>JNpVB#95}^Z?H_-MJn!WndmV;vYY6+
zL2|A|<N2J4&5M6Bk7>=fI`G1&mYzrNWL>sWv#Dz`M)%W_2~re!_9r)Lw#!8X3K&-2
zo8%Ax1ktHo&<OiAE9aiiAV9aHAGBXC%z_BP6!2Be%fz4)ny(7Jr%{4K0&|>W3w(&&
z3hfNsvx!Wge(#>WL%^sOA`jz>@NnbG5@m_>@DwK6=aF@pF%Xh{DCI6ZLsaULb(aPr
zwbS2hikv@*9iNeNCEC#qhMbFwEgcclkq52gaUIXwNAYZ;>U?8tjZ$SECQDnPzY)gl
z<$B6W6P10<j_l}lK%KJQQUN-Cu3HEZ*BI-JG${dI)>M)G4Suv($I)VUtThloH-XND
z2mx?}4CKM(CXK1a*#*bwtl}clx@3ARLGfPapi)6>|7y9$I}j>9Jh&*#G;|KxdgN5F
z4)9>Q1eYsu^YxyhIx*5PnH7J*yI?CPCMf&2k599u;Zb+wA77t%Yc?x?VPg0vjpyvA
zKWIFU<n%1u?mwxR{`9(s;<MY250a#SIsa3P$H-;&8m#zGi4a1dqW=;?X$YrmlDO~>
zZ5nQ<dc$ZNsCXATVN@AmT{!goz!OGJN%~ZJ@b~Z-QgKLHK%luZPFYMYi9LJllm%46
zh=xNxXw}?_?@v;XVF=PvKSnl$ibG_&<LRtuae#0yHf6Yfv1w@kWdB<0L#mh!=6!{~
znEyBM2Fv|t)&bjSX9<_XcI&Gavbd*3>@#9P4Dc*Brr6%FEzlv+t#ic7?NB}DvW^)J
z+_(sUAgGWGL{5U(#tw>DT3bO4s8fj7bJ!RH6*1aGj50$<b{Kz6-3BbgxKwZ>o+ho)
zAp%NH5P{PurwxCvpB9$*FDsbvEU4<Il8Q#|$ue@`3E*(JB*b8=`Bz<SOo$cIav|ui
zqVtB&+8ZU<R>(0-570YSCKhg7C|bDcv9xsD<MQI8uRSg;1~T}kx#iU5*}t!vZLmH<
zzM7gnweYvRq8Up^Wd0<E=s)zq8tXv#pv3&FJ(B(eBEU6hHi@*MRY3se&ERQ}Sp%dv
zH~fRpOmk2rtU<#{vB~H}QHg_%8w|kV%QM_w^%1d7b$vVL6jVg1;lcL?o(E9R=21?^
z7tjVagNwpun2hfYePP@~nzOjs>ZaW7@_+yLF$N4ZSf)TRe6zuDXYgX)gvGp=msL1w
zUBD8L2I~XGgQqLAeI2^8edC<EqLYs*pRaRt%iT%4D*U^BF&VDju#&a0YK;)y$%Iuz
z2{1(;Tn8GC%qn80aY`88yoNPDaIM)uP8%gJ3-@a$j-N4hVpQpg_R$WId@ZVtPzoqQ
z2Nsj6&mf-zW;fBWc<6+wj>32u!aJK6DiW;Y;y`V<LIDOH_+OwKC^PbX^c)(!#<X!@
zsqa$rkD4BSZu#_ei)jPsp)dY{9-6)Nk5THe^esA2%vz}aQ2+%eW0C@$J@f_iz*7U@
zKy$1GIRM~-k4b;x+dLQk6KnH(fJcw5%}A|}t<BGqWE^#Zrlmka=xrj7w88#Mszee@
z!_=G~z$a@h&F3*g$M?nz7e86ri4u)3pfd}95A8aEQm}{M1Q~X5Co&nV=sx__>m%|B
z9DUnC%tFMG@cy!(p1Hs6>9}?9-j~+Nvsf-XF?gfW+xfJJwcR&PC6SznaeP98IPh((
z?`<(aOTMv&@44F*reAR?q?RBb*Hj4^S3Y<*Has6)gb;ODa)F&j;&3T_dJ5>>ysNI;
zI^iv=<lk`Z48R2BoP<r<;1enJ-ieOKz!c@iuteZmoEUzf*_C->^}6E@&mAJKlIjD-
z2TeG{KdLPs1n72bmco?o2D!kVvdiH5tG(Hhqe=sGalz`6!-5lI4ch5X_6^OP+-H7R
z5;-@&HuAvicNZe_XI}>QS2~=Ru(@F3E0^&|>vt1H=Fy4GGha?D-EUxeP``?&7dR_c
z2v%&BL?!>6c@kLz1&x+-$wmHq@k}CO8(GvZ4sSQ<VRDB#ZWR?rOwCYPeUU-aXfX^k
z;if98Ud6~HFfzuISa!fFp(`#k{u&pkf-*tSQ#fBlBe)9EM#v#^hv-xO6xKUIxDl}v
zgce)1dT9S0vvxiYgwe`|sC8q|Xql(2qlnT2qY87$(85V)HcVO~Ridv!EwTN%(~Nrp
zZYe)Ip_8pIi~^;GN>GD<amirGMk?EIb|hen0gF|ld5V$;Woe<zEe?}VByN(dEzdBR
zsd$DygGDAEM5{BWsSQDhLt^M4d=jBgr-*et35r}-c`B_ipgb+#*_Lhsc-JK1zdWjf
z01g~YR4O|dr)kTaW_fz2O#gYp{NO(gRBOWcErrR_+Oc`|_(_lO=`Zc_dX{d7dMK89
zeNafycoLt$@PHW<7WfCbQ&5L~(V+esR8RtQLvI={Bo}rPDHJdt%>>z?GufXv>;}P0
zCkF#FgK;Us$ZkQa*+{1k=c@plDoz9UxapgzVJ#!?2F#nk4sfFxI`)DAM{ldwA7^N*
z0Zf=VHhAB)2^X&cuejK>uW-q-3~<*^g=Nj}d&T@0>yr6m(B&yb(OhrjLHEPw3^9id
z+bdVuxP^F$-MFa5_V%Zf18gOj;t2&8^v+crqq<9mVn_}k^dU(!6B9f2k{2PYhL12o
z>y0Z3D^ODSHrQf77E#BE63OiAYIDV5pyU~`SqONq*O%O&5bwso!X-thf@h(%5{Bc*
z(_DdnKOzwXt2L-#^7E<1*Vf%N|M7(9&i@l2x?%sRNyaWcVtTjkUf{K<|MT6u`#$>B
z7hgSmseIMT{AYrqhd!yiWPdUFaJFZ?(~iHflbu)EhTJ(a`sSJD$h3*Y#k;fS&10Wd
zefL`3{s@niDh>SYI*deDn%<(J0CcDffmt5LZgS=UwuKzUz`ZysIb`YDs)U0NcCKE!
zuATo{yY$s2><;^8=e~oJlfv2YTZ>~O_i`E(kMt+A<{qu?nKyfEHh+dH{r2pzb#m9a
z$Ksd<#z<R(i7l8#vI$YT0=VxXs|5+knPmfrdnm=hn=#u9U|*#Gu7;%zWD~`UrusU_
zo#cAsHj*f!5cDEnLm>{?x&97fw~P-l0=^|sH@{@=mXVi6-8BKs+T{r&w1iB9ivxfH
z9VLajYq%<~FsEW@@!@4f{=-LWt@o@8p^(~Id&dkl;rXK_cvWlU!O6>McXrrEC1>|K
z+i)ufAP<LsPl{CruB^kpxQSeKOy)|%?>gDfpOWld4<iohKbWPma$?F$vSC(<!EX}M
zqe{g6$HpOE#TurnA)6DO1kx1y0sWy~8AKn%Ov<r$f?|slryNgMmN4;+LIFr=8%cKb
z#vob`Hq;j)@hF(qn3G&@!)++V@M#xqj$p}j_V%)KY<|P~VsxGjrAm&<cq7Z$-LZLQ
zrsLAA@pJ3s1MR&2ZAiYx-`<xyF?3nceo8TaF~~Z3eZXv*^*A_sc1yy+8K@cOM`{vc
z1;G0<-54=^CNG8w#?+ZkkA~#*=n=(-NW=)VgafRBVFqcIp(couV=O8{NS&BEvp<uL
zc6~c|T$;L{dXb=Hrmhx5emrw4Fh;<)B+r~`7-Dd5@V_;<pUYE!AZK7^<{y|m82KdZ
ziO8p2(e4#PMRzW1EjJ;8|Ca{OgsI!LY1*Y4>*$r)S85a7LPAh@y<^1UqcX#GF@|8^
z29s<embZO*`(z*?`xomfkze(H#yuo17_%8WH%K^Oriudz+)m=}TCUGV9h9-4{Q#ao
z)R*KR&VP}FU$26Ve_0QpjO7t6wB-T`1Luri4q|z1iBY9w!YK?)12)(QpCHm>4Rv{(
zII)*8Q5=Q+ki0axAGAR{)9>viAPwUnd{5=v4(&00aiM3%u4ge6TNfh>#*Q~c7Cv}?
z^}2`=$Zf5eo6Ucp9_X2yS~YCB5~{T}V;k658rGa%X|5>U0j*{`19!8*8sJ;g<8vSJ
zYs~LT@52lMewqR3*&XM9r^dJ#T#ygc3_Eq}A?t!umIgck$i&sA5<9UGXvkfCk&(FU
z=0Wxr-*=w;acK15kXujwxNblEm&@k(aRlEcnIxc_a25nAD(GcK{2)zfOP_*B1vTWD
z>Fzr>hZkET6%YH@7QPuizWP*w<sOEHIJ$;~eTu((SZ_MVNFhJ$$~HuCft0wr`ByQi
zk3Octcggjn5C^U7>Ks#y%~dhGda;-hzq9_Bdp7<Y!ju5mkVtHZ(mVJJ)JcibHVlmj
z*Si~gjql-Pl)*+0<lxZIjU?6^gPE=uSvg)NCUcUU_+2QamP1u1#C#a;8&c~FK|FV=
z?d@wJdbzGUkLkA(Rt+I^JY~@My6WaW`#fTHa0j%c1wj$!S*>Lh6VmTK#5U2I0wm8P
zV_BB*8<9(<*RhCob;JI9pYF&q-)&Ban19|0a@lUwHsmsH96Dz#1EuCI%|>6N-_Al?
z)^~o+9ka2@;dk*ETO2t1tJU7x*m@Cc3QVjp%mUC7zCju3C}f}+DC5!6zOr)TnR<MD
zk|fr-!brFU;;xiVS?|l8M*(Ox$RFc4sF63*qX$!|dBaS-H63zRM(>2AyYb5w>o>!S
zCF{~?bKcP0?QeE^C?+d+j>rcJ7cUN2S4S32_N^YaUehkkwJPj`d22Y?D`m9V2{Kf6
zRC$v?pIFSONUMb88>t*vl?WV`<{-xmu(<G<kvi9^0HGT#UEBg_2cc|bE8+;9E%Ic!
zt22!kFzMb%N+wYtR_E0r#ic>4PizHpYog<w<V-#O7uOT*O^myj$cW5JBRYdGKG+Rc
z?yI?;B4~6G)u&+ygi}Wi#~gAHn0|Fiy-|RE^_aV}uM^k*TIehJ_Y-<%K2j{btu>c5
zT%Nu*;L$Vl7rWISlr#$_AAT?pxoEr*RJdq2ykx0Zcj+O>qgjg`d{a!#;-}>%$+u*#
zZw9JCTL}#UHcN%Khn*Me*Pu`(Bp1d9Aia};28LUten1$~ltFAklTh>sV4gs>Z$r98
zT7=kfJ4Ba+g|M6pQKj*0hge;{g>o0A(WO%)jBmt9F-{7jF9wvEDWpF&kgR}mI4u`}
z-kMn*BrDBY7>PXiU9)wPq7WMdhc7KYbHQ+@XKHw9@|bnLW%VojXRHNZY-za0;e6`a
z#SX3$fH$XObd-qIP1tuElL4tyDnxC<2A9SzL>BiDIV46l1doWMUUBd%2w`_Bj@H#8
z@4>qdTn@p>cBx}2ZgpkKVC8jGXec$F!~6n!F*6~XrCoD_l@vlPFwLN+#P9EJ6o8<B
zUWNX_#VAL^xuOOVaj10Q_`t+lsdUAbAMA(0mQpv2&03Iep_VVfE*Q3HG-&?Aj`{CP
z-%yak^zELG%ZuN>P3`y4ogbFI$ii=#()a0>hQg_PErj0u+u{CsQ-j4zo#gF+0Tr}e
zex@8;fDbCrsdIDnFglXoq}MM#8hdH*fk5sf=|VRMN<T)m_NRwv#DW)WDgeiTa=FR4
z4K}enO!+8<x#=g)Yp`02NMekhy#2xQ!T*TS94KE{B9}c`>(ZbLWjf)xkB_uiC;hYb
z1HSCDPFNpwSckLzy)1k@Zn$9PtH{NBN79NWO)Z7nd=ImJ8B=_CkW+cxur=1HbNl{!
zcebz?76kOT#k%TM-Q26`xwVenl`-Y2#CDmg-l1xM(Pl&1?O$9N51Auv01fLE#vcpc
ziXKz|xr3B!^EZeWfILf-$PLZrx2}Mgs}1%)ut()1aHF`(lEeD}zUXK((f($Vc8!#K
zN?61b?cry`Lk)op9`-7oO6c#Eor3MD0tv@R@wapMz(<jmsR@fQ2h+gml6{wFu3Rm7
zd~!vP=5w+N=a22P-hO-7`stCA7<guB@j7)6PxQRC<t#*;h}MVVio!7ALafdz%Kk8X
zgI{NyGPL`Z9N<sG$3O|*kFX0;FkB06!G3{=J03a^lHd2nE;AmOTGI|LTnl)LPLLfB
zY{X_hy#AmV5_sP`6N|`~NFLV&3>it@k1Z2qaeZukMJz5IX*sp<*^x*Fdzy}0zcVZ5
zA~mt=W~u(o_hLt4Un@6;%iCV(XxrU12zA3o-L_GAF6XFg?#VV4oUqPufwZx<7h1$)
zc+N^lKyge{8_A@!tym1e@uP!7x-DfCkI06BYGIxmHjk3jp0Xr(k49%qS2D4-B?Mv-
zG~6-e>g#V|?f_ATdQ}zSn0lEv$3_%oBUcGSy)f9sqLWsjB?yRuvyWl_7WOoqQ3WY_
z$9f)paZ@`rRF+^hcPQq|un6tDTM!{2YtHukbvj5fJL+MX>&9C%O9n1IynDW*=pkk)
zg}){PsV%^ugyF`4^vO$##N|S3$`DUTdk`P{#iJZC0k+96zE2f{>?kGvl(d|fN&mN<
z)D7(od=H@JV4s0_Y&R+*V@Z_a6CjPwGImm|iB2ll;DDenoO@?k1zJnsC-eJXBe9;@
z{NTh$geTX-3^X%qxjztDxcJ82s_UEQhFY^My$K$drk>bqsNStIaG(#ultP{87`4~e
z9~7ZsTXAOZv-qu-2b*=yBBVqrYR3$iro2henU(8FwpoY-&_Tk*fhLj>NC0pXfI%@x
zUzPxY=OD;-ryV?BoznIwa3hRTBshTBczrPz*QCn%K3MI6oQo%=<|CbuA7vYK=QJb4
zvdq@>XfYcr;9h70TU}c(qiR#m_t%>Kv_moVzMbVxko8ee(WCygk&3y$?DPmPG~nFE
z4|cXR9JGApl2!E0*O7DIHdyXrb@cp~1rvBf@^$%6GAk|D7~FDd^RR>xApJT9A*kee
za6)N3olMyXH>5~FMOxeANhzI58wNnI0>v>RQ1Vf5Aw35&@q<U8O@ldTp$!C#P@6#a
zNdyobUS8P_#3b2Vlvt<Nj_-l()XUOOJkDJ7f0)oP4s$Su_Q_#$cfSiAT6_So`RKu*
zsG<OU9r}ZY2%H`-rcS^Sg|NAmYeoKSi7(u0(Dk^F6jwTSS3^%+4a|`XCAiJX?w8P-
zLXHF^G%BYPvK^6TuEzo}#0b==%UTcoazGf;dXzFqg#Ag|!*mnq7hW4AnzcG9nv$T=
z4-_<Q&#8y72gjZ!50Ns_X7u795`oxYCGi$;<i1j{%uw#i$R>mD6$Nc|S}!Sd{XR-J
zoei2Fym@M2VZ+4bxrf8r*&C1%{B6%f&-^n0nippm_gh9MdX}zwM9ywOXa3g7bztQn
zz@6%C0Re#L1ZEpvdQ#;86=E$c8sv&Hkejrm0Zy|mfXd5#Iczmu>fvYbOQVp3U;_k4
zo~bi#6EOT402z2nT=7icHaNn{=U{Fnbq~5Y^a#wh{)wkrK)Er7Mr=jO#P=}P5x!;i
zZIBt9F45z>8h{pQ=2MUk-*#A+KK;7mb(FO${vAd014RqSkV1i*!|BYGr->QHp4;aI
zbpgAR9o@Pl*}nR%FxS9Z5QnY<YeOaCCm>T;u^XKM7R4H)y+zTW7hPzjD7KIjR3qJ>
z(vX5S4I(^|IMBEu7;qXK;USfMwrifAHn6v#hSBD4tV7hLlpC=y40X=63y_NepqYG6
zH@SnoO4H<QL&ORy+Oam{n=*`=3*RL`u*!cUo3APt2v}x@SE=xr8xOLM|6`n)i%$Kp
zuqhV)tEICvSFA@wzy#z5XmXM^b|zq-0tR<%zjZz=ZVQG??}e?0Iv<<_>?e@bz=aHx
zS}ul7sN;Ez0UJh2IwX`cC_7Y)`H&n4@RH&NIQSM_NQVxUy!E&bKIjR{i2Xvip)CNq
zg2Vt-9StI|8FRWQkpL-b^h(GpEUqx(z}WHOr5SAznz})eOT_Gw$5L8QWd7)vcGg?g
zIlJM7K(?1k>Dy4{-4|txo7JTJg|7N+xmp-gXW$$S+ZtelLcA9{k_Zd-Jr{`oKZFQy
zFiMqYow>pbz3NrF^~j{V3D{@S7J|n|hf=n;l#gF7Uhm9-j<;BzmPum|sJLmFH&HQ}
zdf#?56w0|5Gxaa)Rjrzrd?VIyY(k7Y4pd#<XoKnGo{cvBUggcfxwDTdA)--=DOoCe
z9{%rv&Cpdf?4QuvDXc$*l8T=1ySkf}yNeY)Q=fTi7nr8*;SSUes2XWN%8*Y!RE8*_
zfeiYA>r<l|79sIN7!P+DT%Qzho2VioBs#P0ZGP7F>AFoUHeJ-yKG0>?Zg>){W<WLM
zYj;9}+JfUwBHJey_uv8mDol8CK$SQ#AdnMU4S`5kVZAqe*{ZO9y=9-)(p#Xg-Ye05
zD{xtjH;D`^%~Dg>dii0{;@><(Dy1N<xNGDL1UlG~&OTAY#``Qp_-S4kPCebVoGSS~
zZr+s!NVQjWdqY+WriD5>T+{5>dZSl~rd3nT=u3G2snmGKU&Cs5X&cTPSdBJ&wrJF?
zWUq*&QuP>P0FfAzsmaU5{Yw-d8+67Pg9CLMd<epbHU5xy@jRu`_PQK+v3FsH?d@N=
zn>MAyB3b?3y$7Xjc;OC6lpET`8))Sge$>iY=YJS-SxfgSP8HqTq&<N5#zroMiBO@)
z)=7-h<Bb-y4#A%~YG79(v(tiU%K(NTv(S+f1UbzY6iAh@dAY`g>44%%O8{*g>N_ef
zzH8tzvP9KdHg+NpGM1Iv?^h6EegQ)B9U4Xb$k?~66=7V5%&d}lQF8e~1=A`+iGU)(
zcm`$v(Wf1e^Pfb96JoJqs&Ti$^0Zxd{z5?{#xmxIluoC9+u(c{x>v{Dy<4F`=f-Wc
z_l3+{=n{>Q7YMIBfEPfCnqKvDRXDT@u#)4N5#^m}c(Pl)1GcX`bh@NUDt30}T+sEV
zM{{nd_#h9cP^f@U8cJI=?-Qp)mLd`cqXMEgK)FPE+0l6-MSKh|zF3RH1)&KMSI%KP
zCJ3fIg>2)7Z7`o;A#*j~(1j?LP#5|8tcB`<gP(LnP7jr>wpcD_S&X0De5GLalhAH-
zbeF9m1KOoqClzydI?SIkbmnUF{pMvhGv)u)qij5EXwG}Z`bm~~zWGAd7-F+$_Wp**
z`FlquF8^%>|B&lpLsXq0R~RZ8!E;w09#pL;^Nz`02|=rFEBr)Qw+liCh7a3c*sD;H
z;15H*(PS05l;*%|dTYtD2kdP)k7Q2LUC-jK?=cDzJONYs?@6jdeK!smWS$b+D=^!~
z_7!EKSe6Xl0YgP-dK@T>F@V~E#Vo9iUf8+(oAMDUl|m5Prr#^=Ill3^ssXIXV0l$y
zkHr@kEO!rKrzQSDr)T+5r>A)9hse_f6|p62p<jX5I<Jmc`h6HV=84|IIEA6jyNLe>
z!4=^H=8+%M4HA4!%d+ztZ;-i8o_IODfV4Y@*5yA-NCvS&-uTB=)djjppcg*4lwRWn
zU$lpn{4}y)4!?_qMZ{Q>axbntofbJWMkeMLaFeTnPXm^rDt?s)dH8{`85F|?v<z=J
z<(h660KWKG?}L+mwpcATIB&$hD5u=*uzpqv`;OcHXL^XFjuRK9su_e#6Z;o16BZgd
zH&#S-)4xmYF{A=c!y|qYjCAQpa5R>r{$Ecl?nzLHkVxXqEBa%7D%1g_Fh~X5Wn0_8
zUUHjhdw2}?r<Za5GfbbdWc~Y<f%Z|mx6X~tzENwnf7|dsepjjDr5MMZWX}J7=0(9@
zx3@nLe7FBOm&z);x3W2be=5IMru6+kqWHKkUkp)Iy36BVi`};4R`Fj0cWLK)KFrX5
zxF;>?oh$cVcm4Yqt)qnEThC#d{^zB+-WFs^q=V(_;FRmE;OQ&vCy%)--S_W#^x5@&
zgYz@jwM+A6?Ay9)Ik@-MAK!Q|+>l1=wKqi`82w}-^1!{d6{i;ang_IV#<H;UVOW8Q
zQ>!P+Pl7ZB7(~cCKN{l%^aNbbQ_!p+$zXq{uKIz9aLXPr1E5-jAjsO+L$*bRgb!$%
z`8NGKCVV$_4KR>khjQSlCP=%;zj;2tj`Spz{F`WW<S%`H4VWEu#8t{w8m!pBHSlL{
zWX}C&#q5=dBGa8MtB1xU{o_Tm-?>;H-GT#u6N|^*?P>=>4-DNSzqEz1r~MqqXX-!t
zqTFhg4s`ACORhm?m-%x1t00E#hJ%UPtHh7e^>4L>mDa4N@;~Ibu?lS-=>&`%JEYh>
z#&^TOv8@P}9>Sm(E}|C&?PyXk*YtIQwYt$}YKx5?D>8LZM#Fi*_LJ$~plxv_VH*Tq
z5DLrBg4@AjZJ3D4RaNLDA;`+)dAixEvI@-I+JzezE#IEhE?jH~&xgk?a%uAOlGWXB
zJowA|3I*u2FHa6@W*6|D=}%xrp5Irvc)>$4)tjThj2f*GCl1W)TMvu8Ry@`M;A&zq
z`t$nAHEwu!gb-)2br6f8a8j7m%X+b<2O3Le1tSd>d=nvMm2VFZtTRCE2}*$GPgpa~
z8$iMeMd|BOA8}yP5Jvx+XHc|bK%QZ3RUObU$V$vX5ay)jsMnEe3?>C2G7^EU=oaPh
zLv_{oH8QC8059Ej;Nk5f!)D|4igkq_mDb-fkAEFx`Q&2W@PSRN!Z*)4-VwwwW`fLS
zSPDzgi3k!}+}Y4<qR?T6t;EYC(B`l@oi~!PdPNo85J%hclzNQG!d7x1*HUM$VYc;R
zKNeEfP2Eedw*cy;7`C=)u>QYyYdOgpgiiV>fzhkDH&zUu^H*|eTEf<psswwVWBGA5
z>Pp4pbTM`%Y(u$q#b<~m=GZ1fzb&;~<IBqaqAY0c%d}Gk-<_=()=WRBP|Qy+PF$Y-
z)Vw%|i2956&7Xu+!{*uAEe#5j-X+T!b;GF>FPgy03QV~YcHpWdg}>|$qM}~+l0K1?
zjd<-sT{z~+|7E}=I)k4h!kZ}W$VwrHhlq-_#O@%=Ok}{M*Hg5Hp@js&2|h=)2Mk&m
z@zi`cS1})hgBL@7oF`~F{Z_m~iYROl2g|wDKQ~p5)h1O*^<eR^B;7^Sx@C35+^wLZ
zrMp+`>J4AL1C8fr!<P!iF7Df<QtN70>^}E`S7Y=qTve*f^R6MuVzAe8U)yAL&5Dgl
zWL1JGt{PiD0WWiMMRpt4)65wp1wJ%11-Z^~*e`;2e?T>qZNQFWG*T?T<Z?|EQ8ib=
zMb~0kuUv#3aDkZK10-TK4Tfb{?EPYUnJWK9EPBOx;q@|HuHpple<;u2scReFpn*HV
z#RtZk1F+__5JQ^#Oqv(WFb)oh+D8koD`viZSuuSh%X}+g-6iV>*zYuV58UVSk&R^)
z&5VYQU$(sOQ8--?S!h{1Z29gw(bgWuGn%c4YFod(XzX!NV)+sKWQf9_1dWNp4(k#W
zn?&Gh$WGm&uAwct=iD(VLE397*E|CjGzaGgA_isvwt)|14ihAvM>lcSaDmi8yRngH
zJxxL?7rC<!-U9=Ms@(W4$mCP+A@u--qQ?xtwTGk`E`u`|f#hK64f-{MFT0SU%1Tjn
zX`u9@>5iU-smld;!|Ed!>#d5Or2(tIV^WpIStJlSuRK?2^9zwE4ZRbeQz`q6JM_{<
zFL68%(0fDz^Zj|5Z`x7yhK)7;x!W;P<}p1)9cp5|uKG2a6x86sig(+(db&XYpB)?i
zawZ7+Ya!CMkpR(%y4+Yyui}Cn8BYprqL;;2_Ex3~*YtZD3L#VA<ri>LE+5;vHC}KD
zLUM;}(&pNWYKqtPu}Y6*f*f_$z2}lO*_w55{KJ!$`Qw|ckM1buepI=#hFfQQUY~f!
z<KWCE6N*Qlg|3^g9f+_R%YvGj^YRwqyx4f}j?xIrIqZ*XsizS5#}m1ifk|$voE*~$
z(A-sjfIV0+I8cb70D%DQ_v>faL#FCDr~UsrFOKP`%-7Vy^I?pSZV;jV{CO2e%vM;h
zPg}<lJT_zEhqWl?hfN=~;)TDLJ<NCecTOyQ+pl<JdcDKJ{#s>|AdQ{f%iow3i1i@$
zo#jnRWg-#))+NTXh4G|uGaU_EgL1}fDLCfqW0MP+S9Wg=WEoPMT&zbi{L1tn@|Sck
z(PDHZO4%<^(6AL0^7oNHXi%`6i1U0gHJHuvA^|ia4|^h3bf1U&7{Qbtg<20-)9d^k
z>zBB?D3Zq0Q2hNWSY4~+aSl0lZ!g!V{y*a01ghyfZ5x&pN<fMUi$LX(5Vo)=1$BhC
zj06bCqOwR?luC<O>SHSnP6tpSP+3C2vZ__7fM~VVN?kfY6;d@(pi`*&w7gaY4NzLG
zVi09dzU%&j^K?3$=RNN^-}%0CoUtl||Npn#%XMG(bw_gM*JeNba!_;MeQ4sEyX5JO
ze|2S9$Rh5413k#;^}b`_^3luZd&i#nYHm2yWIroxI?hP?R8ucp6PYi8YcA#M;905N
zP<cXXgS%A;b5I8ZNV8M^(FO;@cp4HlOnW1oHqIWktDXV@gcF9c1N!~U^*5O<SH1-A
zAXKiU!Y5uHcNZr*fCxylRiIAl*yt2>^5GQ;y#qf6Wq8}gRhT0A8c}P?BCDN&zPBf4
zYqT^rZF+JjYvfv1Znx%Ex~5pGzVm8t)@1K=@AQ+Q$h{swKX;w!msJZabX>r{*eIwy
z393m2<Vg8!u}N(j-z;3Zgo`LDkh~8^Cl}TL-nKu4EZ&?<!#c~f)wo<{%h93?IT-8}
z8OYa4AVY1`?6?(SRRG4|fmvvj%OkE>P&I*VF2aQt+&2J5#OR_54O+i>n@z|`h&u7<
zOVL+l<XvnouE;Sn3+Rt1X<{F?{NaCgW|Fvu9*k2;&mQSG*ee-PEno4(wUoJKeMi71
zgkL7gcOPA?+LO9WBQzqWvx<Ccz=1GdYY2=_BQ_bBvoK=}qD%{mwI^VEfH*E>;wfH;
zfFnZD0nKr*!MWYWyH_CO3rATRpqvHWO^oAVIx875pqcySW$Il_Zem7}7JWH|O%d43
z7^?B2EwoIYp&8c>A8o*vUF_91Xm6xy8fxZV80p3E9PQ&r5bT=PW?vta1Pev>LVKQ7
z2Lh)KsYBR+8{IfvDLVc5e1)Gkh#6^k%NGHXK-?w37@Sh&{<gX~<EH30P2(#JfuB)E
zkQPYd_DC;a{xI)?@D;o-8nqb(0F+HgP6XW>EKDa9<|y7*mQ@m)_R4YS1SBY+6&Z@M
zAa&H#0+)fO#Bv5=sLKA6RJ--4z-TP+%hOEiEm?jE<Hg-IlP5s@KA5PvKa@Ruce>`!
zQ;#o)+pOsS;qn#<cY1F?zOeLW9)C%|SIy*!Aig_5bk@D<a)&SS**c?efw-5`5K
zQKJ!}g0&O@u2vi2hN%BC%j{fDeG>$>U?)kKiJIE>qp1ze2F?sFLDNAI#4slTG)4p@
zP8cpELo@wB>@Ui>)Mk#)M0f%3#UvGA!Qv3W0yFN8*)05Y#E=MN@Y%TG(eB`UNYVz!
zDIWP&tMS!7>eZyO(zOrLH%txnwmkXa*3O#|1%~zktcp4WQgz6dqdWP+0hth8Q5XkP
zYFg&n0bCsY9)4iwATKF<0RLTGWNldHi|~Ty<bN_Eh+rghUM!Kslw<>hsO6zTWQ?bX
zb{i-qBxd%n=s@Z4Pka#9-Uq5EPOn}`q*z+qB*l<h1j)2(wK)+Hxxe2vkP(ACqWS!O
z`ydb94>IoQ$tPqD@)7ld^z3Q1@6?nAdcS|su4qI?_SB|p)~&S+|1@gTGPb*GY9=51
z#@(+LS(R#42`a57++2X483l3vQ_d8Ito5N-*yr_GGACqp03iUlz@eL&r$S*qwgH`&
zOlOtX+FT@a2t5+;T#Tm{Axsz3C()+PbRiz916qi{lx71}nIa9tw4VYKjF_dvjivbS
z?XpB2QP1T)x?)2TwH&{<e*JM{Pv$SF%r2R^iu5XMJ(7!zbf`_ZneVFd7sjWR9p$2-
zOwWwZiq1ojQw#VXKbcfV=9rU<lE#&YL~AHPM_W_Q>13%071#s}ry#-uU|KgMP^3K)
zW0dJnF2G7pmj$kK&~1wTC`H#hE)wPBCz#eFIcO%;Zc|qdpC28BiAET=Nrt}~L^5mY
zu`DXiScT8x9F9Yqxo#fHNO2hE0D%Awdt^()csEGe^|)ild@~<wx)x@S-8%-_3Np0y
zTD6aM_=LM$JoGem8g=u@dv_WnlZ{#sduyiaC>fj`>sbdIf0CKzJGKv@H)0{=k;s-{
ztC%SzMB20u00tzSDIg(+gw+r#`V={aFeJzu;OI~&gGqx?glQy611h-)4t<6UiCDuK
z{?>GWk>|{K%i*O=)Q{`TB+lVZg#>2eo_k?BOM*wg<I`)q=4Fo-+hD@$%!a8`ds-%c
zxMg;kV;)=p{$6z(=cvD^Uo5t*ohnZ#H#LQUa|tM6N8o(@cE9$RqA?#8g($AKGuRk1
z(|DaLl;B_}+-d|66-G4eZZB*ekA-IVq&8EO!T_(MlsLO#xI?s^I(k((GyCOj_I$g*
zl2OSR2tu*H1R5hqQYxFIS9L`>Mz*ro5`u+7)M|5-6__<BC{U~Bwhqf0T1M2}LsJvK
zj~Tlz|K@V}?ocl%o$3JTmNK(6=$fF&7KZ`Hki`_n_>f@d)HkzIL}{NuwI;=!hZU_Q
z?`mB^K=e#~mbngeGsQ_lJq8WYM@WplsrPC#&y_8J0YuJ81s1b#G{`gd(#O`A`Jhgw
z#2(_7d4TARa4(JkJZQE}08Jyr4X;Ky{gh=|<;2eyC*_mMQiP1cyPfEZRpV++-f6Tg
z;9CZG-|%XSusUP(N)SK5yE0OzBdT*L3>61m_Y!-Se}XCckL+>0Oc9XAyKuvdx#ZXI
zhzpW1Ob9q6;;c@(7{s8lrRvt{WJ@p#k6Ri9Lm7h!NxT#<!BZs({5(tuF;mvT2E-8@
z%|UP$f{+Sg%4<(jH#i1S_wm6}`pgn3te1Vgmn-I}aQykgTK=hzN|MGc8pdP)?RAnY
z{5|6Fs9QdLOQPwo?9vEz&_oT<4~(5Nlpe&2bL<xPAI{i2(+q}J#8NSBW%dX=YS0jn
zMzfvVyv;a}ILCT8;oHd|;uqtrAV-Bf&J?q<bG(YkSE(R9MTLnJQeRg3d6VVC)k0ev
zs{N?6qn8HPl_9|(DJ0tSL3nqiiFYQ33t<i?U8G#HeAu3ui}ZnKNnQ13>*V7`@h@81
zmDg26+G>+?%FAdu!L&$)U#35T21qriR^b>We~Q&UkYq{xK*3fihU?MPIDPM8DG~Od
zDo4cvCGs#t$Vv7*kQSYoOba=1z#3Vdt}>YJo_V2RM3I*>DmaD6E7yPqg)xi=laxgi
zK~!4-o>LyTY_I#r&RoJkU4?mNr2B=tv5C)~?pu-F*V*8!DXi(047HR(3XTaSrY>wB
zdUdpC{F-ZOw)R?a@7UiFQH|i@WMigyQ3>Xwph(gbQi3j$(1f-?wHwY-pd0W8x6B04
z!T}72AmH}>oL&@<4_5*KREXd5)}Ysq@>+D^dnxiw0N)6aRfi*Y!#H1SGyWk?I0eA)
zZeeb=r=qQ^5VRu$RiFtU)x<degra}!O37K3BhjF@Rs^6gzrX+5u2G3LmBBGs={?&g
zmGZ37N9$8#b7bt`98Q2AuM-^RHC2&PDMGiGJn@ux71UI0K=MY+PI))Z6L2SIiUIO6
zKERY&2B@gQkedL#Hgh-?8fAp)5XBG6%~JTWH1q+X9VYXgl)8E#>VXYEoy1Po#U)W8
z09Mdea-FcxJicnxDKsQ;T^E|_px8^!%+hYJP^8ONcw&GqvU@kCXbbY4&XtaamY)*k
zncbz?<0on~{TC;FcZ}UDtXZ+M4T0X)>2c5!{wdNskFmC?+FvTeS4<+#LID~nZxFL9
zP=gU1azJnZ?j$J(`9OA(ho|-`0$4%qU`=R_TZ4ca`66|dlSA^I<*1{b496!5<>Lx-
z{n<t@W_Aa_MNpxjucjc=2Pv8&hf<Kb0nb6Yl9mJ!!zx|X0j9mMBOb+k5<_9c8GVXp
zTgc3vYJA7;rc1PaT{b&;?+t$5m^gJS9U6Xr)jKZrtT2aJu67P9s754i>>?Zp;k&Z>
z{X3J{N<Tjk-S&c8OH9=;;UP05^0-5o>}t!y<Sk^p5vSOSmk{JJQ3Be)%=D2FHPtZ|
z5kO42Ic(SS?GgzSxJ6RaT<GrM;-KZ!4yj_<**IcALfGBBw1~KoFef@!Pz!(%y^d{)
z4#bj>g@IIJ$SE@CzH#C42Z>oXyEjaYJ@|tdZ}8#wJ9?W(J4-iYKAEg(dAxgN_P|-f
z+!5n}7|9sqv!E-&x!1wEae2c{WWPc5hL?b!2Xu*sQZz!UWDt1*zV@4jJdG}8!q8oY
za{}Xw>5Z@krvPUFwgdx%o_AUTzNg%bJcG|EM03CG$XmjdZkg%GOJRDFa4ML71vld8
zV~A38H52M-HCM4?@JH)Y<zvf-da@ooWJi=cwB_rZiRKCWL9#N80Q(>WC2I#KT$4Bt
zlSm@=gj=AxMJ!^rJ!L~Bm|zKjyjo$xFEf7`rdEL|(!)oP;UYOC8VwhPvJ|Bs&Lnd%
z>tQWGP-SMlL8VNZ<5iAc>Jp^B5;G25gJ3J-A`nA1DO_-xE0)A74ORcE&%P6X+ayJ|
zE%XZ9z&3%7f-7`Zm7LaRKW>vu&w_&^PV;)SHv!5_job@hN+eykp18{=h8K)((EfnN
zoEDp&>4`?kWH`XVn>MgD=6JLiwRWus(tDIQux$YA%)>31)>e#GQADu#$SK7E7{Lyw
z3K;r%P;R&8d3W7Qgb8QvRecf%k*8Q}l=ci=G>Gv5#z7kdsCzR`2{mAhOW2IKj>}`I
zwWsxcUp;8kpZm$stxfwJI#!izey3=4thKY>-gR&OXNcXefi?c_++u@c*Q&oB<5jPA
zFIe$nzuT{?%Vb})8Md3$e-P-jwn#^N`Suf^q1d4zly#HNIPB~O$piFUT-ca|@?v_9
zA_}wjW>5UwlBk`&RrBZViFZ49El-@1XFs0WwcJh^xR+<~Vq1|R^n5(QFEK>k59YL+
zH&`c1S-CU^Rioi|pke`Af?xC{5cxGkQ5uI2;M2zTB2Y>B1yikY9U(q6dJ6{_p$T+R
z+HiRCyeM>}GYq2y;vS2e;)gJRnOfPcFa=u{2KW<848=nw=QN#L6lho<G+p#6LQAI0
z;tw+TwCB|VwFvMd?~owqWQO`wV(Vb+hN<q@p((kIrhjN5bi3W3=gk=xc52ahHPI6$
zQ9lfS)cWjGu)9G|*G{GT1jbaGBPW;tZoj+c|6lL-IA6K?H2fOW*94j`{$;=2q0B{5
zy=?AyBU4Bw6&XE+3F51GaUe#DV?+;5!HKwuoAslZ2%Rc<_H$P5tMhtxs#7&rpUmqq
zinnzMFn$@TY{>%sJ;ixVK{~*7&m?V5C~{46*qgxBK|^x;{shwi)N_Dgv-)K@@JFry
zQxwZywlk*MCQU};crOWhQHp;tr?jaPP?R$SnhH#S&-fvSI@uJ=Bj^`a9(R)Kx);=+
zKwUocX;F$4`EX(WJJAlU>!K=BaV*sDs71QEkZyvb6ShMDJ1#H>3L*-_rzaBC?un9~
z#Hq_&*<+2EH8QH!x7j&0=E3B}SNrmJV6yj*)3L4Uj~lezLo2lR7ifFsJ4X(dO6Hm)
z%{PqRnd_xkh9s6+J(#771N?$~h!|Q(xHw3C#J1rSY#`Dg$cSJJs-_UAF&`XB^-}Kf
zu|}r*4LoF2mBudPks0w1sGuNrhZ;SF|5QU^!brgH#AD%5nT$%L)SN)=I|O3bE@tXH
z2<MmFyRmXeJ8?BtGVz-o`dB~p1B~V_Ncd=Whn5&fx0sbJL3+IR3KHEShm?+G$PrH!
zp^b`xUeW@E@Sgx-VRKBvkz*-C$cnb@_wYothgbGO6{mffcx_2iFk0JG&SL1EWbhBM
z%LrW~kx7KLD1>#2c+gD~OU*54NNC$0X{A?rt(RG0lQgB+>2Mr2=ah48G6dr}936O4
zPM^|ZYB;PbYd|T<dfHqg8LO1<80&mhzH_V$vB#Z>zLDYS)E>!$9q!Z0M&IViADyxh
zc;L1?>2(HN6`|Hsak>?LNOq~ElTWI-G$53ohe>!DqvrED#Rwb}$l%XNV{&#G;~@#5
zya8*}8v_(}jD-pr#FOR~qZUnBF5U<Lj+Y1vfkYFd7ZB+aM8#NIGFwQS#lU>Aa8%ty
z_6B?q2sv$XXI{7s<VkGg)4$Y@>OJzwc%mHAl4Y|K>YDY3y6s*vAQEnDe+6hVMXZq+
zu`SePv&E6@<3@E}1`*J`fc61Y9GFi7`y++vZ`%xk0^sh5f`F4ND*~9z17H%fHB=m+
z`on&dk3B+#wtRmkCP9#nAfuAze-j8;oKseu1h;Y;E!lmaG@<H3kTmp`2p7rAsdfOh
zUsohV^VA&miNwrD?uqg~cg?*N->F+Ghgv4bcFkLnJ@wJ-=3HvQOx}kY*WR(tZeV9M
z+8?2a;ck;PqE1}~j1C1H0Qwfrt+35LVaf`3P9cioC*p?P;AI#m#3WHCP~8i%2AY8!
z4-+!Y^=b44zmOVe5YJoEDx<=3J7Q-!_7&mm8QAvQ99(`97yxxZ?|3z+BF$2i$7BxS
z3{sDZvw7w^<!VZII*~4jIo{(V*Cyo}^{tu}+rwvkjAkA-W(iFdI;FVowRVjzBphCV
zrNZH{&=Z8^vpF2qVGgIfGA{pk8wW*$Ki&`yCpHguwjg0G@<DVDL;;k=bZpb*)U8J7
zY&DXY(35PY+Z5$13UmNeP-_O7ip{$Ng>8YS5x~cR`5<1Kpa>};_!=8@n5KxG3Ko`l
znVn#lN2U)=NG^=rj(wCpc7L0%_Oa8@ik*WWCq9qU)jP&D%b}UxvTuX-+12#KNrijX
zvzvo8(2%dDpp=88&OK@;jBe&$nL{Fv#$TW;7muuy)(wz44q}W>+Tju}@b>_oAgy;b
zMm@M6g&c;+Fudrv_ncWSK#+$5WTNNW;!jjz0_?Xky;0EG&TBda!A~#(@IJKD$4I=H
zaRfe}Qz}JriT#JYq}4koex6R$jy=SEaf0r~L6q%jTTGLV=kQKPvpM|Es-w^;mm;R(
zu@L=)dAIG!IDk48FD;Oj`C>C7b09|DW`OU@-gLusy{$+UOjBT<i2}8UW|c4yY+=)<
z$P6jNMtP#2!}MVhO)(Zd6*9;|`Z(m}u=_(x;s`o|GHa3nVXnwF*i4DT3^PbT6;Dvq
zA)B&hmCV@)oI;cosQeCMquSxF8p$II-|3t4dNW6Q&X*$D=(}3V5G{PO-EZ$0@9BOd
zAKpE*b9#Mmmgf8AnqLwzX1=f2AC;sSpS9{85c{V3y2A+~#*iIDZb<Pru^_-LO#*Na
zmG>EO3<Bqz5Z_R5k}>J)w-XX%4x0@e#gs5s7tP1AU68v1E6N_D2RRmKZzKDaLRKC4
zHw+<mp%}d-$uy!5L42I@20j9<HeE*$i(x~iwf}9o<l`6N^G|ZPer0hkJROX=jfw%&
z*Z2%_2F%OD)V=;tpVcvL1|09Og4^~O)?*7rfCFF%SbCPACZ<^JQMCmjAFL_vG;SB~
z1Ofu8)S8DGQ*J6vFp!of6ge?{RZvDq3*_@pQ3u*fLFgtiv5P@Mhu8soiAWEyK37?W
zKL(&=5r)t$(3%A7Z6U`R1CY{Q8rhQ3y>G+zM|;j|Cw^ouie&Wh$LZPQk8Njg@Nd2u
zm1knUK+DKPqi-wYQ*NNX3!fr>-7Q|dN;y<SBMeE<NmXkW4k9koDTeAo8dPOK?-e^F
zDT*)!dWQUoRMfki>qNPc7%mwB0(SKjx|g;J3ZSuoKdXmO)rW~fpCpvc3Y*G`Wi$&H
za%r$(Gu?ih%@awf3BnBcnI@pkEJ73|wN$TC9g}AabxX9*j+ORUG(EZdt%Y2>Yf#$-
zPqK+UoM&By%+#~W6l^1WJYxBKZ-7;w1BN{;;$lb&fkvP#GRl_%kVEW1&ML&ZUt}O%
zQl0~7%C=j%znZPC@+#tX(YVn-`0hFmYL`6TUd6H~@azMjT?@;m;|-ygYDFNj(Pk5_
zT4ZF)W5u75?MV>nDon+%EG7&y#Su2<m6+RRcDUVFOfgAX;o?7iKlae%H&DI0{cpTT
z>A-KpFUf=RUGj-5zSG~J^I>W|+CvW!Sq$X@dt8Q>P;ga*Fl|Kf9?Y0vOuC2^e?o9D
zZUO)tm>8Nw2fa*igWyg&8KEBpw;`1JtU=c=wRj@TOhEhz$2;XW!+**531cWmWWN)z
zceq%P?==rtgXjk(dx{^xJ_2KS&Hw{B-~iN2t6CDL#~yYqY;k8SSGHEa%Qxd}PILh;
zye%L7J$sD?M2i`%VbbKR{q4r|%b?jKTT%|VJ%X5MxQCTqZAB&=3qvywFr2TvDpQKi
zV*hcO0070RZhqiZRF0r9$hyBMhXtCL^WL0ls4%Zl73Dy}GNA~F-aq!l%;Jx?Br-u)
zr)BJe#Oz<1|D7AT`t*6k)uMS=I@J0M?Wey|f^q@I0T|g#VRo^q03|rGL1@F{fb>Dq
zME{>|1r2ykW8!NMm7*(K-P54aJjCy)Ht46ej!oO#&U`!{Q@7hp(@c0_1ALLlMTZ+v
zYZ3u?xd2Zr+5|-(rz=uhSNv5RBt(uf-KyL&Q`=KaKmr#<Azhqj$u6d}O5H>;3iC}$
zz_>|oyA|a`ND%=e2IK;n_byr>;`4)e@Iwr02lq}^38g#UDMGo-m*ajAEsrGeP;f@9
zU)U6UN~8zL-7~swJjgd;ZibHS&Zn+j)8E(hs{h!TXqz@*o>fc39m&9g-tmdX*2$?8
zsZ%$Qs$9D&&j*kWgl9Qoc#4e{Cz~hQ1B=70M`Di_An5|t0TuL_DgD;vgN2eLD5N}C
zhQtjSSU0FR321x3fLuhFl}zU7r3eAKhgvH1vyc~q`L=~hdQ6Ruxor_fn8rX7ACx^B
zMnD+?834fZ8RaK`)^%7d?NiK<K=6f*?DaOSn(k7~EhLo|5|nQ~@Ue9QqO;gvbi{<C
zg97O$v)bWvhTL)oMVaYF@Dp^5L1EUdLj4t;#Q)RY%xDmU=4p`hHanPrX{cfW`X%Z^
znu&ZRMpLyTVvfu~4~9p-y)o*s4*Vpgxd$4s#cR`0rH1AQ)&XH0z@zAdqT)HtCLnu(
zp7KRbP6p(-bJ-hqUUct$vgf=_>%{FY->LTV@b-iIW~+6Qmp8O(zs0nwp`$1(sPA;I
zP>0oIPmB-BC;uZ>RR9!|saGRnhktZ8d5&B{wGR9s@oF)4A2}pSsi1RVX~h7Ml;-5j
zWm^!l%#co~iGl2xB+eS<5V3EaC3wv+Q9uor&15J{dII_@5&IA*V7w|NI|3($=7-L8
z2JhZ!tMf~keEh0*M6T^_Db-v9J7}eE)}t=X;9T^RcTMl_*pID&ZCN$I7Y;xHi5G^N
zQQMsIUC0`7I(E6`Bj#!&5hH;9W>k_lM%LPf(t<7az#H5VFiEJsjhsHiKgI{L=w&pA
z$*_zL8+_uaA|$=wqjLlBYB?uWZ-A2qDgu~>6u~HErdE*AtX4oxo@A65?^<NY0o;W<
zE6Xv|W$}f3frHAoPj_xB&7L@G)B5D+mUKuQT)bLpvyIB9*pPp|D?0`ec#W%9%E!kV
zO;V^=5N99Cz!(nT49+Rhf52WuiMNT0tIP$V_N%r!z(RIgAwCDB9^AT&1I%nErpFFj
zir79L%dFauVyhddl`g<rPzR%rkHb-hOgMtNqPa-b8G;=ZhQYT(q=kEGiuEu>n9A^K
z&!KjidST+AR+9)?TCAiGLv!kDR*a@-pX6(&ZVYNx4XpAz1Q}h3toFMxyW-~e-@Brc
zfl!Qe=SDJ{81h8XIt=CvGA}UaP%HJQ0$NFYW9I&&;*?TcOx_*#p^eTNszBZFZkRmY
zB(<eijZurx$Fx@GG}u~*b^wbii%MAisIe+JER}uGjP>Vo{SmCDT#?nXuqR)iPoEzC
zb7J%4y)Bxl%QY7!e;aol8yuMIy@08w8@5m1vbmG>(6n{rCTgUb@gZygiX(o~v;Vvs
z9R60|l+SMk(^mzc+(1+UBL!1Mu2{91t_wUMwhD+Q?m|}-Zb(-&yf*;*7<D1dH)gt6
zk%J&ThCL&qYBjVS0DgL?03|b@=)9N6=+y$??F@g`wZKPx<%E2ENIOu|oAGq{@2%H=
zgsssn-$UV-w;&+UMTm^47FQNkL{!nJMGlorPe70|X^tmd1fiZAlv!~z*J408UP<n>
zm97sXWr%nfK?R!Yw_?&MkQ9jgRf5M`wgmz4DFccVKr>VXdQh<iIE=Zzsq~nh9}j_-
zi5(&@yBLW2A4ZWCagRlN9g4fsNP$9j8C9D{4VtNcdqKAUn+t-L(_3Js;cb{kdRj*K
z5R|2mXA>%9Xy(Kin;Cu%=#Iq22aPQRUG@y`jjlAj38aF;JfdebF;E47n*b7FLue+k
z4xnNO(~im*=;82C#FT^wLHrDy1|F8a9se{_T;Kpf1NMS4UnB%1iSP@W!6!t?+lQ$g
ziIctB2R$pbg6C0K`oat5I;=CIn6wD++iH!>KgFP{q8y9_ut?*BBn*yfyrEj;wh`b!
z^qM>%p@_g76j7+kZbeCetk$UFEUpm)7$LHu>)_V+32~l@yYFQ{S*Ii3<C5h}x^KI9
zb_N<8>PJGKMQm35Y21+ngacq$6$t~u1cKNsAvO^JN1en2k6xGW?23J)c|bR%@Sii3
zV-J6OQ+k=3(mr!jKsk&T6JAJyFR1xP+@`#a;t6n!7zXx?zBtufsG13GG0HOo5pFY0
z4h*<&jw>?TxQ5&X1Ojs>ToZg52xy+*cu?QXEFo)#k;5S5fHmvT@sG8FO=Bu?ENZqx
z)JNn=Vu~{uSQm+QjsFU)Wdc~sES^7v&={Mdr_&LU%f_$onm7?LwgR3-2U0EQ`YSLg
zPW9q$E@s7YK_5WO%L|hdRSFfG{TMG|rCdVNRRPGN;K1o@fs&}JI^tD{Ja7TvEL;4y
z0_KUJ{Fhi<;P#AecZq5<L`=gG#ZX>mCFY)pLvF=<vx~8q8(t)x1I~UB8^zIT;4Zo@
z{=7rqHMIVU-mQ-Z7fxU9($vVaexK04`@4j`<q6>8sWIwXTLh-lS`--ov5`<XvTS$+
z!d6hQ2FnR~<QB|Ip?&Fh!#RQ{HKjjwCM%Z#Bw0q7J2IUEV=QtX_%B=&SpZHwia!;^
zSw%+^w_A~q9(0UaAWQ}qZ_hFKD027U^UmxyvSMW0Fm!T^_nOcCIR(mTvoRP8yO23L
z>5D{i5gP<IQ4W4x$<H}ssKadz>MtM)c&MAsz<zsUB#Uu}ab*er!4>F-Lxh%};E*%m
za0tCXs)@yqMl?xu5GUYW<&g;%;kY%0fQl*bZSv#s{gINAK0<<p&VO$M2vLVE4HO|d
z3~n5dpjdktD4nr~|JTU5?V<nA=a;ww>>Ak7nl^g5duqIJUT@ZKK;f_6fA+f+#UOEF
zghn{UEc5+Ux5c+~O;}RGpFVB-oPFV!mjk{~cLc{<ZTyAh|LE(CqkrG@wpDPDMcq4D
zdhssy*OS&o{KfrDk>49%C9@x$+PphR(lOxFJ2dojR`1%C4*RXPzxDo#8mI7@71K-X
zR6tn}9@4Ud@f%O*o*VvuF)Fvzhtvd(XzCUf;SGqIh2VlFEx00baNWw0l08SVm(I8%
z7$PV|Vjc%d;+SkiD};|l9gjpI6i|%>3WVE~ww6R#XluJ?wiYsmB*cxaZ3Jgn5mn<e
zarbO#&-DFkg{;^H`S|dyp@bH748Th?h>H|Wj?nvvhJ+SI+td;iT7V3;42gMCv=kUO
zap4Hj=7*u3iO!@b9zfq#>D*?iZ7;_HCK;y})C2WSF_!?f{!D7DFK(Beh18Z%{8kBa
z3Z#R$_)4%x?7`IQgD<r_;pL3c<Ir^h|Db?_+pZwB1lvwS)d7@N=Rt0^5@7$Odb##a
zYVVGbi<20aFjfq>Y};Vx$FDlM_nemueXrjzH4^JPp={9Jmrvfbu|bqzA^FTw8PTbN
zhB(PdfU6D*4V`Z%fW}nV!^-H%Uh}EJpePapCWsL;n8yghBw_)8`4kike4uxM1c$gd
zxTGm)5WrXl48wtb7lARjt^tg5qfdnkBD=W-n!sTs<FnQDILPFng?nH^WZLRlRWrfM
z(X}iEG*KSgXrA4{?}GQ9P8@WXPj`O5AoaBz=mXb-cL7;=s5fK8!C_BBNYcO+?uB+8
za2ZQ;w0+Vx=o9TpCEAPoNRF@Q3{d&oK<>B!t#B?5DTr251)ZvK6(7v5oVF*Vwlx2R
zJPeHT(goTAzCbsyaTpcsO$ii<Hjx?M2=J$D0d&RUOcIOHla?EI1&l;R_hGLkF&9QE
ze47Sc-KU>y(P(ZIY6mZ)dG9jXR+xPlY}Vc#oYyn)?Wknx8}|*{ABMG#4#YAS;s2FA
zD29<o)C>a`kIr|B#$9od;M%2Rf+DC4Mx$rQsbHjE|AFdg43C4}%`Kc$v<d*l#HbX^
zJjgf&=Y!B1#fp_SJD*)U4`sH9yZ4_iPuxEF@rmx<>p4&ojO&oj#RN8#JmGuywk<l1
zhLqb8Y-t<PViPFv3kO7uj9UN#4w{VuU>uuZWQA1)3QqoxjBh@oUpz-O&JHf1ErK+B
zy<l<wNlU~p7}!-#t)&nHV56;_w#+psZLL4%JO!c=*A_E@j*CPfmY#<C&q-8oa#gF0
z#W9>KwtU`y+4PJ$X(#UIuau0Q@6`;IuE@}0q`E{s=8^u0wj*=Hg>-GtiFq5we}Mo`
zabjZ2#97JID22|L+9G8pL*$$!`y?HdAp(O)TEE-YnK=jXBt(V-OvBY^43BQxGIYhv
zj7m!|MfDEFpRxHKNPo;Fk+>)o#iW^sJ4Cq<O^(?{a{w?q4Hb!g8gejuMKG;trb!4F
z3FE)S(j}O+i|KY5@cuu>madrkwLTp@hPto6e~{Wb^6XWZr=vQjtDq;Fe+Aqn>U`#e
zVSalWETlkJ)Q?2C$TS`{kB@djz~QJGLzElkB1FT8*GP;VYNJxJnE+yt-K<{YvIf;L
z;H4ZGAQHc}NQ<EeAPz#lcu|rCg77j})X1UBLFQC^2_gW{aLjfAW16?OB+i4#{QU=n
zVn>bzm$o8fJW%ppSL}v~k!4U;xQy|hqdhjzZ-iY+e>CMV3IX(s>*eDbH0CM88nW6)
zTc_^-a^skUq*kgKA%|YjwtQj{#Mc6w>s0N9V?aHkxDtFP939jf_XGb%ui9z`q(mb$
zOgnT;P!ehlM71kLnTX;Gpl52=!vrJGJZUO|GF3C3HQWbV)XFOj5egw{GDfn2LrRrr
zDoGNSZ`&D-RxiIGH5ff#HZ05<ySZ`*+MofsxCuMQAEuw$AaMoO;;$>x_1s&Q6H$G<
z+Q46je;K_a)Jp-lGDcUR-wcySjP_lyvpD>bLA0Q>1tyS@BbNpP+|~#cMN_0D#dRl}
z7Ezhf_7sh9p()EQFkG0L3<ndh?Eau>(bi8fc@Vk*#S|>ypu{WR^%Lc_(ZxUQAMZiE
z?C1maP}pc1K|lL`ZmC9-F3%derauJ7(C5C;_ovzQ|D9`S*f4$5S2IcEiEu6q9XT`9
z*6}6~@<LY#R-C%c85wXS8?$rD$i%>80Bkyiwuh5VA;@0{<-*3?nl<aMQr6Lo>D{RL
zdApMT1p<JuhblC9f6ru+N{t8t;y?4*__%>1GlgFQ!$1deHO44rAs}dKYXTEu{ClT=
zwociMb&}07|CFP{%`I`L^}vX-C4K^7vSpai7$z1MvP}j#Nei8{MQD&9RU@pv6AL%l
zQ)YYF5+Zk4Hhc}ZT*Hf0RyxcL$2h`9`iV&Y;G6?WngVga3oI#CQGPkL4qLVLQ=y8#
z7`?^_zaWqKuDT=+1P1bh=rd75dVtRb81Rkz_^ut$MLE`>o(#8X)joz?BGiOu%#p8h
z8WTDm|B%|M?Muv7U*D2A+~O|ZJ~8r34T4f8S7c_rGIQsoFg`Ph$QroUcKANT7+5X(
zgP;@1T(cvZlT?l<(>VdXgpJx-ydGFxu51Wb37I0Qq|lE(X$gD@Rv$1w6=sAa)=fI8
zKpttnG4t7i&MHh_ob9MG!p30`6lo2hQOhjl41DNMv2fG>xDv~IX0OBvR+R(v+2(ID
zujM0+Li(BJc)}!Nd>wrLaVUgP-@Ot#8<^n1D{KNs8SOj3>m#et!Bf0s6UE4I$w*3Y
zib4yRGCcl(v1k<wa8C=-0z&05Ov>3>jPymM`0|6YR0K|&lyG<b5gv%)<N47-klek@
zoZcd$H|LD(7Sj4dTn=<fU4qWA6yzc1o!XEwGAf^ZxZ__Ogs-xA^bv#vFS`0pe_xud
zesCgI^6ctWI0$8p08AXpFkm;yYHWo_0%JoUnuqLgrjfqc5#bW`uhE=8ugF|Pc%XEi
zB<=Zpu&At%S+=>fLPmj<%sgd7CMhUY;gq5g=|G8BG38n?QZ%FyAGwBM>rjD<xgD4t
zNt=uzW4#VgN+3bvgdM}}g*90tkH8@{YueuT?Y`dZ!TV1efufx0_um>BW6O_XBrpUB
zJHNizjVZaHs&UFdiALI;h`D|`-he}@FyRdavl5jkSs7mqJ_zzgL_j<=DZ0dXmQa}y
ze<1DA+Yu;W0o|fXG0p;X*AXbG*6HT$41m!E&!eL(02F6@<7b$4gW<yQxbsekZI$mL
zitS=cIk}iE72h&-m<gG(CSI0nNA69|>)kOl5BM*|t?8O=cQ7SYKXv-2Nng#tpnK~x
zhu+EiCys^JX!e6Tpxn>F`DWDYQKdrE8J!cFK!l;0tN~sK9uDm{=<#$}gGkT~YvqH%
z1!Uc6hL;Io;~^6O>r7eqyaL>4R6v=fwL@nib>3t5W}E};GDs3LK)r~XdS={Ar|LY>
zph=3q`_RtednuUZ(O)@F4_@KPKx1$7_UUi?w@@+w8ic4ojnn@UVYG&AhRHbXScCvu
zK4RG;)Oxn`y(p78vs?heZ#?=xfxE`+Q=U%UP6!+U&+7jZ#Ba*aHhm8MHGW5!8UR1n
z3;r|4B`7M^LpBCObs%5B2C;QUXc_1uLi`xenTb-wWhQ+xdI>aT+>Sl41tcUFjL0e^
zN?s!7*r*Ffr`va(uvj=fJ(NA!Q$MdqGNrzn=#%|-^4Pb>yPm3ACPz19J?;pXpv`7#
z;{NwF)9M-&dyAb&oDOrH(0bv6TaV}>p>4)up=6DjnN%(nD$nAafaD89bU>&J_YJ_L
zIfSMiOa;Q>5Yh;*!vJBpAz<+TV>DNQTjn9gQP(#G7D*#eo%aLOKjTaQz~stMTx`WB
zVXMz0*oCP=+s#D(HU0m-7t^!%VtJ)Q0P+Gis~4r5v|^~;0i^}|87-flNZXSHBGK_2
z&m<6^V8Ebjd{BkCoWi{vCrAJRC4tmdWrH5(B+hT7*TnCIAmyM(cR><7Gzb#S&OB0~
zg`g9Kl00J~90Lzil8FiwLeLIo7}G^?#FDT`Ld=5V#|RRObDA7=?cogz;a0}hd8h{V
zO<f$_IdC*Qv2PUQaqYbwL(})}1uy|@r#x>G^Gol3-n$cH0;i^w-L1dwNc?l=#4*H9
z4CaJ*6DbW`ESOf<68Kvh+~h=0j9*=#CStt+tZYGaj&>UC2lcZAsnNU1F!*BuSAfdK
zqEQh{FMApvj27}7NLq=gr<xBnxlMW~l{|MQKqvKF4q;yUOlQmxAVz54;TZf}xzZ;4
z*)=(o_mEbnUU+()(rS$(w3g6wK?b7$#GE|jn9jAZQ(YD2Y$~jv)8Z5yw;{*2|1g3K
zOd}*FUt35%yKNboSGIDkl#94+kcmQt8qa_OZc5XFafnci!Xq+6fQjj?2oXCjM<q%e
zrbO6OW^NK;8|DC#AfM(qk=g`vGNo^_%p2{|n~I@@N#;Bg9ow+6ds&j<yZ7@YW9Sxs
z*0R#(f_AL0RGu;Q5nYDnC+_!@J4T0EAj(&xxxZ3A_VoJ`zLRhh+N<uGXbhcbRK(+q
zlhk?Z>u|6kbna=^14$Gx>a1-c_;<P%@#yd)YWR|75DP#}Y+4N;2wo!mG?UkzCu4@1
zTqiIP9Tz9&^fcK9(G*&S@PRo@m_31!%yU74NGd`n{d|OF%m)H<^;#?6KJwf=>7MGz
zc)*yazHfo6C+9;N>0fuC%>+^Yt^RdJ11B-nwVF;452-eRm58vK9nbJvLr@!_)J-fv
znURJb*3)cK#HbN5APrDn^BIt3sH6s0WBi>-H#8ulz}<vksz`*n+{oLY1qo(1Qaij8
zm)ngY+IuOQK^_Nxa@g|HY}BOkmH#m0n>D8Sr;J*%`S<C=O(uP)Ii%U2)V@eOA2BP9
zsZNa*^V6c>C5;@`V1Z#fNu1W!iOzkk2{%PtM|@e#S_}lu#DFcmh1i=0ToNG}x!90N
zm~;zn58_>15N~Iwy5%+5o(il+f<iHhrIYeq<X0g`9uRPmEOjx^Pi(*{N?AKc@_z$m
z@E+6YAJ$HFB8mIcoMyJjPs9@%TZMJv={B{4;0i|7CXhvxE_<apW+^6kaBEgFbPYl2
z(FOY+JDahPOBD$WltMj$3cO*wy~?6c5hcMdiPsxJBNXl2Gi3ZO2TUiY3Aq^1OS~b!
zARoIJI7vABYRW>nI8rhMxbYa!VH;DcD@xGy<8^B$G;@8&51z2l+#AXsy*0Wp>#+uN
ziZm+q##!TY&SL0}eDppfmM2cwOsl$M8#KR`YDdNg6MdYwVnzd&82Doq*8)+Uc^FlP
zDcK~N379Y-mWpW<5yM18J@zGdB08cB(+8we_+4Nb>zM`>rZPce1Az~qj1J`vW^giC
z!I&?F32nF=NX60Tz$^_QzTP#u(3mDdDx&r#vLbE>l{{3pyk{Kr^etp@GuzOh`IeIC
z@pC$9T<Hf=owP{Sa6(6SKe%wT3ks8?U@>sZRIAYQYKeRp_7gcH$}3EZ1wNVdH|rn<
zNt2Pu?&rqMN3=~DZ~@YSVq0A@^qAR|u^22XDNUgUdJG<obXQ465C=5|uz1hX6yzk4
zLe`*sR}5NKgw$N%Ctl0Z9Ud;$+&$EAVXUj+>Az+S@*{s>{<EKs{Bpy08&rSDvywMv
zdW7QmPXCw$MY%GSz!%Upvw~m33T@@k6ewy7$GYR+a1y~`yB%dTz@KG1Aa(9?0ieSS
z__Cd`mPP2&!S23hv<Eha060u51bRO9SB4i!gH8}42Qr_y3_qp!%F{~-PN(KcCgoXA
ze|{m?LR;9V`Cgtmcz^eD?Z`RbLTCm1m2DB*`m^m%2t^{}e4f6JH&b6<N~Fgx=aet;
zCQ!h$D(Q$PY)67=*%lnJ6F3P{(*b?~u$A!w#6~!L-ZXBT>$;daBhz5!BXov$^0{2s
z*7+E9WhmVQ7aU+9orClMYmb@nhNoFLcD5EM2SAG(u?Tqb$nTJ(*Lp3Ho|U(1FNJG<
zA~$gU58XiG^*_oMBok9ly0m6o%>PF4NI{|na0k+?fT>4DO_3;u!BY!9JaVG~h-&{W
zl=x~;0EeSc@1s6KpfRsfj7tax{SDpxvea0ZHvGb><C~qp6~N><x>hJWVcY4$wxJ;f
zKXLglbKNhz_Y^^??=+h6V@oxux$=qju0%~u_O_>2)=RP<rhYZri_tFDy3L@~=Lydk
zibTi?{Dg>dFizNkhZ)a8#0G$@Aj<1U(symkbeiX*eFJcX)H0XdCR>t;@L_@KZ<HPM
zbGj^C7w1tu57`(xWe%YtgeD=ZfY7ZEig}^nWh?4nd+O0(CpBk_KertIGG}a^5Fjg|
z!LEx@heJ+oC}^F3<;#7EBl-w}l#=YvdnFT(rwgIpR0zo}2HMtsdHZ)6!lvHIVPpu}
zkrZSw<e3Oevr;F}@H%QQ{d6`z=w$eUxw7?1&Nv+JVjN32rDu>c|AweLU>N=f1Bj!+
z7JnlGFp$vJc!Du(1a^RZ0Gfe5s(NI(o{xH7Pw+LljDwGQ&rckTO`Jd(LEEKO%cmqz
zsCT$K?-52_Tm+1*`CdD=Ks#9z%QAK;;H9lDILWF?<vKy65TFIrvIfEbg4_=el1!Da
zwN)gwuf|q8rL9IM3A)NauppZg0HG+95MkCA;e^9~;_|+P><WKxR{&%Z=S+v|N<>(M
za<DXpWm|ClBdV?HK}Y}`<wAVd7(ITyC0aomm3}igAES}S;qWw1r!BIFm&dwKUGANJ
z)Hw9#iM!K)P^XlAS7!ii?YchzQ6UmDs8v645@AjQ+^fa#NebE5Q{IbJrG`0DuEJkc
zGH^dLTD+NzbvgrZ2!JV}I5K~xj|0~P_khvE;@TmM#gWH*(yh@*p4}ZBG`+m$*NMr8
zS1Z3*_@`IXe%*e#^p1h_tp)48|F8T2v!q_H_f!#|?rVGR=lKR|{r86JoFcY;uw|Z|
zZCTWqev{Rwe~XrShUPAeun+j_L+kr*Nz<1s48M}PqGqUXXms-LeeLt!b$vMR&kfx`
zt~QyiR2D&G29h_JW9BmR0$yv8-r{X!iF~AkPGXQl_C=t$lKCK2(w86JBSCc1C>fg6
zj=k&sRmQ`o?o*oe7LvZ*!3Cj8j%YFH6*<Fm73GG4dR6d%o3MXBDZNSO=xk~)f+s)f
zg$@HxnO(De3e*(<GRt;ZrhV*5BC)XB(k;Z9<ns~nVNjhN0{$RkW~19=Hs3?KA)b%0
ze;<;KS}zayod96juMt*I0u?7^Fyzq|8|%tmvqp_q_JV;lPA5w}SM$@{?C~E~HjLDl
zK9Xl?ds=LI<z2fMEZ=&2?8bY8k4~ibW;{upZfqTY{DW)h_IuarYoMKQY^GVqk!jX}
z>>Q;Wj5~-SDLHs*nZrKZFA%Zp4mpDeK4pS`#XBH4I<yFcs2J=qRu~PGvlSzI*G_8N
zrVt)3<o{~<<(hrj&l)v97S>$Qc9L>?uiSm5MMF#WgP{xWtAAWz626uzt&el5DZ<z#
zskF_H7q++^IHfJxQ&;H{B(HZI0k=vhM6-uC8);iv80b}~kzrUbAODqt_)dUzO|y69
zfv@X<(jj3vvET+D&MAx2Avo85rTDeevL#j!-s%sI!FU{8*f&_HIL9YgoS@<8P-_f9
zzXAovmZw#sj9)&KjHkOV*4(X`dTi6PL;G=~&4tmPtMeAV*LMZ0uUUHgtE|Ci-=rr_
zwWLmOffSQ6EK!@{JASikp(X(>0Z6+DFZc@U7nc^T(~(=Pz+5O~5pX*cC_&piEo!E#
z7+8$T{#21ex4C>3o}7fr&{YhkF8&!{A=1ONXbdZIGsf<q*v^28k$P<oX#IE}=0)%!
z07pdAW<Y9Vc77;SgiLvEpG?tqe`BNRD#b9wmeHOakMHJMzkcD_#OLxSKPb1P2K%fI
z<b9a*=}yNOeuOQbub=DIY#$!^&uiYahFNh;k`pe5H45-9K9NT#6tHa%P+AcE+UhdH
z!T`_Kdlfy9Iq0WxI2;pK9uCTf{W33SNC?!=*_dZ-0J_n?U2z5jS)8TboA}kkVhmsn
z^_}>6GPPHpK6G_-Xxq$cpS#mMdH>Gxo(t38r>@wk*|YM(<94U;Lq6kGyJQYs<hG5P
zQCSo0wSFxpjpqdJ%gHzhoWzTnYDC!!-pl|`1h+}(b=Z>B{=U-PGz_U2l^iQlC^*K_
zMkD~`SP2xlQ&mer*A1kD&}n=vCjq3Y1SiC^Dm0&(JMmX?Nv3uXEs+ZM`qEy>V5xlc
z<Eu6ozL{O)UdaR1m))xiaPVwh(kKYC<Z;5wZTBS!oG^otbtVa_;@IQU{lUgb0yL<W
zx25n=nV`*o^9;!AWR7Y2VhQ^dt29m>ON^n&f)4>AN$n7|HlMOg<Jdt#gAmD?I~i1s
zy8Aa?&W9Y8N2!&AI7bi!QAVPT16{ge6Lb~o)S0+eMPj@$hLqbCs0Uu{n(nz%JV=A6
zryjM8_GV6AgpSVCT^*Q~CUIZ*t<P_zZ<X&HRW@WjK=0Et<u{3F$C)R2_Si!QDBKMA
zJwrk-A{kiN;&v$Ck$#B)*dD{SRaAtfMXN#j%LTPAS?M<w3K<2msVd^x16-y~F67$P
zO#8tu3c?__czjF=;9QZ5>6>R{rse|UByd}r?GYq{Z?br=HG+L(#Keh}6YY~jQ|i&y
zCu6(Pz02y0yEkYr^=8rHk4ML_1<_KOH7L7QFShwrx0i)^R8_L7UO?Dp4^p}LH*sn9
z`bZ5+yiB~Eh1iXfq_$V$;S7mMDiQ)O)&0nbKtE{OBAVJA#$CyZG|F6uaYdHqwQ$-5
zyfPh|DF6jL*I;Ob)gLUrA_Z>sbRNWQ;{e2Q(deZ-I;L7Ad7>xZ`QX?eS@pC1op)k~
zTGc;2Y{-6c?@5CXN~SeaQ|*=E=pUn+%|KLw%<^<qBpUfiU;sIlS0!m+0V39;p^xI#
zSge)i;TZM7G-g;U8c;$ZVIUVQTL>N#Dwx$M3w4A5mCW^`cnlq7|Fr0Pq+uBJu{Qz3
zsNtRxEW(WKdCwPp3{mSCMA<9n^$Z9<T)bgNU$JELb1Z(UAFpGr$ha!yT8<$aHKKE*
z)SO_9xmqBQptS!Q40wsmt{j4uHDJc_T|Y$oGSP#Y?duU_5(~#d>I#ZqwaD7CYkqhU
z(SsnRu80>qx(WqB;t*riHkLtI7&cX!(}eAVMTawj)+nF1xE<pTc*gxu$0E$qTV}q`
zC5Tf7mHhQxSq~Xgj)Pt}+9Mf!sE>eEy-S~t`IX(iVO#ICZkrr9@1Ff|ed73)M)wVx
zi9+qvy{;A6ZjhBw0LHAebfMT@Us+KPFHV(ZgdaFOZr-1HA=;<|vLK34Z*eJrm<d48
zvy>B1!5``FC#qmwRjpTKP{LZWE&%U<Y8cWDp&kyn0WAJX^OXpp(60>U3X0YC6j6D=
z3*(kCd3R!A?TT#e9p8zb^cCZG|HHWd=W+S?75Q+D{ORp24U&mGnNYXiWs`H)VIBN{
z4#fEJs9$3Dk;^17CpG#*mk&iellLC)N1^77DwwbcbRve4n*}$t&7G5ifD}%!!c}#X
zFqV^Sj+HnDPs)+eN~^tdb=!`zKq3|M|7F=R#|3Ja7*mYN*Lvb8u4}D(_C@!WyZchJ
zM*hf}pZsdu==bMSFJw-h=-oLs<yx9?uM);Qb9ghl&yON*!^mZLIrzJkxM^q$i-Yw8
z$^u6!wg)bX6)rld>W+pOp99delwvOS23S|(r~kuRtD=-;FW|J)%j^*UkZ=dlFB2G2
zrzOr8?iS+Ocx6LC%j0Gk^C#D|8VHDKjrG(1@}O7Jw_H=)NN0VpX5^P+!A9PQ^$V;8
zI2Eoa8n+iEyh)YIm1j_-fL8&fz75qBfvd6qVmvptSaH<y<uC|{pnB7dd<1+8oqXvo
z)d9=25L9m-sAQ;4bLv_B6<{jEkpd6!^KoGL)4X0m5J`Q&+TalZSMO<x7AK>4z#=3&
z-u8_g^EVJDBt=`pkh0zHsJF<siPeSL{=#0AGha>h&43Dxjn7?*TDDHbzyILb$G6;j
zv(#I9Cpwn5W~qm}LGr&`^XHwbz|b<cJ0)P^JE0rHQ9%XAY^9ik$Z*Dhn*;u>RUN<!
zp)v+*jAJc+y;`xK>N3a+0MjCwq#`t9)nS^D?>K>CMGeW677<HSx6L6}X2+E>j@TvJ
z*-GgiGgXwKu1;)f%9zre_Z{!m94ys7wh{AIc4;v-bFAh<rh0kL*vLTd!~9oG0?44)
z>Vrar^L-iAirD-dz@c^|UBXxyk7OxgYIA*Y2onk7rJy(%37M3nw)e68SV)j-Tuv3)
z?$hOe-L&|W)PbjuGT~w>l*hrd?zL8|<?~Ro2)5__Zt1!8sx~%BZJ~QHhW(LH&oT^C
zSIqbM>wf0mGaS}9J^aKs>nK`-ryGZQ&yV&^jy98tSs+<>`*#6M%j2Ib5y9;2IEqk1
zVGc{d2)aOI4s6hhN)?<QmKPGlAkRTV&LA4XZPXdzhp-&r!l~j)2vQQ%Ld_Az2;KoG
zp6~*73x~TBJ_2og^NArzDK>2;SruF!t`odNywcsFeoQIz<z`J?Q-S~@*LJSdj!1Tn
zbZO=p_qVy<X&&ny+Ay`dyGC28U8@~VY?w1(X!Svx`S5;eva`~U<)ZM@aRDM)21QET
z`n26dRzq%pOeie`zcGsDw)%*|?XcXPN%pk>+73B#_VI&^$h@TmR=2sVa1z>64r0zr
zQF5#xqskOAp`*O)3d0^n5D6d}C<YJ7vhXz8@k3(#aC`(oMgvECw?CRW=)Pn3(lKAO
z^VYnc9YcNk^B9{>3;sMl|FN-kcz0^zg^{Q8B*TwW=Gm-8V}c#R{0>wgoFIouk;x#c
zQ)xU#zyRS(Esdw?R#w1XoCTcbvld}gAOIg_!+eOopw$;@9u$ZdkoVe!D}lIzVXGi$
zF+wYmc?5Y(#LdQ7{L?s&${RA<6tN>}x=2kcSByW_PW5Xwg%mrgSNcwjT+?37^?9iJ
za(V0H$yCJY+w}yonA>&8(UmWZknNhYrxH@NR;T+vL&ygV)(-98NZ2W9ECaHHXaq5B
z*)C)e<*yOV!34|-hZ;K$kmRy3tZ%FW_Hr9bgoaaumStVKid+`0x5APe0J`1FeWVxw
zXkaCas}IbLZi54~KUbq2h>h6ZQmtH{{mb7V+{y$jt=ebPJ4Um2YFEreid_d^O;2o@
zymGKnKJjf~mwfnmZ^p>a?Uhb79|#rYpqLFr2_YRmgEFhV@x0bR)K-J=gs_oXBHlmU
zFNA4Cu0tISE18al6V%ZVMH6QbnqV9O#4_PHaYeC9M5r(Eq5(g!{tuGcb73=cQSyD>
zVG@t}DOH!dkNh?`Hy=KS7)Ab+S>UfI)r>H*0wiNZ;3Phb!?E3KU6f;XBF-fM838!Q
zcEtL$3JWsI{1d3+!>g3#=m>(?QABl{xfsM0TmqyC`wfr>z{p>r$^x<xGZ=4yC>Z9c
zTpU$m_zECugnlT|P%{wRx*I6500%|2ai^=|Y(4S>fT69e?6&{XcxbYvx4CKTvdz@v
z^M4Shr2b2sGT{p8l|I<+1f&|z$?%)ZIOwj@aY3{~)CD*c6nUvVZwr;6n1B(-mfXKI
z5u}@>ws@EbSQ3z#1obS@hm_{;wT9wYG%9-pui)S8sML+)PKTjGz(ZRd1uqC5aafp>
z1F)1h@fe&59xB>{bgKw#d7c;je(7^C><T^wEht=`J_(rf6%|YLFhMN30Hh{7`$|e|
z&{$$Q{0;&@z%X{rE_*#$h!puWmfQ<~zFRXNz4%UNlB}2>{=GhH;T`MD?BS93`L9@;
z@Y$S=AWQhsE3HBG;Xlz_UTmJ?fy=}NqU8`1Y;CTq2>!6d{kCd(;$!8YWa8HJ&!zuz
zw$DC4+h(Wd@aN2O0iK_8i-mul81Jgt>4KmH)E-9dP6zF6=Cz3O&@{;Slo?77c=9yr
zk&-0AjM*6bo`SGyf(&W4BPb^z9}AeAB2ssz02zWXGoUDpi>t}|2Oo?1e2QIgv%EPB
zo;%woLfSTv)m1yy`cSl}VWP#?Pn8{mMY&>;t)&>f6mEvgMf2f_3f%Lh*hd7h6)$*~
zyMmgZ2F`6HNzIBeA|4DOmI=JJ9maB~f;Xb7iRZTlN^}~IoWu>`jVcg8Qqcu%O*w7L
zP`*Hq)qZOXpOix0j8D;<AqQMnE(h8JZpGZNB5B!cC*%CvTs}mYK0h{4crs4C-|J_x
z=A+r8s%GD@i}U0=hIXa*WIyiNVdI<e^$gPW<kue_zS@}BI;rYzeYyuT98^aeAfr+U
z5w|7~H4w5Ojw`RnfrJ(d5!6+|F~FBQRl*oN4Z5U}nnHZh39uHbJ!sGwRG5Q?kBlC2
z_)=H6Nm71YgBK%(<&GOOAXl{k12R+QkE9&_bxbcAVSoQus>D3r3eDYr1msxamNA<P
z>Y=H9WW6V}Po(<cEUSDeTkm+=Yp~35j`&oBvuXDG`eQ5M5<es)y9s&t?^oHpgZ+PA
zWgP=S!HaB2UAQJG>Ah((-x*mKxNVql3WgIpZZY}V7Hu?$Jl_z%hLK0+HAU_{|I<l#
zmpwn}v#0y#8x{+*hWiJ@SL}HW@iB2t$tu9u<&wKt3H3C@82W&-U=li!tyH5Y(itRk
zC<=HLbrk{Srd$)&g&RxCqttXm7$bHH-5l5ufJUasla4p7LF&vDC1ccc+X&8jUn!UN
zZok*uJ3jn`5Ud)FyY{K?<UL=NVdjr=5LVSS)3LPDS0vgueKVwDPPc%ib0&(ENXk5u
z+A+-(AcJ*VJQ7Y&_gA_Qy_SA$Vk5zaoifsMg9s<AHu0i7st~DgNbQGQAR(tt0COn-
zK=bohiOD`AsK)fmXi8<9v=aGi4jUb+XgTzfmK)hF#+*KG5e8^D7laBGDXJynPZ6nm
zsQ!;<x>)r`XZjw6;?`nE+#QgqfzZ#S`9P+@s|5@)3wxvI3_jBXRdHs8E76U)hT%-@
z$9poe8UmVfJ8(RS8FXI<59#3j+1RFJvu+%4kTOsuVsL^8HRerHn4)&CocdnU_d~Dd
z%3Qcj0OjcnP%at3nxDW>gr5$kZkYOZxu(#4>XC1z=EL7EdJbmhbvYNX(s(KPfCZzi
zKgG-n#x(7Lm&PG<-J1jJ1a*8s$|z<IR0!3p#2i#=gGsC8h^2V<N!3@DhLO1mCMK*^
zAA+_DzDN7Nu67J4@%|sE>~U31I5gN0{-VA$<D1^8mlAhsj>+$oV%FX7z(f9T)6EQZ
zM}4@Y@0wG1wx)mW3cLSb*7>~TmwUYvTWqq2HLucCOo?_yY}*a0<Ks^-(h{xj_9yj{
z$p#C3RtHqb{Hv3}39XMq{Gijp@&1g%%>}-&i^dnYc}pDvj0sMI^~FAxcvafUg-8-G
zceTvO%3~>jzx<p8V(FAL1>s15#hfCZ&vRX@Tn2D$BMRTJeptjN$4&q~9xiLpJ`PbB
zKY_xK{C>#`R?rky9HkiCJ?od|-pP*#4^8f3LX;I5zk^cb(mxDQo|FPK6pO)r+k@;I
znO|ijGFGU0L+p>jhkeNuAG@AHrUuHs>zZWc`!Li1&+%G-U%`PT_^JcVyh!v{VqP6O
znO|dQN0AA@9z64;>;WOu4Ms%d96*!Q-I#0^j;>^i<RwX}hlSdaqw}=Kkl0TmPxy1j
z;~dxic;xfbb7!zCcVpeEv;OZ1|L2w7r~Ah%O?-j3VZ>;qjm)8-V~5wOF-;Mp?TJhY
z+PS&<$cAAV#H*{p#NHTP4b00U2tecS*ZC<0+t0SSuh8D;%^ucju8sD#N``t$hk6HR
za`?dX*|N{GSo!$l@z{k~6PM~&GUpk-qa#vT4kHaL9LcN+GJuK~n2L#NGhnz_;P~hQ
zG<Q0q`U*j+Tk|O_7C<R1;Sw4#t$v4;?2fC@XSQ|w$p8BLq_I}>MEcf|BImsoF3x}J
zQ(rlE!9&MI?;W<u=<eM4uh0F}#a%FA^<VFDCIkNd$-76wFXoHPo_-TubOXg{f7vpY
zo8HDE)vfn_+BdmkUd^_*78)m7hsFQb;8&rSf;Ybzs8=z^g>y;f7E<mf(22~=^P96L
z<n8PJk&bl+ZNhZ|TWOoU*?`yPI_E`I-&ILYCvVhVPG6z@rF7`m*2`I!LgrhzXAOk+
z#wG0Bo!b#=BGfy>Js^xTbXaP|wlArw_`=be8&v1)AY^l*Wp(YLCNh@oQk|oD-yN~f
zF|_v+uFLr@I_8qS0Wa!E-ej9-^Sy=Tk!-=DbKg}h!*}U$Ic4*CQ6gTRxj9?Me{H<o
zYDdrV$hTcMFJ3nhiTu}w`d1z<;Rn7LIyaE{P)nf|JMWb!k<cG^ljl;_fAiS#PRYiN
zYUqoM{_yP4t=MO$UO)e3<JEWmbnsh^B>g|1{QA?j^@E-A3Dvv}i9VBe!dEnB_Wjhl
zDw?Ou*<2^6&pW(|bzJy%S~NZ9k$lG&oDBlwofa*1zJ4ic&K{EkCoAX6qArP!+M8Jg
z&$ZiJcU;KEtJv#Aulp!?Zta<hJo7#H0=)Hg|2gKtZ@)P6p?I6s&LLQ6L8yV<(LDOp
zU2|OU?IGp!53KTE+xp7pNt+GRQ^$P2dUEg8+fLQVeYY&uOg|o04TNQ?502lD-?PU)
zCzNfQGqcpe1_r^Ap_dN4mk?*T+WAeW7bw3=b}$Ic<s8U~@_#!D+forxHJ`V1&z|79
zyX>WA&i7V$My|7TcD2o6=N*lx>Z7L<R^^_XQ_g4Ep2S<&4G5Z7{${cDu-G37>VwO3
zaW;g`wo9E~jChe{>t-Nj2eWzn^23K#>{)Vf!Qi2>k^Ehw7|+|$`Rv)sS6}`6TW4?n
zn%;i%^SSHyEl$mTu;*a$VCsLYcz?3<;>C-$FJN0m>jWKwIzc6l;Q?!NJfr#Mx^`ix
zf8-A#b63mGePJfV3OzWIf4J(ENUZ*g*DnRH6S#0#cnd7ye8=mo<0tLk3_OZ2$UB^K
zjw94FIWY5~INW$@6X7Nk>`|~Yed;A%PDk*%x1+F?mETpl%hFf;d1`9k<oVcC&7H(M
z(_^Vk;a(xF+yCUgb<u|Ghef`v7t{~u=6xbGaWwH~<A+*?z8xAeXS08t!O9R1w|o(u
zU!HDooRgi0&_pO>=S5UurSYr0#6K=Fcg{UaY-$J|_fl{!M|f0^CUh9o;tn{US%bsN
zU$oSm{pOJzT?2mJVP4pqya(7Nqh$LRjs_dJ^7*(yNdnI@2X5Ib*S-jH_AK9VB!qRi
z>g%)NsnyymS5!8=Swo*cywlk5>+9$Lx+S&rCXV&=V(%|A|N5l)^KFBPS!(?T_e0Z<
zuGZYXAl_zkR1e25q<n8gGH+21UDi#z&*dGqK2~LqKe1~)Lj@f&oW6x2&UK$w>9SX6
z(!=9K8dWhiA~=Y@NcgskLmlh_-i`StUX;;lM;#GQHzM9ZM<hF^8<C3*OSUg{TvR8h
z#fGf%uaunr`c8PZ_<>E&S7&!Ty?b`vh3PGiOc%Z*R)2}%;=S`N6781iRlLFt_K(Qr
zd>w%;V+EUD3iS$=z3HrLz~Nzu>hj*JE>X-0Ne<EV7lekuDzG|&gXo}I=Iux}DXWNx
z=H->MY;!{AmgPu;ao%z2lRvJ?a|~PuYjLR}Cr7s^igmc;lJE!TMMBHA;1@&7O&&!G
z^YY619GMY5*M8N~laaDj{R{m<S^Vnk50-?#TfIYd!@{^nu6g46>aANg4}X2-;J0s`
z?R_n!aq&B^e|2H}^VqG656Ryj>wkDlUthnFBgCl+T<2eoYvkgJv$^cLi7c%;3X6w(
zMrYjB(_nR`{n5NnqD**82i|19{B6&OP?#9DomJ7fIav2R{tL5K8Q&D_v2?#K2PbEf
zgD%JUI^GLrr@Y+fz&TwV9T=TWCI_a9^&b7o*ovOsmW~(l^w%d`_{;t99<s=eVRf*y
zc@@}|?~=<R%z_1VMX<yN&b_xv_+nk&CoCAC5P@OdE9HC#10KuPOs}dU&vDUGt9HTC
zqj`2m@|WFX<EP*_oXnfU=io8D{6pXNEQ_=Z_TcfZ>-wA0rdQ!PjtJScRo_JiopIES
zaOUUr9WgL$ixSYOT6IM9VOi3gJ^a-{7gM_4sn%Xrs&=LK?%1+p^r7pH?o)SuytNU=
z_$~Q~HSZp}vvcc%+vBl`Qx_i;r)-!QtJ&Jqlo^YckMLfz&dYxtEmUokf8`|JDDSXO
zMHIP!$TdgsBJdKrWg%~e&Xr2Z{N}vEmP*U$q*r5b2W$(hM93VRBm=>&?~de`lTGvo
zIZ=R}c{wv@j!T(tMCM0igN(2(iB0zDD=tiohf6NVCynQE7QUk%K6*nkaYdza(09u>
z@vk%qW&7#*SNi!!^4@gD(pg(9<Y57L27$0qxg1Aw0`OahbD}En;~ed*@;|A1FYnwJ
zjwji{b6x*=L0M-Uqml!iJ=yq1;gR4F|HvT2yg8u;t2fzifnkn-q1d6j`L_!SvXdb2
zfZN9_l>3jRZB_p=SKlT<JHF@RhYw5t`qz(eNN*Z-ZhifKfAyElC+iz-C-_JnJZ`_$
zIPcLyYfrXL^oN!6KP=nJtVz{({VQ~$d3A!)BXIAqTc)21O=M<VwobCif$J<jpG+*z
zh;<k~3=30v99E9jZ5d<4bT;8Ey%NQoU%(95rg*z~Fu+*vd8YWf%G`6`^%w6+-ng-^
zvoN+n+wVTKbBvbr*COAqM*1b08^!AD7hm^@ei6P5TkAj<mN}R>ro0#QD!vHHwXb>w
zKCR9`#~&YP>FiJ!9MZHBfk{UZE>{A(mh~ntP}kopG)A|Ac6+IFR0WS;@m(_C)T)R}
z4=d;p+vXVJSYJ;zI}kdTga6rWvVU;ocv}>^b}4+rs;c=i&k9G+vN>nWhJU@`t=+v_
zss9J@!dvhCE?jtW{i^ma53YQhb=0}?3;0b$G&<2ip14$VR|~BimO7{9!M>hsv&Us}
zd|`iH`OI-$7z76dTbWmW-M?~v40+wUcCKex2#hL6sOvw+O7MXGA=>fHBUKSqCjO=o
zKj8W;S_<7gM|?C+eD?bGD?x|id``=EJcS;r`&RAny9;?f|J&64b+2g~clX5`hI_X*
zkDt44#u(L5fx&8-k-x!eJriND@I^!WH+Qo_;tazOVlCZlphGSQcE{n(!{=}S9Lxrq
zu!on(o4rzD_k3CDbmlpl?r#g$qXmZFu)uA6`#SD*-KS)^a1eshN|tiVk8*^YJM7K&
z3l^DevJY%t$qM$)a!EPV(X2e?Zhb*AJsA7fzcu>)c=JuV_GkNke^5W2IjDzMUBUy<
zz2sW8Q>6Gd`sLt`?2b5?Z3=Y6qw`=+5j}kZFXK~kF2}9NY%~6x@3`os;DyaP{*`vg
zyvi>QV_VpHZL%|qm%^-WG6@RB4MhmLX|>dBGmJs~E0Nd@6WQ>s&@<Zjt&biJwQieE
z)n5HLae5gJs#{9u;?}=?*);Z}s?>e(R&PKRCzMx<lZvDFh9#cQ@T2$gU{ZDMX(@EE
z#Li@5%Oav<;Hc(0Agry2i7=>5OYo08P;!ZT#&PKYJKmtKjmru(7%eZN6K|ETU-n7K
z5&NZ^(>!VA^p50z0<(z=*WRxC0qgoDVJ=56+CR>O1GZ^OP3L&(J8mn$DQf&%dT;BQ
z^oKuw-tuQ$`FH;G>ZAYq>E^eCf9w2M@>FSap*7>D^EQ&SCs%f7l~+WV!Hz`2=^qUa
zqxfD3J7wjd6a7OlMOm;>pPWPpiDSu_f!uTNReP3E=*nBV3DE+b>=^$!o9jNcc3PBI
zl;g=>y-r{lJhv`nZtd_JSYiQA68sf@PSrA4u61#Sh=a+HmhA$ndrSNHr;qQPNcYXs
zK7Q7ane$jsyf1OGqo(z=SZ#8+AmWmcmuKR=Soakp9RH*Co?LdFRoR@nkm7Sf0RnqR
zmaWh~600UyR25uq@}XfMMIT}WbV2vn@ummT@=oSO^t17m$RU`V;L;1hYlEUp4lMQF
zPgfk__gcp&BnsF2lM4_ez7nyHgHs<8v2IO`>hG1_jeiiezVqyjSx4XewBuXllSbp3
z3t5wQA6>LHK(b+Pb`(p3OYFFayk<mIBoj`3cLdA1?(L43kJ_)I|D;6@o_p2#1tb#=
z$Y5SXv<ednk5)GSK*#^n-nYk7nYaBS)moUb93s`QoGlfSQV}a7hvg9J=`49-WSY`M
zA%{gRgJPT(q9j>TyF>>`4(mx%u_|QVbi@c@<Xq0I{rxS?`^-G=^X~n}{_OqN?vJS{
zao@lDy1u9Dy7gm&)ftM2kBD&1glv#il8Aw1Pr*-$_`5rDxp*>pVj7S!LmM4lw^V?W
z!UKFpzfsjU7raLTJN0_;>C@?oiQ3v6T4E$q{vR$?);rH!W@;?I#@WELql#}yv!Wy#
zxq*^EQuz{|@>bN1Xzrb;N+Jtnw13h8YJ%-?6FknPn>H+Xh*Q_7PQ*ziN8(~|7oHbE
zUM3=Y;~=2{FC!OY!Z1#h&ve^DUpCi;wZ<ybP2PHg75>GEQ*OyVb?s&zE!#r}`a=8;
z_E&~^_rEFL6%s<M01h>Ob=@J0lB0c7R}I63pC8^b44bNXw%D}q6mB0R45dO}4oWh&
z!^AD>BFfFAV+Fw}JZ)!Qg9Im~hN<c#f`(j<hOQeb0b~hu1b;#r;D+HAlkoq(IKHGv
zqHP!9??UuL>IjtenC~Awnu*?KDXj76V}pzqsCu_eMQye$FaL4SZO=Qww(Rfhsp^tm
z^h8M`u2GVbj((yT<i3M^qP)mxBo^@`)E#-+OLB3R1fn8=@lYiaIVvm}C%_boYO4p9
z0(-}Eixe{zQJ=;%il^2bl&4ZA%p#+e;)@21q@_us)1j+;-Mr}9CV7`(<}Q$P&YSC!
zSZ}SxxtPDH<Lai4j4!z9^B3HF?0>AMsxj}$*?%E;;^YW08;clT6}Mkw<qF2n96H05
z&%BM()phd~ud4B)C+L7UA%U!t2<a1Aht^kAfq=^a?jjSg!~cs2j5+HS>b<m3DTo84
z71z8Ie*A6NqpyEY#b@D6XH$akr)a5!<ihjM)P7hF#2jnwh#zh_En}6EN(1}Aoyj+H
zbR?<bJe2=dd73X++m3ERz9yunLL|)w7M_u#wV}_ou^KS%iY~TiGTYA+2Z4@H))>X(
z)YO*oqHokOhTtIG;p-+(OHBs#D1M8NcTMBftiWfnVWkkkTfIU<1AZV$N4w^h&rsGE
z$TTha7sw1T|9ACRQ0MnC*9ve++&t79piS^C>1;g~P7;Txp_xHc{3&n`WKkP*-vvBU
z8={ItSg8YGBua3Hs|^%X5>7GV_Tep}M-%5o<Y*?0YKJ;c8w+6Y`&;b$!iP?uWeY#d
z^u0J)XF)*n@<5Q1A%7&*GGt%ddu_=mIC_pe%1c@s&AHR)T`njbzgTFOl*R->5aCz_
z#R*a2%F<Jh<E}Dh?RVos31u0xXo(QWRjzLB78{t2cGNj4c1f{uxQvbPItU-2l>d2(
zk~{KmyLj<T;I0?MEYT#&S#ig@Y3o|^R3D}6%EJRQpR?Uxziyt{?-NZ7=d*DCzu=^)
z?D>0PlYLDmA8;;m$z@49*#I$8qk(wP7qAy-M{65cA!03N@sZRY41AoZ3Q7IKHi~kS
zn-|GtYND6LeOoV!00L?NJx<sLkHi5W!MPNA<z~`!)76*kp|)w+mev)V$HTv<xv!h<
z^jo&4G9V}-`X+=_G9;Qk>^dq7NysYZ!YECy{Jb1JniW+W`sXl*aNPnstEAYQ227gW
ziHHcds&5(abFMbT!qIeCR0r{2T;yFC#!^o6U~QasCN9`P-C%tRH)M~yiNet<_-fFh
zwTz_{@@lLRm^fvc;J%oDfy6`q3W;aU{^Zyv3;oQ8n>@mfv^{!Oria7l8c{TX6F4`*
zI5@YBj8-o}ZVke_i?<KUBS8`Up|=(ZZf=WtYRn-T2v83?hSHV)dGiisa_|fA@o~U-
zqI+t>Bx2&)9TcpTM3jPnGs~m$!9GfY&tR5xzVJau{OR=2@YB!ocet_i?6=EBLf*jm
z+e1A|X+g!Xs3%<%;mSeH4f2U0$Om$WZ*b%u6+dyCqj7;d;KD-Az)b@DN^xS{;96I3
zCAXZ4gC)2L_oM{NEicKlLo>lNA&6RTBF6)2+Pm6#UNp%d;Rk)xZu3sP;}cu{sJqeW
z9?P<uw<@Ne*M3dJ&e`c>v(j<A#xk^H;{GrSue|4ePBglndeX=ZPz=^*$y<AAr9eeH
zUot<jB=0ul-kMkt29e$_Kg&nbwi2&MQZ1OG&7_T@l8mX3uUsR|E;xmcw;|pVvIs`W
z+GZ#(b~0PFdF(OD%~aSgij0C}S_j*&`k4NwPQ+|9gtqmBBr$?k(sbkvq8vhlmPUG%
zwH=Q}l15S2bBctDh>C@&M$U>{N`fPwiP~aNN2^_P7z)Jp;v?|vBja!*cN1`Cr-*W6
zR?#;wPhCsVkVw*$xQzG>9!S$$<D?7s@B1Zk7}e{6f|(Nb=hvt7hlcGNykFK|QCcxM
zl3H=#<)Pl*6-0Hv_Urs26+?$#u!oyonw}5tzEx)0zodDtoK?yy#GBh)cW9;%+yZ5Q
zDv+bh3K$Z<v8X-7w&*8vl0;t79=|0G!$X3OY66;G$nx_4s5U+AWpil^pyf^Hoj8gM
z1OWIav;r;19~Q8KhG7k5Ehhi*mj7V|r{$OT9WN_~(6|PbA3PKDkmR~T8d7kG+o1&H
zC=M{ztucq=q>8>$;s3D<z^Ru^7BhB%`S$hHp#7YvMAEF}<Rbi&;P45a8n$z}u{S~A
zC{d!O$#1^1org|>D!afnveRQF9Yua{#2TVd+!_HK&(Vu^3egrlf<jL7NCHVWXPNk2
zm~f)*`o|yq*tc&zAn^B{$GH}EYn8OJDBgVFU)P^~d}yTi<jJyVj?Mw7x1x(TZFC5L
zLBT+|+y(?ma4vy1lF*g%jnIa`GpZ&-gC<d(jNejXG1G|HPD({JR?|(^I;&`JNJ9zY
zqtm|VLfz#uRtgB3>f4_vCI9e|n3mbF*!<4G*`W6=?h_TliARqfzWjQ0Pz=5|hXc7G
ze+mzYTjVSWNr!kIY8s~-Nu%4@i^1@d>;j2y#O)%jjW#Gp)^_Dg8AS>SQT34iGZ*U7
zmoJMFSw#FFQZ*~^#<D08GTJlKxUNXmd{5Cbf%zSGBiy(&9SuAC6A>{oJp0W|YNC0d
z4!)7}BT@F_|1a6kEyG#`e&EPx?V&6);qNao5sK~HGx0*#)bHKh#hLhrOGG7zv#gio
zrr66x(ws@IxQbMp3>W%k6$%lVI{Y2iMKO_RJ}IpnJGl%H5JzL9w>A~kK%|VuMoAzM
z;TOdWwB%NNkhIghA#HoleMt|&yqnlV$|aVeXCEttE1XU@I5)E?-E>doFSlCSuiO&2
zihoOmjE_Rb`0oypWiLY~2^n${s48Didw`u-YU+TC6o;5lq$|i#6cd%wtXxTW!Bv_E
z-<>0AE$VV3R}!upB_^GSpnyxnxAf_<sGb@P9IX=)5=RnD=5l|{ANsU8_&Y>Bfi3)*
zTYnBDU#GWyY>X;!NH;CnQ+gw4=EMEXfrr$VELnnoU<6DxwEw2fPKpzAZ3bGkRZE~{
zvLhGGP>qB_aR(6*D1oh8Y*3&ocJ|~3yzGH9=vGBCWFTpIgD4SlAQf`K4Y?3z9rk(v
z%*ast<injSulGX+PEYmEOlBnrGs)Sl^*`2jU>7W&+2?-Y@S_rbpq|D_7!cH%T@`D;
zMv9ReT~aH8Mwj7j6eZ}KBzd~HCJjjkp;d&ER??T3D2u?*Q+}>v1pNY+Fp%hcVPL7Y
zL{zXgqievf=nGt^l=UeHd?YLxlD-lR5|T8zwKk_Bv^;r=DhzQR1m(i&27d^_LPI6B
zsqE3_cg<mwg684_B_kuE!0h9bAO95ZjC!=dJo6{oKg2Jd)LV`e)+N}8m_CI0B+2ao
zsi2QZ*6Tvj#478bU35eT1YJY;%#k<OMO1&p*)d7krJ|j^v{1yC%(nnQ6;v2PLGqH&
zq0_tA!bTKdS#||GX!0d#;9ckVhkuVwS5`J0oZs?kH2XUmlpP6ZIeGQgNQIabJ27?U
zLc7!GRXT+FB04pOGdt@c>JQ%_S^sg=B|;i&tmf!2X)Y|}9@4YvBGtA!mWzP~`3=Ay
za_R*!imA{Al9F6|8oU=qI!iP+GVHg@Cu(TsS!XKdaU##n!e6`x|L}jI;jr;9366hH
z>f*(9E#{$L(2tgh*_rtnR8j*^OHr7y{%&wXk5T`pP~pAB8cf>Z1nN&>t8~m#<Gfy_
z7A<Z6Wfe8L%J^(jh28&fOEs!Dc&#<L!?!K<<?HXB@AztMjir&<nvFM&7p#58_;u0R
z2l@-SROg?zERx(9`ss02=UT(s;4+VYc-d>wYb;)BgSW0F$1aN^5*<LRvD^{R*}l-5
zArj|l;CcjU4)EdtG)VZ_M?}03g>Ia*0J5QyJR%2U*YmuV<Hvg+`n(+VtDJ0O50dX$
zu$q5i;PjKyXT{2Q8a>9FAC0mT`^)2Q8zEk^(gU2*F^{y5ko4N%D@PI80YH(J1Buh>
zc@%L8q=Bd-6fkQ$#fff$Ub1opwr`<h;<DCewjC}yq62wabO|dxPrAbpnHoAE`_j3}
z!#sweAwQ4WU@b{T95k;DJGuBy!-P~>4tl|Tq2Z(bg#|4^{U=L~9jXjF*gx^P><{Zb
zmbI=dc%-zTWu~tuedhV%19f|50<)XP-=D@D&IyL*wh?9Hjs=JyF#{r~FB(f!yY0WJ
zwUHMU>n4vw)4hW53nig5M|7An5|ErZBzaSS>bwwo#f6L8!F-5v;YhMuRWcsIZ5JHu
zV@CTZ+oz2`Q+K{u*p(&hV9&G(Cp~5uILjwH*`Ea^Gm~o9D~`;~<FrW-YFW;i?{$GI
zX$K3V6p2892=j|tTuELGlL8Kvq)6k@5^tuxjH9!Z2Qu3znhJ6*Ezz81Cs!S+#D6)%
zSabU98DfO|pJ#YV(bx6VGL_7T1|DPirmXi<8O72)B?f&hV^ek4d;30ymX|#dG5w~-
z4~-*@saJhU_Duf0ciha@r?1J(e9aoE|NijOlQbG&`N{F5Aeq256A7${A>}+WrKl%4
za4dX-3o1^4nDgHqT}{6*#A7O5_&k5q^0*Iv1O}#ixQuxh@Q-x-hj8G*=WW+76DF^p
z6+zl$8o`ZZQe-c4p(w#WS|nG2+0v&eS;fX#?MGD#q+q=mo_`AhKZ_tVMKqvpz=>io
zO6F<!D@8OLeT<!~NYbY%E=Sq8ld0;GM$!cVT`|L2+*ci+b0Z7?UIIsB_nP#U$;ksf
zrZX?*`}MvHoqW^$xhHV+3%J&-J9=#RqyH}cbkj@2>0Hys^A%J578Sz}>twAl5WT60
zb8#i3rM!BtRFTjj*8qGa)QO9sZWJXx%gL|Fy+q(6$($8<1%u&2WDFB+HP?$Vd#BJ+
z#5nz7$5rijp3#GxMzV3qoN7;R{g}nSJ5VDG7v=^QdhjbtN*){PP2Fs)EUOsYbp1O2
zh_+$*FDuU?DYJ3_Q%2HQTAahl%aDk<MRKC>6^oFB{2!m_24A8-GEoggynD$*>C>gZ
zoTp!wpmgQ#?UNrHX5#CFAK&!V);=yPn~l(}-YPAwINI@hhO)=x)NkWv(LN&yUniI&
z6(Mj2m6$2x#W2~Sy&Qe2jZ-1$PNpl03+zrTL;oPEAa_QDklvgfQY5Dk<&xwhpaT)5
zFw@ANT9IV{Vy2++h%jJwBoP=uJIk~73sMpf3mczzS~l^=YqPS;i-k3lQ*Kp;OFLH&
zS2a#7ZFxI3BimSc>nr5k%RMh5v%!TZ(Q~T*M?F`bC#p-a?;`+l>X^Tzc_9bKiPC^-
zn_1i4zO`r0%dy^SZ_ngKX#<s|C5)30>fk$cS6c&TXiB)8g?1?#bTZSoawVfP6??pk
zQcac6U|}Z7v)`XA-RVF6_P}D(=E>g*vd;Q>ye$i#?rT<^wT{1^n}xz*!-|>F@c#JX
zmden{-)g6G=cEGuQ(|uyVIp#I88#4fhG;-#Y;z!<A~6LvVH{eua3>~7N~anmaGbhV
zmcei<0uRZ&;wo{sN0Jev^Ec>0J#Ei3_oU9xu4IoIw!Gnw_qF!QCiI0pGSm)TpD+AP
z;5VIMxjW>uKzY=BjWr`j+i{DHeFW!X15Rtlf=ss)+#-BqV;b|TM7-q-pr}fYvw|7I
zNg=QM<D|r?vPsDo9INK&=hAcp);mu`IC^dLx(HOENN=#FN0`%b?Kr6Y2*7jnG^w;5
za%(rZV2L9lhORwOuz78uj&kE6_c#2(@oe|EO|Qp;0@G)co%QDaKTf_Md6DlCdQh0r
zIl3w9wdFeVp_n$ngaR}!KJ5x}T;|q4BPk`?V?W7ocQTo2RrdP8VGY#aTnw<C-F8S3
zL+vJV8e7SEb!g!z*)Xk1w~JW8$&K6JR!_`43-p`r+SC{JX|>0<W5;KjEh~rmg})MJ
z=*YJmzGb-G4yK-r7g&0t7Ux{NZNw5=U__<OyUAiuX;582Er1D%fNs7MHc)EL4#)u=
zq_5xz5Q~$zw>$$HmeHy%z=T>4l9r5GAZv}Gu&lPb?bN%;!foKfk!8aMHXLaUHDHGv
zY<m=<R$lBs+<PG8WnoLx?9}>}hwS2f(@lPdve@09r49KvX5MwYoeb<#`vx(76`c^}
zauaWR;6H<i5ac9`1mwp^W-w^x=7}6MiLVJ<eF0n#9YycIq5D7wL~cjjE`p)rK|At@
zQ<Z`k4X(0szVYMgP5mvQox*AU%vh(XkXxu8y8i6jx4X6#Ugsb0$j#q#uj0lP3XTeC
zs<f^aX?vo?s`SLS{6h;H5c+vq%tR&3`7o!VV!I%6U_V$vPsS_N5%?jAMSzQ7*?&DP
zNm^1Ac7U)Y^7Cm5rBxiJwpt&Jahpzm=)whxeY*0$D5vQ^m($1L$^6=nAwlg>&;QpB
zD*1<ZlkY!uX7{l_)#kHHhacvbg?`oAv$nW2_}GC1Z7=%2nNhY}Ga-8?p`C&%+PD+X
zKf;<-r6<E%2pm$0ij^VR9~gW#6So$P8(cILe8D7KEehsd=GH1C#0OaZM0P1KtI^3}
zJjNyvZ*IGI-oi`@c(y_QlyTBUZV_EHERqDNVS|k5yA$Y{Q%h0^Rz(%ZFYXT;Xsmr!
zQrPRx4-cH(=Xn-B{lmFt^NP}=GkGr!%cmYq_^12zw^Rh)mUTn6>L5T_UnSd7MJ%#{
zMs{WJYivY_bv!3xNi#JWnb*SEgnwbp2{I&IK#?YMApjMI)l#B0zX*XF<Rr!>=+j_h
z`Q#;G6K#`t4RFNr_-J!)PxR>M*rF;^&*`$ao2FY|Kyd{5KJf{65Kn>q(Z*gA53wNg
zR1SxrwOz^v66At$kp+;SiTt3kyf%PUl9Y(a3yQ%v6a=6VLk@NEmqX)G?N(s828ryT
zC$Prm)`{DgUDBz@W{3pVrqi7uhc_ynUzh6dbHuZ^F>iWtwy^6j4B6A0ar&)tNukB`
z`va3c*)5izKUL1RRaGa?SBDy7j-4|W^(rDPG8khKsZU@_G*&Wl6N$AXGVw&3z6|kt
z5nY2Y&Me9&)9G}?035moQRJ~m#~3CZH3Ks~GgJAKrY#l2DW;>A!P$k&Llur3^6!l*
z_`I#~8e8~%<*j%L5D0c0>LF+FUoVW2>BeN_Dy0Rk0Vqgu$U2GnGUQ{3Jh;gN5jX{{
zWi{!p$YVr@#M-_<;xh4d+cCq@ARI1-IJZ^H_s9v1a4B|}ah-^<+fTJqz$&L!ObnTy
zw^x<!o*8-bD1Ou;$iMT$+m|EPJ@{p_E0Evun^!ygoo{}(PT0_+yeE9$bZJXjh45iT
z?i{WX*}I#qEQlQpfgv$<AOvDauA!6$q*Xy~PB5Pcen>5%6Ub(yJVawAiOh3p%yw)$
zeOYfr%aGkOuKj*Np>_5B{Qj0{b~6~U_6sAvRH|HiprxsKZ01Grw(Hm2k25+~t6cu$
zz@&2||9oKL@<>ncfMk&9wR)0a3t6xM*jn@EQk*JJpu8Xy5js{qKz@&Pz=g4P(nJfR
zQ?a{gf8rVfAw^&GBsrzD2*tS&#}E~i3l>N9JL}r3-V``Ynpy~-7dK^PB?Ja$kj(U>
z>Eo5kC5DGQ)(?xy%5u%o?}S|g)taK6q`#4)*p5$&xj7bU6iAea=Qisz<C2Zs@Km*P
z5Tc<O$d)bn`lJ;_o1ieB@Jp};T=GY#DjunREmYR<jE`*z`?&CfS)g#J;N+&3&c*VT
zYuA=!%(~Aj?4gsz{gu~ZkdNRPFomEY3x!TV{c^GPhR?*2-;u|a6ql2?@!X&T^(IHW
zUZji`2mL2fU$|N)v;n*a1PjBHE=$R&nIkd#WNok%#@mP>1-oX9o3v9B_B1)l9A7v>
zhPbsg=H6-2ok#xAS<n89&ie3bO-1l=|DGv_Ld(e^!~^b?73|LD=aEvF+GwLTFwnge
zs`8(EhDblR+R0&i1Jwdt62X?YqC^ZVL0-M6<jBxn#FWIQR+9N7?va?z^AQ}J4u3e^
zub2Ia$gB#XN=xE^(_|z2c*i^8Ct{P{RrKG-EF*|R76t9>7e;9l4XllL(?lgIT2Ffe
z76M(jC&tQP)dHV%VKv!Hfb%G#)0Z#NMbwfEqLbEGj!aY9E`{W`Wb#izwu?%_s8VC2
z5;_=DKu@|;8U`8>*hM@=D$2R@zp1ES|58ycma`Rw?k~hb7FjRl8fucJLTQNTTM9Zt
zB&$SVk0cY_2qAh?k!%m-YQ+Gqz*krwfJfVwk_NCs@8TjKAH<KaebYo?3A5Qp`U_h=
zEM8FQ+c`6s>LC0S-BY+{yxk{sFiM7`eAuVMl8}}JnU|5!4_!vqXmGDpE06#Z)$0V;
zoI{8&4O6tCJZr0ku}m>_wD^Yo8W5J8XglPzKmrNU|4^Tdn;OVEQZ~$P9J+90p^YwK
zn<G-f-r(Gm4&}zxhssCqpEWaWsk{?7J<;@KrDgKj?H`X1fBMikp1y}Y(qY<YxSLea
z%=1WrJyKr~h$w6&`{{s4FgOM(4OO|stpFWU7tHRd<)QCI@{8ctIYkWeF^pV-6*Zug
zhYtRtHe3Z?UVU-?YG=#oPt#}nLnrY4eX|y`1*ZC;zn*{HTp9HF_eYlHL&B`4d$(=`
z2VAxleF?19*j3q~<3!$oI8f4GWN~~SuUd<&ZbE4`R9Q)UIj8+(6@Vw<iTQrw2I9k|
z{(Mk$=^QG>E@=UdD^o@UtaKC=>c}Jy5|v0Rwn(YpW48^f{Z*+Cub)-wD-A2}ut&zH
z<5$kNjgD6RF;0|(!!8hIgW)k8u_HD(s<3MBDtgioeWwULlTC3{k`C6J3353DK)1r3
z{!EHNEY%QaRh5h?^oZiF;*T}n*F_+9tE9|-w9UQJwe{?s8>PX=5A^I}ANFpOEmWP3
zemmO#W09dW@#a(~Y@FqA7PKw|H-JH^q{OLMpCDsMazfaVKv0BT9T@`Mv$pi|GW5+&
zD;PkC2(tf+T{3dGc%gum=^2HO6qmD#RTTFfiU3UQjQB^Cf9X`!f5&m-|Kzy3%Be?>
zqMl$C@|~HvONO1af=HpORBubsKv+chEJeviK;BDeU#L>;_C+^gC6D+z8+`|C!l3B^
zIx=1`q-29ErAX^B3C_)_>YmGOEhh)3*~4`tsA2a{v^K5B=?t#<Jo(l~c=hUm!ac9q
z$9lS+>L|GYShg|}IZ=mqud%`Eizl{8x2rFgXkAF6K^@X<c`(XpWPlIq{b%+Kq}vL~
zjX(+eyXu%9iULeGdFRCWGE4EHlo%R|N1@#00i4qESFic+DrV+b@Q3#!4|}g};*Y(V
zc|Ggx%*{W=%*C9Ti>)J_Ipg1B@X6JR+kz25l^p;FLp@Cd2AX7o!i<UI{tb+ynVN^`
z1-cpobDRl6p+BF=A@!Q)({H1*g#p43Wr28s&}^)|E?B0j?Z&Zi|LH&ks1-M7rmr3l
z?tQCijsn%!=0Ns<yRhudoF~webfJ2x!-j+2>%HtrIBJ8hiFH-^pOb9KyAH@HQ6gwK
zqBQY@H0ZjJo45$@y{E=TEHR1XJAtB&iSwbitMrbv?l#~T-TAStLbmx`>G()-MbFFr
zzAtmp7K6X~x$xE@pV9tzmgYvrWT^;62qkomhtNz1PX&b{+NIk`Jb7vgFRDoE0yHl$
zNe-Dii3b#eVcnq9v2-}Q9k&2e3PdW%qDZM_qH5H8){cEL?3)@Zm{hCq51Rb8zv=x7
zO2V%b{fDwAa<2lJcZ7tW_F0{<h2oUQagkRg`PU-)Toutx%g^(~^i`n8(LtWgxrB+&
zPRhn$u2FI!U3(eyrUJQSARe*gOzz*%iExluMKVgdNCs7ynqhlMT0_H0l|o@jpClEt
zj-&zq6=P;bj?Y$%Pk3kX^sTynzjLogp?;M9x#*@#7Tw5}05lMG4lwSLKsCeZ<3$nV
zZ-7JQth0nmQe}{pcA%W-jTuGnD6kBiAmMjXeF^S#PvNcHZ1!E@^RmG3sSh){{ll*~
zkD&@*R#p@b{+{X=Cj9+P;2qh|c1qZ1I|K12!(n1nsV%<N9$8o%Hh-;mO3cfshL1*k
zR8JNRYLdyqrKD8~M~R%Gz?y~UrzfyyQKiXvKI$TPBX1#HTrRN<DqCVG0uU{*rV;z5
zlkx}3cvSp5qI!Fv?bC<uX9Xlu`pEvnr<JmU=o->^)cd@m2pFa;FitF-UA)5bNwwyz
zF9YzB%_|z5q=?kWiDL1@*$s<V9ZShO7&x4vU$oSV(oSMbH0r@G6~4r*erBMn*~ff!
z>)<Y7-^gZm(9!ACrM8CU?JLhJVpiFD7n_cWO5_zAe0|<re0uQiZAqDpN?r^oR0cLx
zfV<1=?XizcH$jHNvX{_TV2C<Qm!U0#VD8@@iG<*UiF%Hn^iaey^P{JR^ZnIJ44RN0
zCHy5VXPbVp`~T#m$>?ZX)fI$$HJ6e!Jo8#&Waug~`6g-YMob-u+lJh$$KV^^P#DC7
z*qUrVHelO3nV1R>`!6f1x;j|J_(tTq8lNqZ{$m#a4y?^o)?(aW_13^x$GZ=2q<8fQ
z2SOIN2sc!g72riB#A0?;dLR|NJw@5zO(w*1tLUClct#osiXKv|;or5VzTzs8Jpp(z
z(nQ`2=5~zHfP)y}lf@gC_A^+iajQZesJE9XK_OQfb1F_U**wr7-LT-`)C;6VrT>~1
z@fZJ<iY7os{Vk<|cV+ee*oDG?MYIcr3DB2aQ3mmYfH+NWFwR;8+*pOWi#ox!y*k;H
z6s<y~i`M-zF>je-Y2|3mx12nU@8aVF9vU4OiEb`?xKHUkx45Or;&b%2mmU?#4u*qI
zbxvHqjm-({*W+~=uzZp$2E7X?CpinSWqt}VkfJ6HQz7nvsmsoVuvchHP>v&FE6)u~
zQYCYOP1bH;QkG*lrMHyF5xp9=(~BUn*aDUAI=^qG-}3Jj9qSp&D}KNpZvJzI9auLj
ze8c+GraJ4wCy8QF3qKyzQ!7W^2uF$6yrAz0!|Gv}H@g}tiV(>br(FijIZ-Igooft4
zBfYKI1cQ}=4IR?PQXSU)vE0s=@U<PaG@F<rbCb@#P0i+nd^D<3lKsz)k2F5I+A`Uw
zHodpk?5(YBkd|yp1eOwk+A=&XSY0S0@@fNKErP&DjQE&?bPT)1^B}b$WHEmx&Ik=<
zn=Fz8*qybKr(+!+K_i(qYQO+`YRKYklpMVaCX80j1KH-XQNx*+pZOEpI_rMxEcvq!
zXOGT?JfE}i>WlChd$i@c$5;h>#*iN*ER>beHc3@dC3c^nE1425#LB&={X9IUoohgW
zM7RmGa;mN80;zBe`;FZ&UA%x8)RRYsWUw~?-+Hd@GHPs%$a>eD{U_PPQZ_Y$Y%D+9
zZhN_>qAG8B-Jy%sRFpb-wFkGFqXlD3rrO$=`x8!9iHdU3*tk{)nKLE@DLMfC8a>e%
z_0*7^8#FO?FogW|cc%*;l=qvk!|D*ctY#?P$b1zmuT)m_-cCi~)km?KzS;@dSG=ZR
zSD$-^U6#wAo?Wf7x4usz#mVF0{X5_6U9tW9@3jZNEebd{Kfic4>*Dgk)^i8?CS5w0
z#HOX(R<JzZ(y6d6&pyIVxm(dClcSPH^-oe+KxySZjg29bv~C`CG^ogO=5w9%%oFM8
z7U4+Erb1U1BTw<wn>TNon=Jwdf9DI?E#re{JEN5iyuH^4=(`6kg!<)4X9b9hmnH;w
zvht*9AH1x(t#sXk5=naCe#Z4X`9&FcQPw7dk`<7X!pNiMOI$X|p|)~kOd6t`??pHw
z@UCa56=}7#z0EzDttR}vzi+19(78D@N!Dd-ulK=y1FxU83dcS+Wd9(0{>>52K3@}&
zP068VIqCrlIa<s!SczTaZNThm=LX~uM>%w%p5b&iJ3#z|$q}{gsx4=*tM(n&;HjQ}
zj_U_2-FQci4YN;yBp93Z6YK49Fw`tuuYP=v0&<c?zE)k5COJQT`t?9Gd%8E}faQ|u
ziB+;Aj6ANJ)DQS^=+M_&8$H+u0y=LPu0P(fDcx|7oC`%Z*Q9lsx79tkysDGPQxZ2f
zMA>v(DXjCg#%B~ux&mpFWRT;k%6qDnlVpQmo3Ip|qnw$(@{2l_byrz;NphX<)z>&$
z<%Wt$Xnj3T*YtWzShpGeWsbMcdta_EubHnGx^SMyyPY#FCD9(i#|FBencrJ+$AR33
z(|#*m1#%1O9NGtDg*F+|v~_E>-S;a(eVXe9uoU`q-6QDnQ}aBXAp%|R8Rb5W!UtuU
z7|SinNW`__cikc-+qZOabprHhCt^%~a>nq`mq*P}NDM0KUmT+QGCXMPWsAjl_bQYH
zcgX08a-sS0j?&b+u%mZpO3#)3Bg7^1aGJ5YJ4{{c1~5}KquVMs0Ix<fl58@O$?M}a
zVV=W&?tI2d`^3!}@pxF7hxG#_SSj^-KK9BwR=O7gP!uScj_EY7#GrtlvEMpv`4eRT
z=hG>s9_@3KEo64Qt@bfZ=MR1S-SkfMri0%+*<P9eFPmC#r_5Bfxs+UQuZ&v2(dt;`
zqR>b#X_Jv`P=5L;$JtkYU588tHLD>?r@>Bnowtr-br?^2r6}l{6T|d!{tl(fOTkmq
zm8EfQbgb;wjCU93_OmCwMx&k=dJON)cEi1!Ki%scb~oI&qv56MbVIg*N4dU&)Mdvg
zu_LTS-SUgpev210^Cax|^IS9R_bVn+eZj<u9GgW+rS*DFR+{mGEqJNH<{rlePu|y>
zeoji{8ddOB<Da4@>T5Wmhm$>TnlJhJ&~E<cHcS5G6aF_%;Y|<DhW=I+?bAKfsL3v`
zm@3TvCe-uS;`p|2Z8e?2yDQb{?q?DMQD_o&|C=)?^<bhhebKtc3Oel;4N)q9MY$Mp
zYr3)SO7V<DodycdCpaR4(!d!TnHf{tV^%rx;6Z<X+@1;>(g)#ZJ5Fy?tr&VI=&W45
zo6k)>s_2p-CEm_8Y0x$~@)Pn9*4*TJyIf;DTDn35iHE62SYCK8DdcxkF`gv7kcm@~
zqW}^><2aFedrkkG6oUX`b+tGzqiUna^MfaSZ241X-&)QLKR^L|D{o=G<LyF=((x|g
z&_}f&{A3OMn#%7tnQ8nTpFp3p7BZEk=_J1hNXN0k73j&5Y%|~|XH|WUjd*tz$I(ui
z4&>@0YTgd<Z<Y0a?#lK!;6;#fo^%$|OG8AFbjRwr6CCOSd<OUZ+){dubX@rH-aZfZ
z<k%bOHq$*{Dq(8%$<)uS6M3l)9wDCu{^*Fk4}fw0N7OPBXSwzgIv>ci<(`qQyI3mv
zEYBXF4;bN$;oD*Ti0Zg=>h4|K()yY(D_x)JILGQ}jT*W`=TDuR4((=7>b6u)o>u<Z
zk8fd7UK6bq`l{CNVA%H|A%}x|KI~fZwdqj1^36N<If5+|f9%Po+GJ|sgpK5Aky<8#
zlf<xO<)lPQ<+w8M8L2BY%%x<pl-I#mh~7#wl3UavvsM$>(7X|~N6SDgP4WzdBMB6{
zE<CV!02pwj>i*1Fd_~&<!zJOux4M?I1u(NVO*kOP-FAN0_`7ZCjfP)sZG60M(f{GC
z&#%kf)pjzwb1bXl{&>n_!{(X6+SapWp_i+o58l0T<7R2in@59kQ-O8M!rI1LJC<7$
zz)x}ZwNj{hKtND3`S4ExIE&31v~}IUmsZncYPST(8NB3duc;uaw+yKq1()xt&MnH6
z1V}@Oni#9cn=>V~0vhHbt;JFKTIHLpT~*!n;y`rE+qKQp_q$#e1{ZB2Wk0ia-qM~2
z(&<|b!w&ZTPCDOs(MDHau8yZoca;iGKF=g4h3aj9Kw`7_mV08<c)@OfZ9P?w@4BCx
zAu7QucyY-s6Nm4u6TngiE_O_(OVXUt>XSo(wx;1G)%6c{JuVr26g`O>x2^eYsBs8T
zM*#m9odb*Y0h@omWvd`P6h1=^{F-EQZ(wl^CqUdPACEepmlR9zCf6|vUw@^i^H)w-
z79y<%z_(GQvlP~KTVcD}L`u)8%Vg=3-P_~+TFS?7y=<PT!=v`=Y=7Y{0(!%veX@nu
z!p1kLm433!+BK<JTc8$iv+ps*t}3n}Y9(%-<qP%^>;~?ml>@}(X-TC<0Dc9R@Z5U6
zl~f6dhA13|D$m4I!$jcN#WgU2QJh?^H8{{%bH4`!S5{^i*4h5(sD*Krt$pL_q4p0A
z+s?bswDsQaC>XSxi^vd3*g4#Yy>hL~r7G()!1YUbOw=qBAnQteLdXH*d?tn7V3&f2
zdRX5?pwfUxYOlPmn`;n2jOiAcy}Tp@KYC6Q){7uFanT0XJG$$PCc29cn3?e>*n1XM
z+~~Y-f#=K`PhWKHnxXsM+cQz@;clPv9`}Me8lPu9cy2HGLl-zb580r-n77pjkPI~*
zpaTBfZ<UL$L&%KoPhM;R1R_+`am4kIj>BWuh9Js>kp{^c1PoQOz%QjgPS<X#4EZ#R
z@VUD}y*LTyTa`=Fgh3(W?4V;~zbFU&&}RK8lYEvBUiuv8dxw9*P1Z+EYF#j107=)y
zbB)wGmjj-45=)wdB!aRcczaGN{>V=_AwsFRUdS(3XM0ReSitoLEhhdJ?Yp@7lWiXS
zk)T5s(;0Dlnwpwk2k#3^Cmc6rVLp;kQ1`<@Lyz*~J!7@c2c{a{<lPtClfr%G2M|2J
z;8?wTKUXIw2|Xr~Af%r_X>+{{NMAXEk}foGVu?p3_}-BNs%7EnWJrO4otrId=qGb*
zOCHS+Aqa0~Io>yT_T_jCRc$>GKJ=LX=cU4)M{1Uqcgv3&eb~QD0re_J@pPthoEL$(
zpwv8BlJX#Vro6Re6^E!v->L^u&KWfHFnEQqsR`yU>L#96s?N2PEHHjI*XBE(w7<CZ
z2aR}Rg-j0E?AX|JR_lSXo~wg<*=#>(&)LTxwtZ*Q_NG@$u3b}0uzm2D|EcTOmfe*>
z8x>ve4H)UWN7ToXn*&)7&{IoiB?s`sW$jX&woR~~AcT_T$Z%TYFdh-*Rb;Z=f_<cI
z!pw4_@ws96ntRg|V*^TRD_V2E|9pDaV)o7T<QCtL-H5-h;Ia4Mw`e8_^`}%0*f2go
z9kuHi%Zw1)z{H3y(nuZczd3W@aL#5O>@?!^fiCu7JZo!;)0xERl<36G67o8K+~*~&
z0Kt`iBnBP!|Mcj<<ClBZOwH8RTrER^GdiYkhYtRpyUDWTNcRB;WsCIGRgd*)sy4vN
zbwrw?aUf%%mI?pi4w)U-$bEKHF#6R4jPsy%J5DN?m#?av_IGYpS(a^CI{5zW%fO$7
zKc*+}23%L|e!G3<<9Cg#ojLjEcg#!-mzGBXpwM`u)GTrN5BQv9(oN0M3b4ZK>?RGI
zz$6fVz9ZhUorPt<QV`y)?vO!A!Deh(G>1dYllC{(^lxC`nhco9>dR7|Uj<z81HUe;
zFgm`k?qrw<u)-Hz+r99*$MN=tZ7*kD_&-TtcenpCxR4@A5TC?qc_Nyl#+&H(W5Bp}
zDd5^U1F1BWOF-)u^lU%_+{<QYNK|vXd;3XRrWR4-!QE!Li!E#SuCXXzb+6de;$5dE
zBXo(3f!}g=*yKBY_%Uz)l?N6_+x~1rkC)<1a%#n>{({{i4eqEN(4ZZlkzZ8?VU@A@
z5<xT(7SHACnB&cS(yB2O49erht2hl&YcrBU+Ew$U<CB@@uSQ=Me)aVsgR{@Cmd$Tz
zS$q7bs6#_twpxUF_dcmx?ooc^h=zl5TBD(NhXe(!h&?gj5Wa$Q6eD#M)A(~_Pc{E3
zU;*m%9%KiaiFgud6$NU%#E`*I_+k@>ZxarE)AG)2>S4>fu<kNlzdfCrhM`Nm4r*>X
zbbY$6F8sqd!DG{w_gR6I%jL>2O(yndTvbV4isAeM9iDv`;IP%5N6mU?8%euFFvtfA
zOu-mRd^4OOe2}jDngxrzSsRm$G@T-T@%r>T<aKYHN7FYn*^y1nuNJF)+)SRKtYGia
zu<@$AYr7AB9BSKE$L=oecw87$y<S{SPM!!jm=&zCbLt`P6CtG;3kdZ<f@`2Obl~rZ
zi1pNn5bW;qvRMQya6hxTS=au|_MLlnhP6)(?s_>I$UoZYW4VOr>RqAjQ3(l;@7%Gt
zRe@&w{9)HS_cu_PHE(|UMm7hOu0ze2Y~L`uR}fDyRsEeNJnmD;b_;lqJc-sVce5kP
z9lV^0qSjStumiVNU7^$SG4(alR8AY4`aX2>L-T>crZI49n4ixlfq{r-itz9zx_+%I
zaG!V+C4F|s4$Fn{Uw@-OOVtsVkYFj}2xn(<Nt+w&(hsvlX2E!YL=3S`kPvgdtTGA0
zLN%)pH6P&V9Be6>9G7q?9+Gh(|JL!Dslj;V(ZImI${SA}TaS%xfAea4)4Qi{9<|Di
z_m^xHdW9Sw&q$Hm_Yf%J4yoj=AF(i#w$9fJJwSAWQ;;Ew2|>7PrX(c`<}Q(r*RbO?
zM7Yjm-m9<Xz(_`FRpSYWThr&c1iN&ZFsstl+FHWa<!&qN{rGW;%^t2A+ZLkw<<Y*~
z+fj9Rx*^45%yd`X58#)fq1LR$x%<Rj33>3aZ9~VC0|3D+o^idz)j=W+!w{U<z;XTn
zml~`ma*;Yu8EO$GG{T@+>3p-}J+=>84~GWy1hM@B0<#lh2L^9{dRXwn^0lgY@loSO
zThn=)4t;lU4!%nkZbYz!x57j{prP8ai*Sicrja@-M0D8XzH7Jv8!kevU_PV+lOPhq
zgzXgVVBco#B3w1IN=)|9&+OvBqoKnsVLG|njs+e&b}TTEC`Ey7jgQ5x;^N{8pzM?A
z{;6LZavw-%tDV~wXug%WLc)>c5Z$|w1bEtHxQT=TXmSI3Geg>w1-=wnB%}QDy7$kY
z4c@O?e{WR;Y;b62N55*z-P*@4V?ly9j|Og?1!49KTq*Ex2?{?P?BL3i^ZtVuC{A4@
z0bq84DN=5sLY^|*!6nvKxNWF1m`%OS;1eKwpQfyCma(RfeTpQ_`03tNZQBC-XF6(g
z3;V*uSE+vN@9pjF?-%uELh_3j@$vD`?h7V*2dhtSQ=1;jUTzuwFh$|<KAv266_;!S
ziF}U<+K(qKdU_cUOs!kGyjbJChyZZ=0RjXaQ|CE4CM!hHI&PL*a(#|!Ga521G~jeX
z&cL8)dHK=4#cxmM-%%u*5M;82yP3EWC;*~DAS&jHMBq7X_bxAIqVx$T8sG{3>BcU#
zx3;T#+cY0Esy-Yu-!y?;X<F&;e>WLu8Yn!{*{(ZFhi$*T%jZvZr3-%;A6(j&@_cV_
z@4Jc4n>Q`wJHkn}3U_Ob(<1&90uC=q!ryT=Bhu<9Nr96^Cooo0<A*}z(R*L-868k+
zZ-4Z;%#aOPdph)KR099nL7DStyq}E0_Oo{-xqx;Nkfra`<)w2jRZ@;L9ptPEJQ*Pe
zXj<Y%<kyKj8WHPs4mENiFoBeg^F2;%UdJ_y<mXGvN`wz;TOG<&emOb)+;F74LgZB}
zzMgkgIn5x{qO<Y&i__a?eiQlAci;S|aetl*=m#R^iYQ8psNS+>0eBWDIf|`L((NXK
zGJFjq-Tn>Xn<O}~v$qn~Irix2KXP~FzAX$1NJ%Iis4*@tFP{1gQdq8@aIn3`u>HnY
z-U84oHR~zo7QXRGJTez#A(}q;iAZ4+k=cOHMHB-_7DEiBPJ6&5xm99)&5-=ntAoFN
zFe~hhZ>cPuu(K)r*gPX^7H!!~ppv2HlQ7l#!t(CWyP=0m18wgf_w3m0Q7xFLdA>jN
z?^d}Yb4xf+BF2%pjaro0k!%=kRKHbjIZ3?zhg91~HDNl&MtW46PMmvpHyo{W?p;vQ
zro@4o<l^}IDQB-86ng-{GX7{7WoM(2Z%u+U*yskWoD`c6vzbYQ7U?2H2FMk)5R}xq
zEb6X07tp{!nGo8$1cYpQlWRWS4S{)Q6bsct@4z+C+K>6K7Pq{~pS_z0snVSWy9I)Y
z?OQENK3^$3M3Au75<_r*N1r(y5+-FJEHiMt@-zx@Z9J~Pm8s(eG(>qbGpPRfi6l!x
zH%|?wo%5lou099$k3%1h?hOq%5HNW(;m{CO-l8<kz^@>5U&z3JG}@)DUR~>V=);Y8
zGh5Y!E=9BpX&)Gfq|yk~6K<&LTtifB3|NFjC+^@hlZGhhkZuu%P&R)rf64rAcj1Wn
zE7@!7Gj%PpP<tU}W|L*JEtJwVl~=GS>}bgN`xk*T-L->F3MR12#SS}mB$$Y9hnD!K
z+d=9~Zb#!^Zb#$f{k?0J91rN(&-M!qN+GppM%>{u<#CQw_HI%917f~;qqul{YUH;o
z1=;+E1!e(@nAT9t3L^ItKsXo}AlAx_kc<gn5xE{AX>E;QQMJ_LZUX$;B4WtmO)`Ev
zAGhcA`iTUy+--r89qOpSy~5s#ini%lDQRzF&c88q<=M4s2|wGkCH-UQ;nmLU@I&2$
zSJyoH$pHQxnQk<Y0TLq@rq)gqejA1mz7tspkyb=UTj>gywQkv8)iBiXyC%tXT3V(X
zn>;+`JhCmZDe)K`(8+f3T=ByX2M-=1f!6fY@<UBID&}`?d?My=pA3!;8`_J21Q{t*
zqGciuvzQXr>$I6f21{I&z!$7h!FMS{lEmRSmv-)LK|=N0cWYAde2`(NRNQnRoTT(Z
zbonzM8Xhi<dmLl5ro^KBFr%~47dvAnLoVd<iFzU_EkY_#2rfMrEQq=zU$v_}P}!KS
z+GJh|1Jk8sw0qCa_3RnI+hmzpV9P6San8e{@*9I)9{|+NYu|fW-<tZ5l?hXRb+XxS
z)=BL{0;9K?_`k@3!%1i*L=OOJ8y%i>{E$(r#gViCPknL8?&QIxWwxSgezG2(Z0hRz
z+2mbtaaPp}1nZ7`4~Xu}7Tt%wfPn~=!+;ATS%<kDX$gKR>K1A5qK?2+*bF1DqF)A3
zMsE$L$DhY5N6Hv4{CfSpqvd?ldu4SJl+)jjUnv;&XdyabiOtiF8`8mlhm5D_xg!v3
zZ3T)&p;16&#OE*S@T>}JU)G&0Hpl$#+L?~?B_kOWm5O^IpVR^YSj0MWhnmw2_pBFw
z@^?t^aAkTG-K_ld3QCq1Wt4A>y1?1aXE=fBOOg?JGH}&XTO;#2?h7=(mG!HbmJNB_
z|4~<_THXN)oDXhQYZ?nta@)S;WO00}0^$0TProv-yfyW?bN;4->hzTmiYXBR#@e%h
z9uEK)Y`N<pave{r9pz~mEXR38zaY-C6?T@`81^37Wd2G``282R<kmc^$J}ZQRvzu!
zyiIlTy=l5C-&ks&btL@)2`?ZHNPR3uU|C<YyNlbh;S6iPL_)#xu!-2=$EMf&r+?cu
z8B#_~6jxDeUROGOYw)>~lAE6FqTS6qXC~00%iF_ulo^HoJ*D`oxp?J${SiK4o<A<M
z%NkECN5Sq4<>YU@;4dNDuZ8CzGiUqXKMDGt3yW7C_Z9gau5fjyDu3dYGQafGSgPF5
zzLIm+272Bd8NXjDeIX{okxJ^9?@BUHJh0%)^;$-|Rj*yclO7+j{P+#uU%`%Ie&^1o
zZrxh7mY^T|=bPc8xv?&)+~Roi(u~KegX$YPe705&PT`6l>z!E0!>jBj^Avd^0%-N*
zNz*snqpgdSq?{B#sh*n-dGw;T#-Z$;g2erk(}RYiUwBu*K6_sfl}Au7T_v(()AZG0
zkEvU5F9OPAjcP7|)U~N3>xK`6Q=`T-Xfw}9b7ZP>4?I!M-oCza@O`svWidbSE4%$^
zX*ch!TcXx<d{S-n=a%MtN6t{$j(g?BL-D<>n|BGeE2>wo&q>-`wRgH|$cLX;9RK{x
z4XR}76|I1q#(Tr&%8m$qNv08gre2J?@~6Z^@dx5isr^%;3kMB#9{1TSCMFqVZ@pm)
zBQLZlO-(p7ZaAa%B*7y};Twm2UA)cC?yv3+*q&vZn3v!m6tXF4#+KaQ=>aFOoRTbK
zHScTld%*!AegU@&-=&DPo@d%s??amK$88)N89m0}yXx`5t1{U{{WMfFN_9<)Zw-A~
zc|dq9Bq$g&Y~403$(1&8cJt;)#07<>8oIZ)cQiF+`BmH~d$MJX<~b>9bF<p?k-E2&
zggJ4U^Oh?AbRb*p+HSGM&2Lxksx#+LPyODuP58TI)AaMYP0o9@_N`dB=o?K1o8%3~
zyT1#5;cx3VGaa7}$M@*^5h$AZl4)u4mR>zM_`IM*_+$TAZ<xSOQ82l$qr0G(UA8{-
z`<~hry8GlMRB|bt`w?QAsvaHPr{nfC|0y8jZaP&^PykhIi=s>ycK0QOuUHqEq<3PD
z<Ttr?3KSJD-`I}7r6;sZO)fU<FZbV<J`}P5U!th;#?<R)*AB)#iqp<XDWx%MzuNAB
z0-CO<D4YCpgKl4@jCBRV_XUFmW>TjdCB-`|a%v*(M?^-O%KG)k|2)FrZ_5##2lu9t
z6@hma!`qW10w6a*6-D$LdE6*Df=KvXY0cRWc2ly^oBgw8aW1oewqxP{|3v@)jPm|p
d*HIf^qJhG^Z_GX@E`}fW8=bAMTkSpZKLLpmoKye+

literal 0
HcmV?d00001

diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/Contents.json b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/Contents.json
new file mode 100644
index 000000000..73c00596a
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/ContentView.swift b/transformers/llm/engine/ios/mnn-llm/mnn-llm/ContentView.swift
new file mode 100644
index 000000000..11ac631b1
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/ContentView.swift
@@ -0,0 +1,152 @@
+//
+//  ContentView.swift
+//  mnn-llm
+//
+//  Created by wangzhaode on 2023/12/14.
+//
+
+import Combine
+import SwiftUI
+
+class ChatViewModel: ObservableObject {
+    @Published var messages: [Message] = []
+    @Published var isModelLoaded = false // 模型是否加载完成
+    @Published var isProcessing: Bool = false // 标志表示是否有正在处理的LLM响应
+    private var llm: LLMInferenceEngineWrapper?
+
+    init() {
+        self.messages.append(Message(id: UUID(), text: " 模型加载中, 请稍等 ...", isUser: false))
+        llm = LLMInferenceEngineWrapper { [weak self] success in
+            DispatchQueue.main.async {
+                self?.isModelLoaded = success
+                var loadresult = "模型加载完毕！"
+                if !success {
+                    loadresult = "模型加载失败！"
+                }
+                self?.messages.append(Message(id: UUID(), text: loadresult, isUser: false))
+            }
+        }
+    }
+
+    func sendInput(_ input: String) {
+        // 将用户输入作为新消息添加
+        let userMessage = Message(id: UUID(), text: input, isUser: true)
+        DispatchQueue.main.async {
+            self.messages.append(userMessage)
+        }
+        isProcessing = true
+        // 在后台线程处理耗时的输入
+        DispatchQueue.global(qos: .userInitiated).async {
+            self.llm?.processInput(input) { [weak self] output in
+                // 切换回主线程来更新UI
+                DispatchQueue.main.async {
+                    if (output.contains("<eop>")) {
+                        self?.isProcessing = false
+                    } else {
+                        self?.appendResponse(output)
+                    }
+                }
+            }
+        }
+    }
+    
+    private func appendResponse(_ output: String) {
+        if let lastMessage = messages.last, !lastMessage.isUser {
+            // 创建一个更新后的消息
+            var updatedMessage = messages[messages.count - 1]
+            updatedMessage.text += output
+            // 替换数组中的旧消息
+            self.messages[messages.count - 1] = updatedMessage
+        } else {
+            let newMessage = Message(id: UUID(), text: output, isUser: false)
+            self.messages.append(newMessage)
+        }
+    }
+}
+
+
+struct Message: Identifiable, Equatable {
+    let id: UUID
+    var text: String
+    let isUser: Bool
+}
+
+struct ChatBubble: View {
+    let message: Message
+    
+    var body: some View {
+        HStack {
+            if message.isUser {
+                Spacer()
+            }
+            
+            Text(message.text)
+                .padding(10)
+                .foregroundColor(message.isUser ? .white : .black)
+                .background(message.isUser ? Color.blue : Color.gray.opacity(0.2))
+                .cornerRadius(10)
+                .frame(maxWidth: 400, alignment: message.isUser ? .trailing : .leading)
+            
+            if !message.isUser {
+                Spacer()
+            }
+        }
+        .transition(.scale(scale: 0, anchor: message.isUser ? .bottomTrailing : .bottomLeading))
+    }
+}
+
+struct ChatView: View {
+    @StateObject var viewModel = ChatViewModel()
+    @State private var inputText: String = ""
+    
+    var body: some View {
+        NavigationView {  // 包裹在 NavigationView 中
+            VStack {
+                ScrollView {
+                    ScrollViewReader { scrollView in
+                        VStack(alignment: .leading, spacing: 10) {
+                            ForEach(viewModel.messages) { message in
+                                ChatBubble(message: message)
+                            }
+                        }
+                        .padding(.horizontal)
+                        .onChange(of: viewModel.messages) { _ in
+                            scrollView.scrollTo(viewModel.messages.last?.id, anchor: .bottom)
+                        }
+                    }
+                }
+
+                HStack {
+                    TextField("Type a message...", text: $inputText)
+                        .textFieldStyle(RoundedBorderTextFieldStyle())
+                        .frame(minHeight: 44)
+
+                    Button(action: {
+                        viewModel.sendInput(inputText)
+                        inputText = ""
+                    }) {
+                        Image(systemName: "arrow.up.circle.fill")
+                            .resizable()
+                            .aspectRatio(contentMode: .fit)
+                            .frame(width: 44, height: 44)
+                    }
+                    .disabled(inputText.isEmpty || viewModel.isProcessing || !viewModel.isModelLoaded)
+                }
+                .padding()
+            }
+            .navigationBarTitle("mnn-llm", displayMode: .inline)  // 设置标题
+        }
+    }
+}
+
+extension String {
+    var isBlank: Bool {
+        return allSatisfy({ $0.isWhitespace })
+    }
+}
+
+struct ChatView_Previews: PreviewProvider {
+    static var previews: some View {
+        ChatView()
+    }
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.h b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.h
new file mode 100644
index 000000000..28374c06d
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.h
@@ -0,0 +1,29 @@
+//
+//  LLMInferenceEngineWrapper.h
+//  mnn-llm
+//
+//  Created by wangzhaode on 2023/12/14.
+//
+
+#ifndef LLMInferenceEngineWrapper_h
+#define LLMInferenceEngineWrapper_h
+
+
+// LLMInferenceEngineWrapper.h
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+typedef void(^ModelLoadingCompletionHandler)(BOOL success);
+typedef void (^StreamOutputHandler)(NSString * _Nonnull output);
+
+@interface LLMInferenceEngineWrapper : NSObject
+
+- (instancetype)initWithCompletionHandler:(ModelLoadingCompletionHandler)completionHandler;
+- (void)processInput:(NSString *)input withStreamHandler:(StreamOutputHandler)handler;
+
+@end
+
+NS_ASSUME_NONNULL_END
+
+#endif /* LLMInferenceEngineWrapper_h */
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
new file mode 100644
index 000000000..4d05379a4
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
@@ -0,0 +1,106 @@
+//
+//  LLMInferenceEngineWrapper.m
+//  mnn-llm
+//
+//  Created by wangzhaode on 2023/12/14.
+//
+
+#import "LLMInferenceEngineWrapper.h"
+#include <MNN/llm/llm.hpp>
+using namespace MNN::Transformer;
+
+const char* GetMainBundleDirectory() {
+    NSString *bundleDirectory = [[NSBundle mainBundle] bundlePath];
+    return [bundleDirectory UTF8String];
+}
+
+@implementation LLMInferenceEngineWrapper {
+    std::shared_ptr<Llm> llm;
+}
+
+- (instancetype)initWithCompletionHandler:(ModelLoadingCompletionHandler)completionHandler {
+    self = [super init];
+    if (self) {
+        // 在后台线程异步加载模型
+        dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+            BOOL success = [self loadModel]; // 假设loadModel方法加载模型并返回加载的成功或失败
+            // 切回主线程回调
+            dispatch_async(dispatch_get_main_queue(), ^{
+                completionHandler(success);
+            });
+        });
+    }
+    return self;
+}
+
+- (BOOL)loadModel {
+    if (!llm) {
+        std::string model_dir = GetMainBundleDirectory();
+        std::string config_path = model_dir + "/config.json";
+        llm.reset(Llm::createLLM(config_path));
+        NSString *tempDirectory = NSTemporaryDirectory();
+        llm->set_config("{\"tmp_path\":\"" + std::string([tempDirectory UTF8String]) + "\", \"use_mmap\":true}");
+        llm->load();
+    }
+    return YES;
+}
+
+- (void)processInput:(NSString *)input withStreamHandler:(StreamOutputHandler)handler {
+    LlmStreamBuffer::CallBack callback = [handler](const char* str, size_t len) {
+        if (handler) {
+            NSString *nsOutput = [NSString stringWithUTF8String:str];
+            handler(nsOutput);
+        }
+    };
+    LlmStreamBuffer streambuf(callback);
+    std::ostream os(&streambuf);
+    if (std::string([input UTF8String]) == "benchmark") {
+        // do benchmark
+        std::string model_dir = GetMainBundleDirectory();
+        std::string prompt_file = model_dir + "/bench.txt";
+        std::ifstream prompt_fs(prompt_file);
+        std::vector<std::string> prompts;
+        std::string prompt;
+        while (std::getline(prompt_fs, prompt)) {
+            // prompt start with '#' will be ignored
+            if (prompt.substr(0, 1) == "#") {
+                continue;
+            }
+            std::string::size_type pos = 0;
+            while ((pos = prompt.find("\\n", pos)) != std::string::npos) {
+                prompt.replace(pos, 2, "\n");
+                pos += 1;
+            }
+            prompts.push_back(prompt);
+        }
+        int prompt_len = 0;
+        int decode_len = 0;
+        int64_t prefill_time = 0;
+        int64_t decode_time = 0;
+        for (int i = 0; i < prompts.size(); i++) {
+            llm->response(prompts[i], &os, "\n");
+            prompt_len += llm->prompt_len_;
+            decode_len += llm->gen_seq_len_;
+            prefill_time += llm->prefill_us_;
+            decode_time += llm->decode_us_;
+        }
+        float prefill_s = prefill_time / 1e6;
+        float decode_s = decode_time / 1e6;
+        os << "\n#################################\n"
+           << "prompt tokens num  = " << prompt_len << "\n"
+           << "decode tokens num  = " << decode_len << "\n"
+           << "prefill time = " << std::fixed << std::setprecision(2) << prefill_s << " s\n"
+           << " decode time = " << std::fixed << std::setprecision(2) << decode_s << " s\n"
+           << "prefill speed = " << std::fixed << std::setprecision(2) << prompt_len / prefill_s << " tok/s\n"
+           << " decode speed = " << std::fixed << std::setprecision(2) << decode_len / decode_s << " tok/s\n"
+           << "##################################\n";
+        os << "<eop>";
+    } else {
+        llm->response([input UTF8String], &os, "<eop>");
+    }
+}
+
+- (void)dealloc {
+    llm.reset();
+}
+@end
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/Preview Content/Preview Assets.xcassets/Contents.json b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Preview Content/Preview Assets.xcassets/Contents.json
new file mode 100644
index 000000000..73c00596a
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/Preview Content/Preview Assets.xcassets/Contents.json	
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "author" : "xcode",
+    "version" : 1
+  }
+}
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn-llm-Bridging-Header.h b/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn-llm-Bridging-Header.h
new file mode 100644
index 000000000..208d3edbb
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn-llm-Bridging-Header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import "LLMInferenceEngineWrapper.h"
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn_llmApp.swift b/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn_llmApp.swift
new file mode 100644
index 000000000..c0585da9f
--- /dev/null
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/mnn_llmApp.swift
@@ -0,0 +1,17 @@
+//
+//  mnn_llmApp.swift
+//  mnn-llm
+//
+//  Created by wangzhaode on 2023/12/14.
+//
+
+import SwiftUI
+
+@main
+struct mnn_llmApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ChatView()
+        }
+    }
+}
diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp
index 3e41b2eb0..1200957c0 100644
--- a/transformers/llm/engine/llm_demo.cpp
+++ b/transformers/llm/engine/llm_demo.cpp
@@ -8,6 +8,7 @@
 #include "llm/llm.hpp"
 #define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
 #include <fstream>
 #include <sstream>
 #include <stdlib.h>
@@ -160,14 +161,19 @@ int main(int argc, const char* argv[]) {
         std::cout << "Usage: " << argv[0] << " config.json <prompt.txt>" << std::endl;
         return 0;
     }
+    MNN::BackendConfig backendConfig;
+    auto executor = MNN::Express::Executor::newExecutor(MNN_FORWARD_CPU, backendConfig, 1);
+    MNN::Express::ExecutorScope s(executor);
+
     std::string config_path = argv[1];
     std::cout << "config path is " << config_path << std::endl;
     std::unique_ptr<Llm> llm(Llm::createLLM(config_path));
+    llm->set_config("{\"tmp_path\":\"tmp\"}");
     {
         AUTOTIME;
         llm->load();
     }
-    if (false) {
+    if (true) {
         AUTOTIME;
         trace_prepare(llm.get());
     }
diff --git a/transformers/llm/engine/model/bench.txt b/transformers/llm/engine/model/bench.txt
new file mode 100644
index 000000000..87e49b7f1
--- /dev/null
+++ b/transformers/llm/engine/model/bench.txt
@@ -0,0 +1,4 @@
+计算8乘以12
+将下面的句子翻译成中文：It's a beautiful day to learn something new.
+描述优秀的领导者应具备的五个特质，并解释每个特质为什么重要
+近年来，随着技术的快速发展和全球化的深入推进，数字经济已成为推动世界经济增长的新引擎。数字经济不仅改变了人们的生活方式，促进了信息和资源的快速流通，还重塑了传统行业的业务模式和竞争格局。尽管数字经济的发展为全球经济增长提供了新的动能，但同时也带来了数据安全、隐私保护、数字鸿沟和市场垄断等一系列挑战。考虑到这些背景，请详细分析数字经济在促进世界经济增长方面的作用，包括但不限于数字经济对提高生产效率、创造就业机会和促进可持续发展的贡献。同时，探讨如何应对数字经济发展过程中出现的挑战，具体包括如何保护个人数据安全和隐私、缩小数字鸿沟以确保数字经济的包容性和公平性，以及如何制定有效政策以避免市场垄断情况的出现，最终实现数字经济的健康和可持续发展。
\ No newline at end of file
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index e01350eb9..d11254110 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -85,10 +85,19 @@ void Llm::init_runtime() {
     BackendConfig cpuBackendConfig;
     config.type          = backend_type_convert(config_->backend_type());
     config.numThread     = config_->thread_num();
-    if (config_->memory() == "low") {
+    if (config_->power() == "high") {
+        cpuBackendConfig.power = BackendConfig::Power_High;
+    } else if (config_->power() == "low") {
+        cpuBackendConfig.power = BackendConfig::Power_Low;
+    }
+    if (config_->memory() == "high") {
+        cpuBackendConfig.memory = BackendConfig::Memory_High;
+    } else if (config_->memory() == "low") {
         cpuBackendConfig.memory = BackendConfig::Memory_Low;
     }
-    if (config_->precision() == "low") {
+    if (config_->precision() == "high") {
+        cpuBackendConfig.precision = BackendConfig::Precision_High;
+    } else if (config_->precision() == "low") {
         cpuBackendConfig.precision = BackendConfig::Precision_Low;
     }
     config.backendConfig = &cpuBackendConfig;
@@ -97,10 +106,16 @@ void Llm::init_runtime() {
     runtime_manager_.reset(Executor::RuntimeManager::createRuntimeManager(config));
     runtime_manager_->setHint(MNN::Interpreter::MEM_ALLOCATOR_TYPE, 0);
     runtime_manager_->setHint(MNN::Interpreter::DYNAMIC_QUANT_OPTIONS, 1); // 1: per batch quant, 2: per tensor quant
-    runtime_manager_->setHint(MNN::Interpreter::KVCACHE_QUANT_OPTIONS, config_->quant_kv());
+    runtime_manager_->setHint(MNN::Interpreter::QKV_QUANT_OPTIONS, config_->quant_qkv());
     runtime_manager_->setHint(MNN::Interpreter::KVCACHE_SIZE_LIMIT, config_->kvcache_limit());
-    runtime_manager_->setExternalPath("/tmp/.kvcache", MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR);
-    
+    std::string tmpPath = config_->tmp_path();
+    if (config_->kvcache_mmap()) {
+        runtime_manager_->setExternalPath(tmpPath, MNN::Interpreter::EXTERNAL_PATH_KVCACHE_DIR);
+    }
+    if (config_->use_mmap()) {
+        runtime_manager_->setExternalPath(tmpPath, MNN::Interpreter::EXTERNAL_WEIGHT_DIR);
+    }
+
 #if DEBUG_MODE==1
     runtime_manager_->setMode(MNN::Interpreter::Session_Debug);
     _initTimeTrace();
@@ -154,7 +169,7 @@ void Llm::load() {
                                            {"input_ids", "attention_mask", "position_ids", "past_key_values"},
                                            {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
         }
-        MNN_PRINT("Done!\n");
+        MNN_PRINT("Load Module Done!\n");
     } else {
         MNN_ERROR("Split version is depercerate\n");
     }
@@ -162,6 +177,8 @@ void Llm::load() {
     for (int v=0; v<modules_.size(); ++v) {
         decode_modules_[v].reset(Module::clone(modules_[v].get()));
     }
+    MNN_PRINT("Clone Decode Module Done!\n");
+
     prefill_modules_ = modules_;
 }
 
@@ -330,9 +347,13 @@ void Llm::chat() {
             continue;
         }
         std::cout << "\nA: " << std::flush;
-        history.emplace_back(std::make_pair("user", user_str));
-        auto assistant_str = response(history);
-        history.emplace_back(std::make_pair("assistant", assistant_str));
+        if (config_->reuse_kv()) {
+            response(user_str);
+        } else {
+            history.emplace_back(std::make_pair("user", user_str));
+            auto assistant_str = response(history);
+            history.emplace_back(std::make_pair("assistant", assistant_str));
+        }
         std::cout << std::endl;
     }
 }
@@ -777,10 +798,12 @@ float Embedding::dist(VARP var0, VARP var1) {
     return dist;
 }
 
-Embedding* Embedding::createEmbedding(const std::string& config_path) {
+Embedding* Embedding::createEmbedding(const std::string& config_path, bool load) {
     std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
     Embedding* embedding = new Embedding(config);
-    embedding->load();
+    if (load) {
+        embedding->load();
+    }
     return embedding;
 }
 
@@ -808,10 +831,9 @@ void Embedding::load() {
     MNN_PRINT("Done!\n");
 }
 
-VARP Embedding::embedding(const std::string& txt) {
-    auto ids = tokenizer(txt);
+VARP Embedding::ids_embedding(const std::vector<int>& ids) {
     int prompt_len = ids.size();
-    auto inputs_ids = _Const(ids.data(), {prompt_len}, NCHW, halide_type_of<int>());
+    auto inputs_ids = embedding(ids);
     auto attention_mask = gen_attention_mask(prompt_len);
     auto position_ids = gen_position_ids(prompt_len);
     auto outputs = modules_[0]->onForward({inputs_ids, attention_mask, position_ids});
@@ -819,12 +841,12 @@ VARP Embedding::embedding(const std::string& txt) {
     return sentence_embeddings;
 }
 
+VARP Embedding::txt_embedding(const std::string& txt) {
+    return ids_embedding(tokenizer(txt));
+}
+
 std::vector<int> Embedding::tokenizer(const std::string& query) {
-    auto prompt = query;
-    if (query.size() <= 256) {
-        prompt = "为这个句子生成表示以用于检索相关文章：" + query;
-    }
-    prompt = apply_prompt_template(prompt);
+    auto prompt = apply_prompt_template(query);
     auto ids = tokenizer_->encode(prompt);
     return ids;
 }
diff --git a/transformers/llm/engine/src/llmconfig.hpp b/transformers/llm/engine/src/llmconfig.hpp
index 57cc924a8..22b66c895 100644
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@@ -241,13 +241,16 @@ class LlmConfig {
     std::string precision() const {
         return config_.value("precision", "low");
     }
+    std::string power() const {
+        return config_.value("power", "normal");
+    }
 
     std::string memory() const {
         return config_.value("memory", "low");
     }
 
-    int quant_kv() const {
-        return config_.value("quant_kv", 0);
+    int quant_qkv() const {
+        return config_.value("quant_qkv", 0);
     }
 
     int kvcache_limit() const {
@@ -264,6 +267,16 @@ class LlmConfig {
         return llm_config_.value("is_visual", false);
     }
 
+    bool use_mmap() const {
+        return config_.value("use_mmap", false);
+    }
+    bool kvcache_mmap() const {
+        return config_.value("kvcache_mmap", false);
+    }
+    std::string tmp_path() const {
+        return config_.value("tmp_path", "");
+    }
+
     int hidden_size() const {
         return llm_config_.value("hidden_size", 4096);
     }
diff --git a/transformers/llm/export/README.md b/transformers/llm/export/README.md
index bc72b39da..bdd38a9de 100644
--- a/transformers/llm/export/README.md
+++ b/transformers/llm/export/README.md
@@ -4,156 +4,85 @@
 
 llm-export是一个llm模型导出工具，能够将llm模型导出为onnx和mnn模型。
 
-- 🚀 均完成`onnxruntime`正确性测试
 - 🚀 优化原始代码，支持动态形状
 - 🚀 优化原始代码，减少常量部分
-- 🚀 使用[OnnxSlim](https://github.com/WeLoveAI/OnnxSlim)优化onnx模型，性能提升约5%; by [@inisis](https://github.com/inisis)
+- 🚀 使用[OnnxSlim](https://github.com/inisis/OnnxSlim)优化onnx模型，性能提升约5%; by [@inisis](https://github.com/inisis)
 - 🚀 支持将lora权重导出为onnx和mnn
+- 🚀 Onnx推理代码[OnnxLLM](https://github.com/inisis/OnnxLLM)
 
-## 模型支持与下载
-- [![Download][download-chatglm-6b-onnx]][release-chatglm-6b-onnx]
-- [![Download][download-chatglm2-6b-onnx]][release-chatglm2-6b-onnx]
-- [![Download][download-chatglm3-6b-onnx]][release-chatglm3-6b-onnx]
-- [![Download][download-codegeex2-6b-onnx]][release-codegeex2-6b-onnx]
-- [![Download][download-qwen-7b-chat-onnx]][release-qwen-7b-chat-onnx]
-- [![Download][download-baichuan2-7b-chat-onnx]][release-baichuan2-7b-chat-onnx]
-- [![Download][download-llama2-7b-chat-onnx]][release-llama2-7b-chat-onnx]
-- [![Download][download-qwen-1.8b-chat-onnx]][release-qwen-1.8b-chat-onnx]
-- [![Download][download-phi-2-onnx]][release-phi-2-onnx]
-- [![Download][download-internlm-7b-onnx]][release-internlm-7b-onnx]
-- [![Download][download-qwen-vl-onnx]][release-qwen-vl-onnx]
-- [![Download][download-bge-large-zh-onnx]][release-bge-large-zh-onnx]
-- [![Download][download-tinyllama-1.1b-chat-onnx]][release-tinyllama-1.1b-chat-onnx]
-- [![Download][download-yi-6b-chat-onnx]][release-yi-6b-chat-onnx]
-- [![Download][download-deepseek-7b-chat-onnx]][release-deepseek-7b-chat-onnx]
-- [![Download][download-qwen1.5-0.5b-chat-onnx]][release-qwen1.5-0.5b-chat-onnx]
-- [![Download][download-qwen1.5-1.8b-chat-onnx]][release-qwen1.5-1.8b-chat-onnx]
-- [![Download][download-qwen1.5-4b-chat-onnx]][release-qwen1.5-4b-chat-onnx]
-- [![Download][download-qwen1.5-7b-chat-onnx]][release-qwen1.5-7b-chat-onnx]
-- [![Download][download-llama3-8b-instruct-onnx]][release-llama3-8b-instruct-onnx]
+## 安装
+```sh
+# pip install
+pip install llmexport
 
-[download-chatglm-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm-6b-onnx/total
-[download-chatglm2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm2-6b-onnx/total
-[download-chatglm3-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/chatglm3-6b-onnx/total
-[download-codegeex2-6b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/codegeex2-6b-onnx/total
-[download-qwen-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-7b-chat-onnx/total
-[download-baichuan2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/baichuan2-7b-chat-onnx/total
-[download-llama2-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/llama2-7b-chat-onnx/total
-[download-qwen-1.8b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-1.8b-onnx/total
-[download-phi-2-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/phi-2-onnx/total
-[download-internlm-7b-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/internlm-7b-onnx/total
-[download-qwen-vl-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen-vl-onnx/total
-[download-bge-large-zh-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/bge-large-zh-onnx/total
-[download-tinyllama-1.1b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/tinyllama-1.1b-chat-onnx/total
-[download-yi-6b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/yi-6b-chat-onnx/total
-[download-deepseek-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/deepseek-7b-chat-onnx/total
-[download-qwen1.5-0.5b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-0.5b-chat-onnx/total
-[download-qwen1.5-1.8b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-1.8b-chat-onnx/total
-[download-qwen1.5-4b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-4b-chat-onnx/total
-[download-qwen1.5-7b-chat-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/qwen1.5-7b-chat-onnx/total
-[download-llama3-8b-instruct-onnx]: https://img.shields.io/github/downloads/wangzhaode/llm-export/llama3-8b-instruct-onnx/total
-[release-chatglm-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm-6b-onnx
-[release-chatglm2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm2-6b-onnx
-[release-chatglm3-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/chatglm3-6b-onnx
-[release-codegeex2-6b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/codegeex2-6b-onnx
-[release-qwen-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-7b-chat-onnx
-[release-baichuan2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/baichuan2-7b-chat-onnx
-[release-llama2-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/llama2-7b-chat-onnx
-[release-qwen-1.8b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-1.8b-onnx
-[release-phi-2-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/phi-2-onnx
-[release-internlm-7b-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/internlm-7b-onnx
-[release-qwen-vl-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen-vl-onnx
-[release-bge-large-zh-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/bge-large-zh-onnx
-[release-tinyllama-1.1b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/tinyllama-1.1b-chat-onnx
-[release-yi-6b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/yi-6b-chat-onnx
-[release-deepseek-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/deepseek-7b-chat-onnx
-[release-qwen1.5-0.5b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-0.5b-chat-onnx
-[release-qwen1.5-1.8b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-1.8b-chat-onnx
-[release-qwen1.5-4b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-4b-chat-onnx
-[release-qwen1.5-7b-chat-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/qwen1.5-7b-chat-onnx
-[release-llama3-8b-instruct-onnx]: https://github.com/wangzhaode/llm-export/releases/tag/llama3-8b-instruct-onnx
+# git install
+pip install git+https://github.com/wangzhaode/llm-export@master
 
-## 用法
-1. 将该项目clone到本地
-```sh
-git clone git@github.com:wangzhaode/llm-export.git
+# local install
+git clone https://github.com/wangzhaode/llm-export && cd llm-export/
+pip install .
 ```
-2. 将需要导出的LLM项目clone到本地，如：chatglm2-6b
+
+## 用法
+
+1. 将需要导出的LLM项目clone到本地，如：chatglm2-6b
 ```sh
 git clone https://huggingface.co/THUDM/chatglm2-6b
 # 如果huggingface下载慢可以使用modelscope
 git clone https://modelscope.cn/ZhipuAI/chatglm2-6b.git
 ```
-3. 执行LLMExporter导出模型
+2. 导出模型
 ```sh
-cd mnn-llm
-# 将chatglm2-6b分为embedding, blocks, lm分别导出为onnx并转换为mnn, 并导出tokenizer.txt
-python llm_export.py \
-        --path ../chatglm2-6b \
-        --export_split \
-        --export_token \
-        --export_mnn \
-        --onnx_path ./chatglm2-6b-onnx \
-        --mnn_path  ./chatglm2-6b-mnn
+# 将chatglm2-6b导出为onnx模型
+llmexport --path ../chatglm2-6b --export onnx
+# 将chatglm2-6b导出为mnn模型, 量化参数为4bit, blokc-wise = 128
+llmexport --path ../chatglm2-6b --export mnn --quant_bit 4 --quant_block 128
 ```
 
 ## 功能
-- 支持将模型完整导出为一个onnx模型，使用`--export`
-- 支持将模型分段导出为多个模型，使用`--export_split`
-- 支持导出模型的词表到一个文本文件，每行代表一个token；其中token使用base64编码；使用`--export_verbose`
-- 支持导出模型的Embedding层为一个onnx模型，使用`--export_embed`，同时支持bf16格式，使用`--embed_bf16`
-- 支持分层导出模型的block，使用`--export_blocks`导出全部层；使用`--export_block $id`导出指定层
-- 支持导出模型的lm_head层为一个onnx模型，使用`--export_lm`
-- 支持导出多模态模型的visual模型为一个onnx模型，使用`--export_visual`
 - 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
-- 支持在导出onnx模型后使用onnxruntime对结果一致性进行校验，使用`--export_test`
-- 支持将tokenizer导出为文本文件，使用`--export_token`
-- 支持将导出的onnx模型转换为mnn模型，默认转换为非对称4bit量化，使用`--export_mnn`
-- 指定导出路径使用`--onnx_path`和`--mnn_path`
 - 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
+- 制定量化bit数使用`--quant_bit`；量化的block大小使用`--quant_block`
+- 使用`--lm_quant_bit`来制定lm_head层权重的量化bit数，不指定则使用`--quant_bit`的量化bit数
+- 支持使用自己编译的`MNNConvert`，使用`--mnnconvert`
 
 ## 参数
 ```
-usage: llm_export.py [-h] --path PATH
-                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}]
-                     [--lora_path LORA_PATH] [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export]
-                     [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm] [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bin]
-                     [--embed_bf16] [--skip_slim]
+usage: llmexport.py [-h] --path PATH [--type TYPE] [--lora_path LORA_PATH] [--dst_path DST_PATH] [--test TEST] [--export EXPORT]
+                    [--skip_slim] [--quant_bit QUANT_BIT] [--quant_block QUANT_BLOCK] [--lm_quant_bit LM_QUANT_BIT]
+                    [--mnnconvert MNNCONVERT]
 
 llm_exporter
 
-optional arguments:
+options:
   -h, --help            show this help message and exit
   --path PATH           path(`str` or `os.PathLike`):
                         Can be either:
                         	- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
                         	- A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-1_8B,Qwen-VL-Chat,Qwen1_5-0_5B-Chat,Qwen1_5-1_8B-Chat,Qwen1_5-4B-Chat,Qwen1_5-7B-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,Llama-3-8B-Instruct,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh,lora}
-                        type(`str`, *optional*):
+  --type TYPE           type(`str`, *optional*):
                         	The pretrain llm model type.
   --lora_path LORA_PATH
                         lora path, defaut is `None` mean not apply lora.
-  --onnx_path ONNX_PATH
-                        export onnx model path, defaut is `./onnx`.
-  --mnn_path MNN_PATH   export mnn model path, defaut is `./mnn`.
-  --export_mnn          Whether or not to export mnn model after onnx.
-  --export_verbose      Whether or not to export onnx with verbose.
-  --export_test         Whether or not to export onnx with test using onnxruntime.
+  --dst_path DST_PATH   export onnx/mnn model to path, defaut is `./model`.
   --test TEST           test model inference with query `TEST`.
-  --export              export model to an `onnx` model.
-  --export_split        export model split to some `onnx` models:
-                        	- embedding model.
-                        	- block models.
-                        	- lm_head model.
-  --export_token        export llm tokenizer to a txt file.
-  --export_embed        export llm embedding to an `onnx` model.
-  --export_visual       export llm visual model to an `onnx` model.
-  --export_lm           export llm lm_head to an `onnx` model.
-  --export_block EXPORT_BLOCK
-                        export llm block [id] to an `onnx` model.
-  --export_blocks       export llm all blocks to `onnx` models.
-  --embed_bin           export embedding weight as bin file with dtype `bfloat16`
-  --embed_bf16          using `bfloat16` replace `float32` in embedding.
+  --export EXPORT       export model to an onnx/mnn model.
   --skip_slim           Whether or not to skip onnx-slim.
+  --quant_bit QUANT_BIT
+                        mnn quant bit, 4 or 8, default is 4.
+  --quant_block QUANT_BLOCK
+                        mnn quant block, default is 0 mean channle-wise.
+  --lm_quant_bit LM_QUANT_BIT
+                        mnn lm_head quant bit, 4 or 8, default is `quant_bit`.
+  --mnnconvert MNNCONVERT
+                        local mnnconvert path, if invalid, using pymnn.
 ```
+
+## 支持模型
+
+- llama/llama2/llama3/tinyllama
+- qwen/qwen1.5/qwen2/qwen-vl
+- baichuan2/phi-2/internlm/yi/deepseek
+- chatglm/codegeex/chatglm2/chatglm3
+- phi-2/gemma-2
\ No newline at end of file
diff --git a/transformers/llm/export/README_en.md b/transformers/llm/export/README_en.md
deleted file mode 100644
index 9942c23f1..000000000
--- a/transformers/llm/export/README_en.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# llm-export
-
-[中文](./README_en.md)
-
-llm-export is a tool for exporting llm models, capable of converting llm models into ONNX or MNN models.
-- 🚀 All passed `onnxruntime` correctness tests
-- 🚀 Optimized the original code to support dynamic shapes
-- 🚀 Optimized the original code to reduce the constant portion
-- 🚀 Using [OnnxSlim](https://github.com/WeLoveAI/OnnxSlim) slim onnx model，speed up 5%; by [@inisis](https://github.com/inisis)
-- 🚀 Support export lora weight to onnx or MNN model
-
-## Model Support and Downloads
-
-## Usage
-1. Clone this project locally
-```sh
-git clnoe git@github.com:wangzhaode/llm-export.git
-```
-2. Clone the LLM project that you want to export locally, such as: chatglm2-6b
-```sh
-git clone https://huggingface.co/THUDM/chatglm2-6b
-# If downloading from Hugging Face is slow, you can use ModelScope
-git clone https://modelscope.cn/ZhipuAI/chatglm2-6b.git
-```
-3. Execute LLMExporter to export the model
-```sh
-cd mnn-llm
-# Divide chatglm2-6b into embedding, blocks, lm, export each as ONNX and convert to MNN, and also export tokenizer.txt
-python llm_export.py \
-        --path ../chatglm2-6b \
-        --export_split \
-        --export_token \
-        --export_mnn \
-        --onnx_path ./chatglm2-6b-onnx \
-        --mnn_path  ./chatglm2-6b-mnn 
-```
-
-## Features
-- Supports exporting the entire model as a single ONNX model, use --export
-- Supports exporting the model in segments as multiple models, use --export_split
-- Supports exporting the model's vocabulary to a text file, each line representing a token; tokens are encoded using base64, use --export_verbose
-- Supports exporting the model's Embedding layer as an ONNX model, use --export_embed, also supports bf16 format, use --embed_bf16
-- Supports layered export of the model's blocks, use --export_blocks to export all layers; use --export_block $id to export a specified layer
-- Supports exporting the model's lm_head layer as an ONNX model, use --export_lm
-- Supports exporting the VL model's visual model as an ONNX model, use --export_visual
-- Supports conducting a dialogue test on the model, using --test $query will return the llm's response
-- Supports verifying the consistency of results using onnxruntime after exporting the ONNX model, use --export_test
-- Supports exporting the tokenizer as a text file, use --export_token
-- Supports converting the exported ONNX model to an MNN model, with default conversion to non-symmetric 4bit quantization, use --export_mnn
-- Specify export paths using --onnx_path and --mnn_path
-- Default using onnx-slim, skip using --skip_slim
-
-## Commad Args
-```
-usage: llm_export.py [-h] --path PATH
-                     [--type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}]
-                     [--onnx_path ONNX_PATH] [--mnn_path MNN_PATH] [--export_mnn] [--export_verbose] [--export_test] [--test TEST] [--export] [--export_split] [--export_token] [--export_embed] [--export_visual] [--export_lm]
-                     [--export_block EXPORT_BLOCK] [--export_blocks] [--embed_bf16] [--skip_slim]
-
-llm_exporter
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --path PATH           path(`str` or `os.PathLike`):
-                        Can be either:
-                                - A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]
-                                - A path to a *directory* clone from repo like `../chatglm-6b`.
-  --type {chatglm-6b,chatglm2-6b,chatglm3-6b,codegeex2-6b,Qwen-7B-Chat,Qwen-1_8B-Chat,Qwen-VL-Chat,Baichuan2-7B-Chat,Llama-2-7b-chat-ms,internlm-chat-7b,TinyLlama-1_1B-Chat,Yi-6B-Chat,deepseek-llm-7b-chat,phi-2,bge-large-zh}
-                        type(`str`, *optional*):
-                                The pretrain llm model type.
-  --onnx_path ONNX_PATH
-                        export onnx model path, defaut is `./onnx`.
-  --mnn_path MNN_PATH   export mnn model path, defaut is `./mnn`.
-  --export_mnn          Whether or not to export mnn model after onnx.
-  --export_verbose      Whether or not to export onnx with verbose.
-  --export_test         Whether or not to export onnx with test using onnxruntime.
-  --test TEST           test model inference with query `TEST`.
-  --export              export model to an `onnx` model.
-  --export_split        export model split to some `onnx` models:
-                                - embedding model.
-                                - block models.
-                                - lm_head model.
-  --export_token        export llm tokenizer to a txt file.
-  --export_embed        export llm embedding to an `onnx` model.
-  --export_visual       export llm visual model to an `onnx` model.
-  --export_lm           export llm lm_head to an `onnx` model.
-  --export_block EXPORT_BLOCK
-                        export llm block [id] to an `onnx` model.
-  --export_blocks       export llm all blocks to `onnx` models.
-  --embed_bf16          using `bfloat16` replace `float32` in embedding.
-  --skip_slim           Whether or not to skip onnx-slim.
-```
diff --git a/transformers/llm/export/llm_export.py b/transformers/llm/export/llm_export.py
deleted file mode 100644
index 4b541b247..000000000
--- a/transformers/llm/export/llm_export.py
+++ /dev/null
@@ -1,1430 +0,0 @@
-import os
-import base64
-import glob
-import json
-import shutil
-import argparse
-import torch
-import numpy as np
-from onnxslim import slim
-import onnxruntime as ort
-import sentencepiece as spm
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
-from peft import LoraConfig, TaskType, get_peft_model, PeftModel
-try:
-    import _tools as MNNTools
-except:
-    MNNTools = None
-
-def onnx2mnn(onnx_path, mnn_dir, quant_bit = 4, asymmetric = True, external_data = False, bizCode : str= None):
-    model_name, model_extension = os.path.splitext(os.path.basename(onnx_path))
-    if model_extension != '.onnx':
-        return
-    mnn_name = model_name + '.mnn'
-    mnn_path = os.path.join(mnn_dir, mnn_name)
-    convert_args = [
-        '',
-        '-f',
-        'ONNX',
-        '--modelFile',
-        str(onnx_path),
-        '--MNNModel',
-        str(mnn_path),
-        '--weightQuantBits',
-        str(quant_bit),
-    ]
-    if asymmetric:
-        convert_args.append("--weightQuantAsymmetric")
-    if external_data:
-        convert_args.append("--saveExternalData")
-    if bizCode is not None:
-        convert_args.append("--bizCode")
-        convert_args.append(str(bizCode))
-    MNNTools.mnnconvert(convert_args)
-
-# some wrapper class for export
-class Embedding(torch.nn.Module):
-    def __init__(self, embed, using_bf16: bool = False):
-        super().__init__()
-        self.bf16 = using_bf16
-        self.embed_dim = embed.weight.shape[-1]
-        if using_bf16:
-            # using bf16 embedding weight
-            self.embed = embed.bfloat16()
-        else:
-            self.embed = embed
-
-    def forward(self, input_ids):
-        res = self.embed(input_ids)
-        if self.bf16:
-            res = res.float()
-        return res.view(-1, 1, self.embed_dim)
-
-class Lm(torch.nn.Module):
-    def __init__(self, lm):
-        super().__init__()
-        self.lm = lm
-
-    def forward(self, hidden_states):
-        m_logits = self.lm(hidden_states)
-        # token = torch.argmax(m_logits)
-        return m_logits
-
-class LLM(torch.nn.Module):
-    '''
-    Base class for all llm model. Inherits from [`torch.nn.Module`].
-    '''
-
-    def __init__(self, args):
-        super().__init__()
-        self.quant_bit = 4
-        self.asymmetric = True
-        self.onnx_path = args.onnx_path
-        self.mnn_path = args.mnn_path
-        if not os.path.exists(self.onnx_path):
-            os.makedirs(self.onnx_path)
-        if not os.path.exists(self.mnn_path):
-            os.makedirs(self.mnn_path)
-        self.export_mnn = args.export_mnn
-        self.export_verbose = args.export_verbose
-        self.export_test = args.export_test
-        # default is False, just set True when using below command:
-        # `python llm_export ../path --export --embed_bin` to export single model without embedding
-        self.without_embed = False
-        self.embed_bin = True
-        self.embed_bf16 = args.embed_bf16
-        self.skip_slim = args.skip_slim
-        tokenizer_model = os.path.join(args.path, 'tokenizer.model')
-        ice_text_model = os.path.join(args.path, 'ice_text.model')
-        try:
-            if os.path.exists(tokenizer_model):
-                self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
-            elif os.path.exists(ice_text_model):
-                self.sp_model = spm.SentencePieceProcessor(ice_text_model)
-            else:
-                self.sp_model = None
-        except:
-            self.sp_model = None
-        merge_file = os.path.join(args.path, 'merges.txt')
-        if os.path.exists(merge_file):
-            self.merge_txt = merge_file
-        else:
-            self.merge_txt = None
-        self.stop_ids = []
-        self.max_length = 1024
-        self.hidden_size = 4096
-        self.visual = None # defualt is not visual
-        self.lora_path = args.lora_path
-        self.load_hf(args.path)
-        self.load_model()
-        self.llm_config = {
-            'hidden_size' : self.hidden_size,
-            'layer_nums' : self.block_nums,
-            'attention_mask': self.attention_mask_type,
-            'key_value_shape': self.past_kv_shape[1:],
-            "prompt_template": self.build_prompt('%s'),
-            'is_visual': False
-        }
-
-    def load_hf(self, model_path: str):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        try:
-            self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
-        except:
-            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
-        self.config = self.model.config
-        if self.lora_path is not None:
-            adapter = PeftModel.from_pretrained(self.model, model_id=self.lora_path)
-            self.model = adapter.merge_and_unload(progressbar=True)
-
-    def load_model(self):
-        raise NotImplementedError
-
-    def get_attention_mask(self) -> torch.Tensor:
-        raise NotImplementedError
-
-    def get_position_ids(self) -> torch.Tensor:
-        raise NotImplementedError
-
-    def export_vocab(self):
-        raise NotImplementedError
-
-    def visual_embed(self, input_ids):
-        raise NotImplementedError
-
-    def __embedding(self, input_ids):
-        if self.visual is not None and self.token_len == 0:
-            input_embeds = self.visual_embed(input_ids)
-        else:
-            input_embeds = self.embed(input_ids)
-        return input_embeds
-
-    def __decode(self, hidden_states, attention_mask, position_ids, past_key_values):
-        presents = []
-        for i in range(self.block_nums):
-            hidden_states, kv = self.blocks[i](hidden_states, attention_mask, position_ids, past_key_values[i])
-            presents.append(kv)
-        logits = self.lm(hidden_states).reshape(-1)
-        presents = torch.stack(presents)
-        self.seq_len += 1
-        self.token_len += 1
-        return logits, presents
-
-    def forward(self, input_ids, attention_mask, position_ids, past_key_values):
-        if self.without_embed:
-            return self.__decode(input_ids, attention_mask, position_ids, past_key_values)
-        return self.__decode(self.__embedding(input_ids), attention_mask, position_ids, past_key_values)
-
-    # some test functions
-    def build_prompt(self, query):
-        if hasattr(self.tokenizer, 'build_prompt'):
-            prompt = self.tokenizer.build_prompt(query)
-        else:
-            prompt = query
-        return prompt
-
-    def str_to_ids(self, prompt):
-        input_ids = self.tokenizer(prompt, return_tensors="pt")['input_ids']
-        return input_ids
-
-    def id_to_str(self, token_id):
-        word = self.tokenizer._convert_id_to_token(int(token_id))
-        word = self.tokenizer.convert_tokens_to_string([word])
-        return word
-
-    def response(self, query):
-        prompt = self.build_prompt(query)
-        input_ids = self.str_to_ids(prompt)
-        self.seq_len = input_ids.numel()
-        self.context_len = self.seq_len - 2
-        self.token_len = 0
-        past_key_values = [None for i in range(self.block_nums)]
-        token_id = input_ids
-        while self.token_len < self.max_length:
-            attention_mask = self.get_attention_mask()
-            position_ids = self.get_position_ids()
-            logits, past_key_values = self.forward(token_id, attention_mask, position_ids, past_key_values)
-            token_id = torch.argmax(logits)
-            if token_id in self.stop_ids:
-                print("", end='\n')
-                break
-            word = self.id_to_str(token_id)
-            print(word, end="", flush=True)
-
-    # some export functions
-    def assert_equal(self, torch_outs, onnx_outs):
-        if type(torch_outs) not in (list, tuple):
-            torch_outs = (torch_outs, )
-            onnx_outs = (onnx_outs, )
-        same = True
-        for orig, onnx in zip(torch_outs, onnx_outs):
-            orig = orig.detach().numpy()
-            if not np.allclose(orig, onnx, rtol=1e-3, atol=1e-3):
-                print('Error: onnx outputs dont match original. [shape = {}] onnx: {}, original: {}'.format(onnx.shape, onnx, orig))
-                same = False
-                break
-        if same:
-            print('onnx test SUCCESS')
-
-    def export_lm(self):
-        model = self.lm
-        hidden_states = torch.randn(1, self.hidden_size)
-        onnx_model = f'./{self.onnx_path}/lm.onnx'
-        torch.onnx.export(model, (hidden_states),
-                        onnx_model,
-                        verbose=self.export_verbose,
-                        input_names=['hidden_states'],
-                        output_names=['logits'],
-                        do_constant_folding=True,
-                        opset_version=15)
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-        # test lm
-        if self.export_test:
-            original_outs = model(hidden_states)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'hidden_states' : hidden_states.numpy(),
-            }
-            onnx_outs = ort_session.run(None, inputs)
-            self.assert_equal(original_outs, onnx_outs)
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path, self.quant_bit, self.asymmetric)
-
-    def export_visual(self):
-        if self.visual is None:
-            return
-        input_images = torch.randn((1, 3, self.image_size, self.image_size))
-        model = self.visual
-        onnx_model = f'./{self.onnx_path}/visual.onnx'
-        torch.onnx.export(model, (input_images),
-                        onnx_model,
-                        verbose=self.export_verbose,
-                        input_names=['input_images'],
-                        output_names=['image_embeds'],
-                        dynamic_axes={"input_images": {
-                            0: "size"
-                        }},
-                        do_constant_folding=True,
-                        opset_version=15)
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-        # test
-        if self.export_test:
-            original_outs = model(input_images)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'input_images' : input_images.numpy(),
-            }
-            onnx_outs = ort_session.run(None, inputs)[0]
-            self.assert_equal(original_outs, onnx_outs)
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path)
-
-    def export_embed(self):
-        model = self.embed
-        if self.embed_bin:
-            import ctypes
-            tensor_data = model.embed.weight.data
-            data_ptr = tensor_data.untyped_storage().data_ptr()
-            buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr)
-            with open(f'./{self.onnx_path}/embeddings_bf16.bin', 'wb') as f:
-                f.write(buffer)
-            return
-        input_ids = torch.arange(3, dtype=torch.long)
-        onnx_model = f'./{self.onnx_path}/embedding.onnx'
-        torch.onnx.export(model, (input_ids),
-                        onnx_model,
-                        verbose=self.export_verbose,
-                        input_names=['input_ids'],
-                        output_names=['inputs_embeds'],
-                        dynamic_axes={"input_ids": {
-                            0: "length"
-                        }},
-                        do_constant_folding=True,
-                        opset_version=15)
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-        # test
-        if self.export_test:
-            original_outs = model(input_ids)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'input_ids' : input_ids.numpy(),
-            }
-            onnx_outs = ort_session.run(None, inputs)
-            self.assert_equal(original_outs, onnx_outs)
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path)
-
-    def export_block(self, block_id: int):
-        self.seq_len = 3
-        self.token_len = 0
-        inputs_embeds = torch.randn((self.seq_len, 1, self.hidden_size))
-        attention_mask =  self.get_attention_mask()
-        position_ids = self.get_position_ids()
-        past_key_values = torch.zeros(self.past_kv_shape[1:])
-        model = self.blocks[block_id]
-        onnx_model = f'./{self.onnx_path}/block_{block_id}.onnx'
-        torch.onnx.export(
-            model, (inputs_embeds, attention_mask, position_ids, past_key_values),
-            onnx_model,
-            verbose=self.export_verbose,
-            input_names=[
-                'inputs_embeds', 'attention_mask', 'position_ids', 'past_key_values'
-            ],
-            output_names=['hidden_states', 'presents'],
-            dynamic_axes=self.block_dynamic_axes,
-            do_constant_folding=True,
-            opset_version=15)
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-        if self.export_test:
-            original_outs = model(inputs_embeds, attention_mask, position_ids, past_key_values)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'inputs_embeds' : inputs_embeds.detach().numpy(),
-                'attention_mask' : attention_mask.numpy(),
-                'position_ids' : position_ids.numpy(),
-                'past_key_values' : past_key_values.numpy()
-            }
-            onnx_outs = ort_session.run(None, inputs)
-            self.assert_equal(original_outs, onnx_outs)
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path, self.quant_bit, self.asymmetric)
-
-    def export_blocks(self):
-        for i in range(self.block_nums):
-            self.export_block(i)
-
-    def export_config(self, is_single = True):
-        self.llm_config['is_single'] = is_single
-        with open(f'./{self.onnx_path}/llm_config.json', 'w', encoding='utf-8') as f:
-            json.dump(self.llm_config, f, ensure_ascii=False, indent=4)
-
-    def export(self):
-        model = self
-        self.seq_len = 3
-        self.token_len = 0
-        input_ids = torch.arange(3, dtype=torch.long)
-        attention_mask =  self.get_attention_mask()
-        position_ids = self.get_position_ids()
-        past_key_values = torch.zeros(self.past_kv_shape)
-        onnx_model = f'./{self.onnx_path}/llm.onnx'
-        if self.embed_bin:
-            self.without_embed = True
-            input_ids = self.__embedding(input_ids)
-        print('export start ...')
-        torch.onnx.export(
-            model, (input_ids, attention_mask, position_ids, past_key_values),
-            onnx_model,
-            verbose=self.export_verbose,
-            input_names=[
-                'input_ids', 'attention_mask', 'position_ids', 'past_key_values'
-            ],
-            output_names=['logits', 'presents'],
-            dynamic_axes=self.model_dynamic_axes,
-            do_constant_folding=True,
-            opset_version=15)
-        print('export done!')
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-            for file_path in glob.glob(f'./{self.onnx_path}/onnx__*'):
-                try:
-                    os.remove(file_path)
-                except FileNotFoundError:
-                    pass
-            for file_path in glob.glob(f'./{self.onnx_path}/model.*'):
-                try:
-                    os.remove(file_path)
-                except FileNotFoundError:
-                    pass
-        if self.export_test:
-            # test
-            original_outs = model(input_ids, attention_mask, position_ids, past_key_values)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'input_ids' : input_ids.detach().numpy(),
-                'attention_mask' : attention_mask.numpy(),
-                'position_ids' : position_ids.numpy(),
-                'past_key_values' : past_key_values.numpy()
-            }
-            onnx_outs = ort_session.run(None, inputs)
-            self.assert_equal(original_outs, onnx_outs)
-        if self.export_mnn:
-            # single model is > 2G, using external_data
-            onnx2mnn(onnx_model, self.mnn_path, self.quant_bit, self.asymmetric, True)
-        if self.without_embed:
-            self.without_embed = False
-
-    def export_tokenizer(self):
-        # TOKENIZER MAGIC NUMBER
-        MAGIC_NUMBER = 430
-        # TOKENIZER TYPE
-        SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3
-        def write_line(fp, *args):
-            for arg in args:
-                for token in arg:
-                    fp.write(str(token) + ' ')
-            fp.write('\n')
-        def write_header(fp, type, speicals, prefix = []):
-            fp.write(f'{MAGIC_NUMBER} {type}\n')
-            fp.write(f'{len(speicals)} {len(self.stop_ids)} {len(prefix)}\n')
-            write_line(fp, speicals, self.stop_ids, prefix)
-
-        file_path = os.path.join(self.onnx_path, "tokenizer.txt")
-        special_list = list(self.tokenizer.added_tokens_decoder.keys())
-        if hasattr(self.tokenizer, 'special_tokens'):
-            for k, v in self.tokenizer.special_tokens.items():
-                special_list.append(v)
-        if hasattr(self.tokenizer, 'gmask_token_id'):
-            special_list.append(self.tokenizer.gmask_token_id)
-        vocab_list = []
-        prefix_list = []
-        if hasattr(self.tokenizer, 'get_prefix_tokens'):
-            prefix_list = self.tokenizer.get_prefix_tokens()
-        if self.sp_model is not None:
-            # senetencepiece
-            print('# senetencepiece tokenier')
-            NORMAL = 1; UNKNOWN = 2; CONTROL = 3
-            USER_DEFINED = 4; UNUSED = 5; BYTE = 6
-            for i in range(self.sp_model.GetPieceSize()):
-                token = self.sp_model.IdToPiece(i)
-                score = self.sp_model.GetScore(i)
-                type = NORMAL
-                if self.sp_model.IsUnknown(i):
-                    type = UNKNOWN
-                elif self.sp_model.IsControl(i):
-                    type = CONTROL
-                elif self.sp_model.IsUnused(i):
-                    type = UNUSED
-                elif self.sp_model.IsByte(i):
-                    type = BYTE
-                if self.model_name == 'Chatglm_6b':
-                    if '<n>' in token: token = '\n'
-                    if '<|tab|>' in token: token = '\t'
-                    if '<|blank_' in token: token = ' ' * int(token[8:token.find('|>')])
-                if '▁' in token: token = token.replace('▁', ' ')
-                token_encode = base64.b64encode(token.encode("utf-8")).decode("utf8")
-                vocab_list.append(f'{token_encode} {score} {type}\n')
-            with open(file_path, "w", encoding="utf8") as fp:
-                write_header(fp, SENTENCEPIECE, special_list, prefix_list)
-                fp.write(f'{len(vocab_list)}\n')
-                for vocab in vocab_list:
-                    fp.write(vocab)
-        elif hasattr(self.tokenizer, 'mergeable_ranks'):
-            print('# tiktoken tokenier')
-            # tikton
-            vocab_list = []
-            for k, v in self.tokenizer.mergeable_ranks.items():
-                line = base64.b64encode(k).decode("utf8") + "\n"
-                vocab_list.append(line)
-            if hasattr(self.tokenizer, 'special_tokens'):
-                for k, v in self.tokenizer.special_tokens.items():
-                    line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n"
-                    vocab_list.append(line)
-            if hasattr(self.tokenizer, 'added_tokens_decoder'):
-                for k, v in self.tokenizer.added_tokens_decoder.items():
-                    line = base64.b64encode(v.__str__().encode("utf-8")).decode("utf8") + "\n"
-                    vocab_list.append(line)
-            with open(file_path, "w", encoding="utf8") as fp:
-                write_header(fp, TIKTOIKEN, special_list, prefix_list)
-                fp.write(f'{len(vocab_list)}\n')
-                for vocab in vocab_list:
-                    fp.write(vocab)
-        elif self.merge_txt is not None:
-            # huggingface tokenizer
-            merge_list = []
-            vocab = self.tokenizer.get_vocab()
-            special_list = list(self.tokenizer.added_tokens_decoder.keys())
-            vocab_list = ['<unk>' for i in range(len(vocab))]
-            # load vocab
-            for k, v in vocab.items():
-                vocab_list[int(v)] = k
-            # load merge
-            with open(self.merge_txt, 'rt') as merge:
-                for line in merge.readlines():
-                    merge_list.append(line)
-            # write to tokenizer.txt
-            with open(file_path, "w", encoding="utf8") as fp:
-                write_header(fp, HUGGINGFACE, special_list)
-                fp.write(f'{len(vocab_list)} {len(merge_list)}\n')
-                for v in vocab_list:
-                    fp.write(v + '\n')
-                for m in merge_list:
-                    fp.write(m)
-        else:
-            print('# other tiktoken tokenier')
-            # other tikton
-            def unicode_to_byte(u: int):
-                if u >= 256 and u <= 288:
-                    return u - 256
-                if u >= 289 and u <= 322:
-                    return u - 162
-                if u == 323:
-                    return 173
-                if u == 65372: # |
-                    return 124
-                if u == 9601:  # _
-                    return 95
-                return u
-            vocab = self.tokenizer.get_vocab()
-            vocab_list = ['<unk>' for i in range(len(vocab))]
-            for k, v in vocab.items():
-                try:
-                    vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore')
-                except:
-                    vocab_list[int(v)] = k
-            special_list = list(self.tokenizer.added_tokens_decoder.keys())
-            with open(file_path, "w", encoding="utf8") as fp:
-                write_header(fp, TIKTOIKEN, special_list)
-                fp.write(f'{len(vocab_list)}\n')
-                for v in vocab_list:
-                    line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n"
-                    fp.write(line)
-
-# chatglm
-class GLMBlock(torch.nn.Module):
-    def __init__(self, block, block_id, final_layernorm = None):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.hidden_size = 4096
-        self.final_layernorm = final_layernorm
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        hidden_states, presents = self.block(hidden_states,
-                                             position_ids,
-                                             attention_mask,
-                                             self.block_id,
-                                             past_kv,
-                                             use_cache=True)
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        return hidden_states, presents
-
-class Chatglm_6b(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'glm'
-        self.model_name = 'Chatglm_6b'
-        super().__init__(args)
-
-    def load_model(self):
-        transformer = self.model.transformer
-        self.lm_ = self.model.lm_head
-        self.embed_ = transformer.word_embeddings
-        self.blocks_ = transformer.layers
-        self.final_layernorm_ = transformer.final_layernorm
-        # some wrapper
-        self.stop_ids.append(self.tokenizer._convert_token_to_id(self.tokenizer.eos_token))
-        self.block_nums = len(self.blocks_)
-        self.lm = Lm(self.lm_)
-        # chatglm embedding and lm using same param, copy embedding when using bf16
-        if self.embed_bf16:
-            import copy
-            embed_copy = copy.deepcopy(self.embed_)
-            self.embed = Embedding(embed_copy, self.embed_bf16)
-        else:
-            self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.blocks = [GLMBlock(self.blocks_[i], i, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        # some config for export
-        self.past_kv_shape = [28, 2, 0, 1, 32, 128]
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 2: "seq_len" },
-            "past_key_values" : { 1: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 2: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.zeros([1]).bool().reshape([1, 1, 1, 1])
-        attention_mask = torch.zeros([self.seq_len, self.seq_len], dtype=torch.bool)
-        for i in range(self.seq_len - 1):
-            attention_mask[i][-1] = True
-        attention_mask = attention_mask.reshape([1, 1, self.seq_len, self.seq_len])
-        return attention_mask
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([self.context_len, self.token_len + 1]).reshape([1, 2, 1])
-        position_ids_0 = torch.arange(self.seq_len, dtype=torch.long)
-        position_ids_1 = torch.zeros(self.seq_len, dtype=torch.long)
-        position_ids_0[-1] = position_ids_0[-2]
-        position_ids_1[-1] = 1
-        position_ids = torch.stack([position_ids_0, position_ids_1]).view(1, 2, -1)
-        return position_ids
-
-    def build_prompt(self, query):
-        return f'{query}[gMASK]<sop>'
-
-# chatglm2
-class GLM2Block(torch.nn.Module):
-    def __init__(self, block, block_id, config, final_layernorm = None):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.final_layernorm = final_layernorm
-        self.config = config
-        self.hidden_size = 4096
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        rope_ratio = self.config.rope_ratio
-        base = 10000 * rope_ratio
-        theta = 1.0 / (base ** (torch.arange(0, 64, 2, dtype=torch.float32) / 64))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1).unsqueeze(0).contiguous()
-        hidden_states, presents = self.block(hidden_states,
-                                            attention_mask,
-                                            kv_cache=past_kv,
-                                            rotary_pos_emb=rotary_pos_emb)
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        return hidden_states, presents
-
-class Chatglm2_6b(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'glm2'
-        super().__init__(args)
-        self.model_name = 'Chatglm2_6b'
-        if 'codegeex2-6b' in args.path:
-            self.model_name = 'Codegeex2_6b'
-
-    def load_model(self):
-        transformer = self.model.transformer
-        self.lm_ = transformer.output_layer
-        self.embed_ = transformer.embedding.word_embeddings
-        self.blocks_ = transformer.encoder.layers
-        self.final_layernorm_ = transformer.encoder.final_layernorm
-        # some wrapper
-        if self.tokenizer.eos_token_id is None:
-            # codegeex2-6b
-            self.stop_ids.append(self.tokenizer.tokenizer.eos_id)
-        else:
-            self.stop_ids.append(self.tokenizer.eos_token_id)
-        if hasattr(self.config, 'eos_token_id'):
-            if type(self.config.eos_token_id) is list:
-                for eos_id in self.config.eos_token_id:
-                    self.stop_ids.append(eos_id)
-            elif type(self.config.eos_token_id) is int:
-                self.stop_ids.append(self.config.eos_token_id)
-        self.block_nums = len(self.blocks_)
-        self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.lm = Lm(self.lm_)
-        self.blocks = [GLM2Block(self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        # some config for export
-        self.past_kv_shape = [28, 2, 0, 1, 2, 128]
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 1: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-        num_layers = self.config.num_layers
-        if num_layers > 28:
-            self.past_kv_shape = [num_layers, 2, 1, 2, 0, 128]
-            self.block_dynamic_axes = {
-                "inputs_embeds" : { 0: "seq_len" },
-                "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-                "position_ids" : { 0: "seq_len" },
-                "past_key_values" : { 3: "history_len" }
-            }
-            self.model_dynamic_axes = {
-                "input_ids" : { 0: "seq_len" },
-                "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-                "position_ids" : { 0: "seq_len" },
-                "past_key_values" : { 4: "history_len" }
-            }
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.zeros([1, 1, 1, 1]).bool()
-        attention_mask = ~torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]).bool())
-        return attention_mask
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([self.token_len], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long)
-
-# chatglm3
-class Chatglm3_6b(Chatglm2_6b):
-    def __init__(self, args):
-        super().__init__(args)
-        self.model_name = 'Chatglm3_6b'
-
-    def build_prompt(self, query):
-        return f'<|user|>\n{query}\n<|assistant|>\n'
-
-# qwen
-class QWENBlock(torch.nn.Module):
-    def __init__(self, name, block, block_id, hidden_size, final_layernorm = None):
-        super().__init__()
-        self.name = name
-        self.block = block
-        self.block_id = block_id
-        self.final_layernorm = final_layernorm
-        self.hidden_size = hidden_size
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
-        rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0)
-        if self.name != 'Qwen-7B':
-            rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
-        hidden_states = hidden_states.view(1, -1, self.hidden_size)
-        hidden_states, presents = self.block(hidden_states=hidden_states,
-                                             layer_past=past_kv,
-                                             attention_mask=attention_mask,
-                                             rotary_pos_emb=rotary_pos_emb,
-                                             use_cache=True)
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        return hidden_states, presents
-
-class QWEN18Block(torch.nn.Module):
-    def __init__(self, block, block_id, hidden_size, final_layernorm = None):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.final_layernorm = final_layernorm
-        self.hidden_size = hidden_size
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000.0 ** (torch.arange(0, 128, 2, dtype=torch.float32) / 128))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1).unsqueeze(1).unsqueeze(0)
-        rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
-        hidden_states = hidden_states.view(1, -1, self.hidden_size)
-        hidden_states, presents = self.block(hidden_states,
-                                             rotary_pos_emb,
-                                             past_kv,
-                                             attention_mask,
-                                             use_cache=True)
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        return hidden_states, presents
-
-class Qwen_Chat(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'int'
-        super().__init__(args)
-        if 'VL' in self.model_name:
-            self.llm_config['is_visual'] = True
-            self.llm_config['attention_mask'] = 'float'
-            self.llm_config['img_size'] = 448
-            self.llm_config['imgpad_len'] = 256
-            self.llm_config['img_start'] = self.tokenizer.img_start_id
-            self.llm_config['img_end'] = self.tokenizer.img_end_id
-            self.llm_config['img_pad'] = self.tokenizer.img_pad_id
-
-
-    def load_model(self):
-        # Qwen models
-        self.model_name = 'Qwen-7B'
-        if '1_8' in model_path:
-            self.model_name = 'Qwen-1_8b'
-        if 'VL' in model_path:
-            self.model_name = 'Qwen-VL'
-        transformer = self.model.transformer
-        self.lm_ = self.model.lm_head
-        self.embed_ = transformer.wte
-        self.blocks_ = transformer.h
-        self.final_layernorm_ = transformer.ln_f
-        if hasattr(transformer, 'visual'):
-            self.visual = transformer.visual
-            self.image_start_id = transformer.config.visual['image_start_id']
-            self.image_size = transformer.config.visual['image_size']
-        # some wrapper
-        self.stop_ids.append(self.tokenizer.im_end_id)
-        self.block_nums = len(self.blocks_)
-        self.hidden_size = transformer.embed_dim
-        self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.lm = Lm(self.lm_)
-        self.blocks = [QWENBlock(self.model_name, self.blocks_[i], i, self.hidden_size, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        if self.block_nums == 32:
-            # qwen-7b, qwen-vl
-            self.past_kv_shape = [32, 2, 1, 0, 32, 128]
-        elif self.block_nums == 24:
-            # qwen-1.8b
-            self.past_kv_shape = [24, 2, 1, 0, 16, 128]
-        # some config for export
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 3: "history_len" }
-        }
-
-    def build_prompt(self, query):
-        return f'\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.model_name == 'Qwen-VL':
-            if self.token_len:
-                return torch.zeros([1, 1, 1, 1], dtype=torch.float32)
-            return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
-        if self.token_len:
-            return torch.ones([1, 1, 1, 1]).bool()
-        return torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]).bool())
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([self.seq_len - 1], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long)
-
-    def visual_embed(self, input_ids):
-        if not torch.any(input_ids == self.image_start_id):
-            return self.embed(input_ids)
-        bos_pos = torch.where(input_ids == self.image_start_id)
-        eos_pos = torch.where(input_ids == self.image_start_id + 1)
-        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-        images = []
-        for i, a, b in img_pos:
-            image = input_ids[i][a + 1 : b - 1].tolist()
-            image = image[ : image.index(self.image_start_id + 2)]
-            images.append(bytes(image).decode('utf-8'))
-        images = self.visual.encode(images)
-        hidden_states = self.embed(input_ids).view(1, -1, self.hidden_size)
-        for idx, (i, a, b) in enumerate(img_pos):
-            hidden_states[i][a + 1 : b] = images[idx]
-        return hidden_states.view(-1, 1, self.hidden_size)
-
-class QWEN2Block(torch.nn.Module):
-    def __init__(self, name, block, block_id, config, final_layernorm = None):
-        super().__init__()
-        self.name = name
-        self.block = block
-        self.block_id = block_id
-        self.final_layernorm = final_layernorm
-        self.hidden_size = config.hidden_size
-        self.head_dim = config.hidden_size // config.num_attention_heads
-        self.rope_theta = config.rope_theta
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (self.rope_theta ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
-        rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0)
-        rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
-        hidden_states = hidden_states.view(1, -1, self.hidden_size)
-        hidden_states, presents = self.block(hidden_states=hidden_states,
-                                             attention_mask=attention_mask,
-                                             past_key_value=past_kv,
-                                             rotary_pos_emb=rotary_pos_emb,
-                                             use_cache=True)
-
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        # print('###', presents.shape)
-        return hidden_states, presents
-
-class Qwen2_Chat(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'float'
-        super().__init__(args)
-
-    def load_model(self):
-        # Qwen2 models
-        self.model_name = 'Qwen2'
-        transformer = self.model.model
-        self.lm_ = self.model.lm_head
-        self.embed_ = transformer.embed_tokens
-        self.blocks_ = transformer.layers
-        self.final_layernorm_ = transformer.norm
-        # some wrapper
-        self.stop_ids.append(self.tokenizer.eos_token_id)
-        if hasattr(self.model, 'generation_config'):
-            for id in self.model.generation_config.eos_token_id:
-                self.stop_ids.append(id)
-        self.block_nums = self.config.num_hidden_layers
-        self.hidden_size = self.config.hidden_size
-        self.num_heads = self.config.num_attention_heads
-        self.kv_heads = self.config.num_key_value_heads
-        self.rope_theta = self.config.rope_theta
-        self.head_dim = self.hidden_size // self.num_heads
-        if self.embed_.weight is self.lm_.weight:
-            import copy
-            embed_copy = copy.deepcopy(self.embed_)
-            self.embed = Embedding(embed_copy, self.embed_bf16)
-        else:
-            self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.lm = Lm(self.lm_)
-        self.past_kv_shape = [self.block_nums, 2, 1, 0, self.kv_heads, self.head_dim]
-        self.blocks = [QWEN2Block(self.model_name, self.blocks_[i], i, self.config, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        # some config for export
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 1: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-
-    def build_prompt(self, query):
-        return f'<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32)
-        return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
-
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([[self.seq_len - 1]], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
-
-    def visual_embed(self, input_ids):
-        if not torch.any(input_ids == self.image_start_id):
-            return self.embed(input_ids)
-        bos_pos = torch.where(input_ids == self.image_start_id)
-        eos_pos = torch.where(input_ids == self.image_start_id + 1)
-        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-        images = []
-        for i, a, b in img_pos:
-            image = input_ids[i][a + 1 : b - 1].tolist()
-            image = image[ : image.index(self.image_start_id + 2)]
-            images.append(bytes(image).decode('utf-8'))
-        images = self.visual.encode(images)
-        hidden_states = self.embed(input_ids).view(1, -1, self.hidden_size)
-        for idx, (i, a, b) in enumerate(img_pos):
-            hidden_states[i][a + 1 : b] = images[idx]
-        return hidden_states.view(-1, 1, self.hidden_size)
-
-# llama2
-class LLAMA2Block(torch.nn.Module):
-    def __init__(self, block, block_id, hidden_size, head_dim, final_layernorm = None):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.head_dim = head_dim
-        self.final_layernorm = final_layernorm
-        self.hidden_size = hidden_size
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000.0 ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.cat((idx_theta, idx_theta), dim=-1)
-        rotary_pos_emb = rotary_pos_emb.unsqueeze(1).unsqueeze(0)
-        rotary_pos_emb = torch.stack([torch.cos(rotary_pos_emb), torch.sin(rotary_pos_emb)])
-        hidden_states = hidden_states.view(1, -1, self.hidden_size)
-        position_ids = position_ids.view(1, -1)
-        hidden_states, presents = self.block(hidden_states,
-                                             attention_mask,
-                                             position_ids,
-                                             past_kv,
-                                             rotary_pos_emb=rotary_pos_emb,
-                                             use_cache=True)
-        if self.final_layernorm is not None:
-            hidden_states = self.final_layernorm(hidden_states)
-            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
-        if isinstance(presents, tuple):
-            presents = torch.stack(presents)
-        return hidden_states, presents
-
-class Llama2_7b_Chat(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'float'
-        self.model_name = 'Llama2_7b'
-        if 'Baichuan2' in args.path:
-            self.model_name = 'Baichuan2_7B'
-        if 'internlm' in args.path:
-            self.model_name = 'Internlm_7b'
-        if 'TinyLlama' in args.path:
-            self.model_name = 'TinyLlama'
-        if 'Yi' in args.path:
-            self.model_name = 'Yi'
-        if 'deepseek' in args.path:
-            self.model_name = 'deepseek'
-        if 'Llama-3' in args.path:
-            self.model_name = 'Llama3_8B'
-        super().__init__(args)
-
-    def load_model(self):
-        self.config = self.model.config
-        transformer = self.model.model
-        self.lm_ = self.model.lm_head
-        self.embed_ = transformer.embed_tokens
-        self.blocks_ = transformer.layers
-        self.final_layernorm_ = transformer.norm
-        # some wrapper
-        self.hidden_size = self.embed_.weight.shape[-1]
-        self.stop_ids.append(self.tokenizer.eos_token_id)
-        if hasattr(self.model, 'generation_config'):
-            self.stop_ids.append(self.model.generation_config.eos_token_id)
-        if self.model_name == 'Llama3_8B':
-            self.stop_ids.append(self.tokenizer.convert_tokens_to_ids("<|eot_id|>"))
-        self.block_nums = len(self.blocks_)
-        self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.lm = Lm(self.lm_)
-        self.block_nums = self.config.num_hidden_layers
-        self.hidden_size = self.config.hidden_size
-        self.num_attention_heads = self.config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_attention_heads
-        if hasattr(self.config, 'num_key_value_heads'):
-            self.num_key_value_heads = self.config.num_key_value_heads
-        else:
-            self.num_key_value_heads = self.config.num_attention_heads
-        self.blocks = [LLAMA2Block(self.blocks_[i], i, self.hidden_size, self.head_dim, self.final_layernorm_ if i == len(self.blocks_) - 1 else None) for i in range(self.block_nums)]
-        self.past_kv_shape = [self.block_nums, 2, 1, 0, self.num_key_value_heads, self.head_dim]
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 1: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 1: "seq_len" },
-            "past_key_values" : { 3: "history_len" }
-        }
-
-    def build_prompt(self, query):
-        if 'Baichuan2' in self.model_name:
-            return f'<reserved_106>{query}<reserved_107>'
-        if 'Internlm_7b' in self.model_name:
-            return f'<|User|>:{query}<eoh>\n<|Bot|>:'
-        if 'TinyLlama' in self.model_name:
-            return f'<s><|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s>\n<|user|>\n{query}</s>\n<|assistant|>\n'
-        if 'Yi' in self.model_name:
-            return f'<|im_start|> user\n{query}<|im_end|>\n<|im_start|> assistant\n'
-        if 'deepseek' in self.model_name:
-            return f'<|begin_of_sentence|>User: {query}\n\nAssistant:'
-        if 'Llama3' in self.model_name:
-            return f'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
-        return f'<s>[INST]{query}[/INST]'
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32)
-        return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([[self.seq_len - 1]], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
-
-# phi-2
-class PHI2Block(torch.nn.Module):
-    def __init__(self, block, block_id, hidden_size):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.hidden_size = hidden_size
-
-    def forward(self, hidden_states, attention_mask, position_ids, past_kv):
-        theta = 1.0 / (10000 ** (torch.arange(0, 32, 2, dtype=torch.float32) / 32))
-        position_ids = position_ids.float().reshape(-1, 1)
-        idx_theta = position_ids * theta
-        rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=0).contiguous()
-        hidden_states = hidden_states.view(1, -1, self.hidden_size)
-        hidden_states, presents = self.block(hidden_states,
-                                             past_kv,
-                                             rotary_pos_emb=rotary_pos_emb,
-                                             causal_mask=attention_mask
-                                             )
-        if self.block_id == 31:
-            hidden_states = hidden_states[:, -1, :]
-        return hidden_states, presents
-
-class phi_2(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'glm'
-        super().__init__(args)
-        self.model_name = 'phi-2'
-        self.asymmetric = False # TODO: some precision bug when using asymmetric
-
-    def load_model(self):
-        transformer = self.model.transformer
-        self.lm_ = self.model.lm_head
-        self.embed_ = transformer.embd.wte
-        self.hidden_size = self.embed_.weight.shape[-1]
-        self.blocks_ = transformer.h
-        # self.final_layernorm_ = transformer.final_layernorm
-        # some wrapper
-        self.stop_ids.append(self.tokenizer.eos_token_id)
-        self.block_nums = len(self.blocks_)
-        self.embed = Embedding(self.embed_, self.embed_bf16)
-        self.lm = Lm(self.lm_)
-        self.blocks = [PHI2Block(self.blocks_[i], i, self.hidden_size) for i in range(self.block_nums)]
-        # some config for export
-        self.past_kv_shape = [len(self.blocks), 1, 0, 2, 32, 80]
-        self.block_dynamic_axes = {
-            "inputs_embeds" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 1: "history_len" }
-        }
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
-        }
-
-    def build_prompt(self, query):
-            return f'Instruct: {query}\nOutput:'
-
-    def get_attention_mask(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.zeros([1, 1, 1, 1]).bool()
-        attention_mask = ~torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]).bool())
-        return attention_mask
-
-    def get_position_ids(self) -> torch.Tensor:
-        if self.token_len:
-            return torch.tensor([[self.seq_len - 1]], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
-
-# BGE is Embedding Model based Bert
-class BGEBlock(torch.nn.Module):
-    def __init__(self, block, block_id, hidden_size):
-        super().__init__()
-        self.block = block
-        self.block_id = block_id
-        self.hidden_size = hidden_size
-
-    def forward(self, hidden_states, attention_mask):
-        hidden_states = self.block(hidden_states, attention_mask)[0]
-        return hidden_states
-
-class bge(LLM):
-    def __init__(self, args):
-        self.attention_mask_type = 'int'
-        self.past_kv_shape = []
-        super().__init__(args)
-        self.model_name = 'bge-large-zh'
-
-    def forward(self, input_ids, position_ids, attention_mask):
-        input_ids = input_ids.view(1, -1)
-        token_type_ids = (1 - attention_mask).view(1, -1)
-        hidden_states = self.embed(input_ids, token_type_ids, position_ids)[0].unsqueeze(0)
-        for i in range(self.block_nums):
-            hidden_states = self.blocks[i](hidden_states, attention_mask)
-        # hidden_states = self.lm(hidden_states) # sentence_embeddings not need
-        sentence_embeddings = hidden_states[:, 0]
-        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
-        return sentence_embeddings
-
-    def response(self, query):
-        self.eval()
-        input_ids = self.tokenizer(query)['input_ids']
-        self.seq_len = len(input_ids)
-        input_ids = torch.tensor(input_ids)
-        position_ids = self.get_position_ids()
-        attention_mask = self.get_attention_mask()
-        res = self.forward(input_ids, position_ids, attention_mask)
-        return res
-
-    def load_model(self):
-        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
-        transformer = self.model.encoder
-        self.lm_ = self.model.pooler
-        self.embed_ = self.model.embeddings
-        self.hidden_size = self.embed_.word_embeddings.weight.shape[-1]
-        self.blocks_ = transformer.layer
-        # some wrapper
-        self.stop_ids = []
-        self.block_nums = len(self.blocks_)
-        self.embed = self.embed_
-        self.lm = self.lm_
-        self.blocks = [BGEBlock(self.blocks_[i], i, self.hidden_size) for i in range(self.block_nums)]
-        # some config for export
-        self.model_dynamic_axes = {
-            "input_ids" : { 0: "seq_len" },
-            "position_ids" : { 1: "seq_len" },
-            "attention_mask" : { 3: "seq_len" }
-        }
-
-    def export(self):
-        model = self.eval()
-        self.seq_len = 3
-        input_ids = torch.arange(3, dtype=torch.long)
-        position_ids = self.get_position_ids()
-        attention_mask = self.get_attention_mask()
-        onnx_model = f'./{self.onnx_path}/bge.onnx'
-        torch.onnx.export(
-            model, (input_ids, position_ids, attention_mask),
-            onnx_model,
-            verbose=self.export_verbose,
-            input_names=[
-                'input_ids',
-                'position_ids',
-                'attention_mask'
-            ],
-            output_names=['sentence_embeddings'],
-            dynamic_axes=self.model_dynamic_axes,
-            do_constant_folding=True,
-            opset_version=15)
-        if not self.skip_slim:
-            slim(onnx_model, output_model=onnx_model)
-        if self.export_test:
-            self.seq_len = 4
-            position_ids = self.get_position_ids()
-            input_ids = torch.tensor([ 101,  872, 1962,  102 ], dtype=torch.long)
-            attention_mask = self.get_attention_mask()
-            # test
-            original_outs = model(input_ids, position_ids, attention_mask)
-            ort_session = ort.InferenceSession(onnx_model, providers=['CPUExecutionProvider'])
-            inputs = {
-                'input_ids' : input_ids.detach().numpy(),
-                'position_ids' : position_ids.detach().numpy(),
-                'attention_mask' : attention_mask.detach().numpy()
-            }
-            onnx_outs = ort_session.run(None, inputs)[0]
-            self.assert_equal(original_outs, onnx_outs)
-
-        token_str = None
-        if False: # save tokenizer in mnn
-            self.export_tokenizer()
-            token_path = os.path.join(self.onnx_path, "tokenizer.txt")
-            token_str = open(token_path, 'rt').read()
-
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path, 8, True, bizCode=token_str)
-
-    def build_prompt(self, query):
-            return f'[CLS]{query}[SEP]'
-
-    def get_position_ids(self) -> torch.Tensor:
-        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
-
-    def get_attention_mask(self) -> torch.Tensor:
-        return torch.ones([1, 1, 1, self.seq_len], dtype=torch.long)
-
-class LoraModule(torch.nn.Module):
-    def __init__(self, args):
-        super().__init__()
-        self.onnx_path = args.onnx_path
-        self.mnn_path = args.mnn_path
-        self.export_mnn = args.export_mnn
-        import peft
-        lora_weight = peft.load_peft_weights(args.path)
-        for k, v in lora_weight.items():
-            k = k.replace('.', '/')
-            self.register_buffer(k, v.cpu())
-
-    def forward(self, dummpy):
-        return self._buffers
-
-    def export(self):
-        onnx_model = f'./{self.onnx_path}/lora.onnx'
-        torch.onnx.export(self.eval(), torch.tensor([]), onnx_model)
-        if self.export_mnn:
-            onnx2mnn(onnx_model, self.mnn_path)
-
-
-if __name__ == '__main__':
-    llm_models = {
-        'chatglm-6b': Chatglm_6b,
-        'chatglm2-6b': Chatglm2_6b,
-        'codegeex2-6b': Chatglm2_6b,
-        'chatglm3-6b': Chatglm3_6b,
-        'glm-4-9b-chat': Chatglm3_6b,
-        'Qwen-7B-Chat': Qwen_Chat,
-        'Qwen-1_8B-Chat': Qwen_Chat,
-        'Qwen-1_8B': Qwen_Chat,
-        'Qwen-VL-Chat': Qwen_Chat,
-        'Qwen1_5-0_5B-Chat': Qwen2_Chat,
-        'Qwen1_5-1_8B-Chat': Qwen2_Chat,
-        'Qwen1_5-4B-Chat': Qwen2_Chat,
-        'Qwen1_5-7B-Chat': Qwen2_Chat,
-        'Qwen2-0_5B-Instruct': Qwen2_Chat,
-        'Qwen2-1_5B-Instruct': Qwen2_Chat,
-        'Qwen2-7B-Instruct': Qwen2_Chat,
-        'Baichuan2-7B-Chat': Llama2_7b_Chat,
-        'Llama-2-7b-chat-ms': Llama2_7b_Chat,
-        'Llama-3-8B-Instruct': Llama2_7b_Chat,
-        'internlm-chat-7b': Llama2_7b_Chat,
-        'TinyLlama-1_1B-Chat': Llama2_7b_Chat,
-        'Yi-6B-Chat': Llama2_7b_Chat,
-        'deepseek-llm-7b-chat': Llama2_7b_Chat,
-        'MiniCPM-1.2b': Llama2_7b_Chat,
-        'MiniCPM-2.4b': Llama2_7b_Chat,
-        'phi-2': phi_2,
-        'bge-large-zh': bge,
-        'lora': LoraModule
-    }
-    parser = argparse.ArgumentParser(description='llm_exporter', formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument('--path', type=str, default='THUDM/chatglm-6b', required=True,
-                        help='path(`str` or `os.PathLike`):\nCan be either:'
-                        '\n\t- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]'
-                        '\n\t- A path to a *directory* clone from repo like `../chatglm-6b`.')
-    parser.add_argument('--type', type=str, choices=llm_models.keys(), default=None,
-                        help='type(`str`, *optional*):'
-                        '\n\tThe pretrain llm model type.'
-                        )
-    parser.add_argument('--lora_path', type=str, default=None, help='lora path, defaut is `None` mean not apply lora.')
-    parser.add_argument('--onnx_path', type=str, default='./onnx', help='export onnx model path, defaut is `./onnx`.')
-    parser.add_argument('--mnn_path', type=str, default='./mnn', help='export mnn model path, defaut is `./mnn`.')
-    parser.add_argument('--export_mnn', action='store_true', default=False, help='Whether or not to export mnn model after onnx.')
-    parser.add_argument('--export_verbose', action='store_true', default=False, help='Whether or not to export onnx with verbose.')
-    parser.add_argument('--export_test', action='store_true', help='Whether or not to export onnx with test using onnxruntime.')
-    parser.add_argument('--test', type=str, help='test model inference with query `TEST`.')
-    parser.add_argument('--export', action='store_true', help='export model to an `onnx` model.')
-    parser.add_argument('--export_split', action='store_true',
-                        help='export model split to some `onnx` models:'
-                        '\n\t- embedding model.'
-                        '\n\t- block models.'
-                        '\n\t- lm_head model.'
-                        )
-    parser.add_argument('--export_visual', action='store_true', help='export llm visual model to an `onnx` model.')
-    parser.add_argument('--export_lm', action='store_true', help='export llm lm_head to an `onnx` model.')
-    parser.add_argument('--export_block', type=int, help='export llm block [id] to an `onnx` model.')
-    parser.add_argument('--export_blocks', action='store_true', help='export llm all blocks to `onnx` models.')
-    parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
-
-    # No use now, add invoid of call error
-    parser.add_argument('--export_token', action='store_true', help='export llm tokenizer to a txt file.')
-    parser.add_argument('--export_embed', action='store_true', help='export llm embedding to an `onnx` model.')
-    parser.add_argument('--embed_bf16', default=True, action='store_true', help='using `bfloat16` replace `float32` in embedding.')
-    parser.add_argument('--embed_bin', action='store_true', help='export embedding weight as bin file with dtype `bfloat16`')
-
-    args = parser.parse_args()
-    model_path = args.path
-    model_type = args.type
-    # not sepcify model type, using path
-    if model_type is None:
-        for model in llm_models:
-            if model in model_path:
-                model_type = model
-    if model_type is None:
-        raise RuntimeError('Please specify model type.')
-
-    # copy modeling py file to pretrain model for export
-    for file in glob.glob(f'./llm_models/{model_type}/*'):
-        shutil.copy2(file, model_path)
-
-    llm_exporter = llm_models[model_type](args)
-
-    # some actions
-    if args.test is not None:
-        llm_exporter.response(args.test)
-
-    if args.export or args.export_split:
-        llm_exporter.export_config(args.export)
-
-    if args.export:
-        llm_exporter.export()
-
-    llm_exporter.export_tokenizer()
-
-    llm_exporter.export_embed()
-
-    if args.export_visual or args.export_split:
-        llm_exporter.export_visual()
-
-    if args.export_lm or args.export_split:
-        llm_exporter.export_lm()
-
-    if args.export_blocks or args.export_split:
-        llm_exporter.export_blocks()
-
-    if args.export_block is not None:
-        llm_exporter.export_block(args.export_block)
\ No newline at end of file
diff --git a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py b/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
deleted file mode 100755
index 5a0b69e83..000000000
--- a/transformers/llm/export/llm_models/Baichuan2-7B-Chat/modeling_baichuan.py
+++ /dev/null
@@ -1,825 +0,0 @@
-# Copyright 2023 Baichuan Inc. All Rights Reserved.
-
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from .configuration_baichuan import BaichuanConfig
-from .generation_utils import build_chat_input, TextIterStreamer
-
-import math
-from typing import List, Optional, Tuple, Union
-from threading import Thread
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from torch.nn import functional as F
-from transformers import PreTrainedModel, PretrainedConfig
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.generation.utils import GenerationConfig
-from transformers.utils import logging, ContextManagers
-
-import os
-from contextlib import contextmanager
-logger = logging.get_logger(__name__)
-
-try:
-    from xformers import ops as xops
-except ImportError:
-    xops = None
-    logger.warning(
-        "Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers\npip install xformers."
-    )
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    if len(mask.size()) == 3:
-        bsz, src_len, _ = mask.size()
-        tgt_len = tgt_len if tgt_len is not None else src_len
-        expanded_mask = mask[:,None,:,:].expand(bsz, 1, tgt_len, src_len).to(dtype)
-    else:
-        bsz, src_len = mask.size()
-        tgt_len = tgt_len if tgt_len is not None else src_len
-        expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
-        freqs = torch.outer(t, self.inv_freq)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32)
-        self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
-            freqs = torch.outer(t, self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            self.cos_cached = emb.cos()[None, None, :, :].to(torch.float32).to(x.device)
-            self.sin_cached = emb.sin()[None, None, :, :].to(torch.float32).to(x.device)
-        elif self.cos_cached.device != x.device:
-            self.cos_cached = self.cos_cached.to(x.device)
-            self.sin_cached = self.sin_cached.to(x.device)
-        return (
-            self.cos_cached[:, :, :seq_len, ...],
-            self.sin_cached[:, :, :seq_len, ...],
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids):
-    cos = torch.squeeze(cos_)  # [seq_len, dim]
-    sin = torch.squeeze(sin_)  # [seq_len, dim]
-    # print(f'### cos.shape = {cos.shape}, position_ids.shape = {position_ids.shape}, cos[position_ids].shape = {cos[position_ids].shape}')
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # print(f'### q.shape = {q.shape}, cos.shape = {cos.shape}')
-    # cos = cos[position_ids]
-    # sin = sin[position_ids]
-    q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin)
-    k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin)
-    return q_embed.to(q.dtype), k_embed.to(k.dtype)
-
-
-class MLP(nn.Module):
-    def __init__(
-            self,
-            hidden_size: int,
-            intermediate_size: int,
-            hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class Attention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-    def __init__(self, config: BaichuanConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.W_pack = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self.rotary_emb = RotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def raw_atten(self, query_layer, key_layer, value_layer, attention_mask):
-        attn_weight = torch.softmax((query_layer @ key_layer.transpose(-2, -1) / math.sqrt(query_layer.size(-1))) + attention_mask, dim=-1)
-        return attn_weight @ value_layer
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            rotary_pos_emb: Optional[torch.Tensor] = None,
-            output_attentions: bool = False,
-            use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        proj = self.W_pack(hidden_states)
-        proj = proj.reshape([1, -1, 3, 4096]).permute([2, 0, 1, 3])
-        '''
-        # proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose(0, -2).squeeze(-2)
-        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-        if xops is not None and self.training:
-            attn_weights = None
-            query_states = query_states.transpose(1, 2)
-            key_states = key_states.transpose(1, 2)
-            value_states = value_states.transpose(1, 2)
-            attn_output = xops.memory_efficient_attention(
-                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask()
-            )
-        else:
-            attn_output = self.raw_atten(query_states, key_states, value_states, attention_mask)
-            attn_output = attn_output.transpose(1, 2)
-        '''
-        #---------------
-        query_states = proj[0].view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = proj[1].view(bsz, q_len, self.num_heads, self.head_dim)
-        value_states = proj[2].view(bsz, q_len, self.num_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        #---------------
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class DecoderLayer(nn.Module):
-    def __init__(self, config: BaichuanConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = Attention(config=config)
-        self.mlp = MLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            rotary_pos_emb: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = False,
-            use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-class BaichuanPreTrainedModel(PreTrainedModel):
-    config_class = BaichuanConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, BaichuanModel):
-            module.gradient_checkpointing = value
-
-
-class BaichuanModel(BaichuanPreTrainedModel):
-    def __init__(self, config: BaichuanConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    def forward(
-            self,
-            input_ids: torch.LongTensor = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.FloatTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class NormHead(nn.Module):
-    def __init__(self, hidden_size, vocab_size, bias=False):
-        super().__init__()
-        self.weight = nn.Parameter(torch.empty((vocab_size, hidden_size)))
-        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
-        self.first_flag = True
-
-    def forward(self, hidden_states):
-        if self.training:
-            norm_weight = nn.functional.normalize(self.weight)
-        elif self.first_flag:
-            self.first_flag = False
-            self.weight = nn.Parameter(nn.functional.normalize(self.weight))
-            norm_weight = self.weight
-        else:
-            norm_weight = self.weight
-        return nn.functional.linear(hidden_states, norm_weight)
-
-_init_weights = True
-@contextmanager
-def no_init_weights(_enable=True):
-    global _init_weights
-    old_init_weights = _init_weights
-    if _enable:
-        _init_weights = False
-    try:
-        yield
-    finally:
-        _init_weights = old_init_weights
-
-class BaichuanForCausalLM(BaichuanPreTrainedModel):
-    def __init__(self, config, *model_args, **model_kwargs):
-        super().__init__(config, *model_args, **model_kwargs)
-        self.model = BaichuanModel(config)
-
-        self.lm_head = NormHead(config.hidden_size, config.vocab_size, bias=False)
-        if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
-            try:
-                from .quantizer import quantize_offline, init_model_weight_int4
-            except ImportError:
-                raise ImportError(f"Needs QLinear to run quantize.")
-            quantize_offline(self, 4)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
-        *model_args,
-        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        ignore_mismatched_sizes: bool = False,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
-        revision: str = "main",
-        use_safetensors: bool = None,
-        **kwargs,
-    ):
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=False,
-                proxies=None,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder="",
-                _from_auto=False,
-                _from_pipeline=None,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        if hasattr(config, "quantization_config") and config.quantization_config['load_in_4bit']:
-            try:
-                from .quantizer import init_model_weight_int4
-                from accelerate import init_empty_weights, dispatch_model, infer_auto_device_map
-                from accelerate.utils import CustomDtype
-                from accelerate.utils import get_balanced_memory
-            except ImportError:
-                raise ImportError(f"Needs import model weight init func to run quantize.")
-            # Instantiate model.
-            init_contexts = [no_init_weights(_enable=True)]
-            init_contexts.append(init_empty_weights())
-            with ContextManagers(init_contexts):
-                model = cls(config)
-
-            model_file = os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin')
-            state_dict = torch.load(model_file, map_location="cpu")
-            model.is_quantized = True
-
-            device_map = kwargs.pop("device_map", None)
-            torch_dtype = kwargs.pop("torch_dtype", None)
-
-            kwargs = {"no_split_module_classes": model._no_split_modules}
-            target_dtype = CustomDtype.INT4
-            max_memory = get_balanced_memory(
-                model,
-                dtype=target_dtype,
-                low_zero=(device_map == "balanced_low_0"),
-                max_memory=None,
-                **kwargs,
-            )
-            kwargs["max_memory"] = max_memory
-
-            device_map = infer_auto_device_map(model, dtype=target_dtype, **kwargs)
-            model = init_model_weight_int4(config, model, state_dict)
-
-            # Set model in evaluation mode to deactivate DropOut modules by default
-            model.eval()
-            # If it is a model with generation capabilities, attempt to load the generation config
-            if model.can_generate():
-                try:
-                    model.generation_config = GenerationConfig.from_pretrained(
-                        pretrained_model_name_or_path,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=False,
-                        proxies=None,
-                        local_files_only=local_files_only,
-                        token=token,
-                        revision=revision,
-                        subfolder="",
-                        _from_auto=False,
-                        _from_pipeline=None,
-                        **kwargs,
-                    )
-                except (OSError, TypeError):
-                    logger.info(
-                        "Generation config file not found, using a generation config created from the model config."
-                    )
-                    pass
-
-            if device_map is not None:
-                dispatch_model(model, device_map=device_map)
-
-            return model
-        return super(BaichuanForCausalLM, cls).from_pretrained(pretrained_model_name_or_path, *model_args,
-                config=config, cache_dir=cache_dir, ignore_mismatched_sizes=ignore_mismatched_sizes,
-                force_download=force_download, local_files_only=local_files_only, token=token, revision=revision,
-                use_safetensors=use_safetensors, **kwargs)
-
-    def forward(
-            self,
-            input_ids: torch.LongTensor = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.FloatTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            softmax_normalizer = shift_logits.max(-1).values ** 2
-            z_loss = self.config.z_loss_weight * softmax_normalizer.mean()
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels) + z_loss
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-    def quantize(self, bits: int):
-        try:
-            from .quantizer import quantize_online
-        except ImportError:
-            raise ImportError(f"Needs QLinear to run quantize.")
-        return quantize_online(self, bits)
-
-    def chat(self, tokenizer, messages: List[dict], stream=False,
-             generation_config: Optional[GenerationConfig]=None):
-        generation_config = generation_config or self.generation_config
-        input_ids = build_chat_input(self, tokenizer, messages, generation_config.max_new_tokens)
-        if stream:
-            streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-            Thread(target=self.generate, kwargs=dict(
-                inputs=input_ids, streamer=streamer,
-                generation_config=generation_config,
-            )).start()
-            return streamer
-        else:
-            outputs = self.generate(input_ids, generation_config=generation_config)
-            print(outputs[0])
-            response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
-            return response
diff --git a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/config.json b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/config.json
deleted file mode 100755
index e2ba91313..000000000
--- a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 32,
-  "pad_token_id": 0,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.31.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32000
-}
diff --git a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/configuration_llama.py b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py b/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
deleted file mode 100644
index 493b040b7..000000000
--- a/transformers/llm/export/llm_models/Llama-2-7b-chat-ms/modeling_llama.py
+++ /dev/null
@@ -1,1040 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/config.json b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/config.json
deleted file mode 100755
index d9c36dfca..000000000
--- a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "bos_token_id": 128000,
-  "eos_token_id": 128001,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 14336,
-  "max_position_embeddings": 8192,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 8,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 500000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.38.2",
-  "use_cache": true,
-  "vocab_size": 128256
-}
diff --git a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/configuration_llama.py b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py b/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
deleted file mode 100644
index 493b040b7..000000000
--- a/transformers/llm/export/llm_models/Llama-3-8B-Instruct/modeling_llama.py
+++ /dev/null
@@ -1,1040 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json b/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json
deleted file mode 100644
index 0bfa72faa..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-1.2b/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 1536,
-  "initializer_range": 0.1,
-  "intermediate_size": 3840,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 24,
-  "num_hidden_layers": 52,
-  "num_key_value_heads": 8,
-  "pad_token_id": 0,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.31.0.dev0",
-  "use_cache": true,
-  "vocab_size": 73440
-}
\ No newline at end of file
diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/configuration_llama.py b/transformers/llm/export/llm_models/MiniCPM-1.2b/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-1.2b/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/convert_minicpm_to_llama.py b/transformers/llm/export/llm_models/MiniCPM-1.2b/convert_minicpm_to_llama.py
deleted file mode 100644
index 7e6a56d5f..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-1.2b/convert_minicpm_to_llama.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-import math
-#torch.manual_seed(0)
-
-path = "path-to-MiniCPM-1B-sft-bf16"
-tokenizer = AutoTokenizer.from_pretrained(path)
-model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.bfloat16, trust_remote_code=True)
-
-responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮？差距多少？", temperature=0.3, top_p=0.5)
-print(responds)
-
-
-state_dict = model.state_dict()
-print(state_dict.keys())
-
-scale_emb = 12
-dim_model_base = 256
-scale_depth = 1.4
-num_layers = 52
-hidden_size = 1536
-
-new_emb = state_dict["model.embed_tokens.weight"] * scale_emb
-state_dict["model.embed_tokens.weight"] = new_emb
-
-new_emb = state_dict["lm_head.weight"] / (hidden_size / dim_model_base)
-state_dict["lm_head.weight"] = new_emb
-
-for i in range(num_layers):
-    attn_out_name = f"model.layers.{i}.self_attn.o_proj.weight"
-    new_weight = state_dict[attn_out_name] * (scale_depth / math.sqrt(num_layers))
-    state_dict[attn_out_name] = new_weight
-
-    ffn_down_proj_name = f"model.layers.{i}.mlp.down_proj.weight"
-    new_weight = state_dict[ffn_down_proj_name] * (scale_depth / math.sqrt(num_layers))
-    state_dict[ffn_down_proj_name] = new_weight
-
-torch.save(state_dict, "pytorch_model_llama.bin")
diff --git a/transformers/llm/export/llm_models/MiniCPM-1.2b/modeling_llama.py b/transformers/llm/export/llm_models/MiniCPM-1.2b/modeling_llama.py
deleted file mode 100644
index 8c562c604..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-1.2b/modeling_llama.py
+++ /dev/null
@@ -1,1010 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json b/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json
deleted file mode 100644
index 541a5f8c4..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-2.4b/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 2304,
-  "initializer_range": 0.1,
-  "intermediate_size": 5760,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 36,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 36,
-  "pad_token_id": 0,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.31.0.dev0",
-  "use_cache": true,
-  "vocab_size": 122756
-}
\ No newline at end of file
diff --git a/transformers/llm/export/llm_models/MiniCPM-2.4b/configuration_llama.py b/transformers/llm/export/llm_models/MiniCPM-2.4b/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-2.4b/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/MiniCPM-2.4b/modeling_llama.py b/transformers/llm/export/llm_models/MiniCPM-2.4b/modeling_llama.py
deleted file mode 100644
index 8c562c604..000000000
--- a/transformers/llm/export/llm_models/MiniCPM-2.4b/modeling_llama.py
+++ /dev/null
@@ -1,1010 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen-1_8B-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-1_8B-Chat/modeling_qwen.py
deleted file mode 100755
index 5138eea76..000000000
--- a/transformers/llm/export/llm_models/Qwen-1_8B-Chat/modeling_qwen.py
+++ /dev/null
@@ -1,1406 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import copy
-import importlib
-import math
-import pathlib
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import warnings
-
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
-from transformers.generation.logits_process import LogitsProcessorList
-
-if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-from torch import nn
-
-SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
-
-
-from .configuration_qwen import QWenConfig
-from .qwen_generation_utils import (
-    HistoryType,
-    make_context,
-    decode_tokens,
-    get_stop_words_ids,
-    StopWordsLogitsProcessor,
-)
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "qwen"
-_CONFIG_FOR_DOC = "QWenConfig"
-
-QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
-
-_ERROR_BAD_CHAT_FORMAT = """\
-We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
-If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
-我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
-如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
-"""
-
-_SENTINEL = object()
-_ERROR_STREAM_IN_CHAT = """\
-Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
-向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
-"""
-
-_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED = """\
-We detect you have activated flash attention support, but running model computation on CPU. Please make sure that your input data has been placed on GPU. If you actually want to run CPU computation, please following the readme and set device_map="cpu" to disable flash attention when loading the model (calling AutoModelForCausalLM.from_pretrained).
-检测到您的模型已激活了flash attention支持，但正在执行CPU运算任务。如使用flash attention，请您确认模型输入已经传到GPU上。如果您确认要执行CPU运算，请您在载入模型（调用AutoModelForCausalLM.from_pretrained）时，按照readme说法，指定device_map="cpu"以禁用flash attention。
-"""
-
-apply_rotary_emb_func = None
-rms_norm = None
-flash_attn_unpadded_func = None
-flash_attn_func = None
-
-def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func, flash_attn_func
-    try:
-        from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
-        apply_rotary_emb_func = __apply_rotary_emb_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
-        )
-
-    try:
-        from flash_attn.ops.rms_norm import rms_norm as __rms_norm
-        rms_norm = __rms_norm
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
-        )
-
-    try:
-        import flash_attn
-        _flash_attn_func = None
-        if not hasattr(flash_attn, '__version__'):
-            from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        else:
-            if int(flash_attn.__version__.split(".")[0]) >= 2:
-                if int(flash_attn.__version__.split(".")[1]) >= 1:
-                    from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func
-                from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
-            else:
-                from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        flash_attn_unpadded_func = __flash_attn_unpadded_func
-        flash_attn_func = _flash_attn_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention"
-        )
-
-def quantize_cache_v(fdata, bits, qmax, qmin):
-    # b, s, head, h-dim->b, head, s, h-dim
-    qtype = torch.uint8
-    device = fdata.device
-    shape = fdata.shape
-
-    fdata_cal = torch.flatten(fdata, 2)
-    fmax = torch.amax(fdata_cal, dim=-1, keepdim=True)
-    fmin = torch.amin(fdata_cal, dim=-1, keepdim=True)
-    # Compute params
-    if qmax.device != fmax.device:
-        qmax = qmax.to(device)
-        qmin = qmin.to(device)
-    scale = (fmax - fmin) / (qmax - qmin)
-    zero = qmin - fmin / scale
-    scale = scale.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
-    zero = zero.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
-    # Quantize
-    res_data = fdata / scale + zero
-    qdata = torch.clamp(res_data, qmin, qmax).to(qtype)
-    return qdata.contiguous(), scale, zero
-
-def dequantize_cache_torch(qdata, scale, zero):
-    data = scale * (qdata - zero)
-    return data
-
-class FlashSelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        causal=False,
-        softmax_scale=None,
-        attention_dropout=0.0,
-    ):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, (
-            "Please install FlashAttention first, " "e.g., with pip install flash-attn"
-        )
-        assert (
-            rearrange is not None
-        ), "Please install einops first, e.g., with pip install einops"
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-
-    def unpad_input(self, hidden_states, attention_mask):
-        valid_mask = attention_mask.squeeze(1).squeeze(1).eq(0)
-        seqlens_in_batch = valid_mask.sum(dim=-1, dtype=torch.int32)
-        indices = torch.nonzero(valid_mask.flatten(), as_tuple=False).flatten()
-        max_seqlen_in_batch = seqlens_in_batch.max().item()
-        cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-        hidden_states = hidden_states[indices]
-        return hidden_states, indices, cu_seqlens, max_seqlen_in_batch
-
-    def pad_input(self, hidden_states, indices, batch, seqlen):
-        output = torch.zeros(batch * seqlen, *hidden_states.shape[1:], device=hidden_states.device,
-                             dtype=hidden_states.dtype)
-        output[indices] = hidden_states
-        return rearrange(output, '(b s) ... -> b s ...', b=batch)
-
-    def forward(self, q, k, v, attention_mask=None):
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
-        assert all((i.is_cuda for i in (q, k, v)))
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-        seqlen_out = seqlen_q
-
-        if flash_attn_func is not None and batch_size == 1:
-            dropout_p = self.dropout_p if self.training else 0
-            output = flash_attn_func(q, k, v, dropout_p, softmax_scale=self.softmax_scale, causal=self.causal)
-            return output
-
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q.device,
-        )
-
-        if batch_size > 1 and attention_mask is not None:
-            k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
-            if q.size(0) == v.size(0):
-                q = q[indices_k]
-                cu_seqlens_q = cu_seqlens_k
-                seqlen_q = seqlen_k
-            v = v[indices_k]
-        else:
-            cu_seqlens_k = torch.arange(
-                0,
-                (batch_size + 1) * seqlen_k,
-                step=seqlen_k,
-                dtype=torch.int32,
-                device=q.device,
-            )
-
-        if self.training:
-            assert seqlen_k == seqlen_q
-            is_causal = self.causal
-            dropout_p = self.dropout_p
-        else:
-            is_causal = seqlen_q == seqlen_k
-            dropout_p = 0
-
-        output = flash_attn_unpadded_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqlen_q,
-            seqlen_k,
-            dropout_p,
-            softmax_scale=self.softmax_scale,
-            causal=is_causal,
-        )
-        if batch_size > 1 and attention_mask is not None and seqlen_q == seqlen_k:
-            output = self.pad_input(output, indices_k, batch_size, seqlen_out)
-        else:
-            new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
-            output = output.view(new_shape)
-        return output
-
-
-class QWenAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
-        self.seq_length = config.seq_length
-
-        self.hidden_size = config.hidden_size
-        self.split_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-
-        self.use_flash_attn = config.use_flash_attn
-        self.scale_attn_weights = True
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        assert self.projection_size % config.num_attention_heads == 0
-        self.hidden_size_per_attention_head = (
-            self.projection_size // config.num_attention_heads
-        )
-
-        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
-
-        self.c_proj = nn.Linear(
-            config.hidden_size, self.projection_size, bias=not config.no_bias
-        )
-
-        self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-        ):
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
-            )
-        self.bf16 = config.bf16
-
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.use_logn_attn = config.use_logn_attn
-
-        logn_list = [
-            math.log(i, self.seq_length) if i > self.seq_length else 1
-            for i in range(1, 32768)
-        ]
-        logn_tensor = torch.tensor(logn_list)[None, :, None, None]
-        self.register_buffer("logn_tensor", logn_tensor, persistent=False)
-
-        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-        self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
-        self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
-        self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
-        cache_dtype = torch.float
-        if self.bf16:
-            cache_dtype=torch.bfloat16
-        elif config.fp16:
-            cache_dtype = torch.float16
-        self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
-        self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
-
-        if config.use_cache_quantization and config.use_cache_kernel:
-            # pre check if the support files existing
-            module_root = pathlib.Path(__file__).parent
-            src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
-            if any(not (module_root/src).is_file() for src in src_files):
-                warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
-                self.cache_kernels = None
-            else:
-                try:
-                    from .cpp_kernels import cache_autogptq_cuda_256
-                    self.cache_kernels = cache_autogptq_cuda_256
-                except ImportError:
-                    warnings.warn("Failed to import KV cache kernels.")
-                    self.cache_kernels = None
-
-    def _attn(self, query, key, value, no_use_mask, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [],
-                value.size(-1) ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
-            )
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        if attention_mask is None:
-            causal_mask = self.bias[
-                :, :, key_length - query_length : key_length, :key_length
-            ]
-        else:
-            causal_mask = attention_mask
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(
-            causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        )
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def __attn(self, query, key, value, causal_mask=None, attention_mask=None, head_mask=None):
-        device = query.device
-        if self.use_cache_quantization:
-            qk, qk_scale, qk_zero = key
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = query.shape[:-1] + (qk.shape[-2],)
-                attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_faster_old(
-                    query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
-                    qk.transpose(-1, -2).contiguous(),
-                    attn_weights,
-                    qk_scale.contiguous() if qk_scale.dtype == torch.float16 else qk_scale.to(torch.float16).contiguous(),
-                    qk_zero.contiguous()if qk_zero.dtype == torch.float16 else qk_zero.to(torch.float16).contiguous())
-                # attn_weights = attn_weights.to(query.dtype).contiguous()
-            else:
-                key = dequantize_cache_torch(qk, qk_scale, qk_zero)
-                attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        else:
-            attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            if self.use_cache_quantization:
-                size_temp = value[0].size(-1)
-            else:
-                size_temp = value.size(-1)
-            attn_weights = attn_weights / (size_temp ** 0.5)
-
-        mask_value = torch.finfo(attn_weights.dtype).min
-        if causal_mask is not None:
-            attn_weights = torch.where(
-                causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-            )
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        if self.softmax_in_fp32:
-            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
-        else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(query.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        if self.use_cache_quantization:
-            qv, qv_scale, qv_zero = value
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = attn_weights.shape[:-1] + (query.shape[-1],)
-                attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
-                    attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
-                    qv.contiguous(),  # dtype: int32
-                    attn_output,
-                    qv_scale.contiguous() if qv_scale.dtype == torch.float16 else qv_scale.to(torch.float16).contiguous(),
-                    qv_zero.contiguous() if qv_zero.dtype == torch.float16 else qv_zero.to(torch.float16).contiguous())
-                if attn_output.dtype != query.dtype:
-                    attn_output = attn_output.to(query.dtype)
-                    attn_weights = attn_weights.to(query.dtype)
-            else:
-                value = dequantize_cache_torch(qv, qv_scale, qv_zero)
-                attn_output = torch.matmul(attn_weights, value)
-        else:
-            attn_output = torch.matmul(attn_weights, value)
-
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        tensor = tensor.contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ):
-        mixed_x_layer = self.c_attn(hidden_states)
-
-        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        if rotary_pos_emb_list is not None:
-            cur_len = query.shape[1]
-            if True:
-                rotary_pos_emb = rotary_pos_emb_list
-                rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                rotary_pos_emb = (rotary_pos_emb,) * 2
-                q_pos_emb, k_pos_emb = rotary_pos_emb
-                # Slice the pos emb for current inference
-                query = apply_rotary_pos_emb(query, q_pos_emb)
-                key = apply_rotary_pos_emb(key, k_pos_emb)
-            else:
-                query_list = []
-                key_list = []
-                for i, rotary_pos_emb in enumerate(rotary_pos_emb_list):
-                    rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                    rotary_pos_emb = (rotary_pos_emb,) * 2
-                    q_pos_emb, k_pos_emb = rotary_pos_emb
-                    # Slice the pos emb for current inference
-                    query_list += [apply_rotary_pos_emb(query[i:i+1, :, :], q_pos_emb)]
-                    key_list += [apply_rotary_pos_emb(key[i:i+1, :, :], k_pos_emb)]
-                query = torch.cat(query_list, dim=0)
-                key = torch.cat(key_list, dim=0)
-
-        if self.use_cache_quantization:
-            key = quantize_cache_v(key.permute(0, 2, 1, 3),
-                                       bits=8,
-                                       qmin=self.cache_qmin,
-                                       qmax=self.cache_qmax)
-            value = quantize_cache_v(value.permute(0, 2, 1, 3),
-                                         bits=8,
-                                         qmin=self.cache_qmin,
-                                         qmax=self.cache_qmax)
-
-
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            if self.use_cache_quantization:
-                # use_cache_quantization:
-                # present=((q_key,key_scale,key_zero_point),
-                #          (q_value,value_scale,value_zero_point))
-                key = (torch.cat((past_key[0], key[0]), dim=2),
-                       torch.cat((past_key[1], key[1]), dim=2),
-                       torch.cat((past_key[2], key[2]), dim=2))
-                value = (torch.cat((past_value[0], value[0]), dim=2),
-                         torch.cat((past_value[1], value[1]), dim=2),
-                         torch.cat((past_value[2], value[2]), dim=2))
-            else:
-                # not use_cache_quantization:
-                # present=(key,value)
-                key = torch.cat((past_key, key), dim=1)
-                value = torch.cat((past_value, value), dim=1)
-
-        if use_cache:
-            present = (key, value)
-        else:
-            present = None
-
-        key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
-        if key_size > self.seq_length and self.use_logn_attn and not self.training:
-            if self.use_cache_quantization:
-                seq_start = key[0].size(2) - query.size(1)
-                seq_end = key[0].size(2)
-            else:
-                seq_start = key.size(1) - query.size(1)
-                seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
-            query = query * logn_tensor.expand_as(query)
-
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
-        else:
-            key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
-            if query.size(1) == key_size:
-                causal_mask = torch.tril(
-                    torch.ones((key_size, key_size), dtype=torch.bool, device=query.device)
-                ).view(1, 1, key_size, key_size)
-            else:
-                causal_mask = None
-            query = query.permute(0, 2, 1, 3)
-            if not self.use_cache_quantization:
-                key = key.permute(0, 2, 1, 3)
-                value = value.permute(0, 2, 1, 3)
-            if (
-                causal_mask is None
-                and self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-                and not query.is_cuda
-            ):
-                raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
-
-            if not self.use_cache_quantization and SUPPORT_TORCH2 and False:
-                if attention_mask is not None:
-                    # attention_mask = attention_mask.expand(
-                    #     -1, -1, causal_mask.size(2), -1
-                    # )
-                    # if causal_mask is not None:
-                    #     attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
-                    causal_mask = attention_mask
-                else:
-                    attention_mask = causal_mask
-                attn_output = F.scaled_dot_product_attention(
-                    query, key, value, attn_mask=attention_mask
-                ).transpose(1, 2)
-                attn_weight = None
-            else:
-                attn_output, attn_weight = self._attn(
-                    # query, key, value, causal_mask, attention_mask, head_mask
-                    query, key, value, attention_mask, attention_mask, head_mask
-                )
-        context_layer = self._merge_heads(
-            attn_output, self.num_heads, self.head_dim
-        )
-
-        attn_output = self.c_proj(context_layer)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            elif not self.use_cache_quantization and SUPPORT_TORCH2:
-                raise ValueError("Cannot output attentions while using scaled_dot_product_attention")
-            else:
-                outputs += (attn_weight,)
-
-        return outputs
-
-
-class QWenMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.w1 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        self.w2 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        ff_dim_in = config.intermediate_size // 2
-        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
-
-    def forward(self, hidden_states):
-        a1 = self.w1(hidden_states)
-        a2 = self.w2(hidden_states)
-        intermediate_parallel = a1 * F.silu(a2)
-        output = self.c_proj(intermediate_parallel)
-        return output
-
-
-class QWenBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.bf16 = config.bf16
-
-        self.ln_1 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-        self.attn = QWenAttention(config)
-        self.ln_2 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.mlp = QWenMLP(config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb: Optional[List[List[torch.Tensor]]] = None,
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ):
-        layernorm_output = self.ln_1(hidden_states)
-
-        attn_outputs = self.attn(
-            layernorm_output,
-            rotary_pos_emb,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        residual = hidden_states
-        layernorm_input = attn_output + residual
-
-        layernorm_output = self.ln_2(layernorm_input)
-
-        residual = layernorm_input
-        mlp_output = self.mlp(layernorm_output)
-        hidden_states = residual + mlp_output
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs
-
-
-class QWenPreTrainedModel(PreTrainedModel):
-    config_class = QWenConfig
-    base_model_prefix = "transformer"
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["QWenBlock"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, RMSNorm):
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                p.data.normal_(
-                    mean=0.0,
-                    std=(
-                        self.config.initializer_range
-                        / math.sqrt(2 * self.config.num_hidden_layers)
-                    ),
-                )
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, QWenModel):
-            module.gradient_checkpointing = value
-
-
-class QWenModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.vocab_size
-        self.num_hidden_layers = config.num_hidden_layers
-        self.embed_dim = config.hidden_size
-        self.use_cache_quantization = self.config.use_cache_quantization if hasattr(self.config, 'use_cache_quantization') else False
-
-        self.gradient_checkpointing = False
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.seq_length = config.seq_length
-
-        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
-
-        self.drop = nn.Dropout(config.emb_dropout_prob)
-
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(
-                config.kv_channels * config.rotary_pct
-            )
-        dim = (
-            self.rotary_ndims
-            if self.rotary_ndims is not None
-            else config.kv_channels
-        )
-        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
-
-        self.use_flash_attn = config.use_flash_attn
-        self.is_fp32 = not (config.bf16 or config.fp16)
-
-        self.h = nn.ModuleList(
-            [
-                QWenBlock(
-                    config
-                )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
-        self.ln_f = RMSNorm(
-            self.embed_dim,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def get_ntk_alpha(self, true_seq_len):
-        context_value = math.log(true_seq_len / self.seq_length, 2) + 1
-        ntk_alpha = 2 ** math.ceil(context_value) - 1
-        ntk_alpha = max(ntk_alpha, 1)
-        return ntk_alpha
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            if self.use_cache_quantization:
-                past_length = past_key_values[0][0][0].size(2)
-            else:
-                past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = attention_mask[:, None, None, :]
-            attention_mask = attention_mask.to(dtype=self.dtype)
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        encoder_attention_mask = None
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        hidden_states = inputs_embeds
-
-        kv_seq_len = hidden_states.size()[1]
-        if past_key_values[0] is not None:
-            # past key values[0][0] shape: bs * seq_len * head_num * dim
-            if self.use_cache_quantization:
-                kv_seq_len += past_key_values[0][0][0].shape[2]
-            else:
-                kv_seq_len += past_key_values[0][0].shape[1]
-
-        if self.training or not self.use_dynamic_ntk:
-            ntk_alpha_list = [1.0]
-        elif kv_seq_len != hidden_states.size()[1]:
-            ntk_alpha_list = self.rotary_emb._ntk_alpha_cached_list
-        else:
-            ntk_alpha_list = []
-            if attention_mask is not None and kv_seq_len > self.seq_length:
-                true_seq_lens = attention_mask.squeeze(1).squeeze(1).eq(0).sum(dim=-1, dtype=torch.int32)
-                for i in range(hidden_states.size()[0]):
-                    true_seq_len = true_seq_lens[i].item()
-                    ntk_alpha = self.get_ntk_alpha(true_seq_len)
-                    ntk_alpha_list.append(ntk_alpha)
-            else:
-                ntk_alpha = self.get_ntk_alpha(kv_seq_len)
-                ntk_alpha_list.append(ntk_alpha)
-        self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
-        rotary_pos_emb_list = [
-            self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
-        ]
-
-        hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    rotary_pos_emb_list,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    rotary_pos_emb=rotary_pos_emb_list,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, presents, all_hidden_states] if v is not None
-            )
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class QWenLMHeadModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        assert (
-            config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
-
-        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
-
-        if autoset_precision:
-            if SUPPORT_BF16:
-                logger.warn(
-                    "The model is automatically converting to bf16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.bf16 = True
-            elif SUPPORT_FP16:
-                logger.warn(
-                    "The model is automatically converting to fp16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.fp16 = True
-            else:
-                config.fp32 = True
-
-        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
-        if config.fp32:
-            if SUPPORT_BF16:
-                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
-            elif SUPPORT_FP16:
-                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        if config.use_flash_attn == "auto":
-            if config.bf16 or config.fp16:
-                logger.warn("Try importing flash-attention for faster inference...")
-                config.use_flash_attn = True
-            else:
-                config.use_flash_attn = False
-        if config.use_flash_attn and config.fp32:
-            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
-
-        if config.use_flash_attn:
-            _import_flash_attn()
-
-        self.transformer = QWenModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        if config.bf16:
-            self.transformer.bfloat16()
-            self.lm_head.bfloat16()
-        if config.fp16:
-            self.transformer.half()
-            self.lm_head.half()
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        if input_ids.size(0) == 1:
-            attention_mask = None
-        else:
-            attention_mask = kwargs.get("attention_mask", None)
-
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(lm_logits.device)
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
-    ) -> Tuple[Tuple[torch.Tensor]]:
-
-        return tuple(
-            tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past
-            )
-            for layer_past in past_key_values
-        )
-
-    def chat(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        query: str,
-        history: Optional[HistoryType],
-        system: str = "You are a helpful assistant.",
-        stream: Optional[bool] = _SENTINEL,
-        stop_words_ids: Optional[List[List[int]]] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ) -> Tuple[str, HistoryType]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        else:
-            # make a copy of the user's input such that is is left untouched
-            history = copy.deepcopy(history)
-
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-        outputs = self.generate(
-                    input_ids,
-                    stop_words_ids=stop_words_ids,
-                    return_dict_in_generate=False,
-                    generation_config=generation_config,
-                    **kwargs,
-                )
-
-        response = decode_tokens(
-            outputs[0],
-            tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            chat_format=generation_config.chat_format,
-            verbose=False,
-            errors='replace'
-        )
-
-        # as history is a copy of the user inputs,
-        # we can always return the new turn to the user.
-        # separating input history and output history also enables the user
-        # to implement more complex history management
-        history.append((query, response))
-
-        return response, history
-
-    def chat_stream(
-            self,
-            tokenizer: PreTrainedTokenizer,
-            query: str,
-            history: Optional[HistoryType],
-            system: str = "You are a helpful assistant.",
-            stop_words_ids: Optional[List[List[int]]] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            generation_config: Optional[GenerationConfig] = None,
-            **kwargs,
-    ) -> Generator[str, Any, None]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-
-        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-        self.__class__.generate_stream = NewGenerationMixin.generate
-        self.__class__.sample_stream = NewGenerationMixin.sample_stream
-        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
-
-        def stream_generator():
-            outputs = []
-            for token in self.generate_stream(
-                    input_ids,
-                    return_dict_in_generate=False,
-                    generation_config=stream_config,
-                    logits_processor=logits_processor,
-                    seed=-1,
-                    **kwargs):
-                outputs.append(token.item())
-                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore')
-
-        return stream_generator()
-
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[
-            Callable[[int, torch.Tensor], List[int]]
-        ] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        # Process stop_words_ids.
-        stop_words_ids = kwargs.pop("stop_words_ids", None)
-        if stop_words_ids is None and generation_config is not None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-        if stop_words_ids is None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-
-        return super().generate(
-            inputs,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            assistant_model=assistant_model,
-            streamer=streamer,
-            **kwargs,
-        )
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000):
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        if importlib.util.find_spec("einops") is None:
-            raise RuntimeError("einops is required for Rotary Embedding")
-
-        self._rotary_pos_emb_cache = None
-        self._seq_len_cached = 0
-        self._ntk_alpha_cached = 1.0
-        self._ntk_alpha_cached_list = [1.0]
-
-    def update_rotary_pos_emb_cache(self, seqlen, ntk_alpha=1.0):
-        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (
-                base
-                ** (
-                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
-                    / self.dim
-                )
-            )
-            self._seq_len_cached = max(2 * seqlen, 16)
-            self._ntk_alpha_cached = ntk_alpha
-            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
-            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
-
-            emb = torch.cat((freqs, freqs), dim=-1)
-            from einops import rearrange
-
-            emb = rearrange(emb, "n d -> 1 n 1 d")
-
-            cos, sin = emb.cos(), emb.sin()
-            self._rotary_pos_emb_cache = [cos, sin]
-
-    def forward(self, max_seq_len, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, ntk_alpha)
-        cos, sin = self._rotary_pos_emb_cache
-        return [cos[:, :max_seq_len], sin[:, :max_seq_len]]
-
-
-def _rotate_half(x):
-    from einops import rearrange
-
-    x = rearrange(x, "... (j d) -> ... j d", j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    """ Apply rotary embedding to the first rotary_dim of the iput
-
-    Arguments:
-      t (tensor(batch_size, seq_len, n_head, head_dim)):
-        the input embedding/hidden states
-      freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
-        the cached cos/sin position embeddings 
-    """
-    rot_dim = freqs[0].shape[-1]
-    cos, sin = freqs
-    t_float = t.float()
-    if apply_rotary_emb_func is not None and t.is_cuda:
-        # apply_rotary_emb in flash_attn requires cos/sin to be of 
-        # shape (seqlen, rotary_dim / 2) and apply rotary embedding 
-        # to the first rotary_dim of the input
-        cos = cos.squeeze(0).squeeze(1)[:, : rot_dim // 2]
-        sin = sin.squeeze(0).squeeze(1)[:, : rot_dim // 2]
-        return apply_rotary_emb_func(t_float, cos, sin).type_as(t)
-    else:
-        t_rot, t_pass = t_float[..., :rot_dim], t_float[..., rot_dim:]
-        t_rot = (t_rot * cos) + (_rotate_half(t_rot) * sin)
-        return torch.cat((t_rot, t_pass), dim=-1).type_as(t)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        if rms_norm is not None and x.is_cuda:
-            return rms_norm(x, self.weight, self.eps)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
diff --git a/transformers/llm/export/llm_models/Qwen-1_8B/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-1_8B/modeling_qwen.py
deleted file mode 100755
index 5138eea76..000000000
--- a/transformers/llm/export/llm_models/Qwen-1_8B/modeling_qwen.py
+++ /dev/null
@@ -1,1406 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import copy
-import importlib
-import math
-import pathlib
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import warnings
-
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
-from transformers.generation.logits_process import LogitsProcessorList
-
-if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-from torch import nn
-
-SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
-
-
-from .configuration_qwen import QWenConfig
-from .qwen_generation_utils import (
-    HistoryType,
-    make_context,
-    decode_tokens,
-    get_stop_words_ids,
-    StopWordsLogitsProcessor,
-)
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "qwen"
-_CONFIG_FOR_DOC = "QWenConfig"
-
-QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
-
-_ERROR_BAD_CHAT_FORMAT = """\
-We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
-If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
-我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
-如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
-"""
-
-_SENTINEL = object()
-_ERROR_STREAM_IN_CHAT = """\
-Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
-向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
-"""
-
-_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED = """\
-We detect you have activated flash attention support, but running model computation on CPU. Please make sure that your input data has been placed on GPU. If you actually want to run CPU computation, please following the readme and set device_map="cpu" to disable flash attention when loading the model (calling AutoModelForCausalLM.from_pretrained).
-检测到您的模型已激活了flash attention支持，但正在执行CPU运算任务。如使用flash attention，请您确认模型输入已经传到GPU上。如果您确认要执行CPU运算，请您在载入模型（调用AutoModelForCausalLM.from_pretrained）时，按照readme说法，指定device_map="cpu"以禁用flash attention。
-"""
-
-apply_rotary_emb_func = None
-rms_norm = None
-flash_attn_unpadded_func = None
-flash_attn_func = None
-
-def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func, flash_attn_func
-    try:
-        from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
-        apply_rotary_emb_func = __apply_rotary_emb_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
-        )
-
-    try:
-        from flash_attn.ops.rms_norm import rms_norm as __rms_norm
-        rms_norm = __rms_norm
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
-        )
-
-    try:
-        import flash_attn
-        _flash_attn_func = None
-        if not hasattr(flash_attn, '__version__'):
-            from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        else:
-            if int(flash_attn.__version__.split(".")[0]) >= 2:
-                if int(flash_attn.__version__.split(".")[1]) >= 1:
-                    from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func
-                from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
-            else:
-                from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        flash_attn_unpadded_func = __flash_attn_unpadded_func
-        flash_attn_func = _flash_attn_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention"
-        )
-
-def quantize_cache_v(fdata, bits, qmax, qmin):
-    # b, s, head, h-dim->b, head, s, h-dim
-    qtype = torch.uint8
-    device = fdata.device
-    shape = fdata.shape
-
-    fdata_cal = torch.flatten(fdata, 2)
-    fmax = torch.amax(fdata_cal, dim=-1, keepdim=True)
-    fmin = torch.amin(fdata_cal, dim=-1, keepdim=True)
-    # Compute params
-    if qmax.device != fmax.device:
-        qmax = qmax.to(device)
-        qmin = qmin.to(device)
-    scale = (fmax - fmin) / (qmax - qmin)
-    zero = qmin - fmin / scale
-    scale = scale.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
-    zero = zero.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
-    # Quantize
-    res_data = fdata / scale + zero
-    qdata = torch.clamp(res_data, qmin, qmax).to(qtype)
-    return qdata.contiguous(), scale, zero
-
-def dequantize_cache_torch(qdata, scale, zero):
-    data = scale * (qdata - zero)
-    return data
-
-class FlashSelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        causal=False,
-        softmax_scale=None,
-        attention_dropout=0.0,
-    ):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, (
-            "Please install FlashAttention first, " "e.g., with pip install flash-attn"
-        )
-        assert (
-            rearrange is not None
-        ), "Please install einops first, e.g., with pip install einops"
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-
-    def unpad_input(self, hidden_states, attention_mask):
-        valid_mask = attention_mask.squeeze(1).squeeze(1).eq(0)
-        seqlens_in_batch = valid_mask.sum(dim=-1, dtype=torch.int32)
-        indices = torch.nonzero(valid_mask.flatten(), as_tuple=False).flatten()
-        max_seqlen_in_batch = seqlens_in_batch.max().item()
-        cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-        hidden_states = hidden_states[indices]
-        return hidden_states, indices, cu_seqlens, max_seqlen_in_batch
-
-    def pad_input(self, hidden_states, indices, batch, seqlen):
-        output = torch.zeros(batch * seqlen, *hidden_states.shape[1:], device=hidden_states.device,
-                             dtype=hidden_states.dtype)
-        output[indices] = hidden_states
-        return rearrange(output, '(b s) ... -> b s ...', b=batch)
-
-    def forward(self, q, k, v, attention_mask=None):
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
-        assert all((i.is_cuda for i in (q, k, v)))
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-        seqlen_out = seqlen_q
-
-        if flash_attn_func is not None and batch_size == 1:
-            dropout_p = self.dropout_p if self.training else 0
-            output = flash_attn_func(q, k, v, dropout_p, softmax_scale=self.softmax_scale, causal=self.causal)
-            return output
-
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q.device,
-        )
-
-        if batch_size > 1 and attention_mask is not None:
-            k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
-            if q.size(0) == v.size(0):
-                q = q[indices_k]
-                cu_seqlens_q = cu_seqlens_k
-                seqlen_q = seqlen_k
-            v = v[indices_k]
-        else:
-            cu_seqlens_k = torch.arange(
-                0,
-                (batch_size + 1) * seqlen_k,
-                step=seqlen_k,
-                dtype=torch.int32,
-                device=q.device,
-            )
-
-        if self.training:
-            assert seqlen_k == seqlen_q
-            is_causal = self.causal
-            dropout_p = self.dropout_p
-        else:
-            is_causal = seqlen_q == seqlen_k
-            dropout_p = 0
-
-        output = flash_attn_unpadded_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqlen_q,
-            seqlen_k,
-            dropout_p,
-            softmax_scale=self.softmax_scale,
-            causal=is_causal,
-        )
-        if batch_size > 1 and attention_mask is not None and seqlen_q == seqlen_k:
-            output = self.pad_input(output, indices_k, batch_size, seqlen_out)
-        else:
-            new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
-            output = output.view(new_shape)
-        return output
-
-
-class QWenAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
-        self.seq_length = config.seq_length
-
-        self.hidden_size = config.hidden_size
-        self.split_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-
-        self.use_flash_attn = config.use_flash_attn
-        self.scale_attn_weights = True
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        assert self.projection_size % config.num_attention_heads == 0
-        self.hidden_size_per_attention_head = (
-            self.projection_size // config.num_attention_heads
-        )
-
-        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
-
-        self.c_proj = nn.Linear(
-            config.hidden_size, self.projection_size, bias=not config.no_bias
-        )
-
-        self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-        ):
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
-            )
-        self.bf16 = config.bf16
-
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.use_logn_attn = config.use_logn_attn
-
-        logn_list = [
-            math.log(i, self.seq_length) if i > self.seq_length else 1
-            for i in range(1, 32768)
-        ]
-        logn_tensor = torch.tensor(logn_list)[None, :, None, None]
-        self.register_buffer("logn_tensor", logn_tensor, persistent=False)
-
-        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-        self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
-        self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
-        self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
-        cache_dtype = torch.float
-        if self.bf16:
-            cache_dtype=torch.bfloat16
-        elif config.fp16:
-            cache_dtype = torch.float16
-        self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
-        self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
-
-        if config.use_cache_quantization and config.use_cache_kernel:
-            # pre check if the support files existing
-            module_root = pathlib.Path(__file__).parent
-            src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
-            if any(not (module_root/src).is_file() for src in src_files):
-                warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
-                self.cache_kernels = None
-            else:
-                try:
-                    from .cpp_kernels import cache_autogptq_cuda_256
-                    self.cache_kernels = cache_autogptq_cuda_256
-                except ImportError:
-                    warnings.warn("Failed to import KV cache kernels.")
-                    self.cache_kernels = None
-
-    def _attn(self, query, key, value, no_use_mask, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / torch.full(
-                [],
-                value.size(-1) ** 0.5,
-                dtype=attn_weights.dtype,
-                device=attn_weights.device,
-            )
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        if attention_mask is None:
-            causal_mask = self.bias[
-                :, :, key_length - query_length : key_length, :key_length
-            ]
-        else:
-            causal_mask = attention_mask
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(
-            causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        )
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def __attn(self, query, key, value, causal_mask=None, attention_mask=None, head_mask=None):
-        device = query.device
-        if self.use_cache_quantization:
-            qk, qk_scale, qk_zero = key
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = query.shape[:-1] + (qk.shape[-2],)
-                attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_faster_old(
-                    query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
-                    qk.transpose(-1, -2).contiguous(),
-                    attn_weights,
-                    qk_scale.contiguous() if qk_scale.dtype == torch.float16 else qk_scale.to(torch.float16).contiguous(),
-                    qk_zero.contiguous()if qk_zero.dtype == torch.float16 else qk_zero.to(torch.float16).contiguous())
-                # attn_weights = attn_weights.to(query.dtype).contiguous()
-            else:
-                key = dequantize_cache_torch(qk, qk_scale, qk_zero)
-                attn_weights = torch.matmul(query, key.transpose(-1, -2))
-        else:
-            attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            if self.use_cache_quantization:
-                size_temp = value[0].size(-1)
-            else:
-                size_temp = value.size(-1)
-            attn_weights = attn_weights / (size_temp ** 0.5)
-
-        mask_value = torch.finfo(attn_weights.dtype).min
-        if causal_mask is not None:
-            attn_weights = torch.where(
-                causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-            )
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        if self.softmax_in_fp32:
-            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
-        else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(query.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        if self.use_cache_quantization:
-            qv, qv_scale, qv_zero = value
-            if self.use_cache_kernel and self.cache_kernels is not None:
-                shape = attn_weights.shape[:-1] + (query.shape[-1],)
-                attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
-                    attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
-                    qv.contiguous(),  # dtype: int32
-                    attn_output,
-                    qv_scale.contiguous() if qv_scale.dtype == torch.float16 else qv_scale.to(torch.float16).contiguous(),
-                    qv_zero.contiguous() if qv_zero.dtype == torch.float16 else qv_zero.to(torch.float16).contiguous())
-                if attn_output.dtype != query.dtype:
-                    attn_output = attn_output.to(query.dtype)
-                    attn_weights = attn_weights.to(query.dtype)
-            else:
-                value = dequantize_cache_torch(qv, qv_scale, qv_zero)
-                attn_output = torch.matmul(attn_weights, value)
-        else:
-            attn_output = torch.matmul(attn_weights, value)
-
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        tensor = tensor.contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ):
-        mixed_x_layer = self.c_attn(hidden_states)
-
-        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        if rotary_pos_emb_list is not None:
-            cur_len = query.shape[1]
-            if True:
-                rotary_pos_emb = rotary_pos_emb_list
-                rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                rotary_pos_emb = (rotary_pos_emb,) * 2
-                q_pos_emb, k_pos_emb = rotary_pos_emb
-                # Slice the pos emb for current inference
-                query = apply_rotary_pos_emb(query, q_pos_emb)
-                key = apply_rotary_pos_emb(key, k_pos_emb)
-            else:
-                query_list = []
-                key_list = []
-                for i, rotary_pos_emb in enumerate(rotary_pos_emb_list):
-                    rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-                    rotary_pos_emb = (rotary_pos_emb,) * 2
-                    q_pos_emb, k_pos_emb = rotary_pos_emb
-                    # Slice the pos emb for current inference
-                    query_list += [apply_rotary_pos_emb(query[i:i+1, :, :], q_pos_emb)]
-                    key_list += [apply_rotary_pos_emb(key[i:i+1, :, :], k_pos_emb)]
-                query = torch.cat(query_list, dim=0)
-                key = torch.cat(key_list, dim=0)
-
-        if self.use_cache_quantization:
-            key = quantize_cache_v(key.permute(0, 2, 1, 3),
-                                       bits=8,
-                                       qmin=self.cache_qmin,
-                                       qmax=self.cache_qmax)
-            value = quantize_cache_v(value.permute(0, 2, 1, 3),
-                                         bits=8,
-                                         qmin=self.cache_qmin,
-                                         qmax=self.cache_qmax)
-
-
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            if self.use_cache_quantization:
-                # use_cache_quantization:
-                # present=((q_key,key_scale,key_zero_point),
-                #          (q_value,value_scale,value_zero_point))
-                key = (torch.cat((past_key[0], key[0]), dim=2),
-                       torch.cat((past_key[1], key[1]), dim=2),
-                       torch.cat((past_key[2], key[2]), dim=2))
-                value = (torch.cat((past_value[0], value[0]), dim=2),
-                         torch.cat((past_value[1], value[1]), dim=2),
-                         torch.cat((past_value[2], value[2]), dim=2))
-            else:
-                # not use_cache_quantization:
-                # present=(key,value)
-                key = torch.cat((past_key, key), dim=1)
-                value = torch.cat((past_value, value), dim=1)
-
-        if use_cache:
-            present = (key, value)
-        else:
-            present = None
-
-        key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
-        if key_size > self.seq_length and self.use_logn_attn and not self.training:
-            if self.use_cache_quantization:
-                seq_start = key[0].size(2) - query.size(1)
-                seq_end = key[0].size(2)
-            else:
-                seq_start = key.size(1) - query.size(1)
-                seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
-            query = query * logn_tensor.expand_as(query)
-
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
-        else:
-            key_size = key[0].size(2) if self.use_cache_quantization else key.size(1)
-            if query.size(1) == key_size:
-                causal_mask = torch.tril(
-                    torch.ones((key_size, key_size), dtype=torch.bool, device=query.device)
-                ).view(1, 1, key_size, key_size)
-            else:
-                causal_mask = None
-            query = query.permute(0, 2, 1, 3)
-            if not self.use_cache_quantization:
-                key = key.permute(0, 2, 1, 3)
-                value = value.permute(0, 2, 1, 3)
-            if (
-                causal_mask is None
-                and self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-                and not query.is_cuda
-            ):
-                raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
-
-            if not self.use_cache_quantization and SUPPORT_TORCH2 and False:
-                if attention_mask is not None:
-                    # attention_mask = attention_mask.expand(
-                    #     -1, -1, causal_mask.size(2), -1
-                    # )
-                    # if causal_mask is not None:
-                    #     attention_mask.masked_fill(~causal_mask, torch.finfo(query.dtype).min)
-                    causal_mask = attention_mask
-                else:
-                    attention_mask = causal_mask
-                attn_output = F.scaled_dot_product_attention(
-                    query, key, value, attn_mask=attention_mask
-                ).transpose(1, 2)
-                attn_weight = None
-            else:
-                attn_output, attn_weight = self._attn(
-                    # query, key, value, causal_mask, attention_mask, head_mask
-                    query, key, value, attention_mask, attention_mask, head_mask
-                )
-        context_layer = self._merge_heads(
-            attn_output, self.num_heads, self.head_dim
-        )
-
-        attn_output = self.c_proj(context_layer)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            elif not self.use_cache_quantization and SUPPORT_TORCH2:
-                raise ValueError("Cannot output attentions while using scaled_dot_product_attention")
-            else:
-                outputs += (attn_weight,)
-
-        return outputs
-
-
-class QWenMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.w1 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        self.w2 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        ff_dim_in = config.intermediate_size // 2
-        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
-
-    def forward(self, hidden_states):
-        a1 = self.w1(hidden_states)
-        a2 = self.w2(hidden_states)
-        intermediate_parallel = a1 * F.silu(a2)
-        output = self.c_proj(intermediate_parallel)
-        return output
-
-
-class QWenBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.bf16 = config.bf16
-
-        self.ln_1 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-        self.attn = QWenAttention(config)
-        self.ln_2 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.mlp = QWenMLP(config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb: Optional[List[List[torch.Tensor]]] = None,
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ):
-        layernorm_output = self.ln_1(hidden_states)
-
-        attn_outputs = self.attn(
-            layernorm_output,
-            rotary_pos_emb,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        residual = hidden_states
-        layernorm_input = attn_output + residual
-
-        layernorm_output = self.ln_2(layernorm_input)
-
-        residual = layernorm_input
-        mlp_output = self.mlp(layernorm_output)
-        hidden_states = residual + mlp_output
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs
-
-
-class QWenPreTrainedModel(PreTrainedModel):
-    config_class = QWenConfig
-    base_model_prefix = "transformer"
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["QWenBlock"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, RMSNorm):
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                p.data.normal_(
-                    mean=0.0,
-                    std=(
-                        self.config.initializer_range
-                        / math.sqrt(2 * self.config.num_hidden_layers)
-                    ),
-                )
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, QWenModel):
-            module.gradient_checkpointing = value
-
-
-class QWenModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.vocab_size
-        self.num_hidden_layers = config.num_hidden_layers
-        self.embed_dim = config.hidden_size
-        self.use_cache_quantization = self.config.use_cache_quantization if hasattr(self.config, 'use_cache_quantization') else False
-
-        self.gradient_checkpointing = False
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.seq_length = config.seq_length
-
-        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
-
-        self.drop = nn.Dropout(config.emb_dropout_prob)
-
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(
-                config.kv_channels * config.rotary_pct
-            )
-        dim = (
-            self.rotary_ndims
-            if self.rotary_ndims is not None
-            else config.kv_channels
-        )
-        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
-
-        self.use_flash_attn = config.use_flash_attn
-        self.is_fp32 = not (config.bf16 or config.fp16)
-
-        self.h = nn.ModuleList(
-            [
-                QWenBlock(
-                    config
-                )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
-        self.ln_f = RMSNorm(
-            self.embed_dim,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def get_ntk_alpha(self, true_seq_len):
-        context_value = math.log(true_seq_len / self.seq_length, 2) + 1
-        ntk_alpha = 2 ** math.ceil(context_value) - 1
-        ntk_alpha = max(ntk_alpha, 1)
-        return ntk_alpha
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            if self.use_cache_quantization:
-                past_length = past_key_values[0][0][0].size(2)
-            else:
-                past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = attention_mask[:, None, None, :]
-            attention_mask = attention_mask.to(dtype=self.dtype)
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        encoder_attention_mask = None
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        hidden_states = inputs_embeds
-
-        kv_seq_len = hidden_states.size()[1]
-        if past_key_values[0] is not None:
-            # past key values[0][0] shape: bs * seq_len * head_num * dim
-            if self.use_cache_quantization:
-                kv_seq_len += past_key_values[0][0][0].shape[2]
-            else:
-                kv_seq_len += past_key_values[0][0].shape[1]
-
-        if self.training or not self.use_dynamic_ntk:
-            ntk_alpha_list = [1.0]
-        elif kv_seq_len != hidden_states.size()[1]:
-            ntk_alpha_list = self.rotary_emb._ntk_alpha_cached_list
-        else:
-            ntk_alpha_list = []
-            if attention_mask is not None and kv_seq_len > self.seq_length:
-                true_seq_lens = attention_mask.squeeze(1).squeeze(1).eq(0).sum(dim=-1, dtype=torch.int32)
-                for i in range(hidden_states.size()[0]):
-                    true_seq_len = true_seq_lens[i].item()
-                    ntk_alpha = self.get_ntk_alpha(true_seq_len)
-                    ntk_alpha_list.append(ntk_alpha)
-            else:
-                ntk_alpha = self.get_ntk_alpha(kv_seq_len)
-                ntk_alpha_list.append(ntk_alpha)
-        self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
-        rotary_pos_emb_list = [
-            self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
-        ]
-
-        hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    rotary_pos_emb_list,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    rotary_pos_emb=rotary_pos_emb_list,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, presents, all_hidden_states] if v is not None
-            )
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class QWenLMHeadModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        assert (
-            config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
-
-        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
-
-        if autoset_precision:
-            if SUPPORT_BF16:
-                logger.warn(
-                    "The model is automatically converting to bf16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.bf16 = True
-            elif SUPPORT_FP16:
-                logger.warn(
-                    "The model is automatically converting to fp16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.fp16 = True
-            else:
-                config.fp32 = True
-
-        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
-        if config.fp32:
-            if SUPPORT_BF16:
-                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
-            elif SUPPORT_FP16:
-                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        if config.use_flash_attn == "auto":
-            if config.bf16 or config.fp16:
-                logger.warn("Try importing flash-attention for faster inference...")
-                config.use_flash_attn = True
-            else:
-                config.use_flash_attn = False
-        if config.use_flash_attn and config.fp32:
-            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
-
-        if config.use_flash_attn:
-            _import_flash_attn()
-
-        self.transformer = QWenModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        if config.bf16:
-            self.transformer.bfloat16()
-            self.lm_head.bfloat16()
-        if config.fp16:
-            self.transformer.half()
-            self.lm_head.half()
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        if input_ids.size(0) == 1:
-            attention_mask = None
-        else:
-            attention_mask = kwargs.get("attention_mask", None)
-
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(lm_logits.device)
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
-    ) -> Tuple[Tuple[torch.Tensor]]:
-
-        return tuple(
-            tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past
-            )
-            for layer_past in past_key_values
-        )
-
-    def chat(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        query: str,
-        history: Optional[HistoryType],
-        system: str = "You are a helpful assistant.",
-        stream: Optional[bool] = _SENTINEL,
-        stop_words_ids: Optional[List[List[int]]] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ) -> Tuple[str, HistoryType]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        else:
-            # make a copy of the user's input such that is is left untouched
-            history = copy.deepcopy(history)
-
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-        outputs = self.generate(
-                    input_ids,
-                    stop_words_ids=stop_words_ids,
-                    return_dict_in_generate=False,
-                    generation_config=generation_config,
-                    **kwargs,
-                )
-
-        response = decode_tokens(
-            outputs[0],
-            tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            chat_format=generation_config.chat_format,
-            verbose=False,
-            errors='replace'
-        )
-
-        # as history is a copy of the user inputs,
-        # we can always return the new turn to the user.
-        # separating input history and output history also enables the user
-        # to implement more complex history management
-        history.append((query, response))
-
-        return response, history
-
-    def chat_stream(
-            self,
-            tokenizer: PreTrainedTokenizer,
-            query: str,
-            history: Optional[HistoryType],
-            system: str = "You are a helpful assistant.",
-            stop_words_ids: Optional[List[List[int]]] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            generation_config: Optional[GenerationConfig] = None,
-            **kwargs,
-    ) -> Generator[str, Any, None]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-
-        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-        self.__class__.generate_stream = NewGenerationMixin.generate
-        self.__class__.sample_stream = NewGenerationMixin.sample_stream
-        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
-
-        def stream_generator():
-            outputs = []
-            for token in self.generate_stream(
-                    input_ids,
-                    return_dict_in_generate=False,
-                    generation_config=stream_config,
-                    logits_processor=logits_processor,
-                    seed=-1,
-                    **kwargs):
-                outputs.append(token.item())
-                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore')
-
-        return stream_generator()
-
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[
-            Callable[[int, torch.Tensor], List[int]]
-        ] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        # Process stop_words_ids.
-        stop_words_ids = kwargs.pop("stop_words_ids", None)
-        if stop_words_ids is None and generation_config is not None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-        if stop_words_ids is None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-
-        return super().generate(
-            inputs,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            assistant_model=assistant_model,
-            streamer=streamer,
-            **kwargs,
-        )
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000):
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        if importlib.util.find_spec("einops") is None:
-            raise RuntimeError("einops is required for Rotary Embedding")
-
-        self._rotary_pos_emb_cache = None
-        self._seq_len_cached = 0
-        self._ntk_alpha_cached = 1.0
-        self._ntk_alpha_cached_list = [1.0]
-
-    def update_rotary_pos_emb_cache(self, seqlen, ntk_alpha=1.0):
-        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (
-                base
-                ** (
-                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
-                    / self.dim
-                )
-            )
-            self._seq_len_cached = max(2 * seqlen, 16)
-            self._ntk_alpha_cached = ntk_alpha
-            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
-            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
-
-            emb = torch.cat((freqs, freqs), dim=-1)
-            from einops import rearrange
-
-            emb = rearrange(emb, "n d -> 1 n 1 d")
-
-            cos, sin = emb.cos(), emb.sin()
-            self._rotary_pos_emb_cache = [cos, sin]
-
-    def forward(self, max_seq_len, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, ntk_alpha)
-        cos, sin = self._rotary_pos_emb_cache
-        return [cos[:, :max_seq_len], sin[:, :max_seq_len]]
-
-
-def _rotate_half(x):
-    from einops import rearrange
-
-    x = rearrange(x, "... (j d) -> ... j d", j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    """ Apply rotary embedding to the first rotary_dim of the iput
-
-    Arguments:
-      t (tensor(batch_size, seq_len, n_head, head_dim)):
-        the input embedding/hidden states
-      freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
-        the cached cos/sin position embeddings 
-    """
-    rot_dim = freqs[0].shape[-1]
-    cos, sin = freqs
-    t_float = t.float()
-    if apply_rotary_emb_func is not None and t.is_cuda:
-        # apply_rotary_emb in flash_attn requires cos/sin to be of 
-        # shape (seqlen, rotary_dim / 2) and apply rotary embedding 
-        # to the first rotary_dim of the input
-        cos = cos.squeeze(0).squeeze(1)[:, : rot_dim // 2]
-        sin = sin.squeeze(0).squeeze(1)[:, : rot_dim // 2]
-        return apply_rotary_emb_func(t_float, cos, sin).type_as(t)
-    else:
-        t_rot, t_pass = t_float[..., :rot_dim], t_float[..., rot_dim:]
-        t_rot = (t_rot * cos) + (_rotate_half(t_rot) * sin)
-        return torch.cat((t_rot, t_pass), dim=-1).type_as(t)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        if rms_norm is not None and x.is_cuda:
-            return rms_norm(x, self.weight, self.eps)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
diff --git a/transformers/llm/export/llm_models/Qwen-7B-Chat/config.json b/transformers/llm/export/llm_models/Qwen-7B-Chat/config.json
deleted file mode 100644
index 2a794d958..000000000
--- a/transformers/llm/export/llm_models/Qwen-7B-Chat/config.json
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-  "architectures": [
-    "QWenLMHeadModel"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen.QWenConfig",
-    "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
-  },
-  "attn_dropout_prob": 0.0,
-  "bf16": false,
-  "fp16": false,
-  "fp32": false,
-  "emb_dropout_prob": 0.0,
-  "intermediate_size": 22016,
-  "initializer_range": 0.02,
-  "kv_channels": 128,
-  "layer_norm_epsilon": 1e-06,
-  "model_type": "qwen",
-  "hidden_size": 4096,
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "max_position_embeddings": 8192,
-  "no_bias": true,
-  "onnx_safe": null,
-  "rotary_emb_base": 10000,
-  "rotary_pct": 1.0,
-  "scale_attn_weights": true,
-  "seq_length": 2048,
-  "tie_word_embeddings": false,
-  "tokenizer_type": "QWenTokenizer",
-  "transformers_version": "4.31.0",
-  "use_cache": true,
-  "use_flash_attn": "auto",
-  "vocab_size": 151936,
-  "use_dynamic_ntk": true,
-  "use_logn_attn": false
-}
diff --git a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
deleted file mode 100644
index 698486f6f..000000000
--- a/transformers/llm/export/llm_models/Qwen-7B-Chat/modeling_qwen.py
+++ /dev/null
@@ -1,1199 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-import math
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.cuda.amp import autocast
-
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
-from transformers.generation.logits_process import LogitsProcessorList
-
-if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-from torch import nn
-
-SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-
-from .configuration_qwen import QWenConfig
-from .qwen_generation_utils import (
-    HistoryType,
-    make_context,
-    decode_tokens,
-    get_stop_words_ids,
-    StopWordsLogitsProcessor,
-)
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "qwen"
-_CONFIG_FOR_DOC = "QWenConfig"
-
-QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
-
-_ERROR_BAD_CHAT_FORMAT = """\
-We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
-If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
-我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
-如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
-"""
-
-_SENTINEL = object()
-_ERROR_STREAM_IN_CHAT = """\
-Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
-向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
-"""
-
-apply_rotary_emb_func = None
-rms_norm = None
-flash_attn_unpadded_func = None
-
-
-def _import_flash_attn():
-    global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
-    try:
-        from flash_attn.layers.rotary import apply_rotary_emb_func as __apply_rotary_emb_func
-        apply_rotary_emb_func = __apply_rotary_emb_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
-        )
-
-    try:
-        from flash_attn.ops.rms_norm import rms_norm as __rms_norm
-        rms_norm = __rms_norm
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
-        )
-
-    try:
-        import flash_attn
-        if not hasattr(flash_attn, '__version__'):
-            from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        else:
-            if int(flash_attn.__version__.split(".")[0]) >= 2:
-                from flash_attn.flash_attn_interface import flash_attn_varlen_func as __flash_attn_unpadded_func
-            else:
-                from flash_attn.flash_attn_interface import flash_attn_unpadded_func as __flash_attn_unpadded_func
-        flash_attn_unpadded_func = __flash_attn_unpadded_func
-    except ImportError:
-        logger.warn(
-            "Warning: import flash_attn fail, please install FlashAttention to get higher efficiency "
-            "https://github.com/Dao-AILab/flash-attention"
-        )
-
-
-class FlashSelfAttention(torch.nn.Module):
-    def __init__(
-        self,
-        causal=False,
-        softmax_scale=None,
-        attention_dropout=0.0,
-    ):
-        super().__init__()
-        assert flash_attn_unpadded_func is not None, (
-            "Please install FlashAttention first, " "e.g., with pip install flash-attn"
-        )
-        assert (
-            rearrange is not None
-        ), "Please install einops first, e.g., with pip install einops"
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-
-    def forward(self, q, k, v):
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
-        assert all((i.is_cuda for i in (q, k, v)))
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = k.shape[1]
-        q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q.device,
-        )
-
-        if self.training:
-            assert seqlen_k == seqlen_q
-
-            is_causal = self.causal
-            cu_seqlens_k = cu_seqlens_q
-        else:
-            is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(
-                0,
-                (batch_size + 1) * seqlen_k,
-                step=seqlen_k,
-                dtype=torch.int32,
-                device=q.device,
-            )
-            self.dropout_p = 0
-        output = flash_attn_unpadded_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            seqlen_q,
-            seqlen_k,
-            self.dropout_p,
-            softmax_scale=self.softmax_scale,
-            causal=is_causal,
-        )
-
-        output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        return output
-
-
-class QWenAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        max_positions = config.max_position_embeddings
-        self.register_buffer(
-            "bias",
-            torch.tril(
-                torch.ones((max_positions, max_positions), dtype=torch.bool)
-            ).view(1, 1, max_positions, max_positions),
-            persistent=False,
-        )
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
-        self.seq_length = config.seq_length
-
-        self.hidden_size = config.hidden_size
-        self.split_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-
-        self.use_flash_attn = config.use_flash_attn
-        self.scale_attn_weights = True
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        assert self.projection_size % config.num_attention_heads == 0
-        self.hidden_size_per_attention_head = (
-            self.projection_size // config.num_attention_heads
-        )
-
-        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
-
-        self.c_proj = nn.Linear(
-            config.hidden_size, self.projection_size, bias=not config.no_bias
-        )
-
-        self.is_fp32 = not (config.bf16 or config.fp16)
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-        ):
-            self.core_attention_flash = FlashSelfAttention(
-                causal=True, attention_dropout=config.attn_dropout_prob
-            )
-
-        self.bf16 = config.bf16
-
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(
-                self.hidden_size_per_attention_head * config.rotary_pct
-            )
-        dim = (
-            self.rotary_ndims
-            if self.rotary_ndims is not None
-            else self.hidden_size_per_attention_head
-        )
-        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
-
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.use_logn_attn = config.use_logn_attn
-
-        logn_list = [
-            math.log(i, self.seq_length) if i > self.seq_length else 1
-            for i in range(1, 32768)
-        ]
-        self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
-        self._ntk_cached = 1.0
-
-        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-
-    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / math.sqrt(self.head_dim)
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        if attention_mask is None:
-            causal_mask = self.bias[
-                :, :, key_length - query_length : key_length, :key_length
-            ]
-        else:
-            causal_mask = attention_mask
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(
-            causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        )
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(
-        self, query, key, value, attention_mask=None, head_mask=None
-    ):
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        attn_weights = torch.empty(
-            bsz * num_heads,
-            q_seq_len,
-            k_seq_len,
-            dtype=torch.float32,
-            device=query.device,
-        )
-
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
-                -1, dk, k_seq_len
-            )
-            attn_weights = torch.baddbmm(
-                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
-            )
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = self.bias[
-            :, :, key_length - query_length : key_length, :key_length
-        ]
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError(
-                "Error with upcasting, attn_weights does not have dtype torch.float32"
-            )
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        tensor = tensor.contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ):
-        mixed_x_layer = self.c_attn(hidden_states)
-        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        kv_seq_len = hidden_states.size()[1]
-        if layer_past is not None:
-            # layer past[0] shape: bs * seq_len * head_num * dim
-            kv_seq_len += layer_past[0].shape[1]
-        if (
-            self.use_dynamic_ntk
-            and kv_seq_len == hidden_states.size()[1]
-            and not self.training
-        ):
-            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
-            ntk_alpha = 2 ** math.ceil(context_value) - 1
-            ntk_alpha = max(ntk_alpha, 1)
-            self._ntk_cached = ntk_alpha
-        else:
-            ntk_alpha = self._ntk_cached
-        if rotary_pos_emb is None:
-            rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
-                hidden_states.device
-            )
-            cur_len = query.shape[1]
-            rotary_pos_emb = rotary_pos_emb[:, -cur_len:, :, :]
-
-        if rotary_pos_emb is not None and False:
-            if isinstance(rotary_pos_emb, tuple):
-                rotary_pos_emb = rotary_pos_emb
-            else:
-                rotary_pos_emb = (rotary_pos_emb,) * 2
-
-        if rotary_pos_emb is not None:
-            '''
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            # Slice the pos emb for current inference
-            cur_len = query.shape[1]
-            q_pos_emb = q_pos_emb[:, -cur_len:, :, :]
-            k_pos_emb = k_pos_emb[:, -cur_len:, :, :]
-            '''
-            query = apply_rotary_pos_emb(query, rotary_pos_emb)
-            key = apply_rotary_pos_emb(key, rotary_pos_emb)
-
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            key = torch.cat((past_key, key), dim=1)
-            value = torch.cat((past_value, value), dim=1)
-
-        if use_cache:
-            present = torch.stack((key, value))
-        else:
-            present = None
-
-        if self.use_logn_attn and not self.training:
-            if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
-                self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
-            seq_start = key.size(1) - query.size(1)
-            seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
-            query = query * logn_tensor.expand_as(query)
-
-        if (
-            self.use_flash_attn
-            and flash_attn_unpadded_func is not None
-            and not self.is_fp32
-            and query.is_cuda
-        ):
-            q, k, v = query, key, value
-            context_layer = self.core_attention_flash(q, k, v)
-
-            context_layer = rearrange(
-                context_layer, "b s h d -> b s (h d)"
-            ).contiguous()
-        else:
-            query = query.permute(0, 2, 1, 3)
-            key = key.permute(0, 2, 1, 3)
-            value = value.permute(0, 2, 1, 3)
-            attn_output, attn_weight = self._attn(
-                query, key, value, attention_mask, head_mask
-            )
-            context_layer = self._merge_heads(
-                attn_output, self.num_heads, self.head_dim
-            )
-
-        attn_output = self.c_proj(context_layer)
-        outputs = (attn_output, present)
-        if output_attentions:
-            if (
-                self.use_flash_attn
-                and flash_attn_unpadded_func is not None
-                and not self.is_fp32
-            ):
-                raise ValueError("Cannot output attentions while using flash-attn")
-            else:
-                outputs += (attn_weight,)
-
-        return outputs
-
-
-class QWenMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.w1 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        self.w2 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        ff_dim_in = config.intermediate_size // 2
-        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
-
-    def forward(self, hidden_states):
-        a1 = self.w1(hidden_states)
-        a2 = self.w2(hidden_states)
-        intermediate_parallel = a1 * F.silu(a2)
-        output = self.c_proj(intermediate_parallel)
-        return output
-
-
-class QWenBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.bf16 = config.bf16
-
-        self.ln_1 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-        self.attn = QWenAttention(config)
-        self.ln_2 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.mlp = QWenMLP(config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ):
-        layernorm_output = self.ln_1(hidden_states)
-
-        attn_outputs = self.attn(
-            layernorm_output,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        residual = hidden_states
-        layernorm_input = attn_output + residual
-
-        layernorm_output = self.ln_2(layernorm_input)
-
-        residual = layernorm_input
-        mlp_output = self.mlp(layernorm_output)
-        hidden_states = residual + mlp_output
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs
-
-
-class QWenPreTrainedModel(PreTrainedModel):
-    config_class = QWenConfig
-    base_model_prefix = "transformer"
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["QWenBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, RMSNorm):
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                p.data.normal_(
-                    mean=0.0,
-                    std=(
-                        self.config.initializer_range
-                        / math.sqrt(2 * self.config.num_hidden_layers)
-                    ),
-                )
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, QWenModel):
-            module.gradient_checkpointing = value
-
-
-class QWenModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.vocab_size
-        self.num_hidden_layers = config.num_hidden_layers
-        self.embed_dim = config.hidden_size
-
-        self.gradient_checkpointing = False
-
-        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
-
-        self.drop = nn.Dropout(config.emb_dropout_prob)
-        self.h = nn.ModuleList(
-            [
-                QWenBlock(
-                    config,
-                )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
-        self.ln_f = RMSNorm(
-            self.embed_dim,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        if attention_mask is not None:
-            if batch_size <= 0:
-                raise ValueError("batch_size has to be defined and > 0")
-            attention_mask = attention_mask.view(batch_size, -1)
-            attention_mask = attention_mask[:, None, None, :]
-            attention_mask = attention_mask.to(dtype=self.dtype)
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
-
-        encoder_attention_mask = None
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        hidden_states = inputs_embeds
-
-        hidden_states = self.drop(hidden_states)
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[1],)
-
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, presents, all_hidden_states] if v is not None
-            )
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class QWenLMHeadModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        assert (
-            config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
-
-        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
-
-        if autoset_precision:
-            if SUPPORT_BF16:
-                logger.warn(
-                    "The model is automatically converting to bf16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.bf16 = True
-            elif SUPPORT_FP16:
-                logger.warn(
-                    "The model is automatically converting to fp16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.fp16 = True
-            else:
-                config.fp32 = True
-
-        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
-        if config.fp32:
-            if SUPPORT_BF16:
-                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
-            elif SUPPORT_FP16:
-                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        if config.use_flash_attn == "auto":
-            if config.bf16 or config.fp16:
-                logger.warn("Try importing flash-attention for faster inference...")
-                config.use_flash_attn = True
-            else:
-                config.use_flash_attn = False
-        if config.use_flash_attn and config.fp32:
-            logger.warn("Flash attention will be disabled because it does NOT support fp32.")
-
-        if config.use_flash_attn:
-            _import_flash_attn()
-
-        self.transformer = QWenModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        if config.bf16:
-            self.transformer.bfloat16()
-            self.lm_head.bfloat16()
-        if config.fp16:
-            self.transformer.half()
-            self.lm_head.half()
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        if past_key_values:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
-
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(lm_logits.device)
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
-    ) -> Tuple[Tuple[torch.Tensor]]:
-
-        return tuple(
-            tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past
-            )
-            for layer_past in past_key_values
-        )
-
-    def chat(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        query: str,
-        history: Optional[HistoryType],
-        system: str = "You are a helpful assistant.",
-        append_history: bool = True,
-        stream: Optional[bool] = _SENTINEL,
-        stop_words_ids: Optional[List[List[int]]] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ) -> Tuple[str, HistoryType]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-        outputs = self.generate(
-                    input_ids,
-                    stop_words_ids=stop_words_ids,
-                    return_dict_in_generate=False,
-                    generation_config=generation_config,
-                    **kwargs,
-                )
-
-        response = decode_tokens(
-            outputs[0],
-            tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            chat_format=generation_config.chat_format,
-            verbose=False,
-            errors='replace'
-        )
-
-        if append_history:
-            history.append((query, response))
-
-        return response, history
-
-    def chat_stream(
-            self,
-            tokenizer: PreTrainedTokenizer,
-            query: str,
-            history: Optional[HistoryType],
-            system: str = "You are a helpful assistant.",
-            stop_words_ids: Optional[List[List[int]]] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            generation_config: Optional[GenerationConfig] = None,
-            **kwargs,
-    ) -> Generator[str, Any, None]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-
-        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-        self.__class__.generate_stream = NewGenerationMixin.generate
-        self.__class__.sample_stream = NewGenerationMixin.sample_stream
-        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
-
-        def stream_generator():
-            outputs = []
-            for token in self.generate_stream(
-                    input_ids,
-                    return_dict_in_generate=False,
-                    generation_config=stream_config,
-                    logits_processor=logits_processor,
-                    seed=-1,
-                    **kwargs):
-                outputs.append(token.item())
-                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore')
-
-        return stream_generator()
-
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[
-            Callable[[int, torch.Tensor], List[int]]
-        ] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        # Process stop_words_ids.
-        stop_words_ids = kwargs.pop("stop_words_ids", None)
-        if stop_words_ids is None and generation_config is not None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-        if stop_words_ids is None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-
-        return super().generate(
-            inputs,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            assistant_model=assistant_model,
-            streamer=streamer,
-            **kwargs,
-        )
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000):
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        if importlib.util.find_spec("einops") is None:
-            raise RuntimeError("einops is required for Rotary Embedding")
-
-        self._rotary_pos_emb_cache = None
-        self._seq_len_cached = 0
-        self._ntk_alpha_cached = 1.0
-
-    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        seqlen = max_seq_len + offset
-        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (
-                base
-                ** (
-                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
-                    / self.dim
-                )
-            )
-            self._seq_len_cached = max(2 * seqlen, 16)
-            self._ntk_alpha_cached = ntk_alpha
-            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
-            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            from einops import rearrange
-
-            self._rotary_pos_emb_cache = rearrange(emb, "n d -> 1 n 1 d")
-
-    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
-        return self._rotary_pos_emb_cache[:, offset : offset + max_seq_len]
-
-
-def _rotate_half(x):
-    from einops import rearrange
-
-    x = rearrange(x, "... (j d) -> ... j d", j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    if apply_rotary_emb_func is not None and t.is_cuda:
-        t_ = t.float()
-        freqs = freqs.squeeze(0).squeeze(1)
-        cos = freqs[:, : freqs.shape[-1] // 2].cos()
-        sin = freqs[:, : freqs.shape[-1] // 2].sin()
-        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
-        return output
-    else:
-        rot_dim = freqs.shape[-1]
-        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-        t_ = t_.float()
-        t_pass_ = t_pass_.float()
-        t_ = (t_ * freqs.cos()) + (_rotate_half(t_) * freqs.sin())
-        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        if rms_norm is not None and x.is_cuda:
-            return rms_norm(x, self.weight, self.eps)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
diff --git a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py b/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
deleted file mode 100755
index d7b3c4798..000000000
--- a/transformers/llm/export/llm_models/Qwen-VL-Chat/modeling_qwen.py
+++ /dev/null
@@ -1,1162 +0,0 @@
-# Copyright (c) Alibaba Cloud.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-import math
-from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.cuda.amp import autocast
-
-from torch.nn import CrossEntropyLoss
-from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
-from transformers.generation.logits_process import LogitsProcessorList
-
-if TYPE_CHECKING:
-    from transformers.generation.streamers import BaseStreamer
-from transformers.generation.utils import GenerateOutput
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-
-try:
-    from einops import rearrange
-except ImportError:
-    rearrange = None
-from torch import nn
-
-SUPPORT_CUDA = torch.cuda.is_available()
-SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
-SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-
-from .configuration_qwen import QWenConfig
-from .qwen_generation_utils import (
-    HistoryType,
-    make_context,
-    decode_tokens,
-    get_stop_words_ids,
-    StopWordsLogitsProcessor,
-)
-from .visual import VisionTransformer
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "qwen"
-_CONFIG_FOR_DOC = "QWenConfig"
-
-QWen_PRETRAINED_MODEL_ARCHIVE_LIST = ["qwen-7b"]
-
-_ERROR_BAD_CHAT_FORMAT = """\
-We detect you are probably using the pretrained model (rather than chat model) for chatting, since the chat_format in generation_config is not "chatml".
-If you are directly using the model downloaded from Huggingface, please make sure you are using our "Qwen/Qwen-7B-Chat" Huggingface model (rather than "Qwen/Qwen-7B") when you call model.chat().
-我们检测到您可能在使用预训练模型（而非chat模型）进行多轮chat，因为您当前在generation_config指定的chat_format，并未设置为我们在对话中所支持的"chatml"格式。
-如果您在直接使用我们从Huggingface提供的模型，请确保您在调用model.chat()时，使用的是"Qwen/Qwen-7B-Chat"模型（而非"Qwen/Qwen-7B"预训练模型）。
-"""
-
-_SENTINEL = object()
-_ERROR_STREAM_IN_CHAT = """\
-Pass argument `stream` to model.chat() is buggy, deprecated, and marked for removal. Please use model.chat_stream(...) instead of model.chat(..., stream=True).
-向model.chat()传入参数stream的用法可能存在Bug，该用法已被废弃，将在未来被移除。请使用model.chat_stream(...)代替model.chat(..., stream=True)。
-"""
-
-apply_rotary_emb_func = None
-rms_norm = None
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class QWenAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
-        self.seq_length = config.seq_length
-
-        self.hidden_size = config.hidden_size
-        self.split_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-
-        self.scale_attn_weights = True
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        assert self.projection_size % config.num_attention_heads == 0
-        self.hidden_size_per_attention_head = (
-            self.projection_size // config.num_attention_heads
-        )
-
-        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size)
-
-        self.c_proj = nn.Linear(
-            config.hidden_size, self.projection_size, bias=not config.no_bias
-        )
-
-        self.is_fp32 = not (config.bf16 or config.fp16)
-        self.bf16 = config.bf16
-
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.use_logn_attn = config.use_logn_attn
-
-        logn_list = [
-            math.log(i, self.seq_length) if i > self.seq_length else 1
-            for i in range(1, 32768)
-        ]
-        self.logn_tensor = torch.tensor(logn_list)[None, :, None, None]
-
-        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-
-    def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
-
-        if self.scale_attn_weights:
-            attn_weights = attn_weights / math.sqrt(self.head_dim)
-
-        # causal_mask = self.bias[
-        #     :, :, key_length - query_length : key_length, :key_length
-        # ]
-        # mask_value = torch.finfo(attn_weights.dtype).min
-        # mask_value = torch.full([], mask_value, dtype=attn_weights.dtype).to(
-        #     attn_weights.device
-        # )
-        # attn_weights = torch.where(
-        #     causal_mask, attn_weights.to(attn_weights.dtype), mask_value
-        # )
-        attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-        attn_output = attn_output.transpose(1, 2)
-
-        return attn_output, attn_weights
-
-    def _upcast_and_reordered_attn(
-        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
-    ):
-        bsz, num_heads, q_seq_len, dk = query.size()
-        _, _, k_seq_len, _ = key.size()
-
-        attn_weights = torch.empty(
-            bsz * num_heads,
-            q_seq_len,
-            k_seq_len,
-            dtype=torch.float32,
-            device=query.device,
-        )
-
-        scale_factor = 1.0
-        if self.scale_attn_weights:
-            scale_factor /= float(value.size(-1)) ** 0.5
-
-        with autocast(enabled=False):
-            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
-                -1, dk, k_seq_len
-            )
-            attn_weights = torch.baddbmm(
-                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
-            )
-            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
-
-        query_length, key_length = query.size(-2), key.size(-2)
-        causal_mask = registered_causal_mask[
-            :, :, key_length - query_length : key_length, :key_length
-        ]
-        mask_value = torch.finfo(attn_weights.dtype).min
-        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
-            attn_weights.device
-        )
-        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
-
-        if attention_mask is not None:
-            attn_weights = attn_weights + attention_mask
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-
-        if attn_weights.dtype != torch.float32:
-            raise RuntimeError(
-                "Error with upcasting, attn_weights does not have dtype torch.float32"
-            )
-        attn_weights = attn_weights.type(value.dtype)
-        attn_weights = self.attn_dropout(attn_weights)
-
-        if head_mask is not None:
-            attn_weights = attn_weights * head_mask
-
-        attn_output = torch.matmul(attn_weights, value)
-
-        return attn_output, attn_weights
-
-    def _split_heads(self, tensor, num_heads, attn_head_size):
-        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
-        tensor = tensor.view(new_shape)
-        return tensor
-
-    def _merge_heads(self, tensor, num_heads, attn_head_size):
-        tensor = tensor.contiguous()
-        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
-        return tensor.view(new_shape)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
-        registered_causal_mask: Optional[torch.Tensor] = None,
-        layer_past: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ):
-
-        mixed_x_layer = self.c_attn(hidden_states)
-
-        query, key, value = mixed_x_layer.split(self.split_size, dim=2)
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        if rotary_pos_emb is not None:
-            '''
-            cur_len = query.shape[1]
-            rotary_pos_emb = [i[:, -cur_len:, :, :] for i in rotary_pos_emb]
-            rotary_pos_emb = (rotary_pos_emb,) * 2
-            q_pos_emb, k_pos_emb = rotary_pos_emb
-            # Slice the pos emb for current inference
-            print('len(q_pos_emb) = ', len(q_pos_emb)) # 2
-            print('q_pos_emb[0].shape = ', q_pos_emb[0].shape) # 1, 20, 1, 128
-            query = apply_rotary_pos_emb(query, q_pos_emb)
-            key = apply_rotary_pos_emb(key, k_pos_emb)
-            '''
-            query = apply_rotary_pos_emb(query, rotary_pos_emb)
-            key = apply_rotary_pos_emb(key, rotary_pos_emb)
-
-        if layer_past is not None:
-            past_key, past_value = layer_past[0], layer_past[1]
-            key = torch.cat((past_key, key), dim=1)
-            value = torch.cat((past_value, value), dim=1)
-
-        if use_cache:
-            present = torch.stack([key, value])
-        else:
-            present = None
-
-        if self.use_logn_attn and not self.training and False:
-            if self.logn_tensor.device != query.device or self.logn_tensor.dtype != query.dtype:
-                self.logn_tensor = self.logn_tensor.to(query.device).type_as(query)
-            seq_start = key.size(1) - query.size(1)
-            seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
-            query = query * logn_tensor.expand_as(query)
-
-        query = query.permute(0, 2, 1, 3)
-        key = key.permute(0, 2, 1, 3)
-        value = value.permute(0, 2, 1, 3)
-        attn_output, attn_weight = self._attn(
-            query, key, value, registered_causal_mask, attention_mask, head_mask
-        )
-        context_layer = self._merge_heads(
-            attn_output, self.num_heads, self.head_dim
-        )
-
-        attn_output = self.c_proj(context_layer)
-
-        outputs = (attn_output, present)
-        if output_attentions:
-            outputs += (attn_weight,)
-
-        return outputs
-
-
-class QWenMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.w1 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        self.w2 = nn.Linear(
-            config.hidden_size, config.intermediate_size // 2, bias=not config.no_bias
-        )
-        ff_dim_in = config.intermediate_size // 2
-        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias=not config.no_bias)
-
-    def forward(self, hidden_states):
-        a1 = self.w1(hidden_states)
-        a2 = self.w2(hidden_states)
-        intermediate_parallel = a1 * F.silu(a2)
-        output = self.c_proj(intermediate_parallel)
-        return output
-
-class QWenBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        hidden_size = config.hidden_size
-        self.bf16 = config.bf16
-
-        self.ln_1 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-        self.attn = QWenAttention(config)
-        self.ln_2 = RMSNorm(
-            hidden_size,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.mlp = QWenMLP(config)
-
-    def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb: Optional[List[torch.Tensor]] = None,
-        registered_causal_mask: Optional[torch.Tensor] = None,
-        layer_past: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ):
-        layernorm_output = self.ln_1(hidden_states)
-
-        attn_outputs = self.attn(
-            layernorm_output,
-            rotary_pos_emb,
-            registered_causal_mask=registered_causal_mask,
-            layer_past=layer_past,
-            attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-        )
-        attn_output = attn_outputs[0]
-
-        outputs = attn_outputs[1:]
-
-        residual = hidden_states
-        layernorm_input = attn_output + residual
-
-        layernorm_output = self.ln_2(layernorm_input)
-
-        residual = layernorm_input
-        mlp_output = self.mlp(layernorm_output)
-        hidden_states = residual + mlp_output
-
-        if use_cache:
-            outputs = (hidden_states,) + outputs
-        else:
-            outputs = (hidden_states,) + outputs[1:]
-
-        return outputs
-
-
-class QWenPreTrainedModel(PreTrainedModel):
-    config_class = QWenConfig
-    base_model_prefix = "transformer"
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["QWenBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module):
-        """Initialize the weights."""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, RMSNorm):
-            module.weight.data.fill_(1.0)
-
-        for name, p in module.named_parameters():
-            if name == "c_proj.weight":
-                p.data.normal_(
-                    mean=0.0,
-                    std=(
-                        self.config.initializer_range
-                        / math.sqrt(2 * self.config.num_hidden_layers)
-                    ),
-                )
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, QWenModel):
-            module.gradient_checkpointing = value
-
-
-class QWenModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.vocab_size = config.vocab_size
-        self.num_hidden_layers = config.num_hidden_layers
-        self.embed_dim = config.hidden_size
-
-        self.gradient_checkpointing = False
-        self.use_dynamic_ntk = config.use_dynamic_ntk
-        self.seq_length = config.seq_length
-
-        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
-
-        self.drop = nn.Dropout(config.emb_dropout_prob)
-
-        if config.rotary_pct == 1.0:
-            self.rotary_ndims = None
-        else:
-            assert config.rotary_pct < 1
-            self.rotary_ndims = int(
-                config.kv_channels * config.rotary_pct
-            )
-        dim = (
-            self.rotary_ndims
-            if self.rotary_ndims is not None
-            else config.kv_channels
-        )
-        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
-
-        self.use_flash_attn = config.use_flash_attn
-        self.is_fp32 = not (config.bf16 or config.fp16)
-        self.registered_causal_mask = None
-        # if (
-        #     self.use_flash_attn
-        #     and flash_attn_unpadded_func is not None
-        #     and not self.is_fp32
-        # ):
-        #     self.registered_causal_mask = None
-        # else:
-        #     max_positions = config.max_position_embeddings
-        #     self.register_buffer(
-        #         "registered_causal_mask",
-        #         torch.tril(
-        #             torch.ones((max_positions, max_positions), dtype=torch.bool)
-        #         ).view(1, 1, max_positions, max_positions),
-        #         persistent=False,
-        #     )
-
-        self.h = nn.ModuleList(
-            [
-                QWenBlock(
-                    config
-                )
-                for i in range(config.num_hidden_layers)
-            ]
-        )
-        self.ln_f = RMSNorm(
-            self.embed_dim,
-            eps=config.layer_norm_epsilon,
-        )
-
-        self.visual = VisionTransformer(**config.visual)
-
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.wte
-
-    def set_input_embeddings(self, new_embeddings):
-        self.wte = new_embeddings
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ):
-        if past_key_values is None and torch.any(input_ids == self.config.visual['image_start_id']):
-            bos_pos = torch.where(input_ids == self.config.visual['image_start_id'])
-            eos_pos = torch.where(input_ids == self.config.visual['image_start_id'] + 1)
-            assert (bos_pos[0] == eos_pos[0]).all()
-            img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
-            images = []
-            for i, a, b in img_pos:
-                image = input_ids[i][a + 1 : b - 1].tolist()
-                image = image[ : image.index(self.config.visual['image_start_id'] + 2)]
-                images.append(bytes(image).decode('utf-8'))
-
-            images = self.visual.encode(images)
-            assert images.shape[0] == len(images)
-            fake_images = None
-        elif self.training:
-            fake_images=torch.zeros(1,3,224,224).to(
-                dtype=self.visual.conv1.weight.dtype, device=self.visual.conv1.weight.device)
-            images = self.visual(fake_images)
-        else:
-            fake_images = None
-            images = None
-
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time"
-            )
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            batch_size = input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size = inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        encoder_attention_mask = None
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
-        if batch_size <= 0:
-            raise ValueError("batch_size has to be defined and > 0")
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_length
-        )
-
-        hidden_states = inputs_embeds
-
-        kv_seq_len = hidden_states.size()[1]
-        if past_key_values[0] is not None:
-            # past key values[0][0] shape: bs * seq_len * head_num * dim
-            kv_seq_len += past_key_values[0][0].shape[1]
-        if (
-            self.use_dynamic_ntk
-            and kv_seq_len == hidden_states.size()[1]
-            and not self.training
-        ):
-            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
-            ntk_alpha = 2 ** math.ceil(context_value) - 1
-            ntk_alpha = max(ntk_alpha, 1)
-        else:
-            ntk_alpha = self.rotary_emb._ntk_alpha_cached
-
-        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
-        for idx in range(len(rotary_pos_emb)):
-            rotary_pos_emb[idx] = rotary_pos_emb[idx].to(hidden_states.device)
-
-        hidden_states = self.drop(hidden_states).clone()
-        if fake_images is not None:
-            hidden_states = hidden_states + images.mean()*0
-        elif images is not None:
-            for idx, (i, a, b) in enumerate(img_pos):
-                hidden_states[i][a + 1 : b] = images[idx]
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    rotary_pos_emb,
-                    self.registered_causal_mask,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    rotary_pos_emb=rotary_pos_emb,
-                    registered_causal_mask=self.registered_causal_mask,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, presents, all_hidden_states] if v is not None
-            )
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class QWenLMHeadModel(QWenPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        assert (
-            config.bf16 + config.fp16 + config.fp32 <= 1
-        ), "Only one of \"bf16\", \"fp16\", \"fp32\" can be true"
-
-        autoset_precision = config.bf16 + config.fp16 + config.fp32 == 0
-
-        if autoset_precision:
-            if SUPPORT_BF16:
-                logger.warn(
-                    "The model is automatically converting to bf16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.bf16 = True
-            elif SUPPORT_FP16:
-                logger.warn(
-                    "The model is automatically converting to fp16 for faster inference. "
-                    "If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to \"AutoModelForCausalLM.from_pretrained\"."
-                )
-                config.fp16 = True
-            else:
-                config.fp32 = True
-
-        if config.bf16 and SUPPORT_CUDA and not SUPPORT_BF16:
-            logger.warn("Your device does NOT seem to support bf16, you can switch to fp16 or fp32 by by passing fp16/fp32=True in \"AutoModelForCausalLM.from_pretrained\".")
-        if config.fp16 and SUPPORT_CUDA and not SUPPORT_FP16:
-            logger.warn("Your device does NOT support faster inference with fp16, please switch to fp32 which is likely to be faster")
-        if config.fp32:
-            if SUPPORT_BF16:
-                logger.warn("Your device support faster inference by passing bf16=True in \"AutoModelForCausalLM.from_pretrained\".")
-            elif SUPPORT_FP16:
-                logger.warn("Your device support faster inference by passing fp16=True in \"AutoModelForCausalLM.from_pretrained\".")
-
-        self.transformer = QWenModel(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        if config.bf16:
-            self.transformer.bfloat16()
-            self.lm_head.bfloat16()
-        if config.fp16:
-            self.transformer.half()
-            self.lm_head.half()
-        self.post_init()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
-        token_type_ids = kwargs.get("token_type_ids", None)
-        if past_key_values:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-            if token_type_ids is not None:
-                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
-
-        attention_mask = kwargs.get("attention_mask", None)
-        position_ids = kwargs.get("position_ids", None)
-
-        if attention_mask is not None and position_ids is None:
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-        else:
-            position_ids = None
-
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "position_ids": position_ids,
-                "attention_mask": attention_mask,
-                "token_type_ids": token_type_ids,
-            }
-        )
-        return model_inputs
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(lm_logits.device)
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
-            )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
-    ) -> Tuple[Tuple[torch.Tensor]]:
-
-        return tuple(
-            tuple(
-                past_state.index_select(0, beam_idx.to(past_state.device))
-                for past_state in layer_past
-            )
-            for layer_past in past_key_values
-        )
-
-    def chat(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        query: str,
-        history: Optional[HistoryType],
-        system: str = "You are a helpful assistant.",
-        append_history: bool = True,
-        stream: Optional[bool] = _SENTINEL,
-        stop_words_ids: Optional[List[List[int]]] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ) -> Tuple[str, HistoryType]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        assert stream is _SENTINEL, _ERROR_STREAM_IN_CHAT
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-        outputs = self.generate(
-                    input_ids,
-                    stop_words_ids=stop_words_ids,
-                    return_dict_in_generate=False,
-                    generation_config=generation_config,
-                    **kwargs,
-                )
-
-        response = decode_tokens(
-            outputs[0],
-            tokenizer,
-            raw_text_len=len(raw_text),
-            context_length=len(context_tokens),
-            chat_format=generation_config.chat_format,
-            verbose=False,
-            errors='replace'
-        )
-
-        if append_history:
-            history.append((query, response))
-
-        return response, history
-
-    def chat_stream(
-            self,
-            tokenizer: PreTrainedTokenizer,
-            query: str,
-            history: Optional[HistoryType],
-            system: str = "You are a helpful assistant.",
-            stop_words_ids: Optional[List[List[int]]] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            generation_config: Optional[GenerationConfig] = None,
-            **kwargs,
-    ) -> Generator[str, Any, None]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-        assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
-        if history is None:
-            history = []
-        if stop_words_ids is None:
-            stop_words_ids = []
-
-        max_window_size = kwargs.get('max_window_size', None)
-        if max_window_size is None:
-            max_window_size = generation_config.max_window_size
-        raw_text, context_tokens = make_context(
-            tokenizer,
-            query,
-            history=history,
-            system=system,
-            max_window_size=max_window_size,
-            chat_format=generation_config.chat_format,
-        )
-
-        stop_words_ids.extend(get_stop_words_ids(
-            generation_config.chat_format, tokenizer
-        ))
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-        input_ids = torch.tensor([context_tokens]).to(self.device)
-
-        from transformers_stream_generator.main import NewGenerationMixin, StreamGenerationConfig
-        self.__class__.generate_stream = NewGenerationMixin.generate
-        self.__class__.sample_stream = NewGenerationMixin.sample_stream
-        stream_config = StreamGenerationConfig(**generation_config.to_dict(), do_stream=True)
-
-        def stream_generator():
-            outputs = []
-            for token in self.generate_stream(
-                    input_ids,
-                    return_dict_in_generate=False,
-                    generation_config=stream_config,
-                    logits_processor=logits_processor,
-                    seed=-1,
-                    **kwargs):
-                outputs.append(token.item())
-                yield tokenizer.decode(outputs, skip_special_tokens=True, errors='ignore', keep_image_special=True)
-
-        return stream_generator()
-
-    def generate(
-        self,
-        inputs: Optional[torch.Tensor] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[
-            Callable[[int, torch.Tensor], List[int]]
-        ] = None,
-        synced_gpus: Optional[bool] = None,
-        assistant_model: Optional["PreTrainedModel"] = None,
-        streamer: Optional["BaseStreamer"] = None,
-        **kwargs,
-    ) -> Union[GenerateOutput, torch.LongTensor]:
-        generation_config = generation_config if generation_config is not None else self.generation_config
-
-        # Process stop_words_ids.
-        stop_words_ids = kwargs.pop("stop_words_ids", None)
-        if stop_words_ids is None and generation_config is not None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-        if stop_words_ids is None:
-            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
-
-        if stop_words_ids is not None:
-            stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids,
-                eos_token_id=generation_config.eos_token_id,
-            )
-            if logits_processor is None:
-                logits_processor = LogitsProcessorList([stop_words_logits_processor])
-            else:
-                logits_processor.append(stop_words_logits_processor)
-
-        return super().generate(
-            inputs,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            assistant_model=assistant_model,
-            streamer=streamer,
-            **kwargs,
-        )
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000):
-        super().__init__()
-        self.dim = dim
-        self.base = base
-        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
-        if importlib.util.find_spec("einops") is None:
-            raise RuntimeError("einops is required for Rotary Embedding")
-
-        self._rotary_pos_emb_cache = None
-        self._seq_len_cached = 0
-        self._ntk_alpha_cached = 1.0
-
-    def update_rotary_pos_emb_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        seqlen = max_seq_len + offset
-        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (
-                base
-                ** (
-                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
-                    / self.dim
-                )
-            )
-            self._seq_len_cached = max(2 * seqlen, 16)
-            self._ntk_alpha_cached = ntk_alpha
-            seq = torch.arange(self._seq_len_cached, device=self.inv_freq.device)
-            freqs = torch.outer(seq.type_as(self.inv_freq), self.inv_freq)
-
-            emb = torch.cat((freqs, freqs), dim=-1)
-            from einops import rearrange
-
-            emb = rearrange(emb, "n d -> 1 n 1 d")
-
-            cos, sin = emb.cos(), emb.sin()
-            self._rotary_pos_emb_cache = [cos, sin]
-
-    def forward(self, max_seq_len, offset=0, ntk_alpha=1.0):
-        self.update_rotary_pos_emb_cache(max_seq_len, offset, ntk_alpha)
-        cos, sin = self._rotary_pos_emb_cache
-        return [cos[:, offset : offset + max_seq_len], sin[:, offset : offset + max_seq_len]]
-
-
-def _rotate_half(x):
-    from einops import rearrange
-
-    x = rearrange(x, "... (j d) -> ... j d", j=2)
-    x1, x2 = x.unbind(dim=-2)
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(t, freqs):
-    cos, sin = freqs
-    if apply_rotary_emb_func is not None and t.is_cuda:
-        t_ = t.float()
-        cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
-        sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
-        output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
-        return output
-    else:
-        rot_dim = freqs[0].shape[-1]
-        cos, sin = freqs
-        t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-        t_ = t_.float()
-        t_pass_ = t_pass_.float()
-        t_ = (t_ * cos) + (_rotate_half(t_) * sin)
-        return torch.cat((t_, t_pass_), dim=-1).type_as(t)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        if rms_norm is not None and x.is_cuda:
-            return rms_norm(x, self.weight, self.eps)
-        else:
-            output = self._norm(x.float()).type_as(x)
-            return output * self.weight
diff --git a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/config.json b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/config.json
deleted file mode 100755
index ea93bc66a..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/config.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 1024,
-  "initializer_range": 0.02,
-  "intermediate_size": 2816,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 16,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.37.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
-
diff --git a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-0_5B-Chat/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/config.json b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/config.json
deleted file mode 100755
index 26ce493f6..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 5504,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 16,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.37.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-1_8B-Chat/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/config.json b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/config.json
deleted file mode 100755
index 9f2be4f60..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 2560,
-  "initializer_range": 0.02,
-  "intermediate_size": 6912,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 20,
-  "num_hidden_layers": 40,
-  "num_key_value_heads": 20,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 5000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.37.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-4B-Chat/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/config.json b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/config.json
deleted file mode 100755
index 6b0cfb9b4..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 28,
-  "model_type": "qwen2",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 32,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.37.0",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen1_5-7B-Chat/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
deleted file mode 100755
index 8f9ea8a58..000000000
--- a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 896,
-  "initializer_range": 0.02,
-  "intermediate_size": 4864,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 14,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.1",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen2-0_5B-Instruct/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
deleted file mode 100755
index bdc572b07..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 1536,
-  "initializer_range": 0.02,
-  "intermediate_size": 8960,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.1",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B-Instruct/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/config.json b/transformers/llm/export/llm_models/Qwen2-1_5B/config.json
deleted file mode 100755
index 08a0ac476..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 1536,
-  "initializer_range": 0.02,
-  "intermediate_size": 8960,
-  "max_position_embeddings": 131072,
-  "max_window_layers": 21,
-  "model_type": "qwen2",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 131072,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.38.2",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
deleted file mode 100644
index f8d5b5345..000000000
--- a/transformers/llm/export/llm_models/Qwen2-1_5B/modeling_qwen2.py
+++ /dev/null
@@ -1,1434 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
deleted file mode 100755
index eac7cd285..000000000
--- a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/config.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_qwen2.Qwen2Config",
-    "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
-  },
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151645,
-  "hidden_act": "silu",
-  "hidden_size": 3584,
-  "initializer_range": 0.02,
-  "intermediate_size": 18944,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 28,
-  "model_type": "qwen2",
-  "num_attention_heads": 28,
-  "num_hidden_layers": 28,
-  "num_key_value_heads": 4,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.41.2",
-  "use_cache": true,
-  "use_sliding_window": false,
-  "vocab_size": 152064
-}
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
deleted file mode 100644
index b6ca1ed43..000000000
--- a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/configuration_qwen2.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-QWEN2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "Qwen/Qwen2-7B-beta": "https://huggingface.co/Qwen/Qwen2-7B-beta/resolve/main/config.json",
-}
-
-
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
diff --git a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py b/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
deleted file mode 100644
index 595a3e91c..000000000
--- a/transformers/llm/export/llm_models/Qwen2-7B-Instruct/modeling_qwen2.py
+++ /dev/null
@@ -1,1436 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-    #from flash_attn import flash_attn_func, flash_attn_varlen_func
-    #from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
-    #_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[0].shape[2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        else:
-            cos, sin = rotary_pos_emb
-            query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-            key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=2)
-            value_states = torch.cat((past_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        past_key_value = torch.stack((key_states, value_states))
-        # repeat k/v heads if n_kv_heads < n_heads
-        # key_states = repeat_kv(key_states, self.num_key_value_groups)
-        # value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        return attn_output, None, past_key_value
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2Attention(config, layer_idx)
-
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_pos_emb: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json
deleted file mode 100755
index 117c9e5d6..000000000
--- a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/config.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "_name_or_path": "/mnt/petrelfs/libo1.p/alignment-handbook/data/tinyllama-2T-sft-full",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "attention_bias": false,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 5632,
-  "max_position_embeddings": 2048,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 22,
-  "num_key_value_heads": 4,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.35.0",
-  "use_cache": false,
-  "vocab_size": 32000
-}
diff --git a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
deleted file mode 100644
index 493b040b7..000000000
--- a/transformers/llm/export/llm_models/TinyLlama-1_1B-Chat/modeling_llama.py
+++ /dev/null
@@ -1,1040 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/Yi-6B-Chat/config.json b/transformers/llm/export/llm_models/Yi-6B-Chat/config.json
deleted file mode 100755
index aad6b1d39..000000000
--- a/transformers/llm/export/llm_models/Yi-6B-Chat/config.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "attention_bias": false,
-  "bos_token_id": 1,
-  "eos_token_id": 2,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 32,
-  "num_key_value_heads": 4,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-05,
-  "rope_scaling": null,
-  "rope_theta": 5000000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.35.0",
-  "use_cache": true,
-  "vocab_size": 64000
-}
diff --git a/transformers/llm/export/llm_models/Yi-6B-Chat/configuration_llama.py b/transformers/llm/export/llm_models/Yi-6B-Chat/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/Yi-6B-Chat/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py b/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
deleted file mode 100644
index 493b040b7..000000000
--- a/transformers/llm/export/llm_models/Yi-6B-Chat/modeling_llama.py
+++ /dev/null
@@ -1,1040 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/chatglm-6b/modeling_chatglm.py b/transformers/llm/export/llm_models/chatglm-6b/modeling_chatglm.py
deleted file mode 100644
index 82effe877..000000000
--- a/transformers/llm/export/llm_models/chatglm-6b/modeling_chatglm.py
+++ /dev/null
@@ -1,1441 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import os
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-
-from transformers.utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    BaseModelOutputWithPastAndCrossAttentions,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm-6b",
-    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
-]
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
-    """Load tf checkpoints in a pytorch model."""
-    try:
-        import re
-
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info(f"Loading TF weight {name} with shape {shape}")
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-                n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-                for n in name
-        ):
-            logger.info(f"Skipping {'/'.join(name)}")
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info(f"Skipping {'/'.join(name)}")
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert (
-                    pointer.shape == array.shape
-            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info(f"Initialize PyTorch weight {name}")
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(config.hidden_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-@torch.jit.script
-def gelu_impl(x):
-    """OpenAI's gelu implementation."""
-    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
-                                       (1.0 + 0.044715 * x * x)))
-
-
-def gelu(x):
-    return gelu_impl(x)
-
-
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, base=10000, precision=torch.half, learnable=False):
-        super().__init__()
-        inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
-        inv_freq = inv_freq.half()
-        self.learnable = learnable
-        if learnable:
-            self.inv_freq = torch.nn.Parameter(inv_freq)
-            self.max_seq_len_cached = None
-        else:
-            self.register_buffer('inv_freq', inv_freq)
-            self.max_seq_len_cached = None
-            self.cos_cached = None
-            self.sin_cached = None
-        self.precision = precision
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
-                              error_msgs):
-        pass
-
-    def forward(self, x, seq_dim=1, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[seq_dim]
-        if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
-            self.max_seq_len_cached = None if self.learnable else seq_len
-            t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            if self.precision == torch.bfloat16:
-                emb = emb.float()
-
-            # [sx, 1 (b * np), hn]
-            cos_cached = emb.cos()[:, None, :]
-            sin_cached = emb.sin()[:, None, :]
-            if self.precision == torch.bfloat16:
-                cos_cached = cos_cached.bfloat16()
-                sin_cached = sin_cached.bfloat16()
-            if self.learnable:
-                return cos_cached, sin_cached
-            self.cos_cached, self.sin_cached = cos_cached, sin_cached
-        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
-
-    def _apply(self, fn):
-        if self.cos_cached is not None:
-            self.cos_cached = fn(self.cos_cached)
-        if self.sin_cached is not None:
-            self.sin_cached = fn(self.sin_cached)
-        return super()._apply(fn)
-
-
-def rotate_half(x):
-    x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
-
-
-@torch.jit.script
-def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
-    # position_id: [sq, b], q, k: [sq, b, np, hn], cos: [sq, 1, hn] -> [sq, b, 1, hn]
-    cos, sin = F.embedding(position_id, torch.squeeze(cos)).unsqueeze(2), \
-        F.embedding(position_id, torch.squeeze(sin)).unsqueeze(2)
-    q, k = (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
-    return q, k
-
-
-def attention_fn(
-        self,
-        query_layer,
-        key_layer,
-        value_layer,
-        attention_mask,
-        hidden_size_per_partition,
-        layer_id,
-        layer_past=None,
-        scaling_attention_score=True,
-        use_cache=False,
-):
-    if layer_past is not None:
-        past_key, past_value = layer_past[0], layer_past[1]
-        key_layer = torch.cat((past_key, key_layer), dim=0)
-        value_layer = torch.cat((past_value, value_layer), dim=0)
-
-    # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
-    seq_len, b, nh, hidden_size = key_layer.shape
-
-    if use_cache:
-        present = (key_layer, value_layer)
-    else:
-        present = None
-
-    query_key_layer_scaling_coeff = float(layer_id + 1)
-    if scaling_attention_score:
-        query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
-
-    # ===================================
-    # Raw attention scores. [b, np, s, s]
-    # ===================================
-
-    # [b, np, sq, sk]
-    output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-    # [sq, b, np, hn] -> [sq, b * np, hn]
-    #query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-    # [sk, b, np, hn] -> [sk, b * np, hn]
-    #key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-    query_layer = query_layer.squeeze(1)
-    key_layer = key_layer.squeeze(1)
-
-    matmul_result = torch.zeros(
-        1, 1, 1,
-        dtype=query_layer.dtype,
-        device=query_layer.device,
-    )
-
-    matmul_result = torch.baddbmm(
-        matmul_result,
-        query_layer.transpose(0, 1),  # [b * np, sq, hn]
-        key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-        beta=0.0,
-        alpha=1.0,
-    )
-
-    # change view to [b, np, sq, sk]
-    # attention_scores = matmul_result.view(*output_size)
-    attention_scores = matmul_result.unsqueeze(0)
-
-    if self.scale_mask_softmax:
-        self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
-        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
-    else:
-        if not (attention_mask == 0).all():
-            # if auto-regressive, skip
-            attention_scores.masked_fill_(attention_mask, -10000.0)
-        dtype = attention_scores.dtype
-        attention_scores = attention_scores.float()
-        attention_scores = attention_scores * query_key_layer_scaling_coeff
-
-        attention_probs = F.softmax(attention_scores, dim=-1)
-
-        attention_probs = attention_probs.type(dtype)
-
-    # =========================
-    # Context layer. [sq, b, hp]
-    # =========================
-
-    # value_layer -> context layer.
-    # [sk, b, np, hn] --> [b, np, sq, hn]
-
-    # context layer shape: [b, np, sq, hn]
-    output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-
-    # change view [sk, b * np, hn]
-    # value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-    value_layer = value_layer.squeeze(1)
-
-    # change view [b * np, sq, sk]
-    # attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-    attention_probs = attention_probs.squeeze(0)
-
-    # matmul: [b * np, sq, hn]
-    context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-
-    # change view [b, np, sq, hn]
-    # context_layer = context_layer.view(*output_size)
-    context_layer = context_layer.unsqueeze(0)
-
-    # [b, np, sq, hn] --> [sq, b, np, hn]
-    context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-    # [sq, b, np, hn] --> [sq, b, hp]
-    new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
-    # context_layer = context_layer.view(*new_context_layer_shape)
-    context_layer = context_layer.view([-1, 1, hidden_size_per_partition])
-    outputs = (context_layer, present, attention_probs)
-
-    return outputs
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class SelfAttention(torch.nn.Module):
-    def __init__(self, hidden_size, num_attention_heads,
-                 layer_id, hidden_size_per_attention_head=None, bias=True,
-                 params_dtype=torch.float, position_encoding_2d=True, empty_init=True):
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        super(SelfAttention, self).__init__()
-
-        self.layer_id = layer_id
-        self.hidden_size = hidden_size
-        self.hidden_size_per_partition = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.num_attention_heads_per_partition = num_attention_heads
-        self.position_encoding_2d = position_encoding_2d
-        self.rotary_emb = RotaryEmbedding(
-            self.hidden_size // (self.num_attention_heads * 2)
-            if position_encoding_2d
-            else self.hidden_size // self.num_attention_heads,
-            base=10000,
-            precision=torch.half,
-            learnable=False,
-        )
-
-        self.scale_mask_softmax = None
-
-        if hidden_size_per_attention_head is None:
-            self.hidden_size_per_attention_head = hidden_size // num_attention_heads
-        else:
-            self.hidden_size_per_attention_head = hidden_size_per_attention_head
-
-        self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head
-
-        # Strided linear layer.
-        self.query_key_value = init_method(
-            torch.nn.Linear,
-            hidden_size,
-            3 * self.inner_hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-        self.dense = init_method(
-            torch.nn.Linear,
-            self.inner_hidden_size,
-            hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-    @staticmethod
-    def attention_mask_func(attention_scores, attention_mask):
-        attention_scores.masked_fill_(attention_mask, -10000.0)
-        return attention_scores
-
-    def split_tensor_along_last_dim(self, tensor, num_partitions,
-                                    contiguous_split_chunks=False):
-        """Split a tensor along its last dimension.
-        Arguments:
-            tensor: input tensor.
-            num_partitions: number of partitions to split the tensor
-            contiguous_split_chunks: If True, make each chunk contiguous
-                                    in memory.
-        """
-        # Get the size and dimension.
-        last_dim = tensor.dim() - 1
-        last_dim_size = tensor.size()[last_dim] // num_partitions
-        # Split.
-        tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-        # Note: torch.split does not create contiguous tensors by default.
-        if contiguous_split_chunks:
-            return tuple(chunk.contiguous() for chunk in tensor_list)
-
-        return tensor_list
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            position_ids,
-            attention_mask: torch.Tensor,
-            layer_id,
-            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-            use_cache: bool = False,
-            output_attentions: bool = False,
-    ):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        attention_mask: [(1, 1), seq_len, seq_len]
-        """
-
-        # [seq_len, batch, 3 * hidden_size]
-        mixed_raw_layer = self.query_key_value(hidden_states)
-
-        # [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
-        new_tensor_shape = mixed_raw_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape)
-
-        # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
-        (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim(mixed_raw_layer, 3)
-
-        if self.position_encoding_2d:
-            q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
-            k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
-            cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
-            position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
-                position_ids[:, 1, :].transpose(0, 1).contiguous()
-            q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids)
-            q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids)
-            query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1))
-            key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
-        else:
-            position_ids = position_ids.transpose(0, 1)
-            cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
-            # [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
-            query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
-
-        # [seq_len, batch, hidden_size]
-        context_layer, present, attention_probs = attention_fn(
-            self=self,
-            query_layer=query_layer,
-            key_layer=key_layer,
-            value_layer=value_layer,
-            attention_mask=attention_mask,
-            hidden_size_per_partition=self.hidden_size_per_partition,
-            layer_id=layer_id,
-            layer_past=layer_past,
-            use_cache=use_cache
-        )
-
-        output = self.dense(context_layer)
-
-        outputs = (output, present)
-
-        if output_attentions:
-            outputs += (attention_probs,)
-
-        return outputs  # output, present, attention_probs
-
-
-class GEGLU(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.activation_fn = F.gelu
-
-    def forward(self, x):
-        # dim=-1 breaks in jit for pt<1.10
-        x1, x2 = x.chunk(2, dim=(x.ndim - 1))
-        return x1 * self.activation_fn(x2)
-
-
-class GLU(torch.nn.Module):
-    def __init__(self, hidden_size, inner_hidden_size=None,
-                 layer_id=None, bias=True, activation_func=gelu, params_dtype=torch.float, empty_init=True):
-        super(GLU, self).__init__()
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        self.layer_id = layer_id
-        self.activation_func = activation_func
-
-        # Project to 4h.
-        self.hidden_size = hidden_size
-        if inner_hidden_size is None:
-            inner_hidden_size = 4 * hidden_size
-        self.inner_hidden_size = inner_hidden_size
-        self.dense_h_to_4h = init_method(
-            torch.nn.Linear,
-            self.hidden_size,
-            self.inner_hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-        # Project back to h.
-        self.dense_4h_to_h = init_method(
-            torch.nn.Linear,
-            self.inner_hidden_size,
-            self.hidden_size,
-            bias=bias,
-            dtype=params_dtype,
-        )
-
-    def forward(self, hidden_states):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        """
-
-        # [seq_len, batch, inner_hidden_size]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-
-        output = self.dense_4h_to_h(intermediate_parallel)
-
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    def __init__(
-            self,
-            hidden_size,
-            num_attention_heads,
-            layernorm_epsilon,
-            layer_id,
-            inner_hidden_size=None,
-            hidden_size_per_attention_head=None,
-            layernorm=LayerNorm,
-            use_bias=True,
-            params_dtype=torch.float,
-            num_layers=28,
-            position_encoding_2d=True,
-            empty_init=True
-    ):
-        super(GLMBlock, self).__init__()
-        # Set output layer initialization if not provided.
-
-        self.layer_id = layer_id
-
-        # Layernorm on the input data.
-        self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
-
-        self.position_encoding_2d = position_encoding_2d
-
-        # Self attention.
-        self.attention = SelfAttention(
-            hidden_size,
-            num_attention_heads,
-            layer_id,
-            hidden_size_per_attention_head=hidden_size_per_attention_head,
-            bias=use_bias,
-            params_dtype=params_dtype,
-            position_encoding_2d=self.position_encoding_2d,
-            empty_init=empty_init
-        )
-
-        # Layernorm on the input data.
-        self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon)
-
-        self.num_layers = num_layers
-
-        # GLU
-        self.mlp = GLU(
-            hidden_size,
-            inner_hidden_size=inner_hidden_size,
-            bias=use_bias,
-            layer_id=layer_id,
-            params_dtype=params_dtype,
-            empty_init=empty_init
-        )
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            position_ids,
-            attention_mask: torch.Tensor,
-            layer_id,
-            layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-            use_cache: bool = False,
-            output_attentions: bool = False,
-    ):
-        """
-        hidden_states: [seq_len, batch, hidden_size]
-        attention_mask: [(1, 1), seq_len, seq_len]
-        """
-
-        # Layer norm at the begining of the transformer layer.
-        # [seq_len, batch, hidden_size]
-        attention_input = self.input_layernorm(hidden_states)
-
-        # Self attention.
-        attention_outputs = self.attention(
-            attention_input,
-            position_ids,
-            attention_mask=attention_mask,
-            layer_id=layer_id,
-            layer_past=layer_past,
-            use_cache=use_cache,
-            output_attentions=output_attentions
-        )
-
-        attention_output = attention_outputs[0]
-
-        outputs = attention_outputs[1:]
-
-        # Residual connection.
-        alpha = (2 * self.num_layers) ** 0.5
-        hidden_states = attention_input * alpha + attention_output
-
-        mlp_input = self.post_attention_layernorm(hidden_states)
-
-        # MLP.
-        mlp_output = self.mlp(mlp_input)
-
-        # Second residual connection.
-        output = mlp_input * alpha + mlp_output
-
-        if use_cache:
-            outputs = (output,) + outputs
-        else:
-            outputs = (output,) + outputs[1:]
-
-        return outputs  # hidden_states, present, attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
-        attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
-        attention_mask.tril_()
-        for i, context_length in enumerate(context_lengths):
-            attention_mask[i, :, :context_length] = 1
-        attention_mask.unsqueeze_(1)
-        attention_mask = (attention_mask < 0.5).bool()
-
-        return attention_mask
-
-    def get_position_ids(self, input_ids, mask_positions, device, use_gmasks=None):
-        batch_size, seq_length = input_ids.shape
-        if use_gmasks is None:
-            use_gmasks = [False] * batch_size
-        context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids]
-        if self.position_encoding_2d:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            for i, context_length in enumerate(context_lengths):
-                position_ids[i, context_length:] = mask_positions[i]
-            block_position_ids = [torch.cat((
-                torch.zeros(context_length, dtype=torch.long, device=device),
-                torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
-            )) for context_length in context_lengths]
-            block_position_ids = torch.stack(block_position_ids, dim=0)
-            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
-        else:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-            for i, context_length in enumerate(context_lengths):
-                if not use_gmasks[i]:
-                    position_ids[i, context_length:] = mask_positions[i]
-
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ChatGLMModel):
-            module.gradient_checkpointing = value
-
-
-CHATGLM_6B_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-
-    Parameters:
-        config ([`~ChatGLM6BConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-CHATGLM_6B_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `({0})`):
-            Indices of input sequence tokens in the vocabulary.
-
-            Indices can be obtained using [`ChatGLM6BTokenizer`].
-            See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
-
-            - 0 corresponds to a *sentence A* token,
-            - 1 corresponds to a *sentence B* token.
-
-            [What are token type IDs?](../glossary#token-type-ids)
-        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range `[0, config.max_position_embeddings - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare ChatGLM-6B Model transformer outputting raw hidden-states without any specific head on top.",
-    CHATGLM_6B_START_DOCSTRING,
-)
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in [Attention is
-    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    `is_decoder` argument of the configuration set to `True`.
-    To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
-    argument and `add_cross_attention` set to `True`; an
-    `encoder_hidden_states` is then expected as an input to the forward pass.
-    """
-
-    def __init__(self, config: ChatGLMConfig, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        # recording parameters
-        self.max_sequence_length = config.max_sequence_length
-        self.hidden_size = config.hidden_size
-        self.params_dtype = torch.half
-        self.num_attention_heads = config.num_attention_heads
-        self.vocab_size = config.vocab_size
-        self.num_layers = config.num_layers
-        self.layernorm_epsilon = config.layernorm_epsilon
-        self.inner_hidden_size = config.inner_hidden_size
-        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
-        self.position_encoding_2d = config.position_encoding_2d
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-
-        self.word_embeddings = init_method(
-            torch.nn.Embedding,
-            num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
-            dtype=self.params_dtype
-        )
-        self.gradient_checkpointing = False
-
-        def get_layer(layer_id):
-            return GLMBlock(
-                self.hidden_size,
-                self.num_attention_heads,
-                self.layernorm_epsilon,
-                layer_id,
-                inner_hidden_size=self.inner_hidden_size,
-                hidden_size_per_attention_head=self.hidden_size_per_attention_head,
-                layernorm=LayerNorm,
-                use_bias=True,
-                params_dtype=self.params_dtype,
-                position_encoding_2d=self.position_encoding_2d,
-                empty_init=empty_init
-            )
-
-        self.layers = torch.nn.ModuleList(
-            [get_layer(layer_id) for layer_id in range(self.num_layers)]
-        )
-
-        # Final layer norm before output.
-        self.final_layernorm = LayerNorm(self.hidden_size, eps=self.layernorm_epsilon)
-
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-            # total_params = sum(p.numel() for p in self.parameters())
-            # trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
-            # print("Using p-tuning v2: # trainable_params = {} / {}".format(trainable_params, total_params))
-
-    def get_input_embeddings(self):
-        return self.word_embeddings
-
-    def set_input_embeddings(self, new_embeddings: torch.Tensor):
-        self.word_embeddings = new_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.num_attention_heads,
-            self.hidden_size // self.num_attention_heads
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        # past_key_values = [(v[0], v[1]) for v in past_key_values]
-        return past_key_values
-
-    @add_start_docstrings_to_model_forward(CHATGLM_6B_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-            self,
-            input_ids: Optional[torch.LongTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPast]:
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-
-        if past_key_values is None:
-            if self.pre_seq_len is not None:
-                past_key_values = self.get_prompt(batch_size=input_ids.shape[0], device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            else:
-                past_key_values = tuple([None] * len(self.layers))
-
-            if attention_mask is None:
-                attention_mask = self.get_masks(
-                    input_ids,
-                    device=input_ids.device
-                )
-
-
-            if position_ids is None:
-                MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-                seqs = input_ids.tolist()
-
-                mask_positions, use_gmasks = [], []
-                for seq in seqs:
-                    mask_token = gMASK if gMASK in seq else MASK
-                    use_gmask = mask_token == gMASK
-                    mask_positions.append(seq.index(mask_token))
-                    use_gmasks.append(use_gmask)
-
-                position_ids = self.get_position_ids(
-                    input_ids,
-                    mask_positions=mask_positions,
-                    device=input_ids.device,
-                    use_gmasks=use_gmasks
-                )
-
-        if self.pre_seq_len is not None and attention_mask is not None:
-            prefix_attention_mask = torch.ones(batch_size, 1, input_ids.size(-1), self.pre_seq_len).to(
-                attention_mask.device)
-            prefix_attention_mask = (prefix_attention_mask < 0.5).bool()
-            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=3)
-
-        # [seq_len, batch, hidden_size]
-        hidden_states = inputs_embeds.transpose(0, 1)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
-
-        if attention_mask is None:
-            attention_mask = torch.zeros(1, 1, device=input_ids.device).bool()
-        else:
-            attention_mask = attention_mask.to(hidden_states.device)
-
-        for i, layer in enumerate(self.layers):
-
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_past = past_key_values[i]
-
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    position_ids,
-                    attention_mask,
-                    torch.tensor(i),
-                    layer_past,
-                    use_cache,
-                    output_attentions
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    position_ids=position_ids,
-                    attention_mask=attention_mask,
-                    layer_id=torch.tensor(i),
-                    layer_past=layer_past,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions
-                )
-
-            hidden_states = layer_ret[0]
-
-            if use_cache:
-                presents = presents + (layer_ret[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_ret[2 if use_cache else 1],)
-
-        # Final layer norm.
-        hidden_states = self.final_layernorm(hidden_states)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-
-        # self.hidden_size = config.hidden_size
-        # self.params_dtype = torch.half
-        # self.vocab_size = config.vocab_size
-        self.max_sequence_length = config.max_sequence_length
-
-        self.position_encoding_2d = config.position_encoding_2d
-
-        self.transformer = ChatGLMModel(config, empty_init=empty_init)
-
-        self.lm_head = init_method(
-            nn.Linear,
-            config.hidden_size,
-            config.vocab_size,
-            bias=False,
-            dtype=torch.half
-        )
-
-        self.config = config
-
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def _update_model_kwargs_for_generation(
-        self,
-        outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
-        is_encoder_decoder: bool = False,
-        standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            if attention_mask is not None and attention_mask.dtype == torch.bool:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((*attention_mask.shape[:3], 1))], dim=3)
-                new_attention_mask = attention_mask[:, :, -1:].clone()
-                new_attention_mask[..., -1] = False
-                model_kwargs["attention_mask"] = torch.cat(
-                    [attention_mask, new_attention_mask], dim=2
-                )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id[:, 1, :] += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past: Optional[torch.Tensor] = None,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            **kwargs
-    ) -> dict:
-        batch_size, seq_length = input_ids.shape
-        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
-        seqs = input_ids.tolist()
-        mask_positions, use_gmasks = [], []
-        for seq in seqs:
-            mask_token = gMASK if gMASK in seq else MASK
-            use_gmask = mask_token == gMASK
-            mask_positions.append(seq.index(mask_token))
-            use_gmasks.append(use_gmask)
-
-        # only last token for input_ids if past is not None
-        if past is not None or past_key_values is not None:
-            last_token = input_ids[:, -1].unsqueeze(-1)
-            if attention_mask is not None and attention_mask.dtype == torch.bool:
-                attention_mask = attention_mask[:, :, -1:]
-            else:
-                attention_mask = None
-            if position_ids is not None:
-                position_ids = position_ids[..., -1:]
-            else:
-                context_lengths = [seq.index(self.config.bos_token_id) for seq in seqs]
-                if self.position_encoding_2d:
-                    position_ids = torch.tensor(
-                        [[mask_position, seq_length - context_length] for mask_position, context_length in
-                         zip(mask_positions, context_lengths)], dtype=torch.long, device=input_ids.device).unsqueeze(-1)
-                else:
-                    position_ids = torch.tensor([mask_position for mask_position in mask_positions], dtype=torch.long,
-                                                device=input_ids.device).unsqueeze(-1)
-
-            if past is None:
-                past = past_key_values
-            return {
-                "input_ids": last_token,
-                "past_key_values": past,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask
-            }
-        else:
-            if attention_mask is not None and attention_mask.dtype != torch.bool:
-                logger.warning_once(f"The dtype of attention mask ({attention_mask.dtype}) is not bool")
-                attention_mask = None
-            if attention_mask is None:
-                attention_mask = self.get_masks(
-                    input_ids,
-                    device=input_ids.device
-                )
-            if position_ids is None:
-                position_ids = self.get_position_ids(
-                    input_ids,
-                    device=input_ids.device,
-                    mask_positions=mask_positions,
-                    use_gmasks=use_gmasks
-                )
-
-            return {
-                "input_ids": input_ids,
-                "past_key_values": past,
-                "position_ids": position_ids,
-                "attention_mask": attention_mask
-            }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states).permute(1, 0, 2).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        punkts = [
-            [",", "，"],
-            ["!", "！"],
-            [":", "："],
-            [";", "；"],
-            ["\?", "？"],
-        ]
-        for item in punkts:
-            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
-            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
-        return response
-
-    @torch.no_grad()
-    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
-             do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if not history:
-            prompt = query
-        else:
-            prompt = ""
-            for i, (old_query, response) in enumerate(history):
-                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
-            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-        response = tokenizer.decode(outputs)
-        response = self.process_response(response)
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
-                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if not history:
-            prompt = query
-        else:
-            prompt = ""
-            for i, (old_query, response) in enumerate(history):
-                prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
-            prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        for outputs in self.stream_generate(**inputs, **gen_kwargs):
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-            response = tokenizer.decode(outputs)
-            response = self.process_response(response)
-            new_history = history + [(query, response)]
-            yield response, new_history
-
-    @torch.no_grad()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-            yield input_ids
-
-    def quantize(self, bits: int, empty_init=False, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer = quantize(self.transformer, bits, empty_init=empty_init, **kwargs)
-        return self
diff --git a/transformers/llm/export/llm_models/chatglm2-6b/modeling_chatglm.py b/transformers/llm/export/llm_models/chatglm2-6b/modeling_chatglm.py
deleted file mode 100644
index e9b5ca258..000000000
--- a/transformers/llm/export/llm_models/chatglm2-6b/modeling_chatglm.py
+++ /dev/null
@@ -1,1193 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM2-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm2-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config: ChatGLMConfig):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(kv_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, kv_size)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len,
-                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-
-@torch.jit.script
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2 and False:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=0)
-            value_layer = torch.cat((cache_v, value_layer), dim=0)
-        if use_cache:
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.multi_query_group_num,
-            self.kv_channels
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        return past_key_values
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if self.pre_seq_len is not None:
-            if past_key_values is None:
-                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
-                                            attention_mask], dim=-1)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            position_ids = position_ids[..., -1:]
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        return response
-
-    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        prompt = tokenizer.build_prompt(query, history=history)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        return inputs
-
-    def build_stream_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        if history:
-            prompt = "\n\n[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
-            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
-            input_ids = input_ids[1:]
-            inputs = tokenizer.batch_encode_plus([(input_ids, None)], return_tensors="pt", add_special_tokens=False)
-        else:
-            prompt = "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
-            inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        return inputs
-
-    @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 8192, num_beams=1,
-             do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = self.build_inputs(tokenizer, query, history=history)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-        response = tokenizer.decode(outputs)
-        response = self.process_response(response)
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.inference_mode()
-    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, past_key_values=None,
-                    max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-                    return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None and not return_past_key_values:
-            inputs = self.build_inputs(tokenizer, query, history=history)
-        else:
-            inputs = self.build_stream_inputs(tokenizer, query, history=history)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            return_past_key_values=return_past_key_values, **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response = self.process_response(response)
-                new_history = history + [(query, response)]
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.inference_mode()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
diff --git a/transformers/llm/export/llm_models/chatglm3-6b/modeling_chatglm.py b/transformers/llm/export/llm_models/chatglm3-6b/modeling_chatglm.py
deleted file mode 100755
index f887c44ce..000000000
--- a/transformers/llm/export/llm_models/chatglm3-6b/modeling_chatglm.py
+++ /dev/null
@@ -1,1293 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-from copy import deepcopy
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
-_CONFIG_FOR_DOC = "ChatGLMConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm3-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config: ChatGLMConfig):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(kv_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, kv_size)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len,
-                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-
-@torch.jit.script
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2 and False:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=0)
-            value_layer = torch.cat((cache_v, value_layer), dim=0)
-        if use_cache:
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.multi_query_group_num,
-            self.kv_channels
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        return past_key_values
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if self.pre_seq_len is not None:
-            if past_key_values is None:
-                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
-                                            attention_mask], dim=-1)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            if past_key_values is not None:
-                position_ids = position_ids[..., -1:]
-                input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True,
-            "use_cache": use_cache
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, output, history):
-        content = ""
-        history = deepcopy(history)
-        for response in output.split("<|assistant|>"):
-            metadata, content = response.split("\n", maxsplit=1)
-            if not metadata.strip():
-                content = content.strip()
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                content = content.replace("[[训练时间]]", "2023年")
-            else:
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                if history[0]["role"] == "system" and "tools" in history[0]:
-                    content = "\n".join(content.split("\n")[1:-1])
-                    def tool_call(**kwargs):
-                        return kwargs
-                    parameters = eval(content)
-                    content = {"name": metadata.strip(), "parameters": parameters}
-                else:
-                    content = {"name": metadata.strip(), "content": content}
-        return content, history
-
-    @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
-             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-             **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        inputs = inputs.to(self.device)
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        history.append({"role": role, "content": query})
-        response, history = self.process_response(response, history)
-        return response, history
-
-    @torch.inference_mode()
-    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
-                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
-                    logits_processor=None, return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None:
-            inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        else:
-            inputs = tokenizer.build_chat_input(query, role=role)
-        inputs = inputs.to(self.device)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        history.append({"role": role, "content": query})
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
-                                            **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response, new_history = self.process_response(response, history)
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.inference_mode()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        model_kwargs["use_cache"] = generation_config.use_cache
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul(
-                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-            )
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
-
-
-class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.num_labels = config.num_labels
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-
-        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
-        if config.classifier_dropout is not None:
-            self.dropout = nn.Dropout(config.classifier_dropout)
-        else:
-            self.dropout = None
-        self.config = config
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def forward(
-            self,
-            input_ids: Optional[torch.LongTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            full_attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.LongTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            full_attention_mask=full_attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        pooled_hidden_states = hidden_states[-1]
-        if self.dropout is not None:
-            pooled_hidden_states = self.dropout(pooled_hidden_states)
-        logits = self.classifier_head(pooled_hidden_states)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits.float(), labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/codegeex2-6b/modeling_chatglm.py b/transformers/llm/export/llm_models/codegeex2-6b/modeling_chatglm.py
deleted file mode 100755
index fdc619f81..000000000
--- a/transformers/llm/export/llm_models/codegeex2-6b/modeling_chatglm.py
+++ /dev/null
@@ -1,1092 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
-_CONFIG_FOR_DOC = "ChatGLM6BConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm-6b",
-    # See all ChatGLM-6B models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device, dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_original_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=dtype, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=dtype, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        if self.original_impl:
-            return self.forward_original_impl(
-                max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-            )
-
-
-@torch.jit.script
-def apply_rotary_pos_emb_original(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, input: torch.Tensor):
-        norm_x = torch.mean(input * input, dim=-1, keepdim=True)
-        x_normed = input * torch.rsqrt(norm_x + self.eps)
-        return self.weight * x_normed
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2 and False:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-        self.interleaved_qkv = config.interleaved_qkv
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            if self.interleaved_qkv:
-                new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                                   (self.num_attention_heads_per_partition,
-                                    3 * self.hidden_size_per_attention_head)
-                mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-            if not self.interleaved_qkv:
-                query_layer = query_layer.view(
-                    query_layer.size()[:-1] + (
-                        self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-                ).contiguous()
-                key_layer = key_layer.view(
-                    key_layer.size()[:-1] + (
-                        self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-                ).contiguous()
-                value_layer = value_layer.view(
-                    value_layer.size()[:-1] + (
-                        self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-                ).contiguous()
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb_original(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb_original(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if use_cache:
-            if kv_cache is not None:
-                cache_k, cache_v = kv_cache
-                key_layer = torch.cat((cache_k, key_layer), dim=0)
-                value_layer = torch.cat((cache_v, value_layer), dim=0)
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-
-            hidden_states, kv_cache = layer(
-                hidden_states,
-                attention_mask,
-                rotary_pos_emb,
-                kv_cache=kv_caches[index],
-                use_cache=use_cache
-            )
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, ChatGLMModel):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        if config.rotary_percent < 1.0:
-            rotary_dim = int(rotary_dim * config.rotary_percent)
-
-        # partial rotary embeddings, which is better than full rotary
-        # Wang and Komatsuzaki et al
-        # https://github.com/kingoflolz/mesh-transformer-jax/
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.gradient_checkpointing = False
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if full_attention_mask is None and attention_mask is not None and not attention_mask.all():
-            full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            input_pos: int = None,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if past_key_values is not None:
-            if position_ids is None:
-                position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-            position_ids = position_ids[..., -1:]
-            input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, response):
-        response = response.strip()
-        response = response.replace("[[训练时间]]", "2023年")
-        return response
-
-    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None):
-        prompt = ""
-        for i, (old_query, response) in enumerate(history):
-            prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
-        prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
-        inputs = tokenizer([prompt], return_tensors="pt")
-        inputs = inputs.to(self.device)
-        return inputs
-
-    @torch.no_grad()
-    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
-             do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = self.build_inputs(tokenizer, query, history=history)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-        response = tokenizer.decode(outputs)
-        response = self.process_response(response)
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048,
-                    do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = self.build_inputs(tokenizer, query, history=history)
-        for outputs in self.stream_generate(**inputs, **gen_kwargs):
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):]
-            response = tokenizer.decode(outputs)
-            response = self.process_response(response)
-            new_history = history + [(query, response)]
-            yield response, new_history
-
-    @torch.no_grad()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
-
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-            yield input_ids
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
diff --git a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/config.json b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/config.json
deleted file mode 100755
index 67a803b6a..000000000
--- a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/config.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
-  "auto_map": {
-    "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM"
-  },
-  "bos_token_id": 100000,
-  "eos_token_id": 100001,
-  "hidden_act": "silu",
-  "hidden_size": 4096,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 4096,
-  "model_type": "llama",
-  "num_attention_heads": 32,
-  "num_hidden_layers": 30,
-  "num_key_value_heads": 32,
-  "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
-  "use_cache": true,
-  "vocab_size": 102400
-}
diff --git a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/configuration_llama.py b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/configuration_llama.py
deleted file mode 100644
index 1b0e9c357..000000000
--- a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/configuration_llama.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" LLaMA model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-
-
-class LlamaConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`LlamaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        pretraining_tp (`int`, *optional*, defaults to `1`):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-
-        Example:
-
-    ```python
-    >>> from transformers import LlamaModel, LlamaConfig
-
-    >>> # Initializing a LLaMA llama-7b style configuration
-    >>> configuration = LlamaConfig()
-
-    >>> # Initializing a model from the llama-7b style configuration
-    >>> model = LlamaModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "llama"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        bos_token_id=1,
-        eos_token_id=2,
-        pretraining_tp=1,
-        tie_word_embeddings=False,
-        rope_scaling=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
-        self.use_cache = use_cache
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
diff --git a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py b/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
deleted file mode 100644
index 493b040b7..000000000
--- a/transformers/llm/export/llm_models/deepseek-llm-7b-chat/modeling_llama.py
+++ /dev/null
@@ -1,1040 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
-from .configuration_llama import LlamaConfig
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.pretraining_tp = config.pretraining_tp
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.pretraining_tp > 1:
-            slice = self.intermediate_size // self.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.pretraining_tp)]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.pretraining_tp = config.pretraining_tp
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-        self._init_rope()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.pretraining_tp
-            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.pretraining_tp, dim=0)
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-        '''
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        '''
-        #---------------
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #---------------
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = LlamaAttention(config=config)
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.pretraining_tp = config.pretraining_tp
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py b/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py
deleted file mode 100755
index e86f5a2f4..000000000
--- a/transformers/llm/export/llm_models/glm-4-9b-chat/modeling_chatglm.py
+++ /dev/null
@@ -1,1238 +0,0 @@
-""" PyTorch ChatGLM model. """
-import json
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-from copy import deepcopy
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
-_CONFIG_FOR_DOC = "ChatGLMConfig"
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 198] = 5e4
-        return scores
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-        self.rope_ratio = rope_ratio
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        base = base * self.rope_ratio
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-
-@torch.jit.script
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [b, np, sq, hn]
-    b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:, :sq]
-    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def raw_atten(self, query_layer, key_layer, value_layer, attention_mask):
-        attn_weights = torch.matmul(query_layer, key_layer.transpose(-1, -2)) / self.norm_factor
-        if attention_mask is None:
-            seq_len = query_layer.shape[2]
-            attention_mask = ~torch.tril(torch.ones([1, 1, seq_len, seq_len], device=attn_weights.device).bool())
-        attn_weights = attn_weights.masked_fill(attention_mask, float("-inf"))
-        #mask_value = torch.finfo(attn_weights.dtype).min
-        #attn_weights = torch.where(attention_mask, attn_weights.to(attn_weights.dtype), mask_value)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        context_layer = torch.matmul(attn_weights, value_layer)
-        return context_layer
-        context_layer = context_layer.transpose(1, 2).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-        context_layer = context_layer.reshape(*new_context_layer_shape)
-        return context_layer
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2 and False:
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
-
-            # [b, np, sq, hn] -> [b * np, sq, hn]
-            query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
-            # [b, np, sk, hn] -> [b * np, sk, hn]
-            key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer,  # [b * np, sq, hn]
-                key_layer.transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [b * np, sk, hn]
-            #value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
-            # change view [b * np, sq, sk]
-            #attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            # context_layer = torch.bmm(attention_probs, value_layer)
-            context_layer = torch.matmul(attention_probs, value_layer)
-            # change view [b, np, sq, hn]
-            # context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [b, sq, np, hn]
-            context_layer = context_layer.transpose(1, 2).contiguous()
-            # [b, sq, np, hn] --> [b, sq, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True):
-        # hidden_states: [b, sq, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # [b, sq, np, hn] -> [b, np, sq, hn]
-        query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]]
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=2)
-            value_layer = torch.cat((cache_v, value_layer), dim=2)
-        if use_cache:
-            '''
-            if kv_cache is None:
-                kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)), dim=1)
-            else:
-                kv_cache = (key_layer, value_layer)
-            '''
-            kv_cache = torch.stack([key_layer, value_layer], axis=0)
-            # '''
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(2)
-            key_layer = key_layer.expand(
-                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:]
-            )
-            value_layer = value_layer.unsqueeze(2)
-            value_layer = value_layer.expand(
-                -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:]
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-        hidden_states = hidden_states.view(1, -1, 4096)
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache,
-                    use_reentrant=False
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                # token by token decoding, use tuple format
-                if kv_caches[0] is not None:
-                    presents = presents + (kv_cache,)
-                # prefilling in decoding, use tensor format to save cuda memory
-                else:
-                    if len(presents) == 0:
-                        presents = kv_cache
-                    else:
-                        presents = torch.cat((presents, kv_cache), dim=0)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
-        if not self.supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio, original_impl=config.original_rope,
-                                              device=device, dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embedding.word_embeddings = value
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-        if presents is not None and type(presents) is torch.Tensor:
-            presents = presents.split(1, dim=0)
-            presents = list(presents)
-            presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
-            presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
-            presents = tuple(presents)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            if past_key_values is not None:
-                position_ids = position_ids[..., -1:]
-                input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True,
-            "use_cache": use_cache
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[:, -1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, output, history):
-        content = ""
-        history = deepcopy(history)
-        for response in output.split("<|assistant|>"):
-            if "\n" in response:
-                metadata, content = response.split("\n", maxsplit=1)
-            else:
-                metadata, content = "", response
-            if not metadata.strip():
-                content = content.strip()
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                content = content.replace("[[训练时间]]", "2023年")
-            else:
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                if history[0]["role"] == "system" and "tools" in history[0]:
-                    parameters = json.loads(content)
-                    content = {"name": metadata.strip(), "parameters": parameters}
-                else:
-                    content = {"name": metadata.strip(), "content": content}
-        return content, history
-
-    @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-             **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        history.append({"role": role, "content": query})
-        inputs = tokenizer.apply_chat_template(history, add_generation_prompt=True, tokenize=True,
-                                               return_tensors="pt", return_dict=True)
-        inputs = inputs.to(self.device)
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
-                        tokenizer.convert_tokens_to_ids("<|observation|>")]
-        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        response, history = self.process_response(response, history)
-        return response, history
-
-    @torch.inference_mode()
-    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-                    past_key_values=None, max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
-                    logits_processor=None, return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|user|>"),
-                        tokenizer.convert_tokens_to_ids("<|observation|>")]
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None:
-            inputs = tokenizer.apply_chat_template(history + [{"role": role, "content": query}],
-                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
-                                                   return_dict=True)
-        else:
-            inputs = tokenizer.apply_chat_template([{"role": role, "content": query}], add_special_tokens=False,
-                                                   add_generation_prompt=True, tokenize=True, return_tensors="pt",
-                                                   return_dict=True)
-        inputs = inputs.to(self.device)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        history.append({"role": role, "content": query})
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
-                                            **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response, new_history = self.process_response(response, history)
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.inference_mode()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        model_kwargs["use_cache"] = generation_config.use_cache
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul(
-                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-            )
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-
-class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.num_labels = config.num_labels
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-
-        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
-        if config.classifier_dropout is not None:
-            self.dropout = nn.Dropout(config.classifier_dropout)
-        else:
-            self.dropout = None
-        self.config = config
-
-    def forward(
-            self,
-            input_ids: Optional[torch.LongTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            full_attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.LongTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            full_attention_mask=full_attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        pooled_hidden_states = hidden_states[-1]
-        if self.dropout is not None:
-            pooled_hidden_states = self.dropout(pooled_hidden_states)
-        logits = self.classifier_head(pooled_hidden_states)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits.float(), labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py b/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
deleted file mode 100755
index b636e8716..000000000
--- a/transformers/llm/export/llm_models/internlm-chat-7b/modeling_internlm.py
+++ /dev/null
@@ -1,1046 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch InternLM model."""
-import math
-from typing import List, Optional, Tuple, Union
-import threading, queue
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.generation.streamers import BaseStreamer
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_internlm import InternLMConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "InternLMConfig"
-
-
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device)
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-
-
-class InternLMRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        InternLMRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class InternLMRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
-    # sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = torch.squeeze(cos)  # [seq_len, dim]
-    sin = torch.squeeze(sin)  # [seq_len, dim]
-    # cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    # sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class InternLMMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-class InternLMAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.max_position_embeddings = config.max_position_embeddings
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
-        self.rotary_emb = InternLMRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-        '''
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        if rotary_pos_emb is None:
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = rotary_pos_emb
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        # [bsz, nh, t, hd]
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        '''
-        #---------------
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
-
-        kv_seq_len = key_states.shape[1]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[1]
-        # rope
-        cos, sin = rotary_pos_emb
-        query_states = (query_states * cos) + (rotate_half(query_states) * sin)
-        key_states = (key_states * cos) + (rotate_half(key_states) * sin)
-        # kv cache
-        if past_key_value is not None:
-            past_key, past_value = past_key_value[0], past_key_value[1]
-            key_states = torch.cat((past_key, key_states), dim=1)
-            value_states = torch.cat((past_value, value_states), dim=1)
-        past_key_value = torch.stack((key_states, value_states))
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.permute([0, 2, 3, 1])
-        value_states = value_states.transpose(1, 2)
-        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
-        #---------------
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-            # attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2)
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class InternLMDecoderLayer(nn.Module):
-    def __init__(self, config: InternLMConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = InternLMAttention(config=config)
-        self.mlp = InternLMMLP(
-            hidden_size=self.hidden_size,
-            intermediate_size=config.intermediate_size,
-            hidden_act=config.hidden_act,
-        )
-        self.input_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            rotary_pos_emb=rotary_pos_emb,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-INTERNLM_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`InternLMConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMPreTrainedModel(PreTrainedModel):
-    config_class = InternLMConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["InternLMDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, InternLMModel):
-            module.gradient_checkpointing = value
-
-
-INTERNLM_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMModel(InternLMPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
-
-    Args:
-        config: InternLMConfig
-    """
-
-    _auto_class = "AutoModel"
-
-    def __init__(self, config: InternLMConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
-    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
-        # create causal mask
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        combined_attention_mask = None
-        if input_shape[-1] > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape,
-                inputs_embeds.dtype,
-                device=inputs_embeds.device,
-                past_key_values_length=past_key_values_length,
-            )
-
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-                inputs_embeds.device
-            )
-            combined_attention_mask = (
-                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-            )
-
-        return combined_attention_mask
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-            )
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, output_attentions, None)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    None,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class InternLMForCausalLM(InternLMPreTrainedModel):
-    _auto_class = "AutoModelForCausalLM"
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = InternLMModel(config)
-
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, InternLMForCausalLM
-
-        >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you consciours? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past
-
-    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
-        prompt = ""
-        for record in history:
-            prompt += f"""<s><|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
-        if len(prompt) == 0:
-            prompt += "<s>"
-        prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
-        return tokenizer([prompt], return_tensors="pt")
-
-    @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = [],
-        streamer: Optional[BaseStreamer] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
-        inputs = self.build_inputs(tokenizer, query, history)
-        inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
-        outputs = self.generate(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_p=top_p,
-            **kwargs,
-        )
-        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
-        response = tokenizer.decode(outputs, skip_special_tokens=True)
-        response = response.split("<eoa>")[0]
-        history = history + [(query, response)]
-        return response, history
-
-    @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = [],
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
-        """
-        Return a generator in format: (response, history)
-        Eg.
-        ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
-        ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
-        """
-
-        response_queue = queue.Queue(maxsize=20)
-
-        class ChatStreamer(BaseStreamer):
-            def __init__(self, tokenizer) -> None:
-                super().__init__()
-                self.tokenizer = tokenizer
-                self.queue = response_queue
-                self.query = query
-                self.history = history
-                self.response = ""
-                self.received_inputs = False
-                self.queue.put((self.response, history + [(self.query, self.response)]))
-
-            def put(self, value):
-                if len(value.shape) > 1 and value.shape[0] > 1:
-                    raise ValueError("ChatStreamer only supports batch size 1")
-                elif len(value.shape) > 1:
-                    value = value[0]
-
-                if not self.received_inputs:
-                    # The first received value is input_ids, ignore here
-                    self.received_inputs = True
-                    return
-
-                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
-                if token.strip() != "<eoa>":
-                    self.response = self.response + token
-                    history = self.history + [(self.query, self.response)]
-                    self.queue.put((self.response, history))
-
-            def end(self):
-                self.queue.put(None)
-
-        def stream_producer():
-            return self.chat(
-                tokenizer=tokenizer,
-                query=query,
-                streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_p=top_p,
-                **kwargs,
-            )
-
-        def consumer():
-            producer = threading.Thread(target=stream_producer)
-            producer.start()
-            while True:
-                res = response_queue.get()
-                if res is not None:
-                    return
-                yield res
-
-        return consumer()
-
-
-@add_start_docstrings(
-    """
-    The InternLM Model transformer with a sequence classification head on top (linear layer).
-
-    [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    INTERNLM_START_DOCSTRING,
-)
-class InternLMForSequenceClassification(InternLMPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = InternLMModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(INTERNLM_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/transformers/llm/export/llm_models/phi-2/modeling_phi.py b/transformers/llm/export/llm_models/phi-2/modeling_phi.py
deleted file mode 100644
index 30b7fc8fd..000000000
--- a/transformers/llm/export/llm_models/phi-2/modeling_phi.py
+++ /dev/null
@@ -1,989 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-#
-# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
-# Licensed under the BSD 3-Clause License.
-
-from __future__ import annotations
-
-import math
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from einops import rearrange, repeat
-from transformers import PretrainedConfig, PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from .configuration_phi import PhiConfig
-
-try:
-    from flash_attn.bert_padding import pad_input, unpad_input
-    from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
-    from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
-    from flash_attn.ops.fused_dense import FusedDense
-except:
-    pad_input, unpad_input = None, None
-    FlashRotaryEmbedding = None
-    FlashSelfAttention, FlashCrossAttention = None, None
-    FusedDense = None
-
-
-@dataclass
-class InferenceParams:
-    """Inference parameters passed to model to efficiently calculate
-    and store context during inference.
-
-    Reference:
-        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
-
-    Args:
-        max_seqlen: Maximum sequence length.
-        max_batch_size: Maximum batch size.
-        seqlen_offset: Sequence length offset.
-        batch_size_offset: Batch size offset.
-        key_value_memory_dict: Key value memory dictionary.
-        lengths_per_sample: Lengths per sample.
-
-    """
-
-    max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
-
-    max_batch_size: int = field(metadata={"help": "Maximum batch size."})
-
-    seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
-
-    batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
-
-    key_value_memory_dict: Dict[str, Any] = field(
-        default_factory=dict, metadata={"help": "Key value memory dictionary."}
-    )
-
-    lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
-
-
-class Embedding(nn.Module):
-    """Token embedding with dropout."""
-
-    def __init__(self, config: PretrainedConfig) -> None:
-        super().__init__()
-
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-
-    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        hidden_states = self.wte(input_ids)
-        hidden_states = self.drop(hidden_states)
-
-        return hidden_states
-
-
-def _apply_rotary_emb(
-    x: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-) -> torch.FloatTensor:
-    _, seqlen, _, _ = x.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-
-    x_rot = x[:, :, :, :rotary_dim]
-    x_pass = x[:, :, :, rotary_dim:]
-
-    x1, x2 = x_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
-
-    x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
-
-    return torch.cat([x_rot, x_pass], axis=-1)
-
-
-def _apply_rotary_emb_kv(
-    kv: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-    cos_k: Optional[torch.FloatTensor] = None,
-    sin_k: Optional[torch.FloatTensor] = None,
-) -> torch.FloatTensor:
-    _, seqlen, _, _, _ = kv.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-
-    k_rot = kv[:, :, 0, :, :rotary_dim]
-    k_pass = kv[:, :, 0, :, rotary_dim:]
-
-    k1, k2 = k_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
-
-    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
-
-    return torch.cat(
-        [
-            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
-            kv[:, :, 1:2, :, :],
-        ],
-        axis=2,
-    )
-
-
-def _apply_rotary_emb_qkv(
-    qkv: torch.FloatTensor,
-    cos: torch.FloatTensor,
-    sin: torch.FloatTensor,
-    cos_k: Optional[torch.FloatTensor] = None,
-    sin_k: Optional[torch.FloatTensor] = None,
-) -> torch.FloatTensor:
-    _, seqlen, _, _, _ = qkv.shape
-    _, rotary_dim = cos.shape
-    rotary_dim *= 2
-
-    q_rot = qkv[:, :, 0, :, :rotary_dim]
-    q_pass = qkv[:, :, 0, :, rotary_dim:]
-
-    k_rot = qkv[:, :, 1, :, :rotary_dim]
-    k_pass = qkv[:, :, 1, :, rotary_dim:]
-
-    q1, q2 = q_rot.chunk(2, dim=-1)
-    k1, k2 = k_rot.chunk(2, dim=-1)
-    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
-    q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
-
-    q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
-    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
-
-    return torch.cat(
-        [
-            torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
-            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
-            qkv[:, :, 2:3, :, :],
-        ],
-        axis=2,
-    )
-
-
-class RotaryEmbedding(nn.Module):
-    """Rotary positional embedding (RoPE).
-
-    Reference:
-        RoFormer: Enhanced Transformer with Rotary Position Embedding.
-        https://arxiv.org/pdf/2104.09864.pdf.
-
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        base: int = 10000,
-        scale_base: Optional[float] = None,
-        pos_idx_in_fp32: bool = True,
-        max_position_embeddings: int = 2048,
-        device: Optional[str] = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        if scale_base is not None:
-            raise NotImplementedError
-
-        self.dim = dim
-        self.base = float(base)
-        self.scale_base = scale_base
-        self.pos_idx_in_fp32 = pos_idx_in_fp32
-        self.max_position_embeddings = max_position_embeddings
-        self.device = device
-
-        # Generate and save the inverse frequency buffer (non-trainable)
-        inv_freq = self._compute_inv_freq(device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Generate and save the scale buffer (non-trainable)
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale, persistent=False)
-
-        # Initialize cached attributes since ONNX can't rely on dynamic initialization
-        self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
-
-    def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
-        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
-
-    def _update_cos_sin_cache(
-        self,
-        seqlen: int,
-        device: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> None:
-        self._seq_len_cached = seqlen
-
-        # fp32 is preferred since the output of `torch.arange` can be quite large
-        # and bf16 would lose a lot of precision
-        if self.pos_idx_in_fp32:
-            t = torch.arange(seqlen, device=device, dtype=torch.float32)
-            if self.inv_freq.dtype != torch.float32:
-                inv_freq = self._compute_inv_freq(device=device)
-            else:
-                inv_freq = self.inv_freq
-        else:
-            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
-            inv_freq = self.inv_freq
-
-        # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
-        freqs = torch.outer(t, inv_freq)
-        if self.scale is None:
-            self._cos_cached = torch.cos(freqs).to(dtype)
-            self._sin_cached = torch.sin(freqs).to(dtype)
-        else:
-            power = (
-                torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
-            ) / self.scale_base
-            scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
-
-            # Force the scale multiplication to happen in fp32
-            self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
-            self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
-            self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
-            self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
-
-    def forward(
-        self,
-        qkv: torch.Tensor,
-        kv: Optional[torch.Tensor] = None,
-        seqlen_offset: int = 0,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if (
-            self._seq_len_cached < qkv.shape[1] + seqlen_offset
-            or self._cos_cached.device != qkv.device
-            or self._cos_cached.dtype != qkv.dtype
-            or (self.training and self._cos_cached.is_inference())
-        ):
-            self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
-
-        if kv is None:
-            return _apply_rotary_emb_qkv(
-                qkv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-        else:
-            q = _apply_rotary_emb(
-                qkv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-            kv = _apply_rotary_emb_kv(
-                kv,
-                self._cos_cached[seqlen_offset:],
-                self._sin_cached[seqlen_offset:],
-            )
-
-            return q, kv
-
-
-class MLP(nn.Module):
-    """Multi-Layer Perceptron.
-
-    Reference:
-        Attention Is All You Need.
-        https://arxiv.org/pdf/1706.03762.pdf.
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        n_inner: Optional[int] = None,
-        act_fn: Optional[str] = None,
-    ) -> None:
-        super().__init__()
-
-        act_fn = config.activation_function if act_fn is None else act_fn
-
-        n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
-        n_inner = n_inner if n_inner is not None else 4 * config.n_embd
-
-        self.fc1 = nn.Linear(config.n_embd, n_inner)
-        self.fc2 = nn.Linear(n_inner, config.n_embd)
-        self.act = ACT2FN[act_fn]
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-
-        return hidden_states
-
-
-class SelfAttention(nn.Module):
-    """Self-attention layer (compatible with PyTorch).
-
-    Reference:
-        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
-
-    """
-
-    def __init__(
-        self,
-        causal: bool = True,
-        softmax_scale: Optional[float] = None,
-        attention_dropout: float = 0.0,
-    ) -> None:
-        super().__init__()
-
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.drop = nn.Dropout(attention_dropout)
-
-    @torch.autocast("cpu", enabled=False)
-    @torch.autocast("cuda", enabled=False)
-    def forward(
-        self,
-        qkv: torch.FloatTensor,
-        causal: bool = None,
-        key_padding_mask: Optional[torch.BoolTensor] = None,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        batch_size, seqlen = qkv.shape[0], qkv.shape[1]
-        q, k, v = qkv.unbind(dim=2)
-
-        q = q.to(torch.float32)
-        k = k.to(torch.float32)
-
-        causal = self.causal if causal is None else causal
-        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
-
-        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
-        # using float16, which might lead to overflow
-        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
-
-        if key_padding_mask is not None:
-            padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
-            padding_mask.masked_fill_(key_padding_mask, 0.0)
-
-            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
-
-        if causal:
-            causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
-            scores = scores + causal_mask.to(dtype=scores.dtype)
-
-        attention = torch.softmax(scores, dim=-1).to(v.dtype)
-        attention = self.drop(attention)
-
-        output = torch.einsum("bhts,bshd->bthd", attention, v)
-
-        return output
-
-
-class CrossAttention(nn.Module):
-    """Cross-attention layer (compatible with PyTorch).
-
-    Reference:
-        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
-
-    """
-
-    def __init__(
-        self,
-        causal: bool = True,
-        softmax_scale: Optional[float] = None,
-        attention_dropout: float = 0.0,
-    ) -> None:
-        super().__init__()
-
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.drop = nn.Dropout(attention_dropout)
-
-    @torch.autocast("cpu", enabled=False)
-    @torch.autocast("cuda", enabled=False)
-    def forward(
-        self,
-        q: torch.FloatTensor,
-        kv: torch.FloatTensor,
-        causal: bool = None,
-        key_padding_mask: Optional[torch.BoolTensor] = None,
-        causal_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        batch_size, seqlen_q = q.shape[0], q.shape[1]
-        seqlen_k = kv.shape[1]
-
-        if kv.shape[3] != q.shape[2]:
-            kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
-        k, v = kv.unbind(dim=2)
-
-        q = q.to(torch.float32)
-        k = k.to(torch.float32)
-
-        causal = self.causal if causal is None else causal
-        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
-
-        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
-        # using float16, which might lead to overflow
-        # scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
-        q = q.permute(0, 2, 1, 3)
-        k_ = (k * softmax_scale).permute(0, 2, 3, 1)
-        scores = torch.matmul(q, k_)
-
-        if key_padding_mask is not None:
-            padding_mask = torch.full(
-                (batch_size, seqlen_k),
-                -10000.0,
-                dtype=scores.dtype,
-                device=scores.device,
-            )
-            padding_mask.masked_fill_(key_padding_mask, 0.0)
-
-            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
-
-        if causal_mask is not None:
-            scores = scores.masked_fill(causal_mask, -10000.0)
-        elif causal:
-            rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
-            cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
-            causal_mask = cols > rows + seqlen_k - seqlen_q
-
-            scores = scores.masked_fill(causal_mask, -10000.0)
-
-        attention = torch.softmax(scores, dim=-1).to(v.dtype)
-        attention = self.drop(attention)
-
-        # output = torch.einsum("bhts,bshd->bthd", attention, v)
-        v = v.permute(0, 2, 1, 3)
-        output = torch.matmul(attention, v).permute(0, 2, 1, 3)
-        return output
-
-
-def _find_mha_dims(
-    config: PretrainedConfig,
-    n_head: Optional[int] = None,
-    n_head_kv: Optional[int] = None,
-    head_dim: Optional[int] = None,
-) -> Tuple[int, int]:
-    if n_head is None and head_dim is None:
-        head_dim = config.n_embd // config.n_head
-        n_head = config.n_head
-    elif n_head is None or head_dim is None:
-        raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
-
-    if n_head_kv is None:
-        n_head_kv = getattr(config, "n_head_kv", None) or n_head
-
-    return n_head, n_head_kv, head_dim
-
-
-def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
-    num_heads, head_dim = kv.shape[-2:]
-
-    if layer_idx not in inference_params.key_value_memory_dict:
-        inference_params.key_value_memory_dict[layer_idx] = torch.empty(
-            inference_params.max_batch_size,
-            inference_params.max_seqlen,
-            2,
-            num_heads,
-            head_dim,
-            dtype=kv.dtype,
-            device=kv.device,
-        )
-
-    batch_start = inference_params.batch_size_offset
-    batch_end = batch_start + kv.shape[0]
-
-    sequence_start = inference_params.seqlen_offset
-    sequence_end = sequence_start + kv.shape[1]
-
-    # When the current sequence length is equal to or larger than the maximum sequence length,
-    # we need to concatenate the current `kv` with the cached `kv` to expand its length
-    if sequence_end >= inference_params.max_seqlen:
-        inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
-
-    inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
-    kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
-        
-    return kv
-
-
-class MHA(nn.Module):
-    """Multi-head attention layer."""
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[str] = None,
-        rotary_dim: Optional[int] = None,
-        rotary_base: float = 10000.0,
-        rotary_scale_base: Optional[float] = None,
-        n_head: Optional[int] = None,
-        n_head_kv: Optional[int] = None,
-        head_dim: Optional[int] = None,
-        bias: bool = True,
-        causal: bool = True,
-        softmax_scale: Optional[float] = None,
-        layer_idx: Optional[int] = None,
-        return_residual: bool = False,
-        checkpointing: bool = False,
-    ) -> None:
-        super().__init__()
-
-        # Rotary embedding
-        self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
-        if self.rotary_dim > 0:
-            rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
-            if rotary_cls is None:
-                rotary_cls = RotaryEmbedding
-
-            rotary_kwargs = {}
-            if rotary_cls is RotaryEmbedding:
-                rotary_kwargs["max_position_embeddings"] = config.n_positions
-
-            self.rotary_emb = rotary_cls(
-                self.rotary_dim,
-                base=rotary_base,
-                scale_base=rotary_scale_base,
-                device=device,
-                **rotary_kwargs,
-            )
-
-        # MLP
-        self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
-            config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
-        )
-        op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
-        hidden_size = config.n_embd
-
-        linear_cls = FusedDense if config.fused_dense else nn.Linear
-        if linear_cls is None:
-            linear_cls = nn.Linear
-
-        self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
-        self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
-
-        # Attention
-        attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
-        if attn_cls is None:
-            attn_cls = SelfAttention
-
-        cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
-        if cross_attn_cls is None:
-            cross_attn_cls = CrossAttention
-
-        self.inner_attn = attn_cls(
-            causal=causal,
-            softmax_scale=softmax_scale,
-            attention_dropout=config.attn_pdrop,
-        )
-        self.inner_cross_attn = cross_attn_cls(
-            causal=causal,
-            softmax_scale=softmax_scale,
-            attention_dropout=config.attn_pdrop,
-        )
-
-        self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
-        self.layer_idx = layer_idx
-        self.return_residual = return_residual
-        self.checkpointing = checkpointing
-
-    def _forward_self_attn(
-        self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
-    ) -> torch.FloatTensor:
-        qkv = self.Wqkv(x)
-        qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
-
-        if self.rotary_dim > 0:
-            qkv = self.rotary_emb(qkv)
-
-        if self.flash_attn:
-            batch_size, seqlen = qkv.shape[0], qkv.shape[1]
-
-            cu_seqlens, max_seqlen = None, None
-            if key_padding_mask is not None:
-                # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
-                # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
-                qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
-
-            if self.checkpointing:
-                attn_output = torch.utils.checkpoint.checkpoint(
-                    self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
-                )
-            else:
-                attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
-
-            # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
-            return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
-
-        if self.checkpointing:
-            return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
-
-        return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
-
-    def _forward_cross_attn(
-        self,
-        x: torch.FloatTensor,
-        past_key_values: Optional[Union[torch.Tensor, InferenceParams]],
-        key_padding_mask: Optional[torch.BoolTensor],
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        causal_mask: Optional[torch.Tensor] = None,
-    ) -> torch.FloatTensor:
-        batch_size = x.shape[0]
-
-        qkv = self.Wqkv(x)
-
-        q = qkv[..., : self.n_head * self.head_dim]
-        q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
-
-        kv = qkv[..., self.n_head * self.head_dim :]
-        kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
-
-        if rotary_pos_emb is None:
-            seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
-            causal = None if seqlen_offset == 0 else False
-            if self.rotary_dim > 0:
-                q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
-        else:
-            causal = False
-            cos_pos, sin_pos = rotary_pos_emb
-            q = _apply_rotary_emb(q, cos_pos, sin_pos)
-            kv = _apply_rotary_emb_kv(kv, cos_pos, sin_pos)
-
-        if past_key_values is not None:
-            if type(past_key_values) is InferenceParams:
-                kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
-            else:
-                # kv.shape is [1, past + 1, 2, 32, 80]
-                #print('kv.shape', kv.shape)
-                #print('past_key_values.shape', past_key_values.shape)
-                kv = torch.cat((past_key_values, kv), dim=1)
-
-        if self.flash_attn:
-            batch_size, seqlen_q = q.shape[0], q.shape[1]
-            seqlen_k = kv.shape[1]
-
-            cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
-                None,
-                None,
-                None,
-                None,
-            )
-            if key_padding_mask is not None:
-                kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
-
-                if seqlen_q == 1:
-                    key_padding_mask = torch.ones(batch_size, 1, device=q.device)
-                elif seqlen_q != seqlen_k:
-                    key_padding_mask = key_padding_mask[:, -seqlen_q:]
-
-                q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
-
-            if self.checkpointing:
-                attn_output = torch.utils.checkpoint.checkpoint(
-                    self.inner_cross_attn,
-                    q,
-                    kv,
-                    causal=causal,
-                    cu_seqlens=cu_seqlens_q,
-                    max_seqlen=max_seqlen_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_k=max_seqlen_k,
-                )
-            else:
-                attn_output = self.inner_cross_attn(
-                    q,
-                    kv,
-                    causal=causal,
-                    cu_seqlens=cu_seqlens_q,
-                    max_seqlen=max_seqlen_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_k=max_seqlen_k,
-                )
-
-            return (
-                pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
-                if key_padding_mask is not None
-                else attn_output
-            )
-
-        if self.checkpointing:
-            return torch.utils.checkpoint.checkpoint(
-                self.inner_cross_attn,
-                q,
-                kv,
-                key_padding_mask=key_padding_mask,
-                causal=causal,
-            )
-        output = self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal, causal_mask=causal_mask)
-        return output, kv
-
-    def forward(
-        self,
-        x: torch.FloatTensor,
-        past_key_values: Optional[InferenceParams] = None,
-        attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        causal_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
-        if attention_mask is not None:
-            attention_mask = attention_mask.bool()
-        else:
-            attention_mask = None
-
-        attention_mask = None
-        kv = None
-        # MHA
-        if self.n_head == self.n_head_kv:
-            if past_key_values is None and False:
-                # If `past_key_values` are not supplied, we run self-attention
-                attn_output = self._forward_self_attn(x, attention_mask)
-            else:
-                # If `past_key_values` are supplied, it means that we might have cached values and
-                # could take advantage of cross-attention
-                attn_output, kv = self._forward_cross_attn(x, past_key_values, attention_mask, rotary_pos_emb, causal_mask)
-        # MQA / GQA
-        else:
-            # Regardless of `past_key_values` being supplied or not, it always use cross-attention
-            # because `q` and `kv` lengths might be different
-            attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
-
-        output = rearrange(attn_output, "... h d -> ... (h d)")
-        output = self.out_proj(output)
-
-        # return output if not self.return_residual else (output, x)
-        return output, kv
-
-
-class ParallelBlock(nn.Module):
-    """Parallel block.
-
-    This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        block_idx: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-
-        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.block_idx = block_idx
-
-        self.mixer = MHA(config, layer_idx=block_idx)
-        self.mlp = MLP(config)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
-        attention_mask: Optional[torch.BoolTensor] = None,
-        rotary_pos_emb: Optional[torch.Tensor] = None,
-        causal_mask: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.FloatTensor:
-        residual = hidden_states
-        hidden_states = self.ln(hidden_states)
-
-        attn_outputs, kv = self.mixer(
-            hidden_states,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            rotary_pos_emb=rotary_pos_emb,
-            causal_mask=causal_mask,
-        )
-
-        if isinstance(attn_outputs, tuple):
-            attn_outputs = attn_outputs[0]
-
-        # attn_outputs = self.resid_dropout(attn_outputs)
-        # feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
-        feed_forward_hidden_states = self.mlp(hidden_states)
-        hidden_states = attn_outputs + feed_forward_hidden_states + residual
-        return hidden_states, kv
-
-
-class CausalLMHead(nn.Module):
-    """Causal Language Modeling head.
-
-    Reference:
-        Improving Language Understanding by Generative Pre-Training.
-        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
-
-    """
-
-    def __init__(self, config: PretrainedConfig) -> None:
-        super().__init__()
-
-        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.linear = nn.Linear(config.n_embd, config.vocab_size)
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        hidden_states = self.ln(hidden_states)
-        logits = self.linear(hidden_states).to(torch.float32)
-        return logits
-
-
-class CausalLMLoss(nn.Module):
-    """Causal Language Modeling loss.
-
-    Reference:
-        Improving Language Understanding by Generative Pre-Training.
-        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
-
-    """
-
-    def __init__(self, shift_labels: bool = True) -> None:
-        super().__init__()
-
-        self.shift_labels = shift_labels
-        self.loss_fct = nn.CrossEntropyLoss()
-
-    def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
-        if self.shift_labels:
-            logits = logits[..., :-1, :].contiguous()
-            labels = labels[..., 1:].contiguous()
-
-        loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
-
-        return loss
-
-
-class PhiPreTrainedModel(PreTrainedModel):
-    """Phi pre-trained model."""
-
-    config_class = PhiConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = False
-    _no_split_modules = ["ParallelBlock"]
-
-    def __init__(self, *inputs, **kwargs) -> None:
-        super().__init__(*inputs, **kwargs)
-
-    def _init_weights(self, module: nn.Module) -> None:
-        if isinstance(module, (nn.Linear,)):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            if module.bias is not None:
-                module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
-        attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
-            past_key_values = InferenceParams(
-                max_seqlen=self.config.n_positions,
-                max_batch_size=input_ids.shape[0],
-                seqlen_offset=0,
-                batch_size_offset=0,
-                key_value_memory_dict={},
-                lengths_per_sample=None,
-            )
-        else:
-            # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
-            past_key_values.seqlen_offset = input_ids.shape[1] - 1
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "attention_mask": attention_mask,
-        }
-
-
-class PhiModel(PhiPreTrainedModel):
-    """Phi model."""
-
-    _keys_to_ignore_on_load_missing = [""]
-    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
-
-    def __init__(self, config: PhiConfig) -> None:
-        super().__init__(config)
-
-        self.embd = Embedding(config)
-        self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
-        self.gradient_checkpointing = False
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Embedding:
-        return self.embd.wte
-
-    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
-        self.embd.wte = new_embeddings
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
-        attention_mask: Optional[torch.BoolTensor] = None,
-    ) -> torch.FloatTensor:
-        hidden_states = self.embd(input_ids)
-
-        for layer in self.h:
-            hidden_states, _ = layer(
-                hidden_states,
-                past_key_values=past_key_values,
-                attention_mask=attention_mask,
-            )
-
-        return hidden_states
-
-
-class PhiForCausalLM(PhiPreTrainedModel):
-    """Phi for Causal Language Modeling."""
-
-    _keys_to_ignore_on_load_missing = [""]
-    _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
-
-    def __init__(self, config: PhiConfig) -> None:
-        super().__init__(config)
-
-        self.transformer = PhiModel(config)
-        self.lm_head = CausalLMHead(config)
-        self.loss = CausalLMLoss()
-
-        self.post_init()
-
-    def get_output_embeddings(self) -> nn.Linear:
-        return self.lm_head.linear
-
-    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
-        self.lm_head.linear = new_embeddings
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
-        attention_mask: Optional[torch.BoolTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
-        lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            loss = self.loss(lm_logits, labels)
-
-        return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)
diff --git a/transformers/llm/export/llmexport.py b/transformers/llm/export/llmexport.py
new file mode 100644
index 000000000..17862a632
--- /dev/null
+++ b/transformers/llm/export/llmexport.py
@@ -0,0 +1,1705 @@
+import os
+import sys
+import math
+import copy
+import json
+import time
+import base64
+import logging
+import warnings
+import argparse
+import functools
+from typing import Optional, Tuple
+
+from yaspin import yaspin
+
+import onnx
+import torch
+import numpy as np
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+RESET = "\033[0m"
+GREEN = "\033[32;1m"
+YELLOW = "\033[33;4m"
+EXPORT_LOG = '.export.log'
+
+# ignore warnning info
+warnings.filterwarnings("ignore")
+logging.basicConfig(level=logging.ERROR)
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+def spinner_run(text='Processing...'):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            with yaspin(text=text, color="cyan") as spinner:
+                start = time.time()
+                try:
+                    result = func(*args, **kwargs)
+                except Exception as e:
+                    spinner.fail("💥 Failed")
+                    print(e)
+                    exit(1)
+                end = time.time()
+                during = f'[{end-start:05.2f} s]'.replace('[0', '[ ')
+                padding = ' ' * (64 - len(spinner.text) - len(result))
+                spinner.text = f'{spinner.text}{YELLOW}{result}{RESET}{padding}{GREEN}{during}{RESET}'
+                spinner.ok("✅ Done")
+                return result
+        return wrapper
+    return decorator
+
+class ModelMapper:
+    def __init__(self):
+        self.attrs = []
+        self.mapper = dict()
+        self.regist_models()
+
+    def get_map(self, config):
+        model_type = config.model_type
+        if model_type == 'chatglm':
+            if hasattr(config, 'vocab_size') and config.vocab_size == 130528:
+                model_type = 'chatglm'
+            else:
+                model_type = 'chatglm2'
+        if model_type in self.mapper:
+            return model_type, self.mapper[model_type]
+        return model_type, self.default_map
+
+    def regist(self, model_type, model_map):
+        assert('config' in model_map and
+               'decoder' in model_map and
+               'attention' in model_map)
+        self.mapper[model_type] = model_map
+
+    def regist_models(self):
+        self.defualt_map()
+        # regist models
+        self.regist_llama()
+        self.regist_qwen()
+        self.regist_glm()
+        self.regist_glm2()
+        self.regist_phi()
+        self.regist_gemma2()
+
+    def regist_llama(self):
+        llama_map = self.default_map
+        self.regist('llama', llama_map)
+        self.regist('qwen2', llama_map)
+        self.regist('internlm', llama_map)
+        baichuan_map = copy.deepcopy(self.default_map)
+        baichuan_map[self.attention_key] = {
+            'qkv_proj': 'W_pack',
+            'o_proj': 'o_proj'
+        }
+        self.regist('baichuan', baichuan_map)
+
+    def regist_qwen(self):
+        qwen_map = {
+            'config': {
+                'hidden_size': 'hidden_size',
+                'num_attention_heads': 'num_attention_heads',
+                'num_hidden_layers': 'num_hidden_layers',
+                'rope_theta': 'rotary_emb_base',
+            },
+            'model': {
+                'lm_': 'lm_head',
+                'embed_': 'transformer.wte',
+                'blocks_': 'transformer.h',
+                'final_layernorm_': 'transformer.ln_f',
+                'visual': 'transformer.visual'
+            },
+            'decoder': {
+                'self_attn': 'attn',
+                'mlp': 'mlp',
+                'input_layernorm': 'ln_1',
+                'post_attention_layernorm': 'ln_2'
+            },
+            'attention': {
+                'qkv_proj': 'c_attn',
+                'o_proj': 'c_proj'
+            }
+        }
+        self.regist('qwen', qwen_map)
+
+    def regist_glm(self):
+        glm_map = {
+            'config': {
+                'hidden_size': 'hidden_size',
+                'num_attention_heads': 'num_attention_heads',
+                'num_hidden_layers': 'num_layers'
+            },
+            'model': {
+                'lm_': 'lm_head',
+                'embed_': 'transformer.word_embeddings',
+                'blocks_': 'transformer.layers',
+                'final_layernorm_': 'transformer.final_layernorm',
+            },
+            'decoder': {
+                'self_attn': 'attention',
+                'mlp': 'mlp',
+                'input_layernorm': 'input_layernorm',
+                'post_attention_layernorm': 'post_attention_layernorm'
+            },
+            'attention': {
+                'qkv_proj': 'query_key_value',
+                'o_proj': 'dense'
+            }
+        }
+        self.regist('chatglm', glm_map)
+
+    def regist_glm2(self):
+        glm2_map = {
+            'config': {
+                'hidden_size': 'hidden_size',
+                'num_attention_heads': 'num_attention_heads',
+                'num_key_value_heads': 'multi_query_group_num',
+                'num_hidden_layers': 'num_layers',
+            },
+            'model': {
+                'lm_': 'transformer.output_layer',
+                'embed_': 'transformer.embedding.word_embeddings',
+                'blocks_': 'transformer.encoder.layers',
+                'final_layernorm_': 'transformer.encoder.final_layernorm',
+            },
+            'decoder': {
+                'self_attn': 'self_attention',
+                'mlp': 'mlp',
+                'input_layernorm': 'input_layernorm',
+                'post_attention_layernorm': 'post_attention_layernorm'
+            },
+            'attention': {
+                'qkv_proj': 'query_key_value',
+                'o_proj': 'dense'
+            }
+        }
+        self.regist('chatglm2', glm2_map)
+
+    def regist_phi(self):
+        phi_map = {
+            'config': {
+                'hidden_size': 'n_embd',
+                'num_attention_heads': 'n_head',
+                'num_hidden_layers': 'n_layer',
+                'rotary_dim': 'rotary_dim'
+            },
+            'model': {
+                'lm_': 'lm_head.linear',
+                'embed_': 'transformer.embd.wte',
+                'blocks_': 'transformer.h',
+                'final_layernorm_': 'lm_head.ln',
+            },
+            'decoder': {
+                'self_attn': 'mixer',
+                'mlp': 'mlp',
+                'input_layernorm': 'ln',
+            },
+            'attention': {
+                'qkv_proj': 'Wqkv',
+                'o_proj': 'out_proj'
+            }
+        }
+        self.regist('phi-msft', phi_map)
+
+    def regist_gemma2(self):
+        gemma2_config = copy.deepcopy(self.default_config)
+        gemma2_config['head_dim'] = 'head_dim'
+        gemma2_decoder = copy.deepcopy(self.default_decoder)
+        gemma2_decoder['pre_feedforward_layernorm'] = 'pre_feedforward_layernorm'
+        gemma2_decoder['post_feedforward_layernorm'] = 'post_feedforward_layernorm'
+        gemma2_map = {
+            'config': gemma2_config,
+            'model': self.defualt_model,
+            'decoder': gemma2_decoder,
+            'attention': self.default_attention
+        }
+        self.regist('gemma2', gemma2_map)
+
+    def defualt_map(self):
+        # default map is `LlamaForCausalLM`
+        self.config_key = 'config'
+        self.model_key = 'model'
+        self.decoder_key = 'decoder'
+        self.attention_key = 'attention'
+        self.default_config = {
+            'hidden_size': 'hidden_size',
+            'num_attention_heads': 'num_attention_heads',
+            'num_hidden_layers': 'num_hidden_layers',
+            'num_key_value_heads': 'num_key_value_heads',
+            'rope_theta': 'rope_theta'
+        }
+        self.defualt_model = {
+            'lm_': 'lm_head',
+            'embed_': 'model.embed_tokens',
+            'blocks_': 'model.layers',
+            'final_layernorm_': 'model.norm',
+        }
+        self.default_decoder = {
+            'self_attn': 'self_attn',
+            'mlp': 'mlp',
+            'input_layernorm': 'input_layernorm',
+            'post_attention_layernorm': 'post_attention_layernorm'
+        }
+        self.default_attention = {
+            'q_proj': 'q_proj',
+            'k_proj': 'k_proj',
+            'v_proj': 'v_proj',
+            'o_proj': 'o_proj'
+        }
+        self.default_map = {
+            'config': self.default_config,
+            'model': self.defualt_model,
+            'decoder': self.default_decoder,
+            'attention': self.default_attention
+        }
+
+    @staticmethod
+    def do_map(dst, src, map):
+        for dst_attr, src_attr in map.items():
+            attributes = src_attr.split('.')
+            obj = src
+            for attr in attributes:
+                if hasattr(obj, attr):
+                    obj = getattr(obj, attr)
+                else:
+                    obj = None
+                    break
+            setattr(dst, dst_attr, obj)
+
+
+# Export class
+class LlmExporterOp(torch.autograd.Function):
+    @staticmethod
+    def symbolic(g, input, in_features, out_features, has_bias, name):
+        args = [input]
+        # These become the operator attributes.
+        kwargs = {
+            "in_features_i": in_features,
+            "out_features_i": out_features,
+            "has_bias_i": has_bias,
+            "name_s": name
+        }
+        from torch.onnx.symbolic_helper import _get_tensor_sizes
+        out_sizes = _get_tensor_sizes(input)[:-1] + [out_features]
+        output_type = input.type().with_sizes(out_sizes)
+        return g.op("LlmExporter::FakeLinear", input, **kwargs).setType(output_type)
+
+    @staticmethod
+    def forward(ctx, input, in_features, out_features, has_bias, name):
+        out_shape = list(input.shape)[:-1] + [out_features]
+        return input.new_zeros(out_shape)
+
+class FakeLinear(torch.nn.Module):
+    def __init__(self, in_features, out_features, has_bias, name):
+        super(FakeLinear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.has_bias = has_bias
+        self.name = name
+
+    def forward(self, x):
+        return LlmExporterOp.apply(x, self.in_features, self.out_features, self.has_bias, self.name)
+
+class OnnxRebuilder:
+    def __init__(self, onnx_path, weight_ops):
+        self.weight_ops = weight_ops
+        self.onnx_model = onnx.load(onnx_path)
+        self.dst_path = onnx_path
+        self.onnx_weight_path = f'{onnx_path}.data'
+        self.onnx_weight_offset = 0
+
+    def make_external(self, name, data, shape):
+        # write to external weight
+        length = self.onnx_weight.write(data.tobytes())
+        location = os.path.basename(self.onnx_weight_path)
+        offset = self.onnx_weight_offset
+        self.onnx_weight_offset += length
+        tensor = onnx.TensorProto()
+        tensor.name = name
+        tensor.data_type = onnx.TensorProto.FLOAT
+        tensor.dims.extend(shape)
+        # external info
+        tensor.data_location = onnx.TensorProto.EXTERNAL
+        for k, v in { "location": location, "offset": offset, "length": length }.items():
+            entry = tensor.external_data.add()
+            entry.key = k
+            entry.value = str(v)
+        self.onnx_model.graph.initializer.append(tensor)
+
+    def build_weight(self, name, has_bias, ic, oc):
+        assert(name in self.weight_ops)
+        linear = self.weight_ops[name]
+        assert(linear.in_features == ic and
+               linear.out_features == oc and
+               (linear.bias is not None) == has_bias)
+        weight_name, bias_name = f'{name}_weight', f'{name}_bias'
+        weight = linear.weight.data.transpose(1, 0).flatten().numpy()
+        self.make_external(weight_name, weight, [ic, oc])
+        if has_bias:
+            bias = linear.bias.data.flatten().numpy()
+            self.make_external(bias_name, bias, [oc])
+        return weight_name, bias_name
+
+    def rebuild(self):
+        from onnx import helper
+        new_nodes = []
+        self.onnx_weight = open(self.onnx_weight_path, 'wb')
+        for node in self.onnx_model.graph.node:
+            if node.op_type == 'FakeLinear':
+                attributes = {a.name: a for a in node.attribute}
+                name = attributes.get('name').s.decode('utf-8')
+                has_bias = attributes.get('has_bias').i
+                ic = attributes.get('in_features').i
+                oc = attributes.get('out_features').i
+                weight, bias = self.build_weight(name, has_bias, ic, oc)
+                if has_bias:
+                    # fakelinear -> matmul + add
+                    middle_tensor = f'{name}_matmul'
+                    new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], [middle_tensor], name))
+                    new_nodes.append(helper.make_node('Add', [middle_tensor, bias], node.output, name))
+                else:
+                    # fakelinear -> matmul
+                    new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], node.output, name))
+            else:
+                new_nodes.append(node)
+        self.onnx_weight.close()
+        del self.onnx_model.graph.node[:]
+        self.onnx_model.graph.node.extend(new_nodes)
+        onnx.save(self.onnx_model, self.dst_path)
+        return self.onnx_weight_path
+
+class MNNConveter:
+    def __init__(self, onnx_path, weight_ops, config):
+        self.weight_ops = weight_ops
+        self.quant_block = config.quant_block
+        self.quant_bit = config.quant_bit
+        self.lm_quant_bit = config.lm_quant_bit
+        self.mnn_weight_offset = 0
+        self.onnx_model_path = onnx_path
+        self.mnn_model_path = onnx_path.replace('.onnx', '.mnn')
+        self.mnn_weight_path = f'{self.mnn_model_path}.weight'
+        if os.path.exists(config.mnnconvert):
+            self.mnnconvert = config.mnnconvert
+        else:
+            self.mnnconvert = None
+
+    def convert(self, convert_args):
+        sfd = os.dup(1)
+        log_fp = open(EXPORT_LOG, "a")
+        log_fd = log_fp.fileno()
+        # mnnconvert ... > .convert_mnn.log
+        os.dup2(log_fd, 1)
+        try:
+            sys.argv = convert_args
+            sys.argc = len(convert_args)
+            if self.mnnconvert is None:
+                from MNN.tools import mnnconvert
+                mnnconvert.main()
+            else:
+                convert_args[0] = self.mnnconvert
+                cmd = ' '.join(convert_args)
+                message = os.popen(cmd).read()
+                print(message)
+            sys.argv = []
+        finally:
+            os.dup2(sfd, 1)
+            os.close(log_fd)
+
+    @spinner_run(f'convert onnx model to ')
+    def onnx2mnn(self, onnx_path, mnn_path, args = []):
+        convert_args = [
+            '',
+            '-f',
+            'ONNX',
+            '--modelFile',
+            str(onnx_path),
+            '--MNNModel',
+            str(mnn_path),
+            '--transformerFuse',
+            '--allowCustomOp'
+        ]
+        convert_args += args
+        self.convert(convert_args)
+        return mnn_path
+
+    def mnn2json(self, mnn_path, json_path):
+        convert_args = [
+            '',
+            '-f',
+            'MNN',
+            '--modelFile',
+            str(mnn_path),
+            '--JsonFile',
+            str(json_path)
+        ]
+        self.convert(convert_args)
+        return json_path
+
+    def json2mnn(self, json_path, mnn_path):
+        convert_args = [
+            '',
+            '-f',
+            'JSON',
+            '--modelFile',
+            str(json_path),
+            '--MNNModel',
+            str(mnn_path)
+        ]
+        self.convert(convert_args)
+        return mnn_path
+
+    def export(self):
+        if self.weight_ops is None:
+            quant_args = [
+                '--weightQuantBits',
+                str(self.quant_bit),
+                '--weightQuantBlock',
+                str(self.quant_block)
+            ]
+            self.onnx2mnn(self.onnx_model_path, self.mnn_model_path, quant_args)
+        else:
+            mnn_json = f'{self.mnn_model_path}.json'
+            self.onnx2mnn(self.onnx_model_path, self.mnn_model_path)
+            self.mnn2json(self.mnn_model_path, mnn_json)
+            self.rebuild(mnn_json)
+            self.json2mnn(mnn_json, self.mnn_model_path)
+
+    @spinner_run(f'quant model weight to ')
+    def rebuild(self, json_path):
+        self.mnn_weight = open(self.mnn_weight_path, 'wb')
+        mnn_graph = json.load(open(json_path, 'rt'))
+        new_ops = []
+        for op in mnn_graph['oplists']:
+            if op['type'] == 'Extra':
+                new_ops += self.rebuild_op(op, mnn_graph)
+            else:
+                new_ops.append(op)
+        mnn_graph['oplists'] = new_ops
+        with open(json_path, 'w', encoding='utf-8') as file:
+            json.dump(mnn_graph, file, ensure_ascii=False, indent=4)
+        return self.mnn_weight_path
+
+    def quant(self, weight, quant_bit, quant_block):
+        weight = weight.numpy()
+        oc, ic = weight.shape
+        if quant_block == 0:
+            block_size = ic
+        else:
+            block_size = quant_block
+        block_num = ic // block_size
+        weight = weight.reshape(oc, block_num, block_size)
+        max_val = np.max(weight, axis=-1, keepdims=True)
+        min_val = np.min(weight, axis=-1, keepdims=True)
+        offset = 1 << (quant_bit - 1)
+        clip_max = offset - 1
+        clip_min = -offset
+        scale = (max_val - min_val) / (clip_max - clip_min)
+        q_weight = np.round((weight - min_val) / scale) + clip_min
+        q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
+        q_weight = q_weight.reshape(-1, 2)
+        if quant_bit == 4:
+            q_weight = q_weight[:, 0] * 16 + q_weight[:, 1]
+        alpha = np.stack([min_val.flatten(), scale.flatten()], axis=-1).flatten()
+        return q_weight, alpha, clip_min
+
+    def write_npy(self, data):
+        return self.mnn_weight.write(data.tobytes())
+
+    def write_header(self, ic, oc, quant_bit):
+        dim_num = self.mnn_weight.write(b'\x02')
+        shape_dtype = np.int16
+        if oc > 65535 or ic > 65535:
+            shape_dtype = np.int32
+        dim_length = self.write_npy(np.array([oc, ic]).astype(shape_dtype))
+        offset = 1 << (quant_bit - 1)
+        weight_map = [i for i in range(-offset, offset)]
+        if len(weight_map) == 256:
+            weight_map.insert(0, 0)
+        else:
+            weight_map.insert(0, len(weight_map))
+        map_length = self.write_npy(np.array(weight_map, dtype=np.int8))
+        header_length = dim_num + dim_length + map_length
+        return header_length, shape_dtype == np.int32
+
+    def build_weight(self, linear, quant_bit, quant_block):
+        ic, oc = linear.in_features, linear.out_features
+        q_weight, alpha, q_min = self.quant(linear.weight.data, quant_bit, quant_block)
+        header_len, shape_int32 = self.write_header(ic, oc, quant_bit)
+        weight_len = self.write_npy(q_weight) + header_len
+        alpha_len = self.write_npy(alpha)
+        if linear.bias is not None:
+            bias = linear.bias.data.flatten().numpy()
+            bias_length = self.write_npy(bias)
+        else:
+            bias_length = 0
+            # bias = np.zeros([oc], dtype=np.float32)
+            # bias_length = self.write_npy(bias)
+        external = [self.mnn_weight_offset, weight_len, alpha_len, bias_length, 0]
+        self.mnn_weight_offset += (weight_len + alpha_len + bias_length)
+        return external, q_min, shape_int32
+
+    def build_tensor(self, graph, tensor_name):
+        tensor_idx = [len(graph['tensorName'])]
+        graph['tensorName'].append(tensor_name)
+        return tensor_idx
+
+    def rebuild_op(self, op, graph):
+        attrs = op['main']['attr']
+        for attr in attrs:
+            if attr['key'] == 'name':
+                name = attr['s']
+            elif attr['key'] == "in_features":
+                ic = attr["i"]
+            elif attr['key'] == "out_features":
+                oc = attr["i"]
+            elif attr['key'] == "has_bias":
+                has_bias = attr["i"]
+        linear = self.weight_ops[name]
+        assert(linear.in_features == ic and
+               linear.out_features == oc and
+               (linear.bias is not None) == has_bias)
+
+
+        quant_bit = self.lm_quant_bit if 'lm_head' in name else self.quant_bit
+        external, q_min, shape_int32 = self.build_weight(linear, quant_bit, self.quant_block)
+
+        origin_input = op['inputIndexes']
+        origin_output = op['outputIndexes']
+        # build new tensor
+        pre_reshape_name = f'{name}/pre_reshape'
+        pre_convert_name = f'{name}/pre_convert'
+        conv_name = name
+        post_convert_name = f'{name}/post_convert'
+        post_reshape_name = f'{name}/post_reshape'
+        pre_reshape_output = self.build_tensor(graph, pre_reshape_name)
+        pre_convert_output = self.build_tensor(graph, pre_convert_name)
+        conv_output = self.build_tensor(graph, conv_name)
+        post_convert_output = self.build_tensor(graph, post_convert_name)
+        # [batch, seq, hidden_size_i] -[Linear] -> [batch, seq, hidden_size_o]
+        # [1, seq, hidden_size_i] ->[Reshape]-> [seq, hidden_size_i, 1, 1]
+        # -[Convert]-[Convolution]-[Convert]-> [Reshape] -> [1, seq, hidden_size_o]
+        pre_reshape = {
+            "name": pre_reshape_name,
+            "type": "Reshape",
+            "inputIndexes": origin_input,
+            "outputIndexes": pre_reshape_output,
+            "main_type": "Reshape",
+            "main": {
+                "dims": [-1, ic, 1, 1],
+                "dimType": "NCHW"
+            },
+            "defaultDimentionFormat": "NHWC"
+        }
+        pre_convert = {
+            "name": pre_convert_name,
+            "inputIndexes": pre_reshape_output,
+            "outputIndexes": pre_convert_output,
+            "type": "ConvertTensor",
+            "main_type": "TensorConvertInfo",
+            "main": {
+                "source": "NCHW",
+                "dest": "NC4HW4"
+            },
+            "defaultDimentionFormat": "NHWC"
+        }
+        conv_op = {
+            "name": conv_name,
+            "inputIndexes": pre_convert_output,
+            "outputIndexes": conv_output,
+            "type": "Convolution",
+            "main_type": "Convolution2D",
+            "main": {
+                'common': {
+                    'dilateX': 1, 'dilateY': 1, 'strideX': 1, 'strideY': 1,
+                    'kernelX': 1, 'kernelY': 1, 'padX': 0, 'padY': 0, 'group': 1,
+                    'outputCount': oc, 'relu': False, 'padMode': 'CAFFE',
+                    'relu6': False, 'inputCount': ic, 'hasOutputShape': False
+                },
+                "quanParameter": {
+                    "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
+                    "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
+                    "type": 1, "aMax": 0, "aMin": q_min, "readType": oc * (ic // self.quant_block), "weightSize": 0
+                },
+                "external": external
+            },
+            "defaultDimentionFormat": "NHWC"
+        }
+        post_convert = {
+            "name": post_convert_name,
+            "inputIndexes": conv_output,
+            "outputIndexes": post_convert_output,
+            "type": "ConvertTensor",
+            "main_type": "TensorConvertInfo",
+            "main": {
+                "source": "NC4HW4",
+                "dest": "NCHW"
+            },
+            "defaultDimentionFormat": "NHWC"
+        }
+        post_reshape = {
+            "name": post_reshape_name,
+            "type": "Reshape",
+            "inputIndexes": post_convert_output,
+            "outputIndexes": origin_output,
+            "main_type": "Reshape",
+            "main": {
+                "dims": [1, -1, oc],
+                "dimType": "NCHW"
+            },
+            "defaultDimentionFormat": "NHWC"
+        }
+        return [pre_reshape, pre_convert, conv_op, post_convert, post_reshape]
+
+# some wrapper class for export
+class Embedding(torch.nn.Module):
+    def __init__(self, embed, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.embed = embed
+        if config.model_type == 'gemma2':
+            normalizer = torch.tensor(self.hidden_size**0.5)
+            self.embed.weight.data *= normalizer
+
+    def forward(self, input_ids):
+        inputs_embeds = self.embed(input_ids).view(-1, 1, self.hidden_size)
+        return inputs_embeds
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+class Attention(torch.nn.Module):
+    def __init__(self, attn, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.rotary = config.rotary
+        ModelMapper.do_map(self, attn, config.model_map['attention'])
+        if hasattr(self, 'qkv_proj') and self.qkv_proj is not None:
+            # split qkv linear to q, k, v
+            split_sizes = [self.hidden_size] * 3
+            if self.qkv_proj.weight.shape[0] != self.hidden_size * 3:
+                # M/GQA
+                qkv_hidden_size = self.qkv_proj.weight.shape[0]
+                kv_hidden_size = (qkv_hidden_size - self.hidden_size) // 2
+                split_sizes = [self.hidden_size, kv_hidden_size, kv_hidden_size]
+            self.q_proj = torch.nn.Linear(self.hidden_size, split_sizes[0])
+            self.k_proj = torch.nn.Linear(self.hidden_size, split_sizes[1])
+            self.v_proj = torch.nn.Linear(self.hidden_size, split_sizes[2])
+            if config.model_type == 'chatglm':
+                # chatglm-6b
+                qkv_weight = self.qkv_proj.weight.data.view(self.num_heads, 3, self.head_dim, self.hidden_size)
+                self.q_proj.weight.data = qkv_weight[:, 0, :, :].reshape(self.hidden_size, self.hidden_size)
+                self.k_proj.weight.data = qkv_weight[:, 1, :, :].reshape(self.hidden_size, self.hidden_size)
+                self.v_proj.weight.data = qkv_weight[:, 2, :, :].reshape(self.hidden_size, self.hidden_size)
+                qkv_bias = self.qkv_proj.bias.data.view(self.num_heads, 3, self.head_dim)
+                self.q_proj.bias.data = qkv_bias[:, 0, :].reshape(self.hidden_size)
+                self.k_proj.bias.data = qkv_bias[:, 1, :].reshape(self.hidden_size)
+                self.v_proj.bias.data = qkv_bias[:, 2, :].reshape(self.hidden_size)
+            else:
+                # other
+                qw, kw, vw = torch.split(self.qkv_proj.weight, split_sizes)
+                self.q_proj.weight.data = qw
+                self.k_proj.weight.data = kw
+                self.v_proj.weight.data = vw
+                if self.qkv_proj.bias is not None:
+                    qb, kb, vb = torch.split(self.qkv_proj.bias, split_sizes)
+                    self.q_proj.bias.data = qb
+                    self.k_proj.bias.data = kb
+                    self.v_proj.bias.data = vb
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        kv_seq_len = key_states.shape[1]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[1]
+
+        # rope
+        cos, sin = rotary_pos_emb[0], rotary_pos_emb[1]
+        query_states = self.rotary.apply_rotary_pos(query_states, cos, sin)
+        key_states = self.rotary.apply_rotary_pos(key_states, cos, sin)
+        # kv cache
+        if past_key_value is not None:
+            past_key, past_value = past_key_value[0], past_key_value[1]
+            key_states = torch.cat((past_key, key_states), dim=1)
+            value_states = torch.cat((past_value, value_states), dim=1)
+
+        past_key_value = torch.stack((key_states, value_states))
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.permute([0, 2, 3, 1])
+        value_states = value_states.transpose(1, 2)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        #------- attention ----------
+        # query_states @ key_states
+        attn_weights = torch.matmul(query_states, key_states) / math.sqrt(self.head_dim)
+        # attention_mask
+        if attention_mask.dtype in (torch.bool, torch.int32):
+            # chatglm
+            attn_weights.masked_fill_(attention_mask, -10000.0)
+        else:
+            attn_weights = attn_weights + attention_mask
+        # upcast softmax to fp32
+        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        # attn_weights @ value_states
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+class Rotary(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.rope_theta = config.rope_theta
+        self.rotary_dim = config.head_dim
+        self.model_type = config.model_type
+        if hasattr(config, 'rotary_dim'):
+            self.rotary_dim = config.rotary_dim
+        if self.model_type == 'chatglm':
+            self.rotary_dim = config.head_dim // 2
+
+    def forward(self, position_ids):
+        theta = 1.0 / (self.rope_theta ** (torch.arange(0, self.rotary_dim, 2, dtype=torch.float32) / self.rotary_dim))
+        position_ids = position_ids.float().reshape(-1, 1)
+        idx_theta = position_ids * theta
+        rotary_pos_emb = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)])
+        if self.model_type != 'chatglm2':
+            rotary_pos_emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        rotary_pos_emb = rotary_pos_emb.unsqueeze(2).unsqueeze(1)
+        return rotary_pos_emb
+
+    def apply_rotary_pos(self, x, cos, sin):
+        if self.model_type == 'chatglm':
+            return self.chatglm_rotary_pos(x, cos, sin)
+        if self.model_type == 'chatglm2':
+            return self.chatglm2_rotary_pos(x, cos, sin)
+        if self.model_type == 'phi-msft':
+            return self.phi_rotary_pos(x, cos, sin)
+        return self.llama_rotary_pos(x, cos, sin)
+
+    def llama_rotary_pos(self, x, cos, sin):
+        x = (x * cos) + (rotate_half(x) * sin)
+        return x
+
+    def phi_rotary_pos(self, x, cos, sin):
+        x, x_pass = x[..., :self.rotary_dim], x[..., self.rotary_dim:]
+        x = (x * cos) + (rotate_half(x) * sin)
+        return torch.cat((x, x_pass), dim=-1)
+
+    def chatglm2_rotary_pos(self, x, cos, sin):
+        x, x_pass = x[..., :self.rotary_dim], x[..., self.rotary_dim:]
+        b, s, n, h = x.shape
+        xshaped = x.view(b, s, n, h//2, 2)
+        x = torch.concat(
+            [
+                xshaped[..., 0] * cos - xshaped[..., 1] * sin,
+                xshaped[..., 1] * cos + xshaped[..., 0] * sin,
+            ],
+            -1,
+        )
+        return torch.cat((x, x_pass), dim=-1)
+
+    def chatglm_rotary_pos(self, x, cos, sin):
+        seq = x.shape[1]
+        x1, x2 = x[..., :self.rotary_dim], x[..., self.rotary_dim:]
+        cos1, sin1 = cos[:, :seq, ...], sin[:, :seq, ...]
+        cos2, sin2 = cos[:, seq:, ...], sin[:, seq:, ...]
+        x1 = (x1 * cos1) + (rotate_half(x1) * sin1)
+        x2 = (x2 * cos2) + (rotate_half(x2) * sin2)
+        return torch.cat((x1, x2), dim=-1)
+
+class Decoder(torch.nn.Module):
+    def __init__(self, decoder, config):
+        super().__init__()
+        ModelMapper.do_map(self, decoder, config.model_map['decoder'])
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(self.self_attn, config)
+        # chatglm
+        self.alpha = (2 * config.num_hidden_layers) ** 0.5 if config.model_type == 'chatglm' else 1.0
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        hidden_states = hidden_states.view(1, -1, self.hidden_size)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        norm_hidden_states = hidden_states
+        # Self Attention
+        hidden_states, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            rotary_pos_emb=rotary_pos_emb,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+        )
+        # Fully Connected
+        if not hasattr(self, 'post_attention_layernorm'):
+            # phi
+            feed_forward_hidden_states = self.mlp(norm_hidden_states)
+            hidden_states = hidden_states + feed_forward_hidden_states + residual
+        elif self.alpha != 1.0:
+            # chatglm-6b
+            hidden_states = norm_hidden_states * self.alpha + hidden_states
+            mlp_input = self.post_attention_layernorm(hidden_states)
+            mlp_output = self.mlp(mlp_input)
+            hidden_states = mlp_input * self.alpha + mlp_output
+        elif hasattr(self, 'pre_feedforward_layernorm'):
+            # gemma2
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.pre_feedforward_layernorm(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+            hidden_states = self.post_feedforward_layernorm(hidden_states)
+            hidden_states = residual + hidden_states
+        else:
+            # general
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+            hidden_states = residual + hidden_states
+
+        return hidden_states, present_key_value
+
+class Lm(torch.nn.Module):
+    def __init__(self, lm_, final_layernorm_, config):
+        super().__init__()
+        self.final_layernorm = final_layernorm_
+        self.lm = lm_
+        self.hidden_size = config.hidden_size
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
+        hidden_states = self.final_layernorm(hidden_states)
+        m_logits = self.lm(hidden_states)
+        return m_logits
+
+class LlmExporter(torch.nn.Module):
+    '''
+    Base class for all llm model export. Inherits from [`torch.nn.Module`].
+    '''
+
+    def __init__(self, args):
+        super().__init__()
+        self.init_from_args(args)
+        self.load_model(args.path)
+
+    def init_from_args(self, args):
+        self.max_length = 1024
+        self.stop_ids = []
+        self.visual = None
+        self.dst_name = 'llm'
+        # load config from args
+        self.path = args.path
+        self.dst_path = args.dst_path
+        self.lora_path = args.lora_path
+        self.skip_slim = args.skip_slim
+        self.quant_bit = args.quant_bit
+        self.quant_block = args.quant_block
+        self.mnnconvert = args.mnnconvert
+        if args.lm_quant_bit is not None:
+            self.lm_quant_bit = args.lm_quant_bit
+        else:
+            self.lm_quant_bit = self.quant_bit
+        # init export dst dir
+        if not os.path.exists(self.dst_path):
+            os.makedirs(self.dst_path)
+
+    def load_pretrained(self, model_path: str):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        try:
+            self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        except:
+            self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        self.config = self.model.config
+        if self.lora_path is not None:
+            from peft import PeftModel
+            adapter = PeftModel.from_pretrained(self.model, model_id=self.lora_path)
+            self.model = adapter.merge_and_unload(progressbar=True)
+
+    @spinner_run(f'load pretrained model ')
+    def load_model(self, model_path):
+        self.load_pretrained(model_path)
+        self.attention_mask_type = 'float'
+        # load tokenizer info
+        self.stop_ids.append(self.tokenizer.eos_token_id)
+        if hasattr(self.tokenizer, 'im_end_id'):
+            self.stop_ids.append(self.tokenizer.im_end_id)
+        eot_id = self.tokenizer.encode('<|eot_id|>')
+        if len(eot_id) == 1:
+            self.stop_ids.append(eot_id[0])
+        if hasattr(self.model, 'generation_config'):
+            eos_token_id = self.model.generation_config.eos_token_id
+            from collections.abc import Iterable
+            if isinstance(eos_token_id, int):
+                self.stop_ids.append(eos_token_id)
+            elif isinstance(eos_token_id, Iterable):
+                for id in eos_token_id:
+                    self.stop_ids.append(id)
+        self.stop_ids = [stop_id for stop_id in self.stop_ids if stop_id is not None]
+        self.stop_ids = list(set(self.stop_ids))
+        model_mapper = ModelMapper()
+
+        self.model_type, self.model_map = model_mapper.get_map(self.config)
+        # print(self.model)
+        # print(self.model_type, self.model_map)
+        # load config info
+        ModelMapper.do_map(self, self.config, self.model_map['config'])
+        if not hasattr(self, 'num_key_value_heads') or self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        if not hasattr(self, 'rope_theta') or self.rope_theta is None:
+            self.rope_theta = 10000.0
+        if not hasattr(self, 'head_dim') or self.head_dim is None:
+            self.head_dim = self.hidden_size // self.num_attention_heads
+        # some export info
+        self.past_kv_shape = [self.num_hidden_layers, 2, 1, 0, self.num_key_value_heads, self.head_dim]
+        self.block_dynamic_axes = {
+            "inputs_embeds" : { 0: "seq_len" },
+            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
+            "position_ids" : { 0: "seq_len" },
+            "past_key_values" : { 1: "history_len" }
+        }
+        self.model_dynamic_axes = {
+            "input_ids" : { 0: "seq_len" },
+            "attention_mask" : { 2: "seq_len", 3: "seq_len" },
+            "position_ids" : { 0: "seq_len" },
+            "past_key_values" : { 2: "history_len" }
+        }
+        self.llm_config = {
+            'hidden_size' : self.hidden_size,
+            'layer_nums' : self.num_hidden_layers,
+            'attention_mask': self.attention_mask_type,
+            'key_value_shape': self.past_kv_shape[1:],
+            "prompt_template": self.build_prompt('%s'),
+            'is_visual': False
+        }
+        # load modules
+        ModelMapper.do_map(self, self.model, self.model_map['model'])
+        # rebuild modules
+        if self.embed_.weight is self.lm_.weight:
+            import copy
+            embed_copy = copy.deepcopy(self.embed_)
+            self.embed = Embedding(embed_copy, self)
+        else:
+            self.embed = Embedding(self.embed_, self)
+        # Rotary
+        self.rotary = Rotary(self)
+        self.blocks = []
+        for block in self.blocks_.children():
+            self.blocks.append(Decoder(block, self))
+        self.lm = Lm(self.lm_, self.final_layernorm_, self)
+        # visual model
+        if self.visual is not None:
+            self.image_start_id = self.config.visual['image_start_id']
+            self.image_size = self.config.visual['image_size']
+            self.llm_config['is_visual'] = True
+            self.llm_config['img_size'] = self.image_size
+            self.llm_config['imgpad_len'] = 256
+            self.llm_config['img_start'] = self.tokenizer.img_start_id
+            self.llm_config['img_end'] = self.tokenizer.img_end_id
+            self.llm_config['img_pad'] = self.tokenizer.img_pad_id
+        return model_path
+
+    def get_attention_mask(self) -> torch.Tensor:
+        if self.model_type == 'chatglm':
+            return self.chatglm_attention_mask()
+        if self.token_len:
+            return torch.zeros([1, 1, 1, self.seq_len], dtype=torch.float32)
+        return (1 - torch.tril(torch.ones([1, 1, self.seq_len, self.seq_len]))) * torch.finfo(torch.float32).min
+
+    def get_position_ids(self) -> torch.Tensor:
+        if self.model_type == 'chatglm':
+            return self.chatglm_position_ids()
+        if self.token_len:
+            return torch.tensor([[self.seq_len - 1]], dtype=torch.long)
+        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
+
+    def chatglm_attention_mask(self):
+        if self.token_len:
+            return torch.zeros([1]).bool().reshape([1, 1, 1, 1])
+        attention_mask = torch.zeros([self.seq_len, self.seq_len], dtype=torch.bool)
+        for i in range(self.seq_len - 1):
+            attention_mask[i][-1] = True
+        attention_mask = attention_mask.reshape([1, 1, self.seq_len, self.seq_len])
+        return attention_mask
+
+    def chatglm_position_ids(self):
+        if self.token_len:
+            return torch.tensor([self.context_len, self.token_len + 1]).reshape([1, 2, 1])
+        position_ids_0 = torch.arange(self.seq_len, dtype=torch.long)
+        position_ids_1 = torch.zeros(self.seq_len, dtype=torch.long)
+        position_ids_0[-1] = position_ids_0[-2]
+        position_ids_1[-1] = 1
+        position_ids = torch.stack([position_ids_0, position_ids_1]).view(1, 2, -1)
+        return position_ids
+
+    def visual_embed(self, input_ids):
+        if not torch.any(input_ids == self.image_start_id):
+            return self.embed(input_ids)
+        bos_pos = torch.where(input_ids == self.image_start_id)
+        eos_pos = torch.where(input_ids == self.image_start_id + 1)
+        img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
+        images = []
+        for i, a, b in img_pos:
+            image = input_ids[i][a + 1 : b - 1].tolist()
+            image = image[ : image.index(self.image_start_id + 2)]
+            images.append(bytes(image).decode('utf-8'))
+        images = self.visual.encode(images)
+        hidden_states = self.embed(input_ids).view(1, -1, self.hidden_size)
+        for idx, (i, a, b) in enumerate(img_pos):
+            hidden_states[i][a + 1 : b] = images[idx]
+        return hidden_states.view(-1, 1, self.hidden_size)
+
+    def embedding(self, input_ids):
+        if self.visual is not None and self.token_len == 0:
+            input_embeds = self.visual_embed(input_ids)
+        else:
+            input_embeds = self.embed(input_ids)
+        return input_embeds
+
+    def forward(self, input_ids, attention_mask, position_ids, past_key_values):
+        hidden_states = input_ids # llm forward without embedding
+        presents = []
+        rotary_pos_emb = self.rotary(position_ids)
+        for i in range(self.num_hidden_layers):
+            hidden_states, kv = self.blocks[i](hidden_states, rotary_pos_emb, attention_mask, past_key_values[i])
+            presents.append(kv)
+        logits = self.lm(hidden_states).reshape(-1)
+        presents = torch.stack(presents)
+        self.seq_len += 1
+        self.token_len += 1
+        return logits, presents
+
+    # some test functions
+    def build_prompt(self, query):
+        # just for test
+        if 'Qwen2' in self.path:
+            return f'<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+        if 'Qwen' in self.path:
+            return f'\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
+        if 'Baichuan2' in self.path:
+            return f'<reserved_106>{query}<reserved_107>'
+        if 'internlm' in self.path:
+            return f'<|User|>:{query}<eoh>\n<|Bot|>:'
+        if 'TinyLlama' in self.path:
+            return f'<s><|system|>\nYou are a friendly chatbot who always responds in the style of a pirate</s>\n<|user|>\n{query}</s>\n<|assistant|>\n'
+        if 'Yi' in self.path:
+            return f'<|im_start|> user\n{query}<|im_end|>\n<|im_start|> assistant\n'
+        if 'deepseek' in self.path:
+            return f'<|begin_of_sentence|>User: {query}\n\nAssistant:'
+        if 'Llama-3.1' in self.path:
+            return f'<|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+        if 'Llama-3' in self.path:
+            return f'<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+        if 'Llama-2' in self.path:
+            return f'[INST]{query}[/INST]'
+        if 'chatglm2' in self.path:
+            return f'[Round 1]\n\n问：{query}\n\n答：'
+        if 'chatglm3' in self.path or 'glm-4' in self.path:
+            return f'<|user|>\n{query}\n<|assistant|>\n'
+        if 'chatglm' in self.path:
+            return f'{query}[gMASK]<sop>'
+        if 'phi-2' in self.path:
+            return f'Instruct: {query}\nOutput:'
+        if 'gemma-2' in self.path:
+            return f'<bos><start_of_turn>user\n{query}<end_of_turn>\n<start_of_turn>model\n'
+        return query
+
+    def str_to_ids(self, prompt):
+        input_ids = self.tokenizer(prompt, return_tensors="pt")['input_ids']
+        return input_ids
+
+    def id_to_str(self, token_id):
+        word = self.tokenizer._convert_id_to_token(int(token_id))
+        word = self.tokenizer.convert_tokens_to_string([word])
+        return word
+
+    def response(self, query):
+        self.imitate_quant()
+        prompt = self.build_prompt(query)
+        input_ids = self.str_to_ids(prompt)
+        # print(f'prompt = {prompt}, ids = {input_ids}')
+        self.seq_len = input_ids.numel()
+        self.context_len = self.seq_len - 2
+        self.token_len = 0
+        past_key_values = [None for i in range(self.num_hidden_layers)]
+        token_id = input_ids
+        while self.token_len < self.max_length:
+            attention_mask = self.get_attention_mask()
+            position_ids = self.get_position_ids()
+            input_ids = self.embed(token_id)
+            logits, past_key_values = self.forward(input_ids, attention_mask, position_ids, past_key_values)
+            token_id = torch.argmax(logits)
+            if token_id in self.stop_ids:
+                print("", end='\n')
+                break
+            word = self.id_to_str(token_id)
+            print(word, end="", flush=True)
+
+    def export_visual(self):
+        if self.visual is None:
+            return
+        input_images = torch.randn((1, 3, self.image_size, self.image_size))
+        model = self.visual
+        onnx_model = f'{self.dst_path}/visual.onnx'
+        torch.onnx.export(model, (input_images),
+                        onnx_model,
+                        input_names=['input_images'],
+                        output_names=['image_embeds'],
+                        dynamic_axes={"input_images": {
+                            0: "size"
+                        }},
+                        do_constant_folding=True,
+                        opset_version=15)
+        return onnx_model
+        if not self.skip_slim:
+            slim(onnx_model, output_model=onnx_model)
+
+    @spinner_run(f'export embedding to ')
+    def export_embed(self):
+        import ctypes
+        if hasattr(self, 'word_embeddings'):
+            # embedding model's embed
+            tensor_data = self.word_embeddings.weight.data.bfloat16()
+        else:
+            tensor_data = self.embed.embed.weight.data.bfloat16()
+        data_ptr = tensor_data.untyped_storage().data_ptr()
+        buffer = (ctypes.c_byte * (tensor_data.numel() * 2)).from_address(data_ptr)
+        embedding_file = f'{self.dst_path}/embeddings_bf16.bin'
+        with open(embedding_file, 'wb') as f:
+            f.write(buffer)
+        return embedding_file
+
+    @spinner_run(f'export config to ')
+    def export_config(self, mnn_config = False):
+        config_json = f'{self.dst_path}/llm_config.json'
+        with open(config_json, 'w', encoding='utf-8') as f:
+            json.dump(self.llm_config, f, ensure_ascii=False, indent=4)
+        if not mnn_config:
+            return config_json
+        with open(f'{self.dst_path}/config.json', 'w', encoding='utf-8') as f:
+            config = {
+                "llm_model": f"{self.dst_name}.mnn",
+                "llm_weight": f"{self.dst_name}.mnn.weight",
+                "backend_type": "cpu",
+                "thread_num": 4,
+                "precision": "low",
+                "memory": "low"
+            }
+            json.dump(config, f, ensure_ascii=False, indent=4)
+        return config_json
+
+    def quant(self, weight, quant_bit, quant_block):
+        weight = weight.numpy()
+        oc, ic = weight.shape
+        if quant_block == 0:
+            block_size = ic
+        else:
+            block_size = quant_block
+        block_num = ic // block_size
+        weight = weight.reshape(oc, block_num, block_size)
+        max_val = np.max(weight, axis=-1, keepdims=True)
+        min_val = np.min(weight, axis=-1, keepdims=True)
+        offset = 1 << (quant_bit - 1)
+        clip_max = offset - 1
+        clip_min = -offset
+        scale = (max_val - min_val) / (clip_max - clip_min)
+        q_weight = np.round((weight - min_val) / scale) + clip_min
+        q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
+        q_weight = q_weight.reshape(-1, 2)
+        if quant_bit == 4:
+            q_weight = q_weight[:, 0] * 16 + q_weight[:, 1]
+        alpha = np.stack([min_val.flatten(), scale.flatten()], axis=-1).flatten()
+        return q_weight, alpha, clip_min
+
+    def imitate_quant(self):
+        def quant_dequant(linear, quant_bit = self.quant_bit, quant_block = self.quant_block):
+            weight = linear.weight.data
+            oc, ic = weight.shape
+            if quant_block == 0:
+                block_size = ic
+            else:
+                block_size = quant_block
+            block_num = ic // block_size
+            weight = weight.reshape(oc, block_num, block_size)
+            max_val = torch.max(weight, axis=-1, keepdims=True).values
+            min_val = torch.min(weight, axis=-1, keepdims=True).values
+            offset = 1 << (quant_bit - 1)
+            clip_max = offset - 1
+            clip_min = -offset
+            scale = (max_val - min_val) / (clip_max - clip_min)
+            q_weight = torch.round((weight - min_val) / scale) + clip_min
+            q_weight = torch.clip(q_weight, clip_min, clip_max)
+            dq_weight = (q_weight - clip_min) * scale + min_val
+            dq_weight = dq_weight.reshape(oc, ic).float()
+            linear.weight.data = dq_weight
+            return linear
+        with torch.no_grad():
+            for i in range(self.num_hidden_layers):
+                for name, child in self.blocks[i].self_attn.named_children():
+                    if isinstance(child, torch.nn.Linear):
+                        setattr(self.blocks[i].self_attn, name, quant_dequant(child))
+                for name, child in self.blocks[i].mlp.named_children():
+                    if isinstance(child, torch.nn.Linear):
+                        setattr(self.blocks[i].mlp, name, quant_dequant(child))
+            self.lm.lm = quant_dequant(self.lm.lm)
+
+    def unload_param(self):
+        self.unloaded_ops = {}
+        def build_faker(real, name):
+            faker = FakeLinear(real.in_features, real.out_features, real.bias is not None, name)
+            self.unloaded_ops[name] = real
+            return faker
+        # replace linear with fakelinear to save export memory and time
+        with torch.no_grad():
+            for i in range(self.num_hidden_layers):
+                for name, child in self.blocks[i].self_attn.named_children():
+                    if isinstance(child, torch.nn.Linear):
+                        setattr(self.blocks[i].self_attn, name, build_faker(child, f'/layers.{i}/self_attn/{name}/Linear'))
+                for name, child in self.blocks[i].mlp.named_children():
+                    if isinstance(child, torch.nn.Linear):
+                        setattr(self.blocks[i].mlp, name, build_faker(child, f'/layers.{i}/mlp/{name}/Linear'))
+            self.lm.lm = build_faker(self.lm.lm, f'/lm/lm_head/Linear')
+
+    @spinner_run(f'export model weight to ')
+    def onnx_load_param(self, onnx_path):
+        return OnnxRebuilder(onnx_path, self.unloaded_ops).rebuild()
+
+    @spinner_run(f'slim the graph of ')
+    def onnx_slim(self, onnx_model):
+        import onnxslim
+        model = onnxslim.slim(onnx_model)
+        onnx.save(model, onnx_model)
+        return onnx_model
+
+    @spinner_run(f'export onnx model to ')
+    def export_onnx(self):
+        # unload linear weight to save export memory
+        self.unload_param()
+        model = self
+        self.seq_len = 3
+        self.token_len = 0
+        input_ids = torch.arange(3, dtype=torch.long)
+        attention_mask =  self.get_attention_mask()
+        position_ids = self.get_position_ids()
+        past_key_values = torch.zeros(self.past_kv_shape)
+        onnx_model = f'{self.dst_path}/{self.dst_name}.onnx'
+        input_ids = self.embedding(input_ids)
+        # export to onnx
+        torch.onnx.export(
+            model, (input_ids, attention_mask, position_ids, past_key_values),
+            onnx_model,
+            input_names=[
+                'input_ids', 'attention_mask', 'position_ids', 'past_key_values'
+            ],
+            output_names=['logits', 'presents'],
+            dynamic_axes=self.model_dynamic_axes,
+            do_constant_folding=True,
+            opset_version=15)
+        return onnx_model
+
+    def export(self, export_type):
+        export_mnn = export_type == 'mnn'
+        # export tokenizer
+        self.export_tokenizer()
+        self.export_config(export_mnn)
+        self.export_embed()
+        if self.visual:
+            self.export_visual()
+        # export graph to llm.onnx
+        onnx_model = self.export_onnx()
+        if not self.skip_slim:
+            self.onnx_slim(onnx_model)
+        if export_mnn:
+            # convert onnx to mnn and quant weight
+            MNNConveter(onnx_model, self.unloaded_ops, self).export()
+        else:
+            # export weight to llm.onnx.data
+            self.onnx_load_param(onnx_model)
+
+    @spinner_run(f'export tokenizer to ')
+    def export_tokenizer(self):
+        # load tokenizer file
+        tokenizer_model = os.path.join(self.path, 'tokenizer.model')
+        ice_text_model = os.path.join(self.path, 'ice_text.model')
+        try:
+            import sentencepiece as spm
+            if os.path.exists(tokenizer_model):
+                self.sp_model = spm.SentencePieceProcessor(tokenizer_model)
+            elif os.path.exists(ice_text_model):
+                self.sp_model = spm.SentencePieceProcessor(ice_text_model)
+            else:
+                self.sp_model = None
+        except:
+            self.sp_model = None
+        merge_file = os.path.join(self.path, 'merges.txt')
+        if os.path.exists(merge_file):
+            self.merge_txt = merge_file
+        else:
+            self.merge_txt = None
+        # TOKENIZER MAGIC NUMBER
+        MAGIC_NUMBER = 430
+        # TOKENIZER TYPE
+        SENTENCEPIECE = 0; TIKTOIKEN = 1; BERT = 2; HUGGINGFACE = 3
+        def write_line(fp, *args):
+            for arg in args:
+                for token in arg:
+                    fp.write(str(token) + ' ')
+            fp.write('\n')
+        def write_header(fp, type, speicals, prefix = []):
+            fp.write(f'{MAGIC_NUMBER} {type}\n')
+            fp.write(f'{len(speicals)} {len(self.stop_ids)} {len(prefix)}\n')
+            write_line(fp, speicals, self.stop_ids, prefix)
+
+        file_path = os.path.join(self.dst_path, "tokenizer.txt")
+        special_list = list(self.tokenizer.added_tokens_decoder.keys())
+        if hasattr(self.tokenizer, 'special_tokens'):
+            for k, v in self.tokenizer.special_tokens.items():
+                special_list.append(v)
+        if hasattr(self.tokenizer, 'gmask_token_id'):
+            special_list.append(self.tokenizer.gmask_token_id)
+        vocab_list = []
+        prefix_list = []
+        if hasattr(self.tokenizer, 'get_prefix_tokens'):
+            prefix_list = self.tokenizer.get_prefix_tokens()
+        if self.sp_model is not None:
+            # senetencepiece
+            NORMAL = 1; UNKNOWN = 2; CONTROL = 3
+            USER_DEFINED = 4; UNUSED = 5; BYTE = 6
+            for i in range(self.sp_model.GetPieceSize()):
+                token = self.sp_model.IdToPiece(i)
+                score = self.sp_model.GetScore(i)
+                token_type = NORMAL
+                if self.sp_model.IsUnknown(i):
+                    token_type = UNKNOWN
+                elif self.sp_model.IsControl(i):
+                    token_type = CONTROL
+                elif self.sp_model.IsUnused(i):
+                    token_type = UNUSED
+                elif self.sp_model.IsByte(i):
+                    token_type = BYTE
+                if self.path == 'Chatglm_6b':
+                    if '<n>' in token: token = '\n'
+                    if '<|tab|>' in token: token = '\t'
+                    if '<|blank_' in token: token = ' ' * int(token[8:token.find('|>')])
+                if '▁' in token: token = token.replace('▁', ' ')
+                token_encode = base64.b64encode(token.encode("utf-8")).decode("utf8")
+                vocab_list.append(f'{token_encode} {score} {token_type}\n')
+            with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, SENTENCEPIECE, special_list, prefix_list)
+                fp.write(f'{len(vocab_list)}\n')
+                for vocab in vocab_list:
+                    fp.write(vocab)
+        elif hasattr(self.tokenizer, 'mergeable_ranks'):
+            # tikton
+            vocab_list = []
+            for k, v in self.tokenizer.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + "\n"
+                vocab_list.append(line)
+            if hasattr(self.tokenizer, 'special_tokens'):
+                for k, v in self.tokenizer.special_tokens.items():
+                    line = base64.b64encode(k.encode("utf-8")).decode("utf8") + "\n"
+                    vocab_list.append(line)
+            if hasattr(self.tokenizer, 'added_tokens_decoder'):
+                for k, v in self.tokenizer.added_tokens_decoder.items():
+                    line = base64.b64encode(v.__str__().encode("utf-8")).decode("utf8") + "\n"
+                    vocab_list.append(line)
+            with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, TIKTOIKEN, special_list, prefix_list)
+                fp.write(f'{len(vocab_list)}\n')
+                for vocab in vocab_list:
+                    fp.write(vocab)
+        elif self.merge_txt is not None:
+            # huggingface tokenizer
+            merge_list = []
+            vocab = self.tokenizer.get_vocab()
+            special_list = list(self.tokenizer.added_tokens_decoder.keys())
+            vocab_list = ['<unk>' for i in range(len(vocab))]
+            # load vocab
+            for k, v in vocab.items():
+                vocab_list[int(v)] = k
+            # load merge
+            with open(self.merge_txt, 'rt') as merge:
+                for line in merge.readlines():
+                    merge_list.append(line)
+            # write to tokenizer.txt
+            with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, HUGGINGFACE, special_list)
+                fp.write(f'{len(vocab_list)} {len(merge_list)}\n')
+                for v in vocab_list:
+                    fp.write(v + '\n')
+                for m in merge_list:
+                    fp.write(m)
+        else:
+            # tiktoken or bert
+            if 'bert' in type(self.tokenizer).__name__.lower():
+                tokenizer_type = BERT
+            else:
+                tokenizer_type = TIKTOIKEN
+            # bert tokenizer
+            def unicode_to_byte(u: int):
+                if u >= 256 and u <= 288:
+                    return u - 256
+                if u >= 289 and u <= 322:
+                    return u - 162
+                if u == 323:
+                    return 173
+                if u == 65372: # |
+                    return 124
+                if u == 9601:  # _
+                    return 95
+                return u
+            vocab = self.tokenizer.get_vocab()
+            vocab_list = ['<unk>' for i in range(len(vocab))]
+            for k, v in vocab.items():
+                try:
+                    vocab_list[int(v)] = bytes([unicode_to_byte(ord(c)) for c in k]).decode('utf-8', errors='ignore')
+                except:
+                    vocab_list[int(v)] = k
+            special_list = list(self.tokenizer.added_tokens_decoder.keys())
+            with open(file_path, "w", encoding="utf8") as fp:
+                write_header(fp, tokenizer_type, special_list)
+                fp.write(f'{len(vocab_list)}\n')
+                for v in vocab_list:
+                    line = base64.b64encode(v.encode('utf-8')).decode("utf8") + "\n"
+                    fp.write(line)
+        return file_path
+
+
+class EmbeddingExporter(LlmExporter):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dst_name = 'embedding'
+
+    def word_embed(self, input_ids):
+        return self.word_embeddings(input_ids.view(1, -1))
+
+    def bge_forward(self, inputs_embeds, position_ids, attention_mask):
+        # bert absolute position
+        inputs_embeds = inputs_embeds.reshape(1, -1, self.hidden_size)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings + self.token_type_embeddings
+        hidden_states = self.embedding_layernorm(embeddings)
+        for i in range(self.num_hidden_layers):
+            hidden_states = self.blocks[i](hidden_states, attention_mask)[0]
+        sentence_embeddings = hidden_states[:, 0]
+        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+        return sentence_embeddings
+
+    def gte_forward(self, inputs_embeds, position_ids, attention_mask):
+        # rope position
+        inputs_embeds = inputs_embeds.reshape(1, -1, self.hidden_size)
+        freqs = position_ids.float().reshape(-1, 1) * self.inv_freq
+        emb = torch.cat((freqs, freqs), dim=-1)
+        rope_embeds = torch.stack([emb.cos(), emb.sin()]).unsqueeze(-2).unsqueeze(1)
+        attention_bias = 1 - attention_mask.float()
+        hidden_states = self.embedding_layernorm(inputs_embeds + self.token_type_embeddings)
+        for i in range(self.num_hidden_layers):
+            hidden_states = self.blocks[i](hidden_states, attention_bias, rope_embeds)[0]
+        sentence_embeddings = hidden_states[:, 0]
+        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
+        return sentence_embeddings
+
+    def forward(self, inputs_embeds, position_ids, attention_mask):
+        if self.model_type == 'bert':
+            return self.bge_forward(inputs_embeds, position_ids, attention_mask)
+        if self.model_type == 'new':
+            return self.gte_forward(inputs_embeds, position_ids, attention_mask)
+        raise RuntimeError(f'Not support embedding model: {self.model_type}!')
+
+    def response(self, query):
+        self.eval()
+        input_ids = self.tokenizer(query)['input_ids']
+        self.seq_len = len(input_ids)
+        input_ids = torch.tensor(input_ids)
+        position_ids = self.get_position_ids()
+        attention_mask = self.get_attention_mask()
+        inputs_embeds = self.word_embed(input_ids)
+        res = self.forward(inputs_embeds, position_ids, attention_mask)
+        print(res)
+        return res
+
+    @spinner_run(f'load pretrained model ')
+    def load_model(self, model_path):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
+        self.config = self.model.config
+        transformer = self.model.encoder
+        self.model_type = self.config.model_type
+        self.lm_ = self.model.pooler
+        self.embed_ = self.model.embeddings
+        self.word_embeddings = self.embed_.word_embeddings
+        self.token_type_embeddings = self.embed_.token_type_embeddings.weight.data[0]
+        self.embedding_layernorm = self.embed_.LayerNorm
+        if hasattr(self.embed_, 'position_embeddings'):
+            self.position_embeddings = self.embed_.position_embeddings
+        self.hidden_size = self.word_embeddings.weight.shape[-1]
+        self.blocks = transformer.layer
+        if self.model_type == 'new':
+            self.inv_freq = self.embed_.rotary_emb.inv_freq
+        # some wrapper
+        self.stop_ids = []
+        self.num_hidden_layers = len(self.blocks)
+        self.embed = self.embed_
+        self.lm = self.lm_
+        # some config for export
+        self.model_dynamic_axes = {
+            "input_ids" : { 1: "seq_len" },
+            "position_ids" : { 1: "seq_len" },
+            "attention_mask" : { 3: "seq_len" }
+        }
+        self.attention_mask_type = 'int'
+        self.llm_config = {
+            'hidden_size' : self.hidden_size,
+            'layer_nums' : self.num_hidden_layers,
+            'attention_mask': self.attention_mask_type,
+            'key_value_shape': [],
+            "prompt_template": self.build_prompt('%s'),
+            'is_visual': False
+        }
+        return model_path
+
+    @spinner_run(f'export onnx model to ')
+    def export_onnx(self):
+        model = self.eval()
+        self.seq_len = 3
+        input_ids = torch.arange(3, dtype=torch.long)
+        position_ids = self.get_position_ids()
+        attention_mask = self.get_attention_mask()
+        inputs_embeds = self.word_embed(input_ids)
+        onnx_model = f'{self.dst_path}/{self.dst_name}.onnx'
+        torch.onnx.export(
+            model, (inputs_embeds, position_ids, attention_mask),
+            onnx_model,
+            input_names=[
+                'input_ids',
+                'position_ids',
+                'attention_mask'
+            ],
+            output_names=['sentence_embeddings'],
+            dynamic_axes=self.model_dynamic_axes,
+            do_constant_folding=True,
+            opset_version=15)
+        return onnx_model
+
+    def export(self, export_type):
+        export_mnn = 'mnn' in export_type
+        self.export_tokenizer()
+        self.export_config(export_mnn)
+        self.export_embed()
+        onnx_model = self.export_onnx()
+        if not self.skip_slim:
+            self.onnx_slim(onnx_model)
+        if export_mnn:
+            MNNConveter(onnx_model, None, self).export()
+
+    def build_prompt(self, query):
+        if self.model_type == 'bert':
+            return f'[CLS]{query}[SEP]'
+        if self.model_type == 'new':
+            return f'<s> {query}</s>'
+
+    def get_position_ids(self) -> torch.Tensor:
+        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
+
+    def get_attention_mask(self) -> torch.Tensor:
+        return torch.ones([1, 1, 1, self.seq_len], dtype=torch.long)
+
+def export(path,
+           type = None,
+           lora_path = None,
+           dst_path = './model',
+           export = 'onnx',
+           skip_slim = False,
+           quant_bit = 4,
+           quant_block = 128,
+           lm_quant_bit = None):
+    args = argparse.Namespace()
+    for k, v in {
+        'path': path,
+        'type': type,
+        'lora_path': lora_path,
+        'dst_path': dst_path,
+        'export': export,
+        'skip_slim': skip_slim,
+        'quant_bit': quant_bit,
+        'quant_block': quant_block,
+        'lm_quant_bit': lm_quant_bit
+    }.items():
+        setattr(args, k, v)
+    if 'bge' in path:
+        llm_exporter = EmbeddingExporter(args)
+    else:
+        llm_exporter = LlmExporter(args)
+    # export
+    llm_exporter.export(export)
+
+def main():
+    parser = argparse.ArgumentParser(description='llm_exporter', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--path', type=str, required=True,
+                        help='path(`str` or `os.PathLike`):\nCan be either:'
+                        '\n\t- A string, the *model id* of a pretrained model like `THUDM/chatglm-6b`. [TODO]'
+                        '\n\t- A path to a *directory* clone from repo like `../chatglm-6b`.')
+    parser.add_argument('--type', type=str, default=None,
+                        help='type(`str`, *optional*):'
+                        '\n\tThe pretrain llm model type.'
+                        )
+    parser.add_argument('--lora_path', type=str, default=None, help='lora path, defaut is `None` mean not apply lora.')
+    parser.add_argument('--dst_path', type=str, default='./model', help='export onnx/mnn model to path, defaut is `./model`.')
+    parser.add_argument('--test', type=str, help='test model inference with query `TEST`.')
+    parser.add_argument('--export', type=str, default=None, help='export model to an onnx/mnn model.')
+    parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
+    parser.add_argument('--quant_bit', type=int, default=4, help='mnn quant bit, 4 or 8, default is 4.')
+    parser.add_argument('--quant_block', type=int, default=128, help='mnn quant block, default is 0 mean channle-wise.')
+    parser.add_argument('--lm_quant_bit', type=int, default=None, help='mnn lm_head quant bit, 4 or 8, default is `quant_bit`.')
+    parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.')
+
+    args = parser.parse_args()
+
+    model_path = args.path
+    model_type = args.type
+
+    if 'gte' in model_path or 'bge' in model_path:
+        llm_exporter = EmbeddingExporter(args)
+    else:
+        llm_exporter = LlmExporter(args)
+
+    # some actions
+    if args.test is not None:
+        llm_exporter.response(args.test)
+
+    if args.export is not None:
+        llm_exporter.export(args.export)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file