Skip to content

Commit

Permalink
Merge branch 'master' into CVS-152290
Browse files Browse the repository at this point in the history
  • Loading branch information
CuriousPanCake authored Oct 18, 2024
2 parents 38e3eee + 03c9ae3 commit 458c6ac
Show file tree
Hide file tree
Showing 64 changed files with 6,578 additions and 54 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/job_pytorch_models_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ jobs:
TEST_DEVICE: CPU
USE_SYSTEM_CACHE: False

- name: TorchFX GPTQ Pattern Test
if: ${{ inputs.model_scope == 'precommit' }}
# install torch 2.3.1 as newer is not yet supported by openvino backend
run: |
export PYTHONPATH=${MODEL_HUB_TESTS_INSTALL_DIR}:$PYTHONPATH
python3 -m pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cpu
python3 -m pytest ${MODEL_HUB_TESTS_INSTALL_DIR}/transformation_tests/test_gptq_torchfx_transformations.py -m precommit --html=${INSTALL_TEST_DIR}/TEST-torch_gptqpattern_tests.html --self-contained-html -v --tb=short
env:
TEST_DEVICE: CPU
USE_SYSTEM_CACHE: False

- name: Reformat unsupported ops file
if: ${{ inputs.model_scope != 'precommit' && !cancelled()}}
run: |
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@
[submodule "src/plugins/intel_npu/thirdparty/level-zero-ext"]
path = src/plugins/intel_npu/thirdparty/level-zero-ext
url = https://github.com/intel/level-zero-npu-extensions.git
[submodule "src/plugins/intel_npu/thirdparty/yaml-cpp"]
path = src/plugins/intel_npu/thirdparty/yaml-cpp
url = https://github.com/jbeder/yaml-cpp.git
[submodule "thirdparty/telemetry"]
path = thirdparty/telemetry
url = https://github.com/openvinotoolkit/telemetry.git
Expand Down
1 change: 1 addition & 0 deletions scripts/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ set(shellcheck_skip_list
"${OpenVINO_SOURCE_DIR}/thirdparty"
"${OpenVINO_SOURCE_DIR}/src/plugins/intel_cpu/thirdparty"
"${OpenVINO_SOURCE_DIR}/src/plugins/intel_gpu/thirdparty"
"${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/thirdparty"
"${OpenVINO_SOURCE_DIR}/src/bindings/python/thirdparty/pybind11"
"${TEMP}")

Expand Down
188 changes: 136 additions & 52 deletions src/frontends/pytorch/src/transforms/torchfx_gptq_pattern_replacer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,6 @@ uint32_t read_u4_data(const void* array, size_t index) {
return val;
};

void write_u4_data(void* array, size_t index, uint32_t data) {
auto arr_u32 = reinterpret_cast<uint32_t*>(array);
size_t idx_u32 = index / 8;
size_t offset_u32 = index % 8;
uint32_t old_val = arr_u32[idx_u32];
data = data << (offset_u32 * 4);
uint32_t mask = 15;
mask = ~(mask << (offset_u32 * 4));
uint32_t new_val = (old_val & mask) | data;
arr_u32[idx_u32] = new_val;
};

GPTQDecompressionReplacer::GPTQDecompressionReplacer() {
const auto& const_1 = wrap_type<v0::Constant>();
const auto& const_2 = wrap_type<v0::Constant>();
Expand All @@ -73,61 +61,157 @@ GPTQDecompressionReplacer::GPTQDecompressionReplacer() {
const auto& convert_2 = wrap_type<v0::Convert>({const_6});
const auto& bitwise_and = wrap_type<ov::op::v13::BitwiseAnd>({add_or_convert, convert_2});

ov::matcher_pass_callback callback = [unsqueeze_1](Matcher& m) {
ov::matcher_pass_callback callback = [=](Matcher& m) {
auto bitwise_and = m.get_match_root();
if (!bitwise_and) {
return false;
}
const auto& pattern_map = m.get_pattern_value_map();
const auto& input_node = pattern_map.at(unsqueeze_1).get_node_shared_ptr();
auto weights_u32 = std::dynamic_pointer_cast<v0::Constant>(input_node->get_input_node_shared_ptr(0));
auto axis = std::dynamic_pointer_cast<v0::Constant>(input_node->get_input_node_shared_ptr(1));
auto axis_data = axis->get_data_ptr<uint32_t>();

auto u8_shape = weights_u32->get_shape();
auto src = weights_u32->get_data_ptr<uint32_t>();

ov::Shape u4_shape;
bool dim_added = false;
size_t stride = 1;
size_t size_y = 1;
for (size_t i = 0; i < u8_shape.size(); i++) {
if (axis_data[0] == i) {
u4_shape.push_back(8);
dim_added = true;
}
if (axis_data[0] <= i) {
stride *= u8_shape[i];
} else {
size_y *= u8_shape[i];
}
u4_shape.push_back(u8_shape[i]);
auto unsqueeze_1_node = pattern_map.at(unsqueeze_1).get_node_shared_ptr();
auto unsqueeze_1_in0_const =
std::dynamic_pointer_cast<v0::Constant>(unsqueeze_1_node->get_input_node_shared_ptr(0));
auto unsqueeze_1_in1_const =
std::dynamic_pointer_cast<v0::Constant>(unsqueeze_1_node->get_input_node_shared_ptr(1));
auto abs_node = pattern_map.at(abs).get_node_shared_ptr();
auto abs_in_const = std::dynamic_pointer_cast<v0::Constant>(abs_node->get_input_node_shared_ptr(0));
auto broadcast_node = pattern_map.at(broadcast).get_node_shared_ptr();
auto unsqueeze_2_node = pattern_map.at(unsqueeze_2).get_node_shared_ptr();
auto unsqueeze_2_in0_const =
std::dynamic_pointer_cast<v0::Constant>(unsqueeze_2_node->get_input_node_shared_ptr(0));
auto unsqueeze_2_in1_const =
std::dynamic_pointer_cast<v0::Constant>(unsqueeze_2_node->get_input_node_shared_ptr(1));

OutputVector outputs_1(unsqueeze_1_node->get_output_size());
OutputVector unsqueeze_1_inputs(2);
unsqueeze_1_inputs[0] = unsqueeze_1_in0_const->outputs()[0];
unsqueeze_1_inputs[1] = unsqueeze_1_in1_const->outputs()[0];
if (!unsqueeze_1_node->constant_fold(outputs_1, unsqueeze_1_inputs)) {
return false;
}
if (!dim_added) {
u4_shape.push_back(8);

OutputVector outputs_2(abs_node->get_output_size());
if (!abs_node->constant_fold(outputs_2, abs_in_const->outputs())) {
return false;
}

auto new_const = std::make_shared<v0::Constant>(element::u4, u4_shape);
auto dst = const_cast<uint32_t*>(reinterpret_cast<const uint32_t*>(new_const->get_data_ptr()));
OutputVector outputs_3(broadcast_node->get_output_size());
OutputVector broadcast_inputs(2);
broadcast_inputs[0] = outputs_1[0];
broadcast_inputs[1] = outputs_2[0];
if (!broadcast_node->constant_fold(outputs_3, broadcast_inputs)) {
return false;
}

OutputVector outputs_4(unsqueeze_2_node->get_output_size());
OutputVector unsqueeze_2_inputs(2);
unsqueeze_2_inputs[0] = unsqueeze_2_in0_const->outputs()[0];
unsqueeze_2_inputs[1] = unsqueeze_2_in1_const->outputs()[0];
if (!unsqueeze_2_node->constant_fold(outputs_4, unsqueeze_2_inputs)) {
return false;
}
const int32_t* rs_in0 =
std::dynamic_pointer_cast<v0::Constant>(outputs_3[0].get_node_shared_ptr())->get_data_ptr<int32_t>();
const int32_t* rs_in1 =
std::dynamic_pointer_cast<v0::Constant>(outputs_4[0].get_node_shared_ptr())->get_data_ptr<int32_t>();
auto shifted_const = std::make_shared<v0::Constant>(element::i32, outputs_3[0].get_shape());
auto dst = const_cast<int32_t*>(reinterpret_cast<const int32_t*>(shifted_const->get_data_ptr()));
if (!dst)
return false;

size_t in_idx = 0;
for (size_t y = 0; y < size_y; y++) {
size_t offset = y * stride * 8;
for (size_t x = 0; x < stride; x++) {
for (size_t z = 0; z < 8; z++) {
uint32_t val = read_u4_data(src, in_idx);
write_u4_data(dst, (offset + x + stride * z), val);
in_idx++;
}
// TODO: Bitwise right shift operation below might need to be
// optimized to reduce FIL.
size_t rs_in0_shape_size = shape_size(outputs_3[0].get_shape());
const auto& rs_in0_shape = outputs_3[0].get_shape();
const auto& rs_in1_shape = outputs_4[0].get_shape();
int shift_dim = -1;
size_t shift_offset = 1;
for (size_t i = 0; i < rs_in1_shape.size(); ++i) {
size_t dim = rs_in1_shape[i];
if (dim != 1 && dim != rs_in0_shape[i]) {
return false;
}
if (shift_dim != -1) {
shift_offset *= rs_in0_shape[i];
}
if (dim == rs_in0_shape[i]) {
shift_dim = static_cast<int>(i);
}
}
if (shift_dim == -1)
return false;
for (size_t k = 0; k < rs_in0_shape_size; ++k) {
size_t shift_idx = (k / shift_offset) % rs_in1_shape[shift_dim];
int32_t shift_val = rs_in1[shift_idx];
dst[k] = (rs_in0[k] >> shift_val);
}

std::shared_ptr<ov::Node> convert_1_node = nullptr;
OutputVector outputs_7;
if (pattern_map.find(convert_1) != pattern_map.end()) {
convert_1_node = pattern_map.at(convert_1).get_node_shared_ptr();
outputs_7.resize(convert_1_node->get_output_size());
if (!convert_1_node->constant_fold(outputs_7, shifted_const->outputs())) {
return false;
}
} else {
auto convert_3_node = pattern_map.at(convert_3).get_node_shared_ptr();
auto convert_4_node = pattern_map.at(convert_4).get_node_shared_ptr();
auto convert_4_in_const =
std::dynamic_pointer_cast<v0::Constant>(convert_4_node->get_input_node_shared_ptr(0));
auto add_node = pattern_map.at(add).get_node_shared_ptr();
OutputVector outputs_5(convert_3_node->get_output_size());
if (!convert_3_node->constant_fold(outputs_5, shifted_const->outputs())) {
return false;
}
OutputVector outputs_6(convert_4_node->get_output_size());
if (!convert_4_node->constant_fold(outputs_6, convert_4_in_const->outputs())) {
return false;
}
outputs_7.resize(add_node->get_output_size());
OutputVector add_inputs(2);
add_inputs[0] = outputs_5[0];
add_inputs[1] = outputs_6[0];
if (!add_node->constant_fold(outputs_7, add_inputs)) {
return false;
}
}

copy_runtime_info_and_name(weights_u32, {new_const}, {weights_u32, bitwise_and});
auto convert_2_node = pattern_map.at(convert_2).get_node_shared_ptr();
auto convert_2_in_const = std::dynamic_pointer_cast<v0::Constant>(convert_2_node->get_input_node_shared_ptr(0));

OutputVector outputs_8(convert_2_node->get_output_size());
if (!convert_2_node->constant_fold(outputs_8, convert_2_in_const->outputs())) {
return false;
}

OutputVector outputs_9(bitwise_and->get_output_size());

const int8_t* and_in0 =
std::dynamic_pointer_cast<v0::Constant>(outputs_7[0].get_node_shared_ptr())->get_data_ptr<int8_t>();
const int8_t* and_in1 =
std::dynamic_pointer_cast<v0::Constant>(outputs_8[0].get_node_shared_ptr())->get_data_ptr<int8_t>();
auto masked_const = std::make_shared<v0::Constant>(element::i8, outputs_7[0].get_shape());
auto masked_dst = const_cast<int8_t*>(reinterpret_cast<const int8_t*>(masked_const->get_data_ptr()));
if (!masked_dst)
return false;

size_t and_in0_shape_size = shape_size(outputs_7[0].get_shape());
// TODO: Bitwise and operation below might need to be
// optimized to reduce FIL.
int8_t mask = and_in1[0];
for (size_t k = 0; k < and_in0_shape_size; ++k) {
masked_dst[k] = (and_in0[k] & mask);
}

auto convert_to_u4 = std::make_shared<v0::Convert>(masked_const, element::u4);
OutputVector outputs_10(convert_to_u4->get_output_size());
if (!convert_to_u4->constant_fold(outputs_10, masked_const->outputs())) {
return false;
}

auto new_convert = std::make_shared<v0::Convert>(new_const, bitwise_and->get_output_element_type(0));
copy_runtime_info_and_name(bitwise_and, {new_convert}, {input_node});
auto new_convert =
std::make_shared<v0::Convert>(outputs_10[0].get_node_shared_ptr(), bitwise_and->get_output_element_type(0));
copy_runtime_info_and_name(bitwise_and, {new_convert}, {unsqueeze_1_node});
replace_node(bitwise_and, new_convert);
return true;
};
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/debug_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ NodeDebugHelper::NodeDebugHelper(const primitive_inst& inst)
debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
std::string name = get_file_prefix() + layer_name + "_src" + std::to_string(i);
std::string name = get_file_prefix() + "_src" + std::to_string(i);
auto input_mem = m_inst.dep_memory_ptr(i);
if (input_mem == nullptr) {
GPU_DEBUG_COUT << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_npu/cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ if(NOT BUILD_SHARED_LIBS AND NOT ENABLE_MLIR_COMPILER AND NOT ENABLE_DRIVER_COMP
endif()

ov_dependent_option(ENABLE_IMD_BACKEND "Enable InferenceManagerDemo based NPU AL backend" OFF "NOT WIN32;NOT CMAKE_CROSSCOMPILING" OFF)

ov_dependent_option(ENABLE_INTEL_NPU_PROTOPIPE "Enable Intel NPU Protopipe tool" ON "ENABLE_INTEL_NPU_INTERNAL" OFF)
12 changes: 12 additions & 0 deletions src/plugins/intel_npu/thirdparty/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,15 @@ if(ENABLE_ZEROAPI_BACKEND)
add_library(LevelZero::NPUExt ALIAS level-zero-ext)
install(TARGETS level-zero-ext EXPORT "${PROJECT_NAME}Targets")
endif()

#
# yaml-cpp
#

if(ENABLE_INTEL_NPU_PROTOPIPE)
add_subdirectory(yaml-cpp EXCLUDE_FROM_ALL)
# NB: Suppress warnings in yaml-cpp
if(SUGGEST_OVERRIDE_SUPPORTED)
target_compile_options(yaml-cpp PRIVATE -Wno-suggest-override)
endif()
endif()
1 change: 1 addition & 0 deletions src/plugins/intel_npu/thirdparty/yaml-cpp
Submodule yaml-cpp added at da82fd
4 changes: 4 additions & 0 deletions src/plugins/intel_npu/tools/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,7 @@
add_subdirectory(common)
add_subdirectory(compile_tool)
add_subdirectory(single-image-test)

if (ENABLE_INTEL_NPU_PROTOPIPE)
add_subdirectory(protopipe)
endif()
72 changes: 72 additions & 0 deletions src/plugins/intel_npu/tools/protopipe/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#
# Copyright (C) 2023-2024 Intel Corporation.
# SPDX-License-Identifier: Apache 2.0
#

set(TARGET_NAME protopipe)

if (NOT DEFINED PROJECT_NAME)
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(protopipe_standalone)
include("cmake/standalone.cmake")
return()
endif()

#
# Dependencies
#

find_package(OpenCV QUIET COMPONENTS gapi)
if(OpenCV_VERSION VERSION_LESS 4.9)
message(STATUS "NPU ${TARGET_NAME} tool is disabled due to missing dependencies: gapi from OpenCV >= 4.9.")
return()
endif()

if (WIN32)
# WA: add_tool_target expects to have all dependencies as cmake targets.
add_library(winmm INTERFACE)
target_link_libraries(winmm INTERFACE "winmm.lib")
endif()

#
# Define the target
#

set(PROTOPIPE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)

ov_add_target(ADD_CPPLINT
TYPE EXECUTABLE
NAME ${TARGET_NAME}
ROOT ${CMAKE_CURRENT_SOURCE_DIR}
ADDITIONAL_SOURCE_DIRS ${PROTOPIPE_SOURCE_DIR}
INCLUDES ${PROTOPIPE_SOURCE_DIR}
LINK_LIBRARIES
PRIVATE
Threads::Threads
gflags
yaml-cpp
openvino::runtime
opencv_gapi
winmm)



set_target_properties(${TARGET_NAME} PROPERTIES
FOLDER ${CMAKE_CURRENT_SOURCE_DIR}
CXX_STANDARD 17)

#
# Install
#

install(TARGETS ${TARGET_NAME}
RUNTIME DESTINATION "tools/${TARGET_NAME}"
COMPONENT ${NPU_INTERNAL_COMPONENT}
${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL})

if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/README.md"
DESTINATION "tools/${TARGET_NAME}"
COMPONENT ${NPU_INTERNAL_COMPONENT}
${OV_CPACK_COMP_NPU_INTERNAL_EXCLUDE_ALL})
endif()
Loading

0 comments on commit 458c6ac

Please sign in to comment.