Skip to content

Commit

Permalink
Fix QB4 segfault on Sandy Bridge/Ivy Bridge
Browse files Browse the repository at this point in the history
- Replace aligned vksum load with unaligned for QB4

PiperOrigin-RevId: 724769160
  • Loading branch information
fbarchard authored and xnnpack-bot committed Feb 9, 2025
1 parent 3acb27f commit e55b399
Show file tree
Hide file tree
Showing 25 changed files with 25 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));

__m128 vinput_zero_point0_float = _mm_cvtepi32_ps(vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__avx_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));

__m128 vinput_zero_point0_float = _mm_cvtepi32_ps(vinput_zero_point0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse2_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
__m128i vinput_zero_point0 = _mm_cvtsi32_si128(*((const int*) &quantization_params[0].zero_point));
vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point0, _MM_SHUFFLE(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse2_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
__m128i vinput_zero_point0 = _mm_cvtsi32_si128(*((const int*) &quantization_params[0].zero_point));
vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point0, _MM_SHUFFLE(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse41_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
__m128i vinput_zero_point0 = _mm_cvtsi32_si128(*((const int*) &quantization_params[0].zero_point));
vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point0, _MM_SHUFFLE(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_1x4c8__sse41_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
__m128i vinput_zero_point0 = _mm_cvtsi32_si128(*((const int*) &quantization_params[0].zero_point));
vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point0, _MM_SHUFFLE(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__avx_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__avx_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse2_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse2_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse41_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_2x4c8__sse41_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__avx_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));
const __m128i vinput_zero_point2 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[2].zero_point));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__avx_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));
const __m128i vinput_zero_point2 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[2].zero_point));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse2_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse2_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse41_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_3x4c8__sse41_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));
const __m128i vinput_zero_point2 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[2].zero_point));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__avx_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point0 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[0].zero_point));
const __m128i vinput_zero_point1 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[1].zero_point));
const __m128i vinput_zero_point2 = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[2].zero_point));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse2_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse2_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse41_ld128(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void xnn_qd8_f32_qb4w_gemm_minmax_ukernel_4x4c8__sse41_ld64(
XNN_FORCE_REALIZATION(vmask);

do {
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
const __m128i vinput_zero_point01 = _mm_loadu_si128((const __m128i*) &quantization_params[0]);
const __m128i vinput_zero_point0 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(0, 0, 0, 0));
const __m128i vinput_zero_point1 = _mm_shuffle_epi32(vinput_zero_point01, _MM_SHUFFLE(2, 2, 2, 2));
Expand Down
7 changes: 1 addition & 6 deletions src/qs8-gemm/MRx4c8-sse.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne
__m128i vacc${M}x2 = _mm_unpacklo_epi64(vksum23${M}, vzero);
__m128i vacc${M}x3 = _mm_unpackhi_epi64(vksum23${M}, vzero);
$elif DATATYPE == "QB4":
const __m128 vksum = _mm_load_ps((const float*) w);
const __m128 vksum = _mm_loadu_ps((const float*) w);
$if AVX:
$for M in range(MR):
const __m128i vinput_zero_point${M} = _mm_castps_si128(_mm_broadcast_ss((const float*) &quantization_params[${M}].zero_point));
Expand Down Expand Up @@ -404,11 +404,6 @@ void xnn_${DATATYPE_SPEC}_gemm${GEMM_SUFFIX}_minmax${REQUANTIZATION_SPEC}_ukerne
$# SSE 2 only
$else:
${_}const __m128i vxb${N} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${N}, vb${N}), 8);
$elif VARIANT == "EXTENDED":
$if N == 0:
${_}const __m128i vxb${N} = _mm_load_si128((const __m128i*) w);
$else:
${_}const __m128i vxb${N} = _mm_load_si128((const __m128i*) ((const int16_t*) w + ${N * 8}));

$for M in range(MR):
${_}vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N}));
Expand Down

0 comments on commit e55b399

Please sign in to comment.