Skip to content

Commit

Permalink
Neon: Intrinsics impl of h2v1 & h2v2 merged upsamp
Browse files Browse the repository at this point in the history
There was no previous GAS implementation.

This commit also reverts 40557b2 and
7723d7f.
7723d7f was only necessary because
there was no Neon implementation of merged upsampling/color conversion,
and 40557b2 was only necessary because
of 7723d7f.
  • Loading branch information
jwright-arm authored and dcommander committed Nov 11, 2020
1 parent 240ba41 commit ba52a3d
Show file tree
Hide file tree
Showing 8 changed files with 1,029 additions and 47 deletions.
38 changes: 4 additions & 34 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -800,29 +800,7 @@ else()
set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
# Since v1.5.1, libjpeg-turbo uses the separate non-fancy upsampling and
# YCbCr -> RGB color conversion routines rather than merged upsampling/color
# conversion when fancy upsampling is disabled on platforms that have a SIMD
# implementation of YCbCr -> RGB color conversion but no SIMD implementation
# of merged upsampling/color conversion. This was intended to improve the
# performance of the Arm Neon SIMD extensions, the only SIMD extensions for
# which those circumstances currently apply. The separate non-fancy
# upsampling and color conversion routines usually produce bitwise-identical
# output to the merged upsampling/color conversion routines, but that is not
# the case when skipping scanlines starting at an odd-numbered scanline. In
# libjpeg-turbo 2.0.5 and prior, doing that while using merged h2v2
# upsampling caused a segfault, so this test validates the fix for that
# segfault. Unfortunately, however, the test also produces different bitwise
# output when using the Neon SIMD extensions, because of the aforementioned
# optimization. The easiest workaround is to use the old test from
# libjpeg-turbo 2.0.5 and prior when using the Neon SIMD extensions. The
# aforementioned segfault never would have occurred with the Neon SIMD
# extensions anyhow, since merged upsampling is disabled when using them.
if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
else()
set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
endif()
set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
Expand Down Expand Up @@ -1223,17 +1201,9 @@ foreach(libtype ${TEST_LIBTYPES})

if(WITH_ARITH_DEC)
# CC: RGB->YCC SAMP: h2v2 merged IDCT: ifast ENT: arith
if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
# Refer to the comment above the definition of MD5_PPM_420M_IFAST_ARI for
# an explanation of why this is necessary.
add_bittest(djpeg 420m-ifast-ari "-fast;-ppm"
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
${MD5_PPM_420M_IFAST_ARI})
else()
add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
${MD5_PPM_420M_IFAST_ARI})
endif()
add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
${MD5_PPM_420M_IFAST_ARI})

add_bittest(jpegtran 420-islow ""
testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
Expand Down
12 changes: 0 additions & 12 deletions jdmaster.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "jpeglib.h"
#include "jpegcomp.h"
#include "jdmaster.h"
#include "jsimd.h"


/*
Expand Down Expand Up @@ -70,17 +69,6 @@ use_merged_upsample(j_decompress_ptr cinfo)
cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
return FALSE;
#ifdef WITH_SIMD
/* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
isn't, then disabling merged upsampling is likely to be faster when
decompressing YCbCr JPEG images. */
if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
(cinfo->out_color_space == JCS_RGB ||
(cinfo->out_color_space >= JCS_EXT_RGB &&
cinfo->out_color_space <= JCS_EXT_ARGB)))
return FALSE;
#endif
/* ??? also need to test for upsample-time rescaling, when & if supported */
return TRUE; /* by golly, it'll work... */
#else
Expand Down
3 changes: 2 additions & 1 deletion simd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,8 @@ endif()
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/gastest.S)

set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
arm/jdsample-neon.c arm/jfdctfst-neon.c arm/jquanti-neon.c)
arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
arm/jquanti-neon.c)
if(NEON_INTRINSICS)
set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
endif()
Expand Down
84 changes: 84 additions & 0 deletions simd/arm/aarch32/jsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -501,25 +501,109 @@ jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
init_simd();

/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;

if (simd_support & JSIMD_NEON)
return 1;

return 0;
}

GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
init_simd();

/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;

if (simd_support & JSIMD_NEON)
return 1;

return 0;
}

GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);

switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
}

neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}

GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);

switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
}

neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}

GLOBAL(int)
Expand Down
84 changes: 84 additions & 0 deletions simd/arm/aarch64/jsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -569,25 +569,109 @@ jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
GLOBAL(int)
jsimd_can_h2v2_merged_upsample(void)
{
init_simd();

/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;

if (simd_support & JSIMD_NEON)
return 1;

return 0;
}

GLOBAL(int)
jsimd_can_h2v1_merged_upsample(void)
{
init_simd();

/* The code is optimised for these values only */
if (BITS_IN_JSAMPLE != 8)
return 0;
if (sizeof(JDIMENSION) != 4)
return 0;

if (simd_support & JSIMD_NEON)
return 1;

return 0;
}

GLOBAL(void)
jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);

switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
break;
}

neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}

GLOBAL(void)
jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
{
void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);

switch (cinfo->out_color_space) {
case JCS_EXT_RGB:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
case JCS_EXT_RGBX:
case JCS_EXT_RGBA:
neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
break;
case JCS_EXT_BGR:
neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
break;
case JCS_EXT_BGRX:
case JCS_EXT_BGRA:
neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
break;
case JCS_EXT_XBGR:
case JCS_EXT_ABGR:
neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
break;
case JCS_EXT_XRGB:
case JCS_EXT_ARGB:
neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
break;
default:
neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
break;
}

neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
}

GLOBAL(int)
Expand Down
Loading

0 comments on commit ba52a3d

Please sign in to comment.