Skip to content

Commit

Permalink
NEON SIMD code.
Browse files Browse the repository at this point in the history
In tests on my pi3b+, a release build of my ghostscript integration
takes 2 minutes 27 seconds to render a PDF and OCR it with the
vanilla sources. With this NEON coded added the time drops to 37
seconds.

I have not tested the configure/Makefile changes as I'm not using
them.
  • Loading branch information
robinwatts committed May 20, 2020
1 parent 3a3c41d commit f79e52a
Show file tree
Hide file tree
Showing 7 changed files with 222 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@ libtesseract_la_LIBADD += libtesseract_sse.la
noinst_LTLIBRARIES += libtesseract_sse.la
endif

if HAVE_NEON
libtesseract_neon_la_CXXFLAGS = -mfpu=neon
libtesseract_neon_la_SOURCES = src/arch/intsimdmatrixneon.cpp
libtesseract_la_LIBADD += libtesseract_neon.la
noinst_LTLIBRARIES += libtesseract_neon.la
endif

libtesseract_la_SOURCES += src/arch/intsimdmatrix.cpp
libtesseract_la_SOURCES += src/arch/simddetect.cpp

Expand Down
6 changes: 6 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ if $sse41; then
AC_DEFINE([HAVE_SSE4_1], [1], [Enable SSE 4.1 instructions])
fi

AX_CHECK_COMPILE_FLAG([-mfpu=neon], [neon=true], [neon=false], [$WERROR])
AM_CONDITIONAL([HAVE_NEON], $neon)
if $neon; then
AC_DEFINE([HAVE_NEON], [1], [Enable NEON instructions])
fi

AX_CHECK_COMPILE_FLAG([-march=native], [arch_native=true], [arch_native=false], [$WERROR])
AM_CONDITIONAL([MARCH_NATIVE_OPT], $arch_native)

Expand Down
1 change: 1 addition & 0 deletions src/api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ static void PrintVersionInfo() {
if (tesseract::SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n");
if (tesseract::SIMDDetect::IsFMAAvailable()) printf(" Found FMA\n");
if (tesseract::SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n");
if (tesseract::SIMDDetect::IsNEONAvailable()) printf(" Found NEON\n");
#ifdef _OPENMP
printf(" Found OpenMP %d\n", _OPENMP);
#endif
Expand Down
1 change: 1 addition & 0 deletions src/arch/intsimdmatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ struct IntSimdMatrix {
static const IntSimdMatrix* intSimdMatrix;
static const IntSimdMatrix intSimdMatrixAVX2;
static const IntSimdMatrix intSimdMatrixSSE;
static const IntSimdMatrix intSimdMatrixNEON;
};

} // namespace tesseract
Expand Down
161 changes: 161 additions & 0 deletions src/arch/intsimdmatrixneon.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
///////////////////////////////////////////////////////////////////////
// File: intsimdmatrixneon.cpp
// Description: matrix-vector product for 8-bit data on neon.
// Author: Robin Watts (from the AVX2 original by Ray Smith)
//
// (C) Copyright 2017, Google Inc.
// (C) Copyright 2020, Artifex Software Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
///////////////////////////////////////////////////////////////////////

#if !defined(__ARM_NEON)
//#error Implementation only for NEON capable architectures
#else

#include "intsimdmatrix.h"

#include "arm_neon.h"
#include <cstdint>
#include <algorithm>
#include <vector>

namespace tesseract {

// Number of outputs held in each register. (Actually, we use a
// pair of 4x32 registers, so 8 x 32 bit ints).
constexpr int kNumOutputsPerRegister = 8;
// Maximum number of registers that we will use.
constexpr int kMaxOutputRegisters = 1;
// Number of inputs in the inputs register.
constexpr int kNumInputsPerRegister = 8;
// Number of inputs in each weight group.
constexpr int kNumInputsPerGroup = 8;

// Function to compute part of a matrix.vector multiplication. The weights
// are in a very specific order (see above) in w, which is multiplied by
// u of length num_in, to produce output v after scaling the integer results
// by the corresponding member of scales.
// The amount of w and scales consumed is fixed and not available to the
// caller.

// Computes part of matrix.vector v = Wu. Computes N=8 results.
// The weights *must* be arranged so that consecutive reads from wi
// provides (num_in/kNumInputsPerGroup groups of (N output dim groups of
// (kNumInputsPerGroup inputs))). After that there must be N consecutive
// bias weights, before continuing with any more weights.
// u must be padded out with zeros to
// kNumInputsPerGroup*ceil(num_in/kNumInputsPerGroup) elements.
static void PartialMatrixDotVector8(const int8_t* __restrict wi, const double* __restrict scales,
const int8_t* __restrict u, int num_in,
double* __restrict v) {
// Initialize all the results to 0.
int32x4_t result0123 = { 0, 0, 0, 0 };
int32x4_t result4567 = { 0, 0, 0, 0 };
// Iterate over the input (u), one registerful at a time.
for (int j = 0; j < num_in; j += 8) {
int8x8_t vu = vld1_s8(u); // vu = u0 u1 u2 u3 u4 u5 u6 u7
int8x16_t vw01 = vld1q_s8(wi); // vw0 = w00 w01 w02 w03 w04 w05 w06 w07 w10 w11 w12 w13 w14 w15 w16 w17
int8x16_t vw23 = vld1q_s8(wi+8*2); // vw2 = w20 w21 w22 w23 w24 w25 w26 w27 w30 w31 w32 w33 w34 w35 w36 w37
int8x16_t vw45 = vld1q_s8(wi+8*4); // vw4 = w40 w41 w42 w43 w44 w45 w46 w47 w50 w51 w52 w53 w54 w55 w56 w57
int8x16_t vw67 = vld1q_s8(wi+8*6); // vw6 = w60 w61 w62 w63 w64 w65 w66 w67 w70 w71 w72 w73 w74 w75 w76 w77

int16x8_t vrow0q = vmull_s8(vget_low_s8(vw01), vu); // vrow0q = vw00.u0 w01.u1 w02.u2 w03.u3 vw04.u4 w05.u5 w06.u6 w07.u7
int16x8_t vrow1q = vmull_s8(vget_high_s8(vw01), vu); // vrow1q = vw10.u0 w11.u1 w12.u2 w13.u3 vw14.u4 w15.u5 w16.u6 w17.u7
int16x8_t vrow2q = vmull_s8(vget_low_s8(vw23), vu); // vrow2q = vw20.u0 w21.u1 w22.u2 w23.u3 vw24.u4 w25.u5 w26.u6 w27.u7
int16x8_t vrow3q = vmull_s8(vget_high_s8(vw23), vu); // vrow3q = vw30.u0 w31.u1 w32.u2 w33.u3 vw34.u4 w35.u5 w36.u6 w37.u7
int16x8_t vrow4q = vmull_s8(vget_low_s8(vw45), vu); // vrow4q = vw40.u0 w41.u1 w42.u2 w43.u3 vw44.u4 w45.u5 w46.u6 w47.u7
int16x8_t vrow5q = vmull_s8(vget_high_s8(vw45), vu); // vrow5q = vw50.u0 w51.u1 w52.u2 w53.u3 vw54.u4 w55.u5 w56.u6 w57.u7
int16x8_t vrow6q = vmull_s8(vget_low_s8(vw67), vu); // vrow6q = vw60.u0 w61.u1 w62.u2 w63.u3 vw64.u4 w65.u5 w66.u6 w67.u7
int16x8_t vrow7q = vmull_s8(vget_high_s8(vw67), vu); // vrow7q = vw70.u0 w71.u1 w72.u2 w73.u3 vw74.u4 w75.u5 w76.u6 w77.u7

int32x4_t vrow0q2 = vpaddlq_s16(vrow0q); // vrow0q2 = vw00.u0+w01.u1 w02.u2+w03.u3 vw04.u4+w05.u5 w06.u6+w07.u7
int32x4_t vrow1q2 = vpaddlq_s16(vrow1q); // vrow1q2 = vw10.u0+w11.u1 w12.u2+w13.u3 vw14.u4+w15.u5 w16.u6+w17.u7
int32x4_t vrow2q2 = vpaddlq_s16(vrow2q); // vrow2q2 = vw20.u0+w21.u1 w22.u2+w23.u3 vw24.u4+w25.u5 w26.u6+w27.u7
int32x4_t vrow3q2 = vpaddlq_s16(vrow3q); // vrow3q2 = vw30.u0+w31.u1 w32.u2+w33.u3 vw34.u4+w35.u5 w36.u6+w37.u7
int32x4_t vrow4q2 = vpaddlq_s16(vrow4q); // vrow4q2 = vw40.u0+w41.u1 w42.u2+w43.u3 vw44.u4+w45.u5 w46.u6+w47.u7
int32x4_t vrow5q2 = vpaddlq_s16(vrow5q); // vrow5q2 = vw50.u0+w51.u1 w52.u2+w53.u3 vw54.u4+w55.u5 w56.u6+w57.u7
int32x4_t vrow6q2 = vpaddlq_s16(vrow6q); // vrow6q2 = vw60.u0+w61.u1 w62.u2+w63.u3 vw64.u4+w65.u5 w66.u6+w67.u7
int32x4_t vrow7q2 = vpaddlq_s16(vrow7q); // vrow7q2 = vw70.u0+w71.u1 w72.u2+w73.u3 vw74.u4+w75.u5 w76.u6+w77.u7

vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
vpadd_s32(vget_low_s32(vrow1q2), vget_high_s32(vrow1q2)));
// vrow0q2 = vw00.u0+...+w03.u3 vw04.u4+...+w07.u7 vw10.u0+...+w13.u3 vw14.u4+...+w17.u7
vrow2q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)),
vpadd_s32(vget_low_s32(vrow3q2), vget_high_s32(vrow3q2)));
// vrow0q2 = vw20.u0+...+w23.u3 vw24.u4+...+w27.u7 vw30.u0+...+w33.u3 vw34.u4+...+w37.u7
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
vpadd_s32(vget_low_s32(vrow5q2), vget_high_s32(vrow5q2)));
// vrow0q2 = vw40.u0+...+w43.u3 vw44.u4+...+w47.u7 vw50.u0+...+w53.u3 vw54.u4+...+w57.u7
vrow6q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)),
vpadd_s32(vget_low_s32(vrow7q2), vget_high_s32(vrow7q2)));
// vrow0q2 = vw60.u0+...+w63.u3 vw64.u4+...+w67.u7 vw70.u0+...+w73.u3 vw74.u4+...+w77.u7

vrow0q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow0q2), vget_high_s32(vrow0q2)),
vpadd_s32(vget_low_s32(vrow2q2), vget_high_s32(vrow2q2)));
// vrow0q2 = vw00.u0+...+w07.u7 vw10.u0+...+w17.u7 vw20.u0+...+w27.u7 vw30.u0+...+w37.u7
vrow4q2 = vcombine_s32(vpadd_s32(vget_low_s32(vrow4q2), vget_high_s32(vrow4q2)),
vpadd_s32(vget_low_s32(vrow6q2), vget_high_s32(vrow6q2)));
// vrow0q2 = vw40.u0+...+w47.u7 vw50.u0+...+w57.u7 vw60.u0+...+w67.u7 vw70.u0+...+w77.u7

result0123 = vaddq_s32(result0123, vrow0q2);
result4567 = vaddq_s32(result4567, vrow4q2);
u += 8;
wi += 64;
}
*v++ = (static_cast<double>(vget_lane_s32(vget_low_s32 (result0123), 0)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_low_s32 (result0123), 1)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_high_s32(result0123), 0)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_high_s32(result0123), 1)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_low_s32 (result4567), 0)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_low_s32 (result4567), 1)) / INT8_MAX + *wi++) * *scales++;
*v++ = (static_cast<double>(vget_lane_s32(vget_high_s32(result4567), 0)) / INT8_MAX + *wi++) * *scales++;
*v = (static_cast<double>(vget_lane_s32(vget_high_s32(result4567), 1)) / INT8_MAX + *wi ) * *scales;
}

static void matrixDotVector(int dim1, int dim2, const int8_t* wi,
const double* scales, const int8_t* u, double* v) {
const int num_out = dim1;
const int num_in = dim2 - 1;
// Each call to a partial_func_ produces group_size outputs, except the
// last one, which can produce less.
const int rounded_num_in =
IntSimdMatrix::Roundup(num_in, kNumInputsPerGroup);
const int rounded_num_out =
IntSimdMatrix::Roundup(num_out, kNumOutputsPerRegister);
int group_size = kNumOutputsPerRegister * kMaxOutputRegisters;
int output = 0;

int w_step = (rounded_num_in + 1) * group_size;

for (; output + group_size <= rounded_num_out; output += group_size) {
PartialMatrixDotVector8(wi, scales, u, rounded_num_in, v);
wi += w_step;
scales += group_size;
v += group_size;
}
}

const IntSimdMatrix IntSimdMatrix::intSimdMatrixNEON = {
// Function.
matrixDotVector,
// Number of 32 bit outputs held in each register.
kNumOutputsPerRegister,
// Maximum number of registers that we will use to hold outputs.
kMaxOutputRegisters,
// Number of 8 bit inputs in the inputs register.
kNumInputsPerRegister,
// Number of inputs in each weight group.
kNumInputsPerGroup
};

} // namespace tesseract.

#endif
40 changes: 40 additions & 0 deletions src/arch/simddetect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@
#endif
#endif

#ifdef HAVE_NEON
#ifdef ANDROID
#include <cpufeatures.h>
#else
/* Assume linux */
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
#endif

namespace tesseract {

// Computes and returns the dot product of the two n-vectors u and v.
Expand Down Expand Up @@ -65,6 +75,8 @@ bool SIMDDetect::avx512BW_available_;
bool SIMDDetect::fma_available_;
// If true, then SSe4.1 has been detected.
bool SIMDDetect::sse_available_;
// If true, then Neon has been detected.
bool SIMDDetect::neon_available_;

// Computes and returns the dot product of the two n-vectors u and v.
static double DotProductGeneric(const double* u, const double* v, int n) {
Expand Down Expand Up @@ -160,6 +172,29 @@ SIMDDetect::SIMDDetect() {
#else
#error "I don't know how to test for SIMD with this compiler"
#endif
#endif

#ifdef HAVE_NEON
#ifdef ANDROID
{
AndroidCpuFamily family = android_getCpuFamily();
if (family == ANDROID_CPU_FAMILY_ARM)
neon_available_ = (android_getCpuFeatures() &
ANDROID_CPU_ARM_FEATURE_NEON);
#if 0
/* There is no NEON flag to test on ARM64. A wild guess would have
* it being the ASIMD flag, but I'll leave this disabled for now. */
else if (family == ANDROID_CPU_FAMILY_ARM64)
neon_available_ = (android_getCpuFeatures() &
ANDROID_CPU_ARM64_FEATURE_ASIMD);
#endif
else
neon_available_ = false;
}
#else
/* Assume linux */
neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
#endif
#endif

// Select code for calculation of dot product based on autodetection.
Expand All @@ -179,6 +214,11 @@ SIMDDetect::SIMDDetect() {
} else if (sse_available_) {
// SSE detected.
SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
#endif
#if defined(HAVE_NEON)
} else if (neon_available_) {
// NEON detected.
SetDotProduct(DotProduct, &IntSimdMatrix::intSimdMatrixNEON);
#endif
}
}
Expand Down
6 changes: 6 additions & 0 deletions src/arch/simddetect.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ class SIMDDetect {
static inline bool IsSSEAvailable() {
return detector.sse_available_;
}
// Returns true if NEON is available on this system.
static inline bool IsNEONAvailable() {
return detector.neon_available_;
}

// Update settings after config variable was set.
static TESS_API void Update();
Expand All @@ -74,6 +78,8 @@ class SIMDDetect {
static TESS_API bool fma_available_;
// If true, then SSe4.1 has been detected.
static TESS_API bool sse_available_;
// If true, then NEON has been detected.
static TESS_API bool neon_available_;
};

} // namespace tesseract
Expand Down

0 comments on commit f79e52a

Please sign in to comment.