-
Notifications
You must be signed in to change notification settings - Fork 0
/
5_general_optimizations.c
74 lines (63 loc) · 2.13 KB
/
5_general_optimizations.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <stdio.h>
#include <stdint.h>
#include <arm_neon.h>
#define K 16
#define SCALE_FACTOR (1 << (K - 1))
int main()
{
// conversion to fixed-point notation
double real_input = 22;
register int32_t M = (int32_t)(real_input * SCALE_FACTOR);
// init to 0
register int32_t f = 0;
// defining the LUT arrays separately
// from python script
int32_t LUT_array1[4] = {32768, 19168, 10548, 5568};
int32_t LUT_array2[4] = {2865, 1454, 732, 367};
int32_t LUT_array3[4] = {184, 92, 46, 23};
int32_t LUT_array4[4] = {11, 5, 2, -1}; // -1 to represent we don't use this place of the array, but have the space (for alignment)
// load the LUT into NEON vectors
int32x4_t LUT_vec[4] = {
// v1
vld1q_s32(LUT_array1),
// v2
vld1q_s32(LUT_array2),
// v3
vld1q_s32(LUT_array3),
// v4 with dummy padding of 128/4 on the right side (32 bit)
vld1q_s32(LUT_array4)};
// normalization of M to range
int shifts = 0;
while (M >= SCALE_FACTOR)
{
// keep shifting till within range
M >>= 1;
// inc shift to make reversible later
shifts++;
}
// double loop
for (register int i = 0; !(i & 16); i += 2)
{
// NEON to LUT value #1 for unroll 1
int32_t LUT_val1 = vgetq_lane_s32(LUT_vec[i >> 2], i & 3);
// #1 unrolled iter
register int32_t u1 = M + (M >> i);
register lteSF1 = u1 <= SCALE_FACTOR;
M = lteSF1 ? u1 : M;
f = lteSF1 ? f - LUT_val1 : f;
// prepare for the next iteration within the current one #2 one
if (!((i + 2) & 16))
{
// NEON to LUT value #2 for unroll 2
int32_t LUT_val2 = vgetq_lane_s32(LUT_vec[(i + 1) >> 2], (i + 1) & 3);
register int32_t u2 = M + (M >> (i + 1));
register lteSF2 = u2 <= SCALE_FACTOR;
M = lteSF2 ? u2 : M;
f = lteSF2 ? f - LUT_val2 : f;
}
}
// denormalize the fixed-point value
f += shifts << 15; // K - 1 = 15
printf("optimized fp ccm log2(%f) = %f\n", 0.6, (double)f / SCALE_FACTOR);
return 0;
}