Skip to content

Commit 8573fc6

Browse files
committed
Finish opt version
1 parent 117e3d5 commit 8573fc6

File tree

1 file changed

+32
-45
lines changed

1 file changed

+32
-45
lines changed

src/opt.cc

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#include <cstdint>
77
#include <cstdio>
88
#include <cstdlib>
9-
#include <omp.h>
109
#include <random>
1110
#include "random.h"
1211

@@ -20,91 +19,79 @@
2019
using namespace std;
2120

2221
void opt_computation(
23-
const size_t num,
24-
const uint32_t k,
25-
const uint32_t c,
26-
const uint32_t d,
27-
const uint32_t e,
28-
const uint32_t *__restrict__ a,
29-
const uint32_t *__restrict__ b,
22+
size_t num,
23+
uint32_t k,
24+
uint32_t c,
25+
uint32_t d,
26+
uint32_t e,
27+
uint32_t *__restrict__ a,
28+
uint32_t *__restrict__ b,
3029
uint32_t *__restrict__ n,
3130
uint32_t *__restrict__ x,
3231
uint32_t *__restrict__ min,
3332
uint32_t *__restrict__ max,
3433
uint32_t *__restrict__ count
3534
) {
35+
n = (uint32_t *)__builtin_assume_aligned(n, 32);
3636
for (size_t j = 0; j < num; ++j)
3737
n[j] = (1 << n[j]) - 1;
3838

39-
const uint32_t *__restrict__ p_a;
40-
const uint32_t *__restrict__ p_b;
41-
const uint32_t *__restrict__ p_n;
42-
uint32_t *__restrict__ p_x;
43-
uint32_t *__restrict__ p_min;
44-
uint32_t *__restrict__ p_max;
45-
uint32_t *__restrict__ p_count;
39+
a = (uint32_t *)__builtin_assume_aligned(a, 32);
40+
b = (uint32_t *)__builtin_assume_aligned(b, 32);
41+
x = (uint32_t *)__builtin_assume_aligned(x, 32);
42+
min = (uint32_t *)__builtin_assume_aligned(min, 32);
43+
max = (uint32_t *)__builtin_assume_aligned(max, 32);
44+
count = (uint32_t *)__builtin_assume_aligned(count, 32);
4645

47-
uint32_t dist, shift;
46+
uint32_t dist;
4847
/* loop tiling - main */
49-
#pragma omp parallel for default(shared) num_threads(12) \
50-
private(dist, shift, p_a, p_b, p_x, p_n, p_min, p_max, p_count)
5148
for (size_t j1 = 0; j1 < num - BF; j1 += BF) {
52-
shift = BF * j1;
53-
p_a = a + shift;
54-
p_b = b + shift;
55-
p_x = x + shift;
56-
p_n = n + shift;
57-
p_min = min + shift;
58-
p_max = max + shift;
59-
p_count = count + shift;
60-
6149
for (size_t i = 0; i < k; ++i) {
6250
for (size_t j = 0; j < BF; ++j) {
6351
/* compute next value */
64-
p_x[j] = (p_a[j] * p_x[j] + p_b[j]) & p_n[j];
52+
x[j] = (a[j] * x[j] + b[j]) & n[j];
6553

6654
/* check if x is in interval */
67-
p_count[j] += (c <= p_x[j] && p_x[j] <= d) ? 1 : 0;
55+
count[j] += (c <= x[j] && x[j] <= d) ? 1 : 0;
6856

6957
/* compute hamming distance */
70-
dist = p_x[j] ^ e;
58+
dist = x[j] ^ e;
7159
dist = dist - ((dist >> 1) & 0x55555555);
7260
dist = (dist & 0x33333333) + ((dist >> 2) & 0x33333333);
7361
dist = (((dist + (dist >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
7462

7563
/* check minimal hamming distance */
76-
p_min[j] = (p_min[j] < dist) ? p_min[j] : dist;
77-
p_max[j] = (p_max[j] > dist) ? p_max[j] : dist;
64+
min[j] = (min[j] < dist) ? min[j] : dist;
65+
max[j] = (max[j] > dist) ? max[j] : dist;
7866
}
7967
}
68+
a += BF;
69+
b += BF;
70+
x += BF;
71+
n += BF;
72+
min += BF;
73+
max += BF;
74+
count += BF;
8075
}
8176

8277
/* loop tiling - the rest */
83-
shift = BF * (num / BF);
84-
p_a = a + shift;
85-
p_b = b + shift;
86-
p_x = x + shift;
87-
p_n = n + shift;
88-
p_min = min + shift;
89-
p_max = max + shift;
90-
p_count = count + shift;
9178
for (size_t i = 0; i < k; ++i) {
9279
for (size_t j = 0; j < num % BF; ++j) {
9380
/* compute next value */
94-
p_x[j] = (p_a[j] * p_x[j] + p_b[j]) & p_n[j];
81+
x[j] = (a[j] * x[j] + b[j]) & n[j];
9582

9683
/* check if x is in interval */
97-
p_count[j] += (c <= p_x[j] && p_x[j] <= d) ? 1 : 0;
84+
count[j] += (c <= x[j] && x[j] <= d) ? 1 : 0;
9885

9986
/* compute hamming distance */
100-
dist = p_x[j] ^ e;
87+
dist = x[j] ^ e;
10188
dist = dist - ((dist >> 1) & 0x55555555);
10289
dist = (dist & 0x33333333) + ((dist >> 2) & 0x33333333);
10390
dist = (((dist + (dist >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
10491

10592
/* check minimal hamming distance */
106-
p_min[j] = (p_min[j] < dist) ? p_min[j] : dist;
107-
p_max[j] = (p_max[j] > dist) ? p_max[j] : dist;
93+
min[j] = (min[j] < dist) ? min[j] : dist;
94+
max[j] = (max[j] > dist) ? max[j] : dist;
10895
}
10996
}
11097
}

0 commit comments

Comments
 (0)