Skip to content

Commit

Permalink
Get more consistent distributions in parallel scenarios test (#8451)
Browse files Browse the repository at this point in the history
* Get more consistent distributions in parallel scenarios test

* Fix decile reporting
  • Loading branch information
abadams authored Nov 21, 2024
1 parent f566881 commit a1d4d19
Showing 1 changed file with 41 additions and 26 deletions.
67 changes: 41 additions & 26 deletions test/performance/parallel_scenarios.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,36 @@ int main(int argc, char **argv) {

int native_threads = Halide::Internal::JITSharedRuntime::get_num_threads();

std::map<std::tuple<bool, bool, int, int>, std::vector<float>> results;

auto bench = [&](bool m, bool c, int i, int o) {
const int num_samples = 128;
const int memory_limit = m ? max_memory : 128;

auto now = std::chrono::high_resolution_clock::now;
auto to_ns = [](auto delta) { return 1e9 * std::chrono::duration<float>(delta).count(); };

auto bench_one = [&]() {
auto t1 = std::chrono::high_resolution_clock::now();
// Ignore error code because default halide_error() will abort on failure
auto t1 = now();
(void)callable(i, o, memory_limit, in, out);
auto t2 = std::chrono::high_resolution_clock::now();
return 1e9 * std::chrono::duration<float>(t2 - t1).count() / (i * o);
auto t2 = now();
return to_ns(t2 - t1) / (i * o);
};

std::vector<float> times(num_samples);
const int num_tasks = 8;
const int min_samples = 32;

std::vector<float> times[num_tasks];
if (c) {
Halide::Tools::ThreadPool<void> thread_pool;
const int num_tasks = 8;
const int samples_per_task = num_samples / num_tasks;
Halide::Internal::JITSharedRuntime::set_num_threads(num_tasks * native_threads);
std::vector<std::future<void>> futures(num_tasks);
for (size_t t = 0; t < futures.size(); t++) {
futures[t] = thread_pool.async(
[&](size_t t) {
bench_one();
for (int s = 0; s < samples_per_task; s++) {
size_t idx = t * samples_per_task + s;
times[idx] = bench_one();
auto t_start = now();
while (to_ns(now() - t_start) < 1e7 || times[t].size() < min_samples / num_tasks) {
times[t].push_back(bench_one());
}
},
t);
Expand All @@ -67,32 +71,43 @@ int main(int argc, char **argv) {
} else {
Halide::Internal::JITSharedRuntime::set_num_threads(native_threads);
bench_one();
for (int s = 0; s < num_samples; s++) {
times[s] = bench_one();
auto t_start = now();
while (to_ns(now() - t_start) < 1e7 || times[0].size() < min_samples) {
times[0].push_back(bench_one());
}
}
std::sort(times.begin(), times.end());
printf("%d %d %d %d ", m, c, i, o);
const int n = 8;
int off = (num_samples / n) / 2;
for (int i = 0; i < n; i++) {
printf("%g ", times[off + (num_samples * i) / n]);

std::vector<float> &r = results[{m, c, i, o}];
for (int i = 0; i < num_tasks; i++) {
r.insert(r.end(), times[i].begin(), times[i].end());
}
printf("\n");
};

// The output is designed to be copy-pasted into a spreadsheet, not read by a human
printf("memory_bound contended inner outer t0 t1 t2 t3 t4 t5 t7\n");
for (bool contended : {false, true}) {
for (bool memory_bound : {false, true}) {
for (int i : {1 << 0, 1 << 6, 1 << 12, 1 << 18}) {
for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) {
bench(memory_bound, contended, i, o);
printf("memory_bound contended inner outer num_samples 10%% 20%% 30%% 40%% 50%% 60%% 70%% 80%% 90%%\n");
for (int repeat = 0; repeat < 10; repeat++) {
for (bool contended : {false, true}) {
for (bool memory_bound : {false, true}) {
for (int i : {1 << 6, 1 << 9, 1 << 12, 1 << 15}) {
for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) {
bench(memory_bound, contended, i, o);
}
}
}
}
}

for (auto p : results) {
auto &times = p.second;
std::sort(times.begin(), times.end());
auto [m, c, i, o] = p.first;
printf("%d %d %d %d %d ", m, c, i, o, (int)times.size());
for (int decile = 10; decile <= 90; decile += 10) {
printf("%g ", times[(decile * times.size()) / 100]);
}
printf("\n");
}

printf("Success!\n");

return 0;
Expand Down

0 comments on commit a1d4d19

Please sign in to comment.