Skip to content

Commit a7cdef6

Browse files
authored
Speed up SSE4.1 test by splitting individual unrolled blocks into their own functions. (#24401)
Before: ``` test_sse4_1 (test_core.core_2gb) ... ok (5.89s) test_sse4_1 (test_core.lsan) ... ok (5.94s) test_sse4_1 (test_core.minimal0) ... ok (5.98s) test_sse4_1 (test_core.strict_js) ... ok (5.98s) test_sse4_1 (test_core.strict) ... ok (6.13s) test_sse4_1 (test_core.bigint) ... ok (6.13s) test_sse4_1 (test_core.core0) ... ok (6.17s) test_sse4_1 (test_core.core1) ... ok (6.36s) test_sse4_1 (test_core.instance) ... ok (6.41s) test_sse4_1 (test_core.asan) ... ok (9.62s) test_sse4_1 (test_core.wasmfs) ... ok (10.01s) test_sse4_1 (test_core.core2) ... ok (10.12s) test_sse4_1 (test_core.corez) ... ok (11.15s) test_sse4_1 (test_core.cores) ... ok (37.72s) test_sse4_1 (test_core.core3) ... ok (140.27s) Total core time: 273.854s. Wallclock time: 140.702s. Parallelization: 1.95x. ``` After: ``` test_sse4_1 (test_core.strict) ... ok (7.11s) test_sse4_1 (test_core.strict_js) ... ok (7.16s) test_sse4_1 (test_core.bigint) ... ok (7.18s) test_sse4_1 (test_core.minimal0) ... ok (7.44s) test_sse4_1 (test_core.core_2gb) ... ok (7.46s) test_sse4_1 (test_core.core0) ... ok (7.49s) test_sse4_1 (test_core.instance) ... ok (7.53s) test_sse4_1 (test_core.lsan) ... ok (7.64s) test_sse4_1 (test_core.core1) ... ok (8.15s) test_sse4_1 (test_core.asan) ... ok (9.63s) test_sse4_1 (test_core.wasmfs) ... ok (10.54s) test_sse4_1 (test_core.core2) ... ok (10.44s) test_sse4_1 (test_core.corez) ... ok (11.38s) test_sse4_1 (test_core.cores) ... ok (11.69s) test_sse4_1 (test_core.core3) ... ok (50.80s) Total core time: 171.622s. Wallclock time: 51.223s. Parallelization: 3.35x. ```
1 parent d0f1ed6 commit a7cdef6

File tree

2 files changed

+24
-20
lines changed

2 files changed

+24
-20
lines changed

test/sse/test_sse.h

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
504504
printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
505505
}
506506

507-
#define Ret_M128_Tint_body(Ret_type, func, Tint) \
507+
#define Ret_M128_Tint_body(Ret_type, func, Tint) [](){ \
508508
for(int i = 0; i < numInterestingFloats / 4; ++i) \
509509
for(int k = 0; k < 4; ++k) \
510510
{ \
@@ -513,9 +513,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
513513
char str[256]; tostr(&m1, str); \
514514
char str2[256]; tostr(&ret, str2); \
515515
printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
516-
}
516+
} \
517+
}();
517518

518-
#define Ret_M128d_Tint_body(Ret_type, func, Tint) \
519+
#define Ret_M128d_Tint_body(Ret_type, func, Tint) [](){ \
519520
for(int i = 0; i < numInterestingDoubles / 2; ++i) \
520521
for(int k = 0; k < 2; ++k) \
521522
{ \
@@ -524,9 +525,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
524525
char str[256]; tostr(&m1, str); \
525526
char str2[256]; tostr(&ret, str2); \
526527
printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
527-
}
528+
} \
529+
}();
528530

529-
#define Ret_M128i_Tint_body(Ret_type, func, Tint) \
531+
#define Ret_M128i_Tint_body(Ret_type, func, Tint) [](){ \
530532
for(int i = 0; i < numInterestingInts / 4; ++i) \
531533
for(int k = 0; k < 4; ++k) \
532534
{ \
@@ -535,9 +537,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
535537
char str[256]; tostr(&m1, str); \
536538
char str2[256]; tostr(&ret, str2); \
537539
printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
538-
}
540+
} \
541+
}();
539542

540-
#define Ret_M128i_int_Tint_body(Ret_type, func, Tint) \
543+
#define Ret_M128i_int_Tint_body(Ret_type, func, Tint) [](){ \
541544
for(int i = 0; i < numInterestingInts / 4; ++i) \
542545
for(int j = 0; j < numInterestingInts; ++j) \
543546
for(int k = 0; k < 4; ++k) \
@@ -547,9 +550,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
547550
char str[256]; tostr(&m1, str); \
548551
char str2[256]; tostr(&ret, str2); \
549552
printf("%s(%s, 0x%08X, %d) = %s\n", #func, str, interesting_ints[j], Tint, str2); \
550-
}
553+
} \
554+
}();
551555

552-
#define Ret_M128d_M128d_Tint_body(Ret_type, func, Tint) \
556+
#define Ret_M128d_M128d_Tint_body(Ret_type, func, Tint) [](){ \
553557
for(int i = 0; i < numInterestingDoubles / 2; ++i) \
554558
for(int k = 0; k < 2; ++k) \
555559
for(int j = 0; j < numInterestingDoubles / 2; ++j) \
@@ -568,9 +572,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
568572
tostr(&m2, str2); \
569573
tostr(&ret, str3); \
570574
printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
571-
}
575+
} \
576+
}();
572577

573-
#define Ret_M128i_M128i_Tint_body(Ret_type, func, Tint) \
578+
#define Ret_M128i_M128i_Tint_body(Ret_type, func, Tint) [](){ \
574579
for(int i = 0; i < numInterestingInts / 4; ++i) \
575580
for(int k = 0; k < 4; ++k) \
576581
for(int j = 0; j < numInterestingInts / 4; ++j) \
@@ -589,9 +594,10 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
589594
tostr(&m2, str2); \
590595
tostr(&ret, str3); \
591596
printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
592-
}
597+
} \
598+
}();
593599

594-
#define Ret_M128_M128_Tint_body(Ret_type, func, Tint) \
600+
#define Ret_M128_M128_Tint_body(Ret_type, func, Tint) [](){ \
595601
for(int i = 0; i < numInterestingFloats / 4; ++i) \
596602
for(int k = 0; k < 4; ++k) \
597603
for(int j = 0; j < numInterestingFloats / 4; ++j) \
@@ -610,7 +616,8 @@ __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime) {
610616
tostr(&m2, str2); \
611617
tostr(&ret, str3); \
612618
printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
613-
}
619+
} \
620+
}();
614621

615622
#define const_int8_unroll(Ret_type, F, func) \
616623
F(Ret_type, func, 0b00000000); \

test/test_core.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6626,14 +6626,11 @@ def test_sse4_1(self):
66266626
if self.is_wasm64():
66276627
self.require_node_canary()
66286628
src = test_file('sse/test_sse4_1.cpp')
6629-
if not self.is_optimizing() and '-fsanitize=address' in self.emcc_args:
6630-
# ASan with -O0 fails with:
6631-
# Compiling function #69:"__original_main" failed: local count too large
6632-
self.emcc_args.append('-O1')
6633-
self.run_process([shared.CLANG_CXX, src, '-msse4.1', '-Wno-argument-outside-range', '-o', 'test_sse4_1', '-D_CRT_SECURE_NO_WARNINGS=1'] + clang_native.get_clang_native_args(), stdout=PIPE)
6629+
# Run with inlining disabled to avoid slow LLVM behavior with lots of macro expanded loops inside a function body.
6630+
self.run_process([shared.CLANG_CXX, src, '-msse4.1', '-fno-inline-functions', '-Wno-argument-outside-range', '-o', 'test_sse4_1', '-D_CRT_SECURE_NO_WARNINGS=1'] + clang_native.get_clang_native_args(), stdout=PIPE)
66346631
native_result = self.run_process('./test_sse4_1', stdout=PIPE).stdout
66356632

6636-
self.emcc_args += ['-I' + test_file('sse'), '-msse4.1', '-Wno-argument-outside-range', '-sSTACK_SIZE=1MB']
6633+
self.emcc_args += ['-I' + test_file('sse'), '-msse4.1', '-fno-inline-functions', '-Wno-argument-outside-range', '-sSTACK_SIZE=1MB']
66376634
self.maybe_closure()
66386635
self.do_runf(src, native_result)
66396636

0 commit comments

Comments
 (0)