-
Notifications
You must be signed in to change notification settings - Fork 3.4k
simd_memcpy #4127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
simd_memcpy #4127
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -599,7 +599,7 @@ def make_emulated_param(i): | |
if metadata['simdUint32x4']: | ||
simdinttypes += ['Uint32x4'] | ||
simdintfloatfuncs += ['fromUint32x4Bits'] | ||
if metadata['simdInt32x4']: | ||
if metadata['simdInt32x4'] or settings['SIMD']: # Always import Int32x4 when building with -s SIMD=1, since memcpy is SIMD optimized. | ||
simdinttypes += ['Int32x4'] | ||
simdintfloatfuncs += ['fromInt32x4Bits'] | ||
if metadata['simdFloat32x4']: | ||
|
@@ -626,7 +626,7 @@ def make_emulated_param(i): | |
fundamentals = ['Math'] | ||
fundamentals += ['Int8Array', 'Int16Array', 'Int32Array', 'Uint8Array', 'Uint16Array', 'Uint32Array', 'Float32Array', 'Float64Array'] | ||
fundamentals += ['NaN', 'Infinity'] | ||
if metadata['simd']: | ||
if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same question here - doesn't one of those imply the other? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is the same, simd=1 refers to whether the input .bc code contained SIMD constructs in general or not, and SIMD=1 means whether user wants to target SIMD now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, thanks, I forgot how those worked. |
||
fundamentals += ['SIMD'] | ||
if settings['ALLOW_MEMORY_GROWTH']: fundamentals.append('byteLength') | ||
math_envs = [] | ||
|
@@ -860,7 +860,7 @@ def math_fix(g): | |
return g if not g.startswith('Math_') else g.split('_')[1] | ||
asm_global_funcs = ''.join([' var ' + g.replace('.', '_') + '=global' + access_quote(g) + ';\n' for g in maths]) | ||
asm_global_funcs += ''.join([' var ' + g + '=env' + access_quote(math_fix(g)) + ';\n' for g in basic_funcs + global_funcs]) | ||
if metadata['simd']: | ||
if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized. | ||
def string_contains_any(s, str_list): | ||
for sub in str_list: | ||
if sub in s: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
#include <memory.h> | ||
#include <string.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <vector> | ||
#include <iostream> | ||
#include <algorithm> | ||
|
||
#ifdef WIN32 | ||
#include <Windows.h> | ||
#define tick_t unsigned long long | ||
#define aligned_alloc(align, size) _aligned_malloc((size), (align)) | ||
#endif | ||
|
||
#ifdef __EMSCRIPTEN__ | ||
#include <emscripten/emscripten.h> | ||
#endif | ||
|
||
char dst[1024*1024*64+16] = {}; | ||
char src[1024*1024*64+16] = {}; | ||
|
||
#ifdef __EMSCRIPTEN__ | ||
#define tick emscripten_get_now | ||
#define tick_t double | ||
tick_t ticks_per_sec() { return 1000.0; } | ||
#elif defined(__APPLE__) | ||
#define tick_t unsigned long long | ||
#define tick mach_absolute_time | ||
tick_t ticks_per_sec() | ||
{ | ||
mach_timebase_info_data_t timeBaseInfo; | ||
mach_timebase_info(&timeBaseInfo); | ||
return 1000000000ULL * (uint64_t)timeBaseInfo.denom / (uint64_t)timeBaseInfo.numer; | ||
} | ||
#elif defined(_POSIX_MONOTONIC_CLOCK) | ||
inline tick_t tick() | ||
{ | ||
timespec t; | ||
clock_gettime(CLOCK_MONOTONIC, &t); | ||
return (tick_t)t.tv_sec * 1000 * 1000 * 1000 + (tick_t)t.tv_nsec; | ||
} | ||
tick_t ticks_per_sec() | ||
{ | ||
return 1000 * 1000 * 1000; | ||
} | ||
#elif defined(_POSIX_C_SOURCE) | ||
inline tick_t tick() | ||
{ | ||
timeval t; | ||
gettimeofday(&t, NULL); | ||
return (tick_t)t.tv_sec * 1000 * 1000 + (tick_t)t.tv_usec; | ||
} | ||
tick_t ticks_per_sec() | ||
{ | ||
return 1000 * 1000; | ||
} | ||
#elif defined(WIN32) | ||
inline tick_t tick() | ||
{ | ||
LARGE_INTEGER ddwTimer; | ||
QueryPerformanceCounter(&ddwTimer); | ||
return ddwTimer.QuadPart; | ||
} | ||
tick_t ticks_per_sec() | ||
{ | ||
LARGE_INTEGER ddwTimerFrequency; | ||
QueryPerformanceFrequency(&ddwTimerFrequency); | ||
return ddwTimerFrequency.QuadPart; | ||
} | ||
#else | ||
#error No tick_t | ||
#endif | ||
|
||
uint8_t resultCheckSum = 0; | ||
|
||
void __attribute__((noinline)) test_memcpy(int numTimes, int copySize) | ||
{ | ||
for(int i = 0; i < numTimes - 8; i += 8) | ||
{ | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
} | ||
numTimes &= 15; | ||
for(int i = 0; i < numTimes; ++i) | ||
{ | ||
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1]; | ||
} | ||
} | ||
|
||
std::vector<int> copySizes; | ||
std::vector<double> results; | ||
|
||
std::vector<int> testCases; | ||
|
||
double totalTimeSecs = 0.0; | ||
|
||
void test_case(int copySize) | ||
{ | ||
const int minimumCopyBytes = 1024*1024*64; | ||
|
||
int numTimes = (minimumCopyBytes + copySize-1) / copySize; | ||
if (numTimes < 8) numTimes = 8; | ||
|
||
tick_t bestResult = 1e9; | ||
|
||
#ifndef NUM_TRIALS | ||
#define NUM_TRIALS 5 | ||
#endif | ||
|
||
for(int i = 0; i < NUM_TRIALS; ++i) | ||
{ | ||
double t0 = tick(); | ||
test_memcpy(numTimes, copySize); | ||
double t1 = tick(); | ||
if (t1 - t0 < bestResult) bestResult = t1 - t0; | ||
totalTimeSecs += (double)(t1 - t0) / ticks_per_sec(); | ||
} | ||
unsigned long long totalBytesTransferred = numTimes * copySize; | ||
|
||
copySizes.push_back(copySize); | ||
|
||
tick_t ticksElapsed = bestResult; | ||
if (ticksElapsed > 0) | ||
{ | ||
double seconds = (double)ticksElapsed / ticks_per_sec(); | ||
double bytesPerSecond = totalBytesTransferred / seconds; | ||
double mbytesPerSecond = bytesPerSecond / (1024.0*1024.0); | ||
results.push_back(mbytesPerSecond); | ||
} | ||
else | ||
{ | ||
results.push_back(0.0); | ||
} | ||
} | ||
|
||
void print_results() | ||
{ | ||
std::cout << "Test cases: " << std::endl; | ||
for(size_t i = 0; i < copySizes.size(); ++i) | ||
{ | ||
std::cout << copySizes[i]; | ||
if (i != copySizes.size()-1) std::cout << ","; | ||
else std::cout << std::endl; | ||
if (i % 10 == 9) std::cout << std::endl; | ||
} | ||
std::cout << std::endl; | ||
std::cout << std::endl; | ||
std::cout << std::endl; | ||
std::cout << "Test results: " << std::endl; | ||
for(size_t i = 0; i < results.size(); ++i) | ||
{ | ||
std::cout << results[i]; | ||
if (i != results.size()-1) std::cout << ","; | ||
else std::cout << std::endl; | ||
if (i % 10 == 9) std::cout << std::endl; | ||
} | ||
|
||
std::cout << "Result checksum: " << (int)resultCheckSum << std::endl; | ||
std::cout << "Total time: " << totalTimeSecs << std::endl; | ||
} | ||
|
||
int numDone = 0; | ||
|
||
void run_one() | ||
{ | ||
std::cout << (numDone+1) << "/" << (numDone+testCases.size()) << std::endl; | ||
++numDone; | ||
|
||
int copySize = testCases.front(); | ||
testCases.erase(testCases.begin()); | ||
test_case(copySize); | ||
} | ||
|
||
#ifdef __EMSCRIPTEN__ | ||
void main_loop() | ||
{ | ||
if (!testCases.empty()) | ||
{ | ||
run_one(); | ||
} | ||
else | ||
{ | ||
emscripten_cancel_main_loop(); | ||
print_results(); | ||
} | ||
} | ||
#endif | ||
|
||
#ifndef MAX_COPY | ||
#define MAX_COPY 32*1024*1024 | ||
#endif | ||
|
||
#ifndef MIN_COPY | ||
#define MIN_COPY 1 | ||
#endif | ||
|
||
int main() | ||
{ | ||
for(int copySizeI = MIN_COPY; copySizeI < MAX_COPY; copySizeI <<= 1) | ||
for(int copySizeJ = 1; copySizeJ <= copySizeI; copySizeJ <<= 1) | ||
{ | ||
testCases.push_back(copySizeI | copySizeJ); | ||
} | ||
|
||
std::sort(testCases.begin(), testCases.end()); | ||
#if defined(__EMSCRIPTEN__) && !defined(BUILD_FOR_SHELL) | ||
emscripten_set_main_loop(main_loop, 0, 0); | ||
#else | ||
while(!testCases.empty()) run_one(); | ||
print_results(); | ||
#endif | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can simdInt32x4 be false but SIMD be true? (if not, then the if could be simplified to just check for SIMD?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it can. It is possible that the application itself does not use Int32x4 SIMD type at all so SIMD=1 and simdInt32x4=0. Also it is possible to be the other way around, simdInt32x4=1 and SIMD=0, because simdInt32x4 is set to one when compiled .bc code is detected to contain SIMD constructs (the .bc could have been compiled earlier with different flags), but that is orthogonal to what the user set on the command line for -s SIMD=1.