Skip to content

simd_memcpy #4127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions emscripten.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def make_emulated_param(i):
if metadata['simdUint32x4']:
simdinttypes += ['Uint32x4']
simdintfloatfuncs += ['fromUint32x4Bits']
if metadata['simdInt32x4']:
if metadata['simdInt32x4'] or settings['SIMD']: # Always import Int32x4 when building with -s SIMD=1, since memcpy is SIMD optimized.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can simdInt32x4 be false but SIMD be true? (if not, then the if could be simplified to just check for SIMD?)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it can. It is possible that the application itself does not use Int32x4 SIMD type at all so SIMD=1 and simdInt32x4=0. Also it is possible to be the other way around, simdInt32x4=1 and SIMD=0, because simdInt32x4 is set to one when compiled .bc code is detected to contain SIMD constructs (the .bc could have been compiled earlier with different flags), but that is orthogonal to what the user set on the command line for -s SIMD=1.

simdinttypes += ['Int32x4']
simdintfloatfuncs += ['fromInt32x4Bits']
if metadata['simdFloat32x4']:
Expand All @@ -626,7 +626,7 @@ def make_emulated_param(i):
fundamentals = ['Math']
fundamentals += ['Int8Array', 'Int16Array', 'Int32Array', 'Uint8Array', 'Uint16Array', 'Uint32Array', 'Float32Array', 'Float64Array']
fundamentals += ['NaN', 'Infinity']
if metadata['simd']:
if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question here - doesn't one of those imply the other?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the same, simd=1 refers to whether the input .bc code contained SIMD constructs in general or not, and SIMD=1 means whether user wants to target SIMD now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, thanks, I forgot how those worked.

fundamentals += ['SIMD']
if settings['ALLOW_MEMORY_GROWTH']: fundamentals.append('byteLength')
math_envs = []
Expand Down Expand Up @@ -860,7 +860,7 @@ def math_fix(g):
return g if not g.startswith('Math_') else g.split('_')[1]
asm_global_funcs = ''.join([' var ' + g.replace('.', '_') + '=global' + access_quote(g) + ';\n' for g in maths])
asm_global_funcs += ''.join([' var ' + g + '=env' + access_quote(math_fix(g)) + ';\n' for g in basic_funcs + global_funcs])
if metadata['simd']:
if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized.
def string_contains_any(s, str_list):
for sub in str_list:
if sub in s:
Expand Down
64 changes: 59 additions & 5 deletions src/library.js
Original file line number Diff line number Diff line change
Expand Up @@ -804,28 +804,82 @@ LibraryManager.library = {
memcpy: function(dest, src, num) {
dest = dest|0; src = src|0; num = num|0;
var ret = 0;
if ((num|0) >= 4096) return _emscripten_memcpy_big(dest|0, src|0, num|0)|0;
var aligned_dest_end = 0;
var block_aligned_dest_end = 0;
var dest_end = 0;
// Test against a benchmarked cutoff limit for when HEAPU8.set() becomes faster to use.
if ((num|0) >=
#if SIMD
196608
#else
8192
#endif
) {
return _emscripten_memcpy_big(dest|0, src|0, num|0)|0;
}

ret = dest|0;
dest_end = (dest + num)|0;
if ((dest&3) == (src&3)) {
// The initial unaligned < 4-byte front.
while (dest & 3) {
if ((num|0) == 0) return ret|0;
{{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
dest = (dest+1)|0;
src = (src+1)|0;
num = (num-1)|0;
}
while ((num|0) >= 4) {
aligned_dest_end = (dest_end & -4)|0;
block_aligned_dest_end = (aligned_dest_end - 64)|0;
while ((dest|0) <= (block_aligned_dest_end|0) ) {
#if SIMD
SIMD_Int32x4_store(HEAPU8, dest, SIMD_Int32x4_load(HEAPU8, src));
SIMD_Int32x4_store(HEAPU8, dest+16, SIMD_Int32x4_load(HEAPU8, src+16));
SIMD_Int32x4_store(HEAPU8, dest+32, SIMD_Int32x4_load(HEAPU8, src+32));
SIMD_Int32x4_store(HEAPU8, dest+48, SIMD_Int32x4_load(HEAPU8, src+48));
#else
{{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 4, makeGetValueAsm('src', 4, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 8, makeGetValueAsm('src', 8, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 12, makeGetValueAsm('src', 12, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 16, makeGetValueAsm('src', 16, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 20, makeGetValueAsm('src', 20, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 24, makeGetValueAsm('src', 24, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 28, makeGetValueAsm('src', 28, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 32, makeGetValueAsm('src', 32, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 36, makeGetValueAsm('src', 36, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 40, makeGetValueAsm('src', 40, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 44, makeGetValueAsm('src', 44, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 48, makeGetValueAsm('src', 48, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 52, makeGetValueAsm('src', 52, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 56, makeGetValueAsm('src', 56, 'i32'), 'i32') }}};
{{{ makeSetValueAsm('dest', 60, makeGetValueAsm('src', 60, 'i32'), 'i32') }}};
#endif
dest = (dest+64)|0;
src = (src+64)|0;
}
while ((dest|0) < (aligned_dest_end|0) ) {
{{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i32'), 'i32') }}};
dest = (dest+4)|0;
src = (src+4)|0;
}
} else {
// In the unaligned copy case, unroll a bit as well.
aligned_dest_end = (dest_end - 4)|0;
while ((dest|0) < (aligned_dest_end|0) ) {
{{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
{{{ makeSetValueAsm('dest', 1, makeGetValueAsm('src', 1, 'i8'), 'i8') }}};
{{{ makeSetValueAsm('dest', 2, makeGetValueAsm('src', 2, 'i8'), 'i8') }}};
{{{ makeSetValueAsm('dest', 3, makeGetValueAsm('src', 3, 'i8'), 'i8') }}};
dest = (dest+4)|0;
src = (src+4)|0;
num = (num-4)|0;
}
}
while ((num|0) > 0) {
// The remaining unaligned < 4 byte tail.
while ((dest|0) < (dest_end|0)) {
{{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
dest = (dest+1)|0;
src = (src+1)|0;
num = (num-1)|0;
}
return ret|0;
},
Expand Down
218 changes: 218 additions & 0 deletions tests/benchmark_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
#include <memory.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
#include <algorithm>

#ifdef WIN32
#include <Windows.h>
#define tick_t unsigned long long
#define aligned_alloc(align, size) _aligned_malloc((size), (align))
#endif

#ifdef __EMSCRIPTEN__
#include <emscripten/emscripten.h>
#endif

char dst[1024*1024*64+16] = {};
char src[1024*1024*64+16] = {};

#ifdef __EMSCRIPTEN__
#define tick emscripten_get_now
#define tick_t double
tick_t ticks_per_sec() { return 1000.0; }
#elif defined(__APPLE__)
#define tick_t unsigned long long
#define tick mach_absolute_time
tick_t ticks_per_sec()
{
mach_timebase_info_data_t timeBaseInfo;
mach_timebase_info(&timeBaseInfo);
return 1000000000ULL * (uint64_t)timeBaseInfo.denom / (uint64_t)timeBaseInfo.numer;
}
#elif defined(_POSIX_MONOTONIC_CLOCK)
inline tick_t tick()
{
timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
return (tick_t)t.tv_sec * 1000 * 1000 * 1000 + (tick_t)t.tv_nsec;
}
tick_t ticks_per_sec()
{
return 1000 * 1000 * 1000;
}
#elif defined(_POSIX_C_SOURCE)
inline tick_t tick()
{
timeval t;
gettimeofday(&t, NULL);
return (tick_t)t.tv_sec * 1000 * 1000 + (tick_t)t.tv_usec;
}
tick_t ticks_per_sec()
{
return 1000 * 1000;
}
#elif defined(WIN32)
inline tick_t tick()
{
LARGE_INTEGER ddwTimer;
QueryPerformanceCounter(&ddwTimer);
return ddwTimer.QuadPart;
}
tick_t ticks_per_sec()
{
LARGE_INTEGER ddwTimerFrequency;
QueryPerformanceFrequency(&ddwTimerFrequency);
return ddwTimerFrequency.QuadPart;
}
#else
#error No tick_t
#endif

uint8_t resultCheckSum = 0;

void __attribute__((noinline)) test_memcpy(int numTimes, int copySize)
{
for(int i = 0; i < numTimes - 8; i += 8)
{
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
}
numTimes &= 15;
for(int i = 0; i < numTimes; ++i)
{
memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
}
}

std::vector<int> copySizes;
std::vector<double> results;

std::vector<int> testCases;

double totalTimeSecs = 0.0;

void test_case(int copySize)
{
const int minimumCopyBytes = 1024*1024*64;

int numTimes = (minimumCopyBytes + copySize-1) / copySize;
if (numTimes < 8) numTimes = 8;

tick_t bestResult = 1e9;

#ifndef NUM_TRIALS
#define NUM_TRIALS 5
#endif

for(int i = 0; i < NUM_TRIALS; ++i)
{
double t0 = tick();
test_memcpy(numTimes, copySize);
double t1 = tick();
if (t1 - t0 < bestResult) bestResult = t1 - t0;
totalTimeSecs += (double)(t1 - t0) / ticks_per_sec();
}
unsigned long long totalBytesTransferred = numTimes * copySize;

copySizes.push_back(copySize);

tick_t ticksElapsed = bestResult;
if (ticksElapsed > 0)
{
double seconds = (double)ticksElapsed / ticks_per_sec();
double bytesPerSecond = totalBytesTransferred / seconds;
double mbytesPerSecond = bytesPerSecond / (1024.0*1024.0);
results.push_back(mbytesPerSecond);
}
else
{
results.push_back(0.0);
}
}

void print_results()
{
std::cout << "Test cases: " << std::endl;
for(size_t i = 0; i < copySizes.size(); ++i)
{
std::cout << copySizes[i];
if (i != copySizes.size()-1) std::cout << ",";
else std::cout << std::endl;
if (i % 10 == 9) std::cout << std::endl;
}
std::cout << std::endl;
std::cout << std::endl;
std::cout << std::endl;
std::cout << "Test results: " << std::endl;
for(size_t i = 0; i < results.size(); ++i)
{
std::cout << results[i];
if (i != results.size()-1) std::cout << ",";
else std::cout << std::endl;
if (i % 10 == 9) std::cout << std::endl;
}

std::cout << "Result checksum: " << (int)resultCheckSum << std::endl;
std::cout << "Total time: " << totalTimeSecs << std::endl;
}

int numDone = 0;

void run_one()
{
std::cout << (numDone+1) << "/" << (numDone+testCases.size()) << std::endl;
++numDone;

int copySize = testCases.front();
testCases.erase(testCases.begin());
test_case(copySize);
}

#ifdef __EMSCRIPTEN__
void main_loop()
{
if (!testCases.empty())
{
run_one();
}
else
{
emscripten_cancel_main_loop();
print_results();
}
}
#endif

#ifndef MAX_COPY
#define MAX_COPY 32*1024*1024
#endif

#ifndef MIN_COPY
#define MIN_COPY 1
#endif

int main()
{
for(int copySizeI = MIN_COPY; copySizeI < MAX_COPY; copySizeI <<= 1)
for(int copySizeJ = 1; copySizeJ <= copySizeI; copySizeJ <<= 1)
{
testCases.push_back(copySizeI | copySizeJ);
}

std::sort(testCases.begin(), testCases.end());
#if defined(__EMSCRIPTEN__) && !defined(BUILD_FOR_SHELL)
emscripten_set_main_loop(main_loop, 0, 0);
#else
while(!testCases.empty()) run_one();
print_results();
#endif
}
28 changes: 27 additions & 1 deletion tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def process(filename):
output = Popen([PYTHON, EMCC, filename, #'-O3',
'-O3', '-s', 'DOUBLE_MODE=0', '-s', 'PRECISE_I64_MATH=0',
'--memory-init-file', '0', '--js-transform', 'python hardcode.py',
'-s', 'TOTAL_MEMORY=128*1024*1024',
'-s', 'TOTAL_MEMORY=256*1024*1024',
'-s', 'NO_EXIT_RUNTIME=1',
'-s', 'BENCHMARK=%d' % (1 if IGNORE_COMPILATION and not has_output_parser else 0),
#'--profiling',
Expand All @@ -136,6 +136,7 @@ def run(self, args):
if SPIDERMONKEY_ENGINE and Building.which(SPIDERMONKEY_ENGINE[0]):
benchmarkers += [
JSBenchmarker('sm-asmjs', SPIDERMONKEY_ENGINE, ['-s', 'PRECISE_F32=2']),
JSBenchmarker('sm-simd', SPIDERMONKEY_ENGINE, ['-s', 'SIMD=1']),
JSBenchmarker('sm-wasm', SPIDERMONKEY_ENGINE, ['-s', 'BINARYEN=1', '-s', 'BINARYEN_METHOD="native-wasm"', '-s', 'BINARYEN_IMPRECISE=1'])
]
if V8_ENGINE and Building.which(V8_ENGINE[0]):
Expand Down Expand Up @@ -582,6 +583,31 @@ def output_parser(output):
return 100.0/float(re.search('Unrolled Single Precision +([\d\.]+) Mflops', output).group(1))
self.do_benchmark('linpack_float', open(path_from_root('tests', 'linpack.c')).read(), '''Unrolled Single Precision''', force_c=True, output_parser=output_parser, shared_args=['-DSP'])

def zzz_test_memcpy_128b(self):
def output_parser(output):
return float(re.search('Total time: ([\d\.]+)', output).group(1))
self.do_benchmark('memcpy_128b', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMAX_COPY=128', '-DBUILD_FOR_SHELL'])

def zzz_test_memcpy_4k(self):
def output_parser(output):
return float(re.search('Total time: ([\d\.]+)', output).group(1))
self.do_benchmark('memcpy_4k', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=128', '-DMAX_COPY=4096', '-DBUILD_FOR_SHELL'])

def test_memcpy_16k(self):
def output_parser(output):
return float(re.search('Total time: ([\d\.]+)', output).group(1))
self.do_benchmark('memcpy_16k', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=4096', '-DMAX_COPY=16384', '-DBUILD_FOR_SHELL'])

def zzz_test_memcpy_1mb(self):
def output_parser(output):
return float(re.search('Total time: ([\d\.]+)', output).group(1))
self.do_benchmark('memcpy_1mb', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=16384', '-DMAX_COPY=1048576', '-DBUILD_FOR_SHELL'])

def zzz_test_memcpy_16mb(self):
def output_parser(output):
return float(re.search('Total time: ([\d\.]+)', output).group(1))
self.do_benchmark('memcpy_16mb', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=1048576', '-DBUILD_FOR_SHELL'])

def test_zzz_java_nbody(self): # tests xmlvm compiled java, including bitcasts of doubles, i64 math, etc.
if CORE_BENCHMARKS: return
args = [path_from_root('tests', 'nbody-java', x) for x in os.listdir(path_from_root('tests', 'nbody-java')) if x.endswith('.c')] + \
Expand Down
3 changes: 3 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2129,6 +2129,9 @@ def test_memcpy2(self):
def test_memcpy3(self):
self.do_run_in_out_file_test('tests', 'core', 'test_memcpy3')

def test_memcpy_alignment(self):
self.do_run(open(path_from_root('tests', 'test_memcpy_alignment.cpp'), 'r').read(), 'OK.')

def test_memset(self):
self.do_run_in_out_file_test('tests', 'core', 'test_memset')

Expand Down
Loading