emscripten-core · juj · Feb 1, 2017 · Feb 25, 2016 · Feb 1, 2017 · kripken
diff --git a/emscripten.py b/emscripten.py
@@ -599,7 +599,7 @@ def make_emulated_param(i):
     if metadata['simdUint32x4']:
       simdinttypes += ['Uint32x4']
       simdintfloatfuncs += ['fromUint32x4Bits']
-    if metadata['simdInt32x4']:
+    if metadata['simdInt32x4'] or settings['SIMD']: # Always import Int32x4 when building with -s SIMD=1, since memcpy is SIMD optimized.
       simdinttypes += ['Int32x4']
       simdintfloatfuncs += ['fromInt32x4Bits']
     if metadata['simdFloat32x4']:
@@ -626,7 +626,7 @@ def make_emulated_param(i):
     fundamentals = ['Math']
     fundamentals += ['Int8Array', 'Int16Array', 'Int32Array', 'Uint8Array', 'Uint16Array', 'Uint32Array', 'Float32Array', 'Float64Array']
     fundamentals += ['NaN', 'Infinity']
-    if metadata['simd']:
+    if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized.
         fundamentals += ['SIMD']
     if settings['ALLOW_MEMORY_GROWTH']: fundamentals.append('byteLength')
     math_envs = []
@@ -860,7 +860,7 @@ def math_fix(g):
       return g if not g.startswith('Math_') else g.split('_')[1]
     asm_global_funcs = ''.join(['  var ' + g.replace('.', '_') + '=global' + access_quote(g) + ';\n' for g in maths])
     asm_global_funcs += ''.join(['  var ' + g + '=env' + access_quote(math_fix(g)) + ';\n' for g in basic_funcs + global_funcs])
-    if metadata['simd']:
+    if metadata['simd'] or settings['SIMD']: # Always import SIMD when building with -s SIMD=1, since in that mode memcpy is SIMD optimized.
       def string_contains_any(s, str_list):
         for sub in str_list:
           if sub in s:

diff --git a/src/library.js b/src/library.js
@@ -804,28 +804,82 @@ LibraryManager.library = {
   memcpy: function(dest, src, num) {
     dest = dest|0; src = src|0; num = num|0;
     var ret = 0;
-    if ((num|0) >= 4096) return _emscripten_memcpy_big(dest|0, src|0, num|0)|0;
+    var aligned_dest_end = 0;
+    var block_aligned_dest_end = 0;
+    var dest_end = 0;
+    // Test against a benchmarked cutoff limit for when HEAPU8.set() becomes faster to use.
+    if ((num|0) >=
+#if SIMD
+      196608
+#else
+      8192
+#endif
+    ) {
+      return _emscripten_memcpy_big(dest|0, src|0, num|0)|0;
+    }
+
     ret = dest|0;
+    dest_end = (dest + num)|0;
     if ((dest&3) == (src&3)) {
+      // The initial unaligned < 4-byte front.
       while (dest & 3) {
         if ((num|0) == 0) return ret|0;
         {{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
         dest = (dest+1)|0;
         src = (src+1)|0;
         num = (num-1)|0;
       }
-      while ((num|0) >= 4) {
+      aligned_dest_end = (dest_end & -4)|0;
+      block_aligned_dest_end = (aligned_dest_end - 64)|0;
+      while ((dest|0) <= (block_aligned_dest_end|0) ) {
+#if SIMD
+        SIMD_Int32x4_store(HEAPU8, dest, SIMD_Int32x4_load(HEAPU8, src));
+        SIMD_Int32x4_store(HEAPU8, dest+16, SIMD_Int32x4_load(HEAPU8, src+16));
+        SIMD_Int32x4_store(HEAPU8, dest+32, SIMD_Int32x4_load(HEAPU8, src+32));
+        SIMD_Int32x4_store(HEAPU8, dest+48, SIMD_Int32x4_load(HEAPU8, src+48));
+#else
         {{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 4, makeGetValueAsm('src', 4, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 8, makeGetValueAsm('src', 8, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 12, makeGetValueAsm('src', 12, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 16, makeGetValueAsm('src', 16, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 20, makeGetValueAsm('src', 20, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 24, makeGetValueAsm('src', 24, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 28, makeGetValueAsm('src', 28, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 32, makeGetValueAsm('src', 32, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 36, makeGetValueAsm('src', 36, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 40, makeGetValueAsm('src', 40, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 44, makeGetValueAsm('src', 44, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 48, makeGetValueAsm('src', 48, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 52, makeGetValueAsm('src', 52, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 56, makeGetValueAsm('src', 56, 'i32'), 'i32') }}};
+        {{{ makeSetValueAsm('dest', 60, makeGetValueAsm('src', 60, 'i32'), 'i32') }}};
+#endif
+        dest = (dest+64)|0;
+        src = (src+64)|0;
+      }
+      while ((dest|0) < (aligned_dest_end|0) ) {
+        {{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i32'), 'i32') }}};
+        dest = (dest+4)|0;
+        src = (src+4)|0;
+      }
+    } else {
+      // In the unaligned copy case, unroll a bit as well.
+      aligned_dest_end = (dest_end - 4)|0;
+      while ((dest|0) < (aligned_dest_end|0) ) {
+        {{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
+        {{{ makeSetValueAsm('dest', 1, makeGetValueAsm('src', 1, 'i8'), 'i8') }}};
+        {{{ makeSetValueAsm('dest', 2, makeGetValueAsm('src', 2, 'i8'), 'i8') }}};
+        {{{ makeSetValueAsm('dest', 3, makeGetValueAsm('src', 3, 'i8'), 'i8') }}};
         dest = (dest+4)|0;
         src = (src+4)|0;
-        num = (num-4)|0;
       }
     }
-    while ((num|0) > 0) {
+    // The remaining unaligned < 4 byte tail.
+    while ((dest|0) < (dest_end|0)) {
       {{{ makeSetValueAsm('dest', 0, makeGetValueAsm('src', 0, 'i8'), 'i8') }}};
       dest = (dest+1)|0;
       src = (src+1)|0;
-      num = (num-1)|0;
     }
     return ret|0;
   },

diff --git a/tests/benchmark_memcpy.cpp b/tests/benchmark_memcpy.cpp
@@ -0,0 +1,218 @@
+#include <memory.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+#ifdef WIN32
+#include <Windows.h>
+#define tick_t unsigned long long
+#define aligned_alloc(align, size) _aligned_malloc((size), (align))
+#endif
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten/emscripten.h>
+#endif
+
+char dst[1024*1024*64+16] = {};
+char src[1024*1024*64+16] = {};
+
+#ifdef __EMSCRIPTEN__
+#define tick emscripten_get_now
+#define tick_t double
+tick_t ticks_per_sec() { return 1000.0; }
+#elif defined(__APPLE__)
+#define tick_t unsigned long long
+#define tick mach_absolute_time
+tick_t ticks_per_sec()
+{
+	mach_timebase_info_data_t timeBaseInfo;
+	mach_timebase_info(&timeBaseInfo);
+	return 1000000000ULL * (uint64_t)timeBaseInfo.denom / (uint64_t)timeBaseInfo.numer;
+}
+#elif defined(_POSIX_MONOTONIC_CLOCK)
+inline tick_t tick()
+{
+	timespec t;
+	clock_gettime(CLOCK_MONOTONIC, &t);
+	return (tick_t)t.tv_sec * 1000 * 1000 * 1000 + (tick_t)t.tv_nsec;
+}
+tick_t ticks_per_sec()
+{
+	return 1000 * 1000 * 1000;
+}
+#elif defined(_POSIX_C_SOURCE)
+inline tick_t tick()
+{
+	timeval t;
+	gettimeofday(&t, NULL);
+	return (tick_t)t.tv_sec * 1000 * 1000 + (tick_t)t.tv_usec;
+}
+tick_t ticks_per_sec()
+{
+	return 1000 * 1000;
+}
+#elif defined(WIN32)
+inline tick_t tick()
+{
+	LARGE_INTEGER ddwTimer;
+	QueryPerformanceCounter(&ddwTimer);
+	return ddwTimer.QuadPart;
+}
+tick_t ticks_per_sec()
+{
+	LARGE_INTEGER ddwTimerFrequency;
+	QueryPerformanceFrequency(&ddwTimerFrequency);
+	return ddwTimerFrequency.QuadPart;
+}
+#else
+#error No tick_t
+#endif
+
+uint8_t resultCheckSum = 0;
+
+void __attribute__((noinline)) test_memcpy(int numTimes, int copySize)
+{
+	for(int i = 0; i < numTimes - 8; i += 8)
+	{
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+	}
+	numTimes &= 15;
+	for(int i = 0; i < numTimes; ++i)
+	{
+		memcpy(dst, src, copySize); resultCheckSum += dst[copySize >> 1];
+	}
+}
+
+std::vector<int> copySizes;
+std::vector<double> results;
+
+std::vector<int> testCases;
+
+double totalTimeSecs = 0.0;
+
+void test_case(int copySize)
+{
+	const int minimumCopyBytes = 1024*1024*64;
+
+	int numTimes = (minimumCopyBytes + copySize-1) / copySize;
+	if (numTimes < 8) numTimes = 8;
+
+	tick_t bestResult = 1e9;
+
+#ifndef NUM_TRIALS
+#define NUM_TRIALS 5
+#endif
+
+	for(int i = 0; i < NUM_TRIALS; ++i)
+	{
+		double t0 = tick();
+		test_memcpy(numTimes, copySize);
+		double t1 = tick();
+		if (t1 - t0 < bestResult) bestResult = t1 - t0;
+		totalTimeSecs += (double)(t1 - t0) / ticks_per_sec();
+	}
+	unsigned long long totalBytesTransferred = numTimes * copySize;
+
+	copySizes.push_back(copySize);
+
+	tick_t ticksElapsed = bestResult;
+	if (ticksElapsed > 0)
+	{
+		double seconds = (double)ticksElapsed / ticks_per_sec();
+		double bytesPerSecond = totalBytesTransferred / seconds;
+		double mbytesPerSecond = bytesPerSecond / (1024.0*1024.0);
+		results.push_back(mbytesPerSecond);
+	}
+	else
+	{
+		results.push_back(0.0);
+	}
+}
+
+void print_results()
+{
+	std::cout << "Test cases: " << std::endl;
+	for(size_t i = 0; i < copySizes.size(); ++i)
+	{
+		std::cout << copySizes[i];
+		if (i != copySizes.size()-1) std::cout << ",";
+		else std::cout << std::endl;
+		if (i % 10 == 9) std::cout << std::endl;
+	}
+	std::cout << std::endl;
+	std::cout << std::endl;
+	std::cout << std::endl;
+	std::cout << "Test results: " << std::endl;
+	for(size_t i = 0; i < results.size(); ++i)
+	{
+		std::cout << results[i];
+		if (i != results.size()-1) std::cout << ",";
+		else std::cout << std::endl;
+		if (i % 10 == 9) std::cout << std::endl;
+	}
+
+	std::cout << "Result checksum: " << (int)resultCheckSum << std::endl;
+	std::cout << "Total time: " << totalTimeSecs << std::endl;
+}
+
+int numDone = 0;
+
+void run_one()
+{
+	std::cout << (numDone+1) << "/" << (numDone+testCases.size()) << std::endl;
+	++numDone;
+
+	int copySize = testCases.front();
+	testCases.erase(testCases.begin());
+	test_case(copySize);
+}
+
+#ifdef __EMSCRIPTEN__
+void main_loop()
+{
+	if (!testCases.empty())
+	{
+		run_one();
+	}
+	else
+	{
+		emscripten_cancel_main_loop();
+		print_results();
+	}
+}
+#endif
+
+#ifndef MAX_COPY
+#define MAX_COPY 32*1024*1024
+#endif
+
+#ifndef MIN_COPY
+#define MIN_COPY 1
+#endif
+
+int main()
+{
+	for(int copySizeI = MIN_COPY; copySizeI < MAX_COPY; copySizeI <<= 1)
+		for(int copySizeJ = 1; copySizeJ <= copySizeI; copySizeJ <<= 1)
+		{
+			testCases.push_back(copySizeI | copySizeJ);
+		}
+
+	std::sort(testCases.begin(), testCases.end());
+#if defined(__EMSCRIPTEN__) && !defined(BUILD_FOR_SHELL)
+	emscripten_set_main_loop(main_loop, 0, 0);
+#else
+	while(!testCases.empty()) run_one();
+	print_results();
+#endif
+}
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -117,7 +117,7 @@ def process(filename):
     output = Popen([PYTHON, EMCC, filename, #'-O3',
                     '-O3', '-s', 'DOUBLE_MODE=0', '-s', 'PRECISE_I64_MATH=0',
                     '--memory-init-file', '0', '--js-transform', 'python hardcode.py',
-                    '-s', 'TOTAL_MEMORY=128*1024*1024',
+                    '-s', 'TOTAL_MEMORY=256*1024*1024',
                     '-s', 'NO_EXIT_RUNTIME=1',
                     '-s', 'BENCHMARK=%d' % (1 if IGNORE_COMPILATION and not has_output_parser else 0),
                     #'--profiling',
@@ -136,6 +136,7 @@ def run(self, args):
   if SPIDERMONKEY_ENGINE and Building.which(SPIDERMONKEY_ENGINE[0]):
     benchmarkers += [
       JSBenchmarker('sm-asmjs', SPIDERMONKEY_ENGINE, ['-s', 'PRECISE_F32=2']),
+      JSBenchmarker('sm-simd',  SPIDERMONKEY_ENGINE, ['-s', 'SIMD=1']),
       JSBenchmarker('sm-wasm',  SPIDERMONKEY_ENGINE, ['-s', 'BINARYEN=1', '-s', 'BINARYEN_METHOD="native-wasm"', '-s', 'BINARYEN_IMPRECISE=1'])
     ]
   if V8_ENGINE and Building.which(V8_ENGINE[0]):
@@ -582,6 +583,31 @@ def output_parser(output):
       return 100.0/float(re.search('Unrolled Single  Precision +([\d\.]+) Mflops', output).group(1))
     self.do_benchmark('linpack_float', open(path_from_root('tests', 'linpack.c')).read(), '''Unrolled Single  Precision''', force_c=True, output_parser=output_parser, shared_args=['-DSP'])
 
+  def zzz_test_memcpy_128b(self):
+    def output_parser(output):
+      return float(re.search('Total time: ([\d\.]+)', output).group(1))
+    self.do_benchmark('memcpy_128b', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMAX_COPY=128', '-DBUILD_FOR_SHELL'])
+
+  def zzz_test_memcpy_4k(self):
+    def output_parser(output):
+      return float(re.search('Total time: ([\d\.]+)', output).group(1))
+    self.do_benchmark('memcpy_4k', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=128', '-DMAX_COPY=4096', '-DBUILD_FOR_SHELL'])
+
+  def test_memcpy_16k(self):
+    def output_parser(output):
+      return float(re.search('Total time: ([\d\.]+)', output).group(1))
+    self.do_benchmark('memcpy_16k', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=4096', '-DMAX_COPY=16384', '-DBUILD_FOR_SHELL'])
+
+  def zzz_test_memcpy_1mb(self):
+    def output_parser(output):
+      return float(re.search('Total time: ([\d\.]+)', output).group(1))
+    self.do_benchmark('memcpy_1mb', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=16384', '-DMAX_COPY=1048576', '-DBUILD_FOR_SHELL'])
+
+  def zzz_test_memcpy_16mb(self):
+    def output_parser(output):
+      return float(re.search('Total time: ([\d\.]+)', output).group(1))
+    self.do_benchmark('memcpy_16mb', open(path_from_root('tests', 'benchmark_memcpy.cpp')).read(), '''Total time:''', output_parser=output_parser, shared_args=['-DMIN_COPY=1048576', '-DBUILD_FOR_SHELL'])
+
   def test_zzz_java_nbody(self): # tests xmlvm compiled java, including bitcasts of doubles, i64 math, etc.
     if CORE_BENCHMARKS: return
     args = [path_from_root('tests', 'nbody-java', x) for x in os.listdir(path_from_root('tests', 'nbody-java')) if x.endswith('.c')] + \

diff --git a/tests/test_core.py b/tests/test_core.py
@@ -2129,6 +2129,9 @@ def test_memcpy2(self):
   def test_memcpy3(self):
     self.do_run_in_out_file_test('tests', 'core', 'test_memcpy3')
 
+  def test_memcpy_alignment(self):
+    self.do_run(open(path_from_root('tests', 'test_memcpy_alignment.cpp'), 'r').read(), 'OK.')
+
   def test_memset(self):
     self.do_run_in_out_file_test('tests', 'core', 'test_memset')