updated hash

tpruvot · mintheunyil · Jun 29, 2018 · Sep 8, 2018 · Sep 8, 2018 · Sep 8, 2018
commit b3e7a4239fc05982bcc4fbd2c4fc861f9751b674
diff --git a/verus/verus_clhash_portable.cpp b/verus/verus_clhash_portable.cpp
@@ -88,11 +88,9 @@ uint64_t precompReduction64_port(__m128i A) {
 
 // verus intermediate hash extra
 __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask, 
-	uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex, uchar version)
+	uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex)
 {
 	const __m128i *pbuf;
-
-	const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
 
 	// divide key mask by 16 from bytes to __m128i
 	keyMask >>= 4;
@@ -115,9 +113,7 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 
 
 		// select random start and order of pbuf processing
-		if (version)
-		pbuf = pbuf_copy + (selector & 3);
-		else
+
 		pbuf = buf + (selector & 3);
 		uint32_t prand_idx = (selector >> 5) & keyMask;
 		uint32_t prandex_idx = (selector >> 32) & keyMask;
@@ -284,13 +280,8 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 
 			do
 			{
-				uint64_t temp_v;
-				if(version == 0x30)
-				temp_v = selector & ((uint64_t)0x10000000) << rounds;
-				else
-					temp_v = selector & (0x10000000) << rounds;
-
-				if (temp_v)
+
+				if (selector & (0x10000000) << rounds)
 				{
 					onekey = _mm_load_si128(rc++);
 					const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
@@ -321,8 +312,7 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 		}
 		case 0x18:
 		{
-			if (version == 0)
-			{
+
 				const __m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
 				const __m128i temp2 = _mm_load_si128(prand);
 				const __m128i add1 = _mm_xor_si128(temp1, temp2);
@@ -337,9 +327,271 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 				_mm_store_si128(prand, tempb3);
 
 				break;
+
+}
+		case 0x1c:
+		{
+			const __m128i temp1 = _mm_load_si128(pbuf);
+			const __m128i temp2 = _mm_load_si128(prandex);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp2);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp2);
+
+			const __m128i tempa3 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			acc = _mm_xor_si128(tempa3, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, tempa3);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, tempa3);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		}
+		fixrand[i] = prand_idx;
+		fixrandex[i] = prandex_idx;
+
+	}
+//	printf("acc = %08x\n", _mm_cvtsi128_si64(acc));
+
+//	exit(0);
+	return acc;
+}
+
+
+__inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port2(__m128i *randomsource, const __m128i buf[4], uint64_t keyMask,
+	uint16_t * __restrict fixrand, uint16_t * __restrict fixrandex)
+{
+	const __m128i *pbuf;
+
+	const __m128i pbuf_copy[4] = { _mm_xor_si128(buf[0], buf[2]), _mm_xor_si128(buf[1], buf[3]), buf[2], buf[3] };
+
+	// divide key mask by 16 from bytes to __m128i
+	keyMask >>= 4;
+
+	// the random buffer must have at least 32 16 byte dwords after the keymask to work with this
+	// algorithm. we take the value from the last element inside the keyMask + 2, as that will never
+	// be used to xor into the accumulator before it is hashed with other values first
+	__m128i acc = _mm_load_si128(randomsource + (keyMask + 2));
+
+	for (int64_t i = 0; i < 32; i++)
+	{
+		//std::cout << "LOOP " << i << " acc: " << LEToHex(acc) << std::endl;
+
+		const uint64_t selector = _mm_cvtsi128_si64(acc);
+
+		// get two random locations in the key, which will be mutated and swapped
+		__m128i *prand = randomsource + ((selector >> 5) & keyMask);
+		__m128i *prandex = randomsource + ((selector >> 32) & keyMask);
+
+
+
+		// select random start and order of pbuf processing
+
+			pbuf = pbuf_copy + (selector & 3);
+
+		uint32_t prand_idx = (selector >> 5) & keyMask;
+		uint32_t prandex_idx = (selector >> 32) & keyMask;
+
+		//	printf("[i]=%d \t acc = %08x, prand_idx = %d\t, prandex_idx = %d\t selector = %d prand %08x, prandex %08x\n", i, _mm_cvtsi128_si64(acc), prand_idx, prandex_idx, (selector & 0x1c)>>2, _mm_cvtsi128_si64(prand[0]), _mm_cvtsi128_si64(prandex[0]));
+		//printf("pbuf %08x%08x%08x%08x\n", _mm_cvtsi128_si64(buf[0]), _mm_cvtsi128_si64(buf[1]), _mm_cvtsi128_si64(buf[2]), _mm_cvtsi128_si64(buf[3]));
+		switch (selector & 0x1c)
+		{
+		case 0:
+		{
+			const __m128i temp1 = _mm_load_si128(prandex);
+			const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf);
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+			acc = _mm_xor_si128(clprod12, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		case 4:
+		{
+			const __m128i temp1 = _mm_load_si128(prand);
+			const __m128i temp2 = _mm_load_si128(pbuf);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+			acc = _mm_xor_si128(clprod1, acc);
+			const __m128i clprod2 = _mm_clmulepi64_si128(temp2, temp2, 0x10);
+			acc = _mm_xor_si128(clprod2, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			acc = _mm_xor_si128(add12, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prand, tempb2);
+			break;
+		}
+		case 8:
+		{
+			const __m128i temp1 = _mm_load_si128(prandex);
+			const __m128i temp2 = _mm_load_si128(pbuf);
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+			acc = _mm_xor_si128(add1, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			const __m128i temp12 = _mm_load_si128(prand);
+			_mm_store_si128(prand, tempa2);
+
+			const __m128i temp22 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add12 = _mm_xor_si128(temp12, temp22);
+			const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+			acc = _mm_xor_si128(clprod12, acc);
+			const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+			acc = _mm_xor_si128(clprod22, acc);
+
+			const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+			const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+			_mm_store_si128(prandex, tempb2);
+			break;
+		}
+		case 0xc:
+		{
+			const __m128i temp1 = _mm_load_si128(prand);
+			const __m128i temp2 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			const __m128i add1 = _mm_xor_si128(temp1, temp2);
+
+			// cannot be zero here
+			const int32_t divisor = (uint32_t)selector;
+
+			acc = _mm_xor_si128(add1, acc);
+
+			const int64_t dividend = _mm_cvtsi128_si64(acc);
+			const __m128i modulo = _mm_cvtsi32_si128(dividend % divisor);
+			acc = _mm_xor_si128(modulo, acc);
+
+			const __m128i tempa1 = _mm_mulhrs_epi16(acc, temp1);
+			const __m128i tempa2 = _mm_xor_si128(tempa1, temp1);
+
+			if (dividend & 1)
+			{
+				const __m128i temp12 = _mm_load_si128(prandex);
+				_mm_store_si128(prandex, tempa2);
+
+				const __m128i temp22 = _mm_load_si128(pbuf);
+				const __m128i add12 = _mm_xor_si128(temp12, temp22);
+				const __m128i clprod12 = _mm_clmulepi64_si128(add12, add12, 0x10);
+				acc = _mm_xor_si128(clprod12, acc);
+				const __m128i clprod22 = _mm_clmulepi64_si128(temp22, temp22, 0x10);
+				acc = _mm_xor_si128(clprod22, acc);
+
+				const __m128i tempb1 = _mm_mulhrs_epi16(acc, temp12);
+				const __m128i tempb2 = _mm_xor_si128(tempb1, temp12);
+				_mm_store_si128(prand, tempb2);
 			}
 			else
+			{
+				const __m128i tempb3 = _mm_load_si128(prandex);
+				_mm_store_si128(prandex, tempa2);
+				_mm_store_si128(prand, tempb3);
+			}
+			break;
+		}
+		case 0x10:
 		{
+			// a few AES operations
+			const __m128i *rc = prand;
+			__m128i tmp;
+
+			__m128i temp1 = _mm_load_si128(pbuf - (((selector & 1) << 1) - 1));
+			__m128i temp2 = _mm_load_si128(pbuf);
+
+			AES2(temp1, temp2, 0);
+			MIX2_EMU(temp1, temp2);
+
+			AES2(temp1, temp2, 4);
+			MIX2_EMU(temp1, temp2);
+
+			AES2(temp1, temp2, 8);
+			MIX2_EMU(temp1, temp2);
+
+			acc = _mm_xor_si128(temp2, _mm_xor_si128(temp1, acc));
+
+			const __m128i tempa1 = _mm_load_si128(prand);
+			const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+			const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+			const __m128i tempa4 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa3);
+			_mm_store_si128(prand, tempa4);
+			break;
+		}
+		case 0x14:
+		{
+			// we'll just call this one the monkins loop, inspired by Chris - modified to cast to uint64_t on shift for more variability in the loop
+			const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
+			__m128i tmp; // used by MIX2
+
+			uint64_t rounds = selector >> 61; // loop randomly between 1 and 8 times
+			__m128i *rc = prand;
+			uint64_t aesroundoffset = 0;
+			__m128i onekey;
+
+			do
+			{
+
+
+				if (selector & ((uint64_t)0x10000000) << rounds)
+				{
+					onekey = _mm_load_si128(rc++);
+					const __m128i temp2 = _mm_load_si128(rounds & 1 ? pbuf : buftmp);
+					const __m128i add1 = _mm_xor_si128(onekey, temp2);
+					const __m128i clprod1 = _mm_clmulepi64_si128(add1, add1, 0x10);
+					acc = _mm_xor_si128(clprod1, acc);
+				}
+				else
+				{
+					onekey = _mm_load_si128(rc++);
+					__m128i temp2 = _mm_load_si128(rounds & 1 ? buftmp : pbuf);
+					AES2(onekey, temp2, aesroundoffset);
+					aesroundoffset += 4;
+					MIX2_EMU(onekey, temp2);
+					acc = _mm_xor_si128(onekey, acc);
+					acc = _mm_xor_si128(temp2, acc);
+				}
+			} while (rounds--);
+
+			const __m128i tempa1 = _mm_load_si128(prand);
+			const __m128i tempa2 = _mm_mulhrs_epi16(acc, tempa1);
+			const __m128i tempa3 = _mm_xor_si128(tempa1, tempa2);
+
+			const __m128i tempa4 = _mm_load_si128(prandex);
+			_mm_store_si128(prandex, tempa3);
+			_mm_store_si128(prand, tempa4);
+			break;
+		}
+		case 0x18:
+		{
+
 			const __m128i *buftmp = pbuf - (((selector & 1) << 1) - 1);
 			__m128i tmp; // used by MIX2
 
@@ -377,8 +629,8 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 			_mm_store_si128(prandex, tempa4);
 			_mm_store_si128(prand, onekey);
 			break;
+
 		}
-}
 		case 0x1c:
 		{
 			const __m128i temp1 = _mm_load_si128(pbuf);
@@ -405,20 +657,22 @@ __inline  __m128i __verusclmulwithoutreduction64alignedrepeat_port(__m128i *rand
 		fixrandex[i] = prandex_idx;
 
 	}
-//	printf("acc = %08x\n", _mm_cvtsi128_si64(acc));
+	//	printf("acc = %08x\n", _mm_cvtsi128_si64(acc));
 
-//	exit(0);
+	//	exit(0);
 	return acc;
 }
-
 // hashes 64 bytes only by doing a carryless multiplication and reduction of the repeated 64 byte sequence 16 times, 
 // returning a 64 bit hash value
 uint64_t verusclhash_port(void * random, const unsigned char buf[64], uint64_t keyMask, uint16_t *  __restrict fixrand, uint16_t * __restrict fixrandex, uchar version) {
 	const unsigned int  m = 128;// we process the data in chunks of 16 cache lines
 	__m128i * rs64 = (__m128i *)random;
 	const __m128i * string = (const __m128i *) buf;
-
-	__m128i  acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, fixrand, fixrandex, version);
+	__m128i  acc;
+	if (version)
+		  acc = __verusclmulwithoutreduction64alignedrepeat_port2(rs64, string, keyMask, fixrand, fixrandex);
+	else
+		acc = __verusclmulwithoutreduction64alignedrepeat_port(rs64, string, keyMask, fixrand, fixrandex);
 	acc = _mm_xor_si128(acc, lazyLengthHash_port(1024, 64));
 	return precompReduction64_port(acc);
 }