From ddc7623a50919d0314c074344ce59d2b904d018f Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Sat, 23 Nov 2013 16:02:13 +0000 Subject: [PATCH] Initial implementation --- .gitattributes | 22 ++++ .gitignore | 215 ++++++++++++++++++++++++++++++++++++++ api.h | 2 + chacha-avx2.c | 272 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 511 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 api.h create mode 100644 chacha-avx2.c diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..412eeda --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9d6bd9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,215 @@ +################# +## Eclipse +################# + +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results + +[Dd]ebug/ +[Rr]elease/ +x64/ +build/ +[Bb]in/ +[Oo]bj/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*.log +*.vspscc +*.vssscc +.builds +*.pidb +*.log +*.scc + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf +*.cachefile + +# Visual Studio profiler +*.psess +*.vsp +*.vspx + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# NCrunch +*.ncrunch* +.*crunch*.local.xml + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.Publish.xml +*.pubxml + +# NuGet Packages Directory +## TODO: If you have NuGet Package Restore enabled, uncomment the next line +#packages/ + +# Windows Azure Build Output +csx +*.build.csdef + +# Windows Store app package directory +AppPackages/ + +# Others +sql/ +*.Cache +ClientBin/ +[Ss]tyle[Cc]op.* +~$* +*~ +*.dbmdl +*.[Pp]ublish.xml +*.pfx +*.publishsettings + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm + +# SQL Server files +App_Data/*.mdf +App_Data/*.ldf + +############# +## Windows detritus +############# + +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Mac crap +.DS_Store + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist/ +build/ +eggs/ +parts/ +var/ +sdist/ +develop-eggs/ +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg diff --git a/api.h b/api.h new file mode 100644 index 0000000..4a2fa55 --- /dev/null +++ b/api.h @@ -0,0 +1,2 @@ +#define CRYPTO_KEYBYTES 32 +#define CRYPTO_NONCEBYTES 8 \ No newline at end of file diff --git a/chacha-avx2.c b/chacha-avx2.c new file mode 100644 index 0000000..5dbd3a3 --- /dev/null +++ b/chacha-avx2.c @@ -0,0 +1,272 @@ + +#if defined(SUPERCOP) + #include "crypto_stream.h" +#endif + +#include +#include + +#include + +/* Default to 20 rounds */ +#if !defined(CHACHA_ROUNDS) + #define CHACHA_ROUNDS 20 +#endif + +#if 0 != (CHACHA_ROUNDS % 2) + #error "I only accept an even number of rounds!" +#endif + +/* Some aliases to keep code below somewhat sane */ +#define LOADU(m) _mm256_loadu_si256((const __m256i *)(m)) +#define STOREU(m, v) _mm256_storeu_si256((__m256i*)(m), (v)) +#define TO128(x) _mm256_castsi256_si128(x) + +#define ADD(A, B) _mm256_add_epi32(A, B) +#define SUB(A, B) _mm256_sub_epi32(A, B) +#define XOR(A, B) _mm256_xor_si256(A, B) +#define ROT(X, C) \ +( \ + (C) == 8 ? _mm256_shuffle_epi8((X), _mm256_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3, \ + 14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3)) \ + : (C) == 16 ? _mm256_shuffle_epi8((X), _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2, \ + 13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2)) \ + : /* else */ _mm256_or_si256(_mm256_slli_epi32((X), (C)), _mm256_srli_epi32((X), 32 - (C))) \ +) + +#if defined(MANUAL_SCHEDULING) /* Behaves better on clang 3.3, worse on gcc, icc */ +#define DOUBLE_ROUND(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) \ +do \ +{ \ + /* Column round */ \ + x0 = ADD( x0, x4); x1 = ADD( x1, x5); x2 = ADD( x2, x6); x3 = ADD( x3, x7); \ + x12 = XOR(x12, x0); x13 = XOR(x13, x1); x14 = XOR(x14, x2); x15 = XOR(x15, x3); \ + x12 = ROT(x12, 16); x13 = ROT(x13, 16); x14 = ROT(x14, 16); x15 = ROT(x15, 16); \ + x8 = ADD( x8, x12); x9 = ADD( x9, x13); x10 = ADD(x10, x14); x11 = ADD(x11, x15); \ + x4 = XOR( x4, x8); x5 = XOR( x5, x9); x6 = XOR( x6, x10); x7 = XOR( x7, x11); \ + x4 = ROT( x4, 12); x5 = ROT( x5, 12); x6 = ROT( x6, 12); x7 = ROT( x7, 12); \ + x0 = ADD( x0, x4); x1 = ADD( x1, x5); x2 = ADD( x2, x6); x3 = ADD( x3, x7); \ + x12 = XOR(x12, x0); x13 = XOR(x13, x1); x14 = XOR(x14, x2); x15 = XOR(x15, x3); \ + x12 = ROT(x12, 8); x13 = ROT(x13, 8); x14 = ROT(x14, 8); x15 = ROT(x15, 8); \ + x8 = ADD( x8, x12); x9 = ADD( x9, x13); x10 = ADD(x10, x14); x11 = ADD(x11, x15); \ + x4 = XOR( x4, x8); x5 = XOR( x5, x9); x6 = XOR( x6, x10); x7 = XOR( x7, x11); \ + x4 = ROT( x4, 7); x5 = ROT( x5, 7); x6 = ROT( x6, 7); x7 = ROT( x7, 7); \ + /* Diagonal round */ \ + x0 = ADD( x0, x5); x1 = ADD( x1, x6); x2 = ADD( x2, x7); x3 = ADD( x3, x4); \ + x15 = XOR(x15, x0); x12 = XOR(x12, x1); x13 = XOR(x13, x2); x14 = XOR(x14, x3); \ + x15 = ROT(x15, 16); x12 = ROT(x12, 16); x13 = ROT(x13, 16); x14 = ROT(x14, 16); \ + x10 = ADD(x10, x15); x11 = ADD(x11, x12); x8 = ADD( x8, x13); x9 = ADD( x9, x14); \ + x5 = XOR( x5, x10); x6 = XOR( x6, x11); x7 = XOR( x7, x8); x4 = XOR( x4, x9); \ + x5 = ROT( x5, 12); x6 = ROT( x6, 12); x7 = ROT( x7, 12); x4 = ROT( x4, 12); \ + x0 = ADD( x0, x5); x1 = ADD( x1, x6); x2 = ADD( x2, x7); x3 = ADD( x3, x4); \ + x15 = XOR(x15, x0); x12 = XOR(x12, x1); x13 = XOR(x13, x2); x14 = XOR(x14, x3); \ + x15 = ROT(x15, 8); x12 = ROT(x12, 8); x13 = ROT(x13, 8); x14 = ROT(x14, 8); \ + x10 = ADD(x10, x15); x11 = ADD(x11, x12); x8 = ADD( x8, x13); x9 = ADD( x9, x14); \ + x5 = XOR( x5, x10); x6 = XOR( x6, x11); x7 = XOR( x7, x8); x4 = XOR( x4, x9); \ + x5 = ROT( x5, 7); x6 = ROT( x6, 7); x7 = ROT( x7, 7); x4 = ROT( x4, 7); \ +} while(0) +#else +#define QUARTER_ROUND(A, B, C, D) \ +do \ +{ \ + A = ADD(A, B); D = XOR(D, A); D = ROT(D, 16); \ + C = ADD(C, D); B = XOR(B, C); B = ROT(B, 12); \ + A = ADD(A, B); D = XOR(D, A); D = ROT(D, 8); \ + C = ADD(C, D); B = XOR(B, C); B = ROT(B, 7); \ +} while(0) + +#define DOUBLE_ROUND(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) \ +do \ +{ \ + /* Column round */ \ + QUARTER_ROUND(x0, x4, x8, x12); \ + QUARTER_ROUND(x1, x5, x9, x13); \ + QUARTER_ROUND(x2, x6, x10, x14); \ + QUARTER_ROUND(x3, x7, x11, x15); \ + /* Diagonal round */ \ + QUARTER_ROUND(x0, x5, x10, x15); \ + QUARTER_ROUND(x1, x6, x11, x12); \ + QUARTER_ROUND(x2, x7, x8, x13); \ + QUARTER_ROUND(x3, x4, x9, x14); \ +} while(0) +#endif + +#define CHACHA_CORE(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15, \ + x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) \ +do \ +{ \ + int i; \ + z0 = x0, z1 = x1, z2 = x2, z3 = x3, \ + z4 = x4, z5 = x5, z6 = x6, z7 = x7, \ + z8 = x8, z9 = x9,z10 = x10,z11 = x11, \ + z12 = x12,z13 = x13,z14 = x14,z15 = x15; \ + for(i = 0; i < CHACHA_ROUNDS; i += 2) \ + DOUBLE_ROUND(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15); \ + \ + z0 = ADD( x0, z0); \ + z1 = ADD( x1, z1); \ + z2 = ADD( x2, z2); \ + z3 = ADD( x3, z3); \ + z4 = ADD( x4, z4); \ + z5 = ADD( x5, z5); \ + z6 = ADD( x6, z6); \ + z7 = ADD( x7, z7); \ + z8 = ADD( x8, z8); \ + z9 = ADD( x9, z9); \ + z10 = ADD(x10, z10); \ + z11 = ADD(x11, z11); \ + z12 = ADD(x12, z12); \ + z13 = ADD(x13, z13); \ + z14 = ADD(x14, z14); \ + z15 = ADD(x15, z15); \ +} while(0) + + +/* Probably not the best it can be... */ +#define TRANSPOSE(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) \ +do \ +{ \ + __m256i t0,t1,t2,t3,t4,t5,t6,t7; \ + __m256i t8,t9,t10,t11,t12,t13,t14,t15; \ + __m256i s0,s1,s2,s3,s4,s5,s6,s7; \ + __m256i s8,s9,s10,s11,s12,s13,s14,s15; \ + \ + t0 = _mm256_unpacklo_epi32( x0, x1); \ + t1 = _mm256_unpackhi_epi32( x0, x1); \ + t2 = _mm256_unpacklo_epi32( x2, x3); \ + t3 = _mm256_unpackhi_epi32( x2, x3); \ + t4 = _mm256_unpacklo_epi32( x4, x5); \ + t5 = _mm256_unpackhi_epi32( x4, x5); \ + t6 = _mm256_unpacklo_epi32( x6, x7); \ + t7 = _mm256_unpackhi_epi32( x6, x7); \ + t8 = _mm256_unpacklo_epi32( x8, x9); \ + t9 = _mm256_unpackhi_epi32( x8, x9); \ + t10 = _mm256_unpacklo_epi32(x10, x11); \ + t11 = _mm256_unpackhi_epi32(x10, x11); \ + t12 = _mm256_unpacklo_epi32(x12, x13); \ + t13 = _mm256_unpackhi_epi32(x12, x13); \ + t14 = _mm256_unpacklo_epi32(x14, x15); \ + t15 = _mm256_unpackhi_epi32(x14, x15); \ + \ + s0 = _mm256_unpacklo_epi64( t0, t2); \ + s1 = _mm256_unpackhi_epi64( t0, t2); \ + s2 = _mm256_unpacklo_epi64( t1, t3); \ + s3 = _mm256_unpackhi_epi64( t1, t3); \ + s4 = _mm256_unpacklo_epi64( t4, t6); \ + s5 = _mm256_unpackhi_epi64( t4, t6); \ + s6 = _mm256_unpacklo_epi64( t5, t7); \ + s7 = _mm256_unpackhi_epi64( t5, t7); \ + s8 = _mm256_unpacklo_epi64( t8, t10); \ + s9 = _mm256_unpackhi_epi64( t8, t10); \ + s10 = _mm256_unpacklo_epi64( t9, t11); \ + s11 = _mm256_unpackhi_epi64( t9, t11); \ + s12 = _mm256_unpacklo_epi64(t12, t14); \ + s13 = _mm256_unpackhi_epi64(t12, t14); \ + s14 = _mm256_unpacklo_epi64(t13, t15); \ + s15 = _mm256_unpackhi_epi64(t13, t15); \ + \ + x0 = _mm256_permute2x128_si256( s0, s4, 0x20); \ + x8 = _mm256_permute2x128_si256( s0, s4, 0x31); \ + x1 = _mm256_permute2x128_si256( s8, s12, 0x20); \ + x9 = _mm256_permute2x128_si256( s8, s12, 0x31); \ + x2 = _mm256_permute2x128_si256( s1, s5, 0x20); \ + x10 = _mm256_permute2x128_si256( s1, s5, 0x31); \ + x3 = _mm256_permute2x128_si256( s9, s13, 0x20); \ + x11 = _mm256_permute2x128_si256( s9, s13, 0x31); \ + x4 = _mm256_permute2x128_si256( s2, s6, 0x20); \ + x12 = _mm256_permute2x128_si256( s2, s6, 0x31); \ + x5 = _mm256_permute2x128_si256(s10, s14, 0x20); \ + x13 = _mm256_permute2x128_si256(s10, s14, 0x31); \ + x6 = _mm256_permute2x128_si256( s3, s7, 0x20); \ + x14 = _mm256_permute2x128_si256( s3, s7, 0x31); \ + x7 = _mm256_permute2x128_si256(s11, s15, 0x20); \ + x15 = _mm256_permute2x128_si256(s11, s15, 0x31); \ +} while(0) + +#define CHACHA_OUT(out,in,x0,x1, x2, x3, x4, x5, x6, x7, \ + x8,x9,x10,x11,x12,x13,x14,x15) \ +do \ +{ \ + TRANSPOSE(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15); \ + STOREU(out + 0, XOR( x0, LOADU(in + 0))); \ + STOREU(out + 32, XOR( x1, LOADU(in + 32))); \ + STOREU(out + 64, XOR( x2, LOADU(in + 64))); \ + STOREU(out + 96, XOR( x3, LOADU(in + 96))); \ + STOREU(out + 128, XOR( x4, LOADU(in + 128))); \ + STOREU(out + 160, XOR( x5, LOADU(in + 160))); \ + STOREU(out + 192, XOR( x6, LOADU(in + 192))); \ + STOREU(out + 224, XOR( x7, LOADU(in + 224))); \ + STOREU(out + 256, XOR( x8, LOADU(in + 256))); \ + STOREU(out + 288, XOR( x9, LOADU(in + 288))); \ + STOREU(out + 320, XOR(x10, LOADU(in + 320))); \ + STOREU(out + 352, XOR(x11, LOADU(in + 352))); \ + STOREU(out + 384, XOR(x12, LOADU(in + 384))); \ + STOREU(out + 416, XOR(x13, LOADU(in + 416))); \ + STOREU(out + 448, XOR(x14, LOADU(in + 448))); \ + STOREU(out + 480, XOR(x15, LOADU(in + 480))); \ +} while(0) + + + +int crypto_stream_xor(unsigned char *out, const unsigned char *in, + unsigned long long inlen, const unsigned char *n_, const unsigned char *k_) +{ + __m256i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; + __m256i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15; + uint32_t k[8]; + uint32_t n[2]; + + memcpy(k, k_, 32); + memcpy(n, n_, 8); + + x0 = _mm256_set1_epi32(0x61707865); + x1 = _mm256_set1_epi32(0x3320646e); + x2 = _mm256_set1_epi32(0x79622d32); + x3 = _mm256_set1_epi32(0x6b206574); + + x4 = _mm256_set1_epi32(k[0]); + x5 = _mm256_set1_epi32(k[1]); + x6 = _mm256_set1_epi32(k[2]); + x7 = _mm256_set1_epi32(k[3]); + x8 = _mm256_set1_epi32(k[4]); + x9 = _mm256_set1_epi32(k[5]); + x10 = _mm256_set1_epi32(k[6]); + x11 = _mm256_set1_epi32(k[7]); + + x12 = _mm256_set_epi32(7,6,5,4,3,2,1,0); + x13 = _mm256_setzero_si256(); + x14 = _mm256_set1_epi32(n[0]); + x15 = _mm256_set1_epi32(n[1]); + + while(inlen >= 512) + { + CHACHA_CORE(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,\ + x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15); + CHACHA_OUT(out,in,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15); + + x13 = SUB(x13, _mm256_cmpeq_epi32(x12, _mm256_set_epi32(-1,-2,-3,-4,-5,-6,-7,-8))); + x12 = ADD(x12, _mm256_set1_epi32(8)); + + out += 512; in += 512; inlen -= 512; + } + + if(inlen > 0) /* Should fallback to latency-oriented implementation here */ + { + unsigned char blk[512]; + memcpy(blk, in, inlen); + CHACHA_CORE(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,\ + x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15); + CHACHA_OUT(blk,blk,z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15); + memcpy(out, blk, inlen); + } + + return 0; +} + +int crypto_stream(unsigned char *out, unsigned long long outlen, + const unsigned char *n, const unsigned char *k) +{ + memset(out, 0, outlen); + return crypto_stream_xor(out, out, outlen, n, k); +} +