diff --git a/node/AES-aesni.c b/node/AES-aesni.c new file mode 100644 index 00000000..2d808b97 --- /dev/null +++ b/node/AES-aesni.c @@ -0,0 +1,190 @@ +/* + * Copyright (c)2019 ZeroTier, Inc. + * + * Use of this software is governed by the Business Source License included + * in the LICENSE.TXT file in the project's root directory. + * + * Change Date: 2023-01-01 + * + * On the date above, in accordance with the Business Source License, use + * of this software will be governed by version 2.0 of the Apache License. + */ +/****/ + +#if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64)) + +#include +#include +#include +#include +#include +#include +#include +#include + +/* #define register */ + +void zt_crypt_ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) +{ + _mm_prefetch(in,_MM_HINT_NTA); + + /* Because our CTR supports full 128-bit nonces, we must do a full 128-bit (big-endian) + * increment to be compatible with canonical NIST-certified CTR implementations. That's + * because it's possible to have a lot of bit saturation in the least significant 64 + * bits, which could on rare occasions actually cause a 64-bit wrap. If this happened + * without carry it would result in incompatibility and quietly dropped packets. The + * probability is low, so this would be a one in billions packet loss bug that would + * probably never be found. + * + * This crazy code does a branch-free 128-bit increment by adding a one or a zero to + * the most significant 64 bits of the 128-bit vector based on whether the add we want + * to do to the least significant 64 bits would overflow. This can be computed by + * NOTing those bits and comparing with what we want to add, since NOT is the same + * as subtracting from uint64_max. This generates branch-free ASM on x64 with most + * good compilers. */ + register __m128i swap128 = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); + register __m128i ctr0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)iv),swap128); + register uint64_t notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0)); + register __m128i ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 1ULL),1LL)),swap128); + register __m128i ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 2ULL),2LL)),swap128); + register __m128i ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 3ULL),3LL)),swap128); + ctr0 = _mm_shuffle_epi8(ctr0,swap128); + + while (len >= 64) { + _mm_prefetch(in + 64,_MM_HINT_NTA); + register __m128i ka = key[0]; + register __m128i c0 = _mm_xor_si128(ctr0,ka); + ctr0 = _mm_shuffle_epi8(ctr0,swap128); + notctr0msq = ~((uint64_t)_mm_extract_epi64(ctr0,0)); + register __m128i c1 = _mm_xor_si128(ctr1,ka); + register __m128i c2 = _mm_xor_si128(ctr2,ka); + register __m128i c3 = _mm_xor_si128(ctr3,ka); + register __m128i kb = key[1]; + ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 5ULL),5LL)),swap128); + ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 6ULL),6LL)),swap128); + register __m128i kc = key[2]; + ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 7ULL),7LL)),swap128); + ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)(notctr0msq < 4ULL),4LL)),swap128); + register __m128i kd = key[3]; +#define ZT_AES_CTR_AESNI_ROUND(kk) \ + c0 = _mm_aesenc_si128(c0,kk); \ + c1 = _mm_aesenc_si128(c1,kk); \ + c2 = _mm_aesenc_si128(c2,kk); \ + c3 = _mm_aesenc_si128(c3,kk); + ka = key[4]; + ZT_AES_CTR_AESNI_ROUND(kb); + kb = key[5]; + ZT_AES_CTR_AESNI_ROUND(kc); + kc = key[6]; + ZT_AES_CTR_AESNI_ROUND(kd); + kd = key[7]; + ZT_AES_CTR_AESNI_ROUND(ka); + ka = key[8]; + ZT_AES_CTR_AESNI_ROUND(kb); + kb = key[9]; + ZT_AES_CTR_AESNI_ROUND(kc); + kc = key[10]; + ZT_AES_CTR_AESNI_ROUND(kd); + kd = key[11]; + ZT_AES_CTR_AESNI_ROUND(ka); + ka = key[12]; + ZT_AES_CTR_AESNI_ROUND(kb); + kb = key[13]; + ZT_AES_CTR_AESNI_ROUND(kc); + kc = key[14]; + ZT_AES_CTR_AESNI_ROUND(kd); + ZT_AES_CTR_AESNI_ROUND(ka); + ZT_AES_CTR_AESNI_ROUND(kb); +#undef ZT_AES_CTR_AESNI_ROUND + register __m128i d0 = _mm_loadu_si128((const __m128i *)in); + register __m128i d1 = _mm_loadu_si128((const __m128i *)(in + 16)); + register __m128i d2 = _mm_loadu_si128((const __m128i *)(in + 32)); + register __m128i d3 = _mm_loadu_si128((const __m128i *)(in + 48)); + c0 = _mm_aesenclast_si128(c0,kc); + c1 = _mm_aesenclast_si128(c1,kc); + c2 = _mm_aesenclast_si128(c2,kc); + c3 = _mm_aesenclast_si128(c3,kc); + d0 = _mm_xor_si128(d0,c0); + d1 = _mm_xor_si128(d1,c1); + d2 = _mm_xor_si128(d2,c2); + d3 = _mm_xor_si128(d3,c3); + _mm_storeu_si128((__m128i *)out,d0); + _mm_storeu_si128((__m128i *)(out + 16),d1); + _mm_storeu_si128((__m128i *)(out + 32),d2); + _mm_storeu_si128((__m128i *)(out + 48),d3); + in += 64; + out += 64; + len -= 64; + } + + register __m128i k0 = key[0]; + register __m128i k1 = key[1]; + register __m128i k2 = key[2]; + register __m128i k3 = key[3]; + register __m128i k4 = key[4]; + register __m128i k5 = key[5]; + register __m128i k6 = key[6]; + register __m128i k7 = key[7]; + /* not enough XMM registers for all of them, but it helps slightly... */ + + while (len >= 16) { + register __m128i c0 = _mm_xor_si128(ctr0,k0); + ctr0 = _mm_shuffle_epi8(ctr0,swap128); + ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < 1ULL),1LL)),swap128); + c0 = _mm_aesenc_si128(c0,k1); + c0 = _mm_aesenc_si128(c0,k2); + c0 = _mm_aesenc_si128(c0,k3); + c0 = _mm_aesenc_si128(c0,k4); + c0 = _mm_aesenc_si128(c0,k5); + c0 = _mm_aesenc_si128(c0,k6); + register __m128i ka = key[8]; + c0 = _mm_aesenc_si128(c0,k7); + register __m128i kb = key[9]; + c0 = _mm_aesenc_si128(c0,ka); + ka = key[10]; + c0 = _mm_aesenc_si128(c0,kb); + kb = key[11]; + c0 = _mm_aesenc_si128(c0,ka); + ka = key[12]; + c0 = _mm_aesenc_si128(c0,kb); + kb = key[13]; + c0 = _mm_aesenc_si128(c0,ka); + ka = key[14]; + c0 = _mm_aesenc_si128(c0,kb); + _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,ka))); + in += 16; + out += 16; + len -= 16; + } + + if (len) { + register __m128i c0 = _mm_xor_si128(ctr0,k0); + k0 = key[8]; + c0 = _mm_aesenc_si128(c0,k1); + c0 = _mm_aesenc_si128(c0,k2); + k1 = key[9]; + c0 = _mm_aesenc_si128(c0,k3); + c0 = _mm_aesenc_si128(c0,k4); + k2 = key[10]; + c0 = _mm_aesenc_si128(c0,k5); + c0 = _mm_aesenc_si128(c0,k6); + k3 = key[11]; + c0 = _mm_aesenc_si128(c0,k7); + c0 = _mm_aesenc_si128(c0,k0); + k0 = key[12]; + c0 = _mm_aesenc_si128(c0,k1); + c0 = _mm_aesenc_si128(c0,k2); + k1 = key[13]; + c0 = _mm_aesenc_si128(c0,k3); + c0 = _mm_aesenc_si128(c0,k0); + k2 = key[14]; + c0 = _mm_aesenc_si128(c0,k1); + c0 = _mm_aesenclast_si128(c0,k2); + uint8_t tmp[16]; + _mm_storeu_si128((__m128i *)tmp,c0); + for(unsigned int i=0;i= 128) { - __m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]); - ctr0 = _mm_shuffle_epi8(ctr0,swap128); - __m128i c1 = _mm_xor_si128(ctr1,_k.ni.k[0]); - ctr1 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 1ULL)),(long long)(ctr + 1ULL))),swap128); - __m128i c2 = _mm_xor_si128(ctr2,_k.ni.k[0]); - ctr2 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 2ULL)),(long long)(ctr + 2ULL))),swap128); - __m128i c3 = _mm_xor_si128(ctr3,_k.ni.k[0]); - ctr3 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 3ULL)),(long long)(ctr + 3ULL))),swap128); - __m128i c4 = _mm_xor_si128(ctr4,_k.ni.k[0]); - ctr4 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 4ULL)),(long long)(ctr + 4ULL))),swap128); - __m128i c5 = _mm_xor_si128(ctr5,_k.ni.k[0]); - ctr5 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 5ULL)),(long long)(ctr + 5ULL))),swap128); - __m128i c6 = _mm_xor_si128(ctr6,_k.ni.k[0]); - ctr6 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 6ULL)),(long long)(ctr + 6ULL))),swap128); - __m128i c7 = _mm_xor_si128(ctr7,_k.ni.k[0]); - ctr7 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr + 7ULL)),(long long)(ctr + 7ULL))),swap128); - ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr)),(long long)(ctr))),swap128); - ctr += 8; - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[3]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[4]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[5]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[6]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[7]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[8]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[9]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[10]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[11]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[12]); - ZT_AES_CTR_AESNI_ROUND(_k.ni.k[13]); - _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,_k.ni.k[14]))); - _mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,_k.ni.k[14]))); - in += 128; - out += 128; - len -= 128; - } -#undef ZT_AES_CTR_AESNI_ROUND - - while (len >= 16) { - __m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]); - ctr0 = _mm_shuffle_epi8(ctr0,swap128); - ctr0 = _mm_shuffle_epi8(_mm_add_epi64(ctr0,_mm_set_epi64x((long long)((~((uint64_t)_mm_extract_epi64(ctr0,0))) < (ctr)),(long long)(ctr))),swap128); - ++ctr; - c0 = _mm_aesenc_si128(c0,_k.ni.k[1]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[2]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[3]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[4]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[5]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[6]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[7]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[8]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[9]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[10]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[11]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[12]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[13]); - _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14]))); - in += 16; - out += 16; - len -= 16; - } - - if (len) { - __m128i c0 = _mm_xor_si128(ctr0,_k.ni.k[0]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[1]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[2]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[3]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[4]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[5]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[6]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[7]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[8]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[9]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[10]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[11]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[12]); - c0 = _mm_aesenc_si128(c0,_k.ni.k[13]); - c0 = _mm_aesenclast_si128(c0,_k.ni.k[14]); - for(unsigned int i=0;i #include #include #include + #define ZT_AES_AESNI 1 -#endif + +// AES-aesni.c +extern "C" void zt_crypt_ctr_aesni(const __m128i key[14],const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out); + +#endif // x64 #define ZT_AES_KEY_SIZE 32 #define ZT_AES_BLOCK_SIZE 16 @@ -115,7 +121,7 @@ public: { #ifdef ZT_AES_AESNI if (likely(HW_ACCEL)) { - _crypt_ctr_aesni(iv,(const uint8_t *)in,len,(uint8_t *)out); + zt_crypt_ctr_aesni(_k.ni.k,iv,(const uint8_t *)in,len,(uint8_t *)out); return; } #endif @@ -524,8 +530,6 @@ private: _mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14])); } - void _crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const; - static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y) { y = _mm_shuffle_epi8(y,shuf); diff --git a/node/CMakeLists.txt b/node/CMakeLists.txt index 0947fbdc..106f84f5 100644 --- a/node/CMakeLists.txt +++ b/node/CMakeLists.txt @@ -48,8 +48,10 @@ set(core_headers Trace.hpp Utils.hpp ) + set(core_src AES.cpp + AES-aesni.c C25519.cpp Credential.cpp ECC384.cpp @@ -72,18 +74,20 @@ set(core_src Trace.cpp Utils.cpp ) + add_library(${PROJECT_NAME} STATIC ${core_src} ${core_headers}) target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_BINARY_DIR}) -if(UNIX) - set_source_files_properties( - AES.cpp - ECC384.cpp - Salsa20.cpp - C25519.cpp - Poly1305.cpp - PROPERTIES - COMPILE_FLAGS "-Wall -O3" - ) -endif(UNIX) +#if(UNIX) +# set_source_files_properties( +# AES.cpp +# AES-aesni.c +# ECC384.cpp +# Salsa20.cpp +# C25519.cpp +# Poly1305.cpp +# PROPERTIES +# COMPILE_FLAGS "-Wall -O3" +# ) +#endif(UNIX) diff --git a/selftest.cpp b/selftest.cpp index 22fb02f1..9149505e 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -214,12 +214,12 @@ static int testCrypto() std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; std::cout << " AES-256-CTR (benchmark): "; std::cout.flush(); start = OSUtils::now(); - for(unsigned long i=0;i<500000;++i) { + for(unsigned long i=0;i<1000000;++i) { tv.ctr((const uint8_t *)hexbuf,buf1,ZT_DEFAULT_MTU,buf1); *dummy = buf1[0]; } end = OSUtils::now(); - std::cout << (((double)(500000 * ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; + std::cout << (((1000000.0 * (double)ZT_DEFAULT_MTU) / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (dummy: " << (unsigned int)*dummy << ")" ZT_EOL_S; } { std::cout << " AES-256-GMAC-SIV (benchmark): "; std::cout.flush();