Cleanup, optimization

This commit is contained in:
Adam Ierymenko 2019-09-12 14:32:37 -07:00
parent 5c06d40358
commit 2d1eeda188
No known key found for this signature in database
GPG key ID: C8877CF2D7A5D7F3
4 changed files with 153 additions and 97 deletions

View file

@ -524,100 +524,7 @@ private:
_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
}
ZT_ALWAYS_INLINE void _crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const
{
const __m64 iv0 = (__m64)(*((const uint64_t *)iv));
uint64_t ctr = Utils::ntoh(*((const uint64_t *)(iv+8)));
#define ZT_AES_CTR_AESNI_ROUND(k) \
c0 = _mm_aesenc_si128(c0,k); \
c1 = _mm_aesenc_si128(c1,k); \
c2 = _mm_aesenc_si128(c2,k); \
c3 = _mm_aesenc_si128(c3,k); \
c4 = _mm_aesenc_si128(c4,k); \
c5 = _mm_aesenc_si128(c5,k); \
c6 = _mm_aesenc_si128(c6,k); \
c7 = _mm_aesenc_si128(c7,k)
while (len >= 128) {
__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),_k.ni.k[0]);
__m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),_k.ni.k[0]);
__m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),_k.ni.k[0]);
__m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),_k.ni.k[0]);
__m128i c4 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+4ULL)),iv0),_k.ni.k[0]);
__m128i c5 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+5ULL)),iv0),_k.ni.k[0]);
__m128i c6 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+6ULL)),iv0),_k.ni.k[0]);
__m128i c7 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+7ULL)),iv0),_k.ni.k[0]);
ctr += 8;
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[1]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[2]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[3]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[4]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[5]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[6]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[7]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[8]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[9]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[10]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[11]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[12]);
ZT_AES_CTR_AESNI_ROUND(_k.ni.k[13]);
_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,_k.ni.k[14])));
_mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,_k.ni.k[14])));
in += 128;
out += 128;
len -= 128;
}
#undef ZT_AES_CTR_AESNI_ROUND
while (len >= 16) {
__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),_k.ni.k[0]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[1]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[2]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[3]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[4]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[5]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[6]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[7]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[8]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[9]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[10]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[11]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[12]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[13]);
_mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14])));
in += 16;
out += 16;
len -= 16;
}
if (len) {
__m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),_k.ni.k[0]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[1]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[2]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[3]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[4]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[5]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[6]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[7]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[8]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[9]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[10]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[11]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[12]);
c0 = _mm_aesenc_si128(c0,_k.ni.k[13]);
c0 = _mm_aesenclast_si128(c0,_k.ni.k[14]);
for(unsigned int i=0;i<len;++i)
out[i] = in[i] ^ ((const uint8_t *)&c0)[i];
}
}
void _crypt_ctr_aesni(const uint8_t iv[16],const uint8_t *in,unsigned int len,uint8_t *out) const;
static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y)
{