1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Upgrade openssl from 1.1.0e to 1.1.1b, with source code. 4.0.78

This commit is contained in:
winlin 2021-03-01 20:47:57 +08:00
parent 8f1c992379
commit 96dbd7bced
1476 changed files with 616554 additions and 4 deletions

View file

@ -0,0 +1,24 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const AES_KEY *key,
unsigned char *ivec, const int enc)
{
if (enc)
CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
(block128_f) AES_encrypt);
else
CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
(block128_f) AES_decrypt);
}

View file

@ -0,0 +1,43 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
/*
* The input and output encrypted as though 128bit cfb mode is being used.
* The extra state information to record how much of the 128bit block we have
* used is contained in *num;
*/
void AES_cfb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc)
{
CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
(block128_f) AES_encrypt);
}
/* N.B. This expects the input to be packed, MS bit first */
void AES_cfb1_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc)
{
CRYPTO_cfb128_1_encrypt(in, out, length, key, ivec, num, enc,
(block128_f) AES_encrypt);
}
void AES_cfb8_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num, const int enc)
{
CRYPTO_cfb128_8_encrypt(in, out, length, key, ivec, num, enc,
(block128_f) AES_encrypt);
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,26 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <assert.h>
#include <openssl/aes.h>
#include "aes_locl.h"
void AES_ecb_encrypt(const unsigned char *in, unsigned char *out,
const AES_KEY *key, const int enc)
{
assert(in && out && key);
assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
if (AES_ENCRYPT == enc)
AES_encrypt(in, out, key);
else
AES_decrypt(in, out, key);
}

View file

@ -0,0 +1,284 @@
/*
* Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include <openssl/aes.h>
#include "aes_locl.h"
#define N_WORDS (AES_BLOCK_SIZE / sizeof(unsigned long))
typedef struct {
unsigned long data[N_WORDS];
} aes_block_t;
/* XXX: probably some better way to do this */
#if defined(__i386__) || defined(__x86_64__)
# define UNALIGNED_MEMOPS_ARE_FAST 1
#else
# define UNALIGNED_MEMOPS_ARE_FAST 0
#endif
#if UNALIGNED_MEMOPS_ARE_FAST
# define load_block(d, s) (d) = *(const aes_block_t *)(s)
# define store_block(d, s) *(aes_block_t *)(d) = (s)
#else
# define load_block(d, s) memcpy((d).data, (s), AES_BLOCK_SIZE)
# define store_block(d, s) memcpy((d), (s).data, AES_BLOCK_SIZE)
#endif
/* N.B. The IV for this mode is _twice_ the block size */
void AES_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, const int enc)
{
size_t n;
size_t len = length;
if (length == 0)
return;
OPENSSL_assert(in && out && key && ivec);
OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
len = length / AES_BLOCK_SIZE;
if (AES_ENCRYPT == enc) {
if (in != out &&
(UNALIGNED_MEMOPS_ARE_FAST
|| ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(long) ==
0)) {
aes_block_t *ivp = (aes_block_t *) ivec;
aes_block_t *iv2p = (aes_block_t *) (ivec + AES_BLOCK_SIZE);
while (len) {
aes_block_t *inp = (aes_block_t *) in;
aes_block_t *outp = (aes_block_t *) out;
for (n = 0; n < N_WORDS; ++n)
outp->data[n] = inp->data[n] ^ ivp->data[n];
AES_encrypt((unsigned char *)outp->data,
(unsigned char *)outp->data, key);
for (n = 0; n < N_WORDS; ++n)
outp->data[n] ^= iv2p->data[n];
ivp = outp;
iv2p = inp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
} else {
aes_block_t tmp, tmp2;
aes_block_t iv;
aes_block_t iv2;
load_block(iv, ivec);
load_block(iv2, ivec + AES_BLOCK_SIZE);
while (len) {
load_block(tmp, in);
for (n = 0; n < N_WORDS; ++n)
tmp2.data[n] = tmp.data[n] ^ iv.data[n];
AES_encrypt((unsigned char *)tmp2.data,
(unsigned char *)tmp2.data, key);
for (n = 0; n < N_WORDS; ++n)
tmp2.data[n] ^= iv2.data[n];
store_block(out, tmp2);
iv = tmp2;
iv2 = tmp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, iv.data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
}
} else {
if (in != out &&
(UNALIGNED_MEMOPS_ARE_FAST
|| ((size_t)in | (size_t)out | (size_t)ivec) % sizeof(long) ==
0)) {
aes_block_t *ivp = (aes_block_t *) ivec;
aes_block_t *iv2p = (aes_block_t *) (ivec + AES_BLOCK_SIZE);
while (len) {
aes_block_t tmp;
aes_block_t *inp = (aes_block_t *) in;
aes_block_t *outp = (aes_block_t *) out;
for (n = 0; n < N_WORDS; ++n)
tmp.data[n] = inp->data[n] ^ iv2p->data[n];
AES_decrypt((unsigned char *)tmp.data,
(unsigned char *)outp->data, key);
for (n = 0; n < N_WORDS; ++n)
outp->data[n] ^= ivp->data[n];
ivp = inp;
iv2p = outp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, ivp->data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2p->data, AES_BLOCK_SIZE);
} else {
aes_block_t tmp, tmp2;
aes_block_t iv;
aes_block_t iv2;
load_block(iv, ivec);
load_block(iv2, ivec + AES_BLOCK_SIZE);
while (len) {
load_block(tmp, in);
tmp2 = tmp;
for (n = 0; n < N_WORDS; ++n)
tmp.data[n] ^= iv2.data[n];
AES_decrypt((unsigned char *)tmp.data,
(unsigned char *)tmp.data, key);
for (n = 0; n < N_WORDS; ++n)
tmp.data[n] ^= iv.data[n];
store_block(out, tmp);
iv = tmp2;
iv2 = tmp;
--len;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
memcpy(ivec, iv.data, AES_BLOCK_SIZE);
memcpy(ivec + AES_BLOCK_SIZE, iv2.data, AES_BLOCK_SIZE);
}
}
}
/*
* Note that its effectively impossible to do biIGE in anything other
* than a single pass, so no provision is made for chaining.
*/
/* N.B. The IV for this mode is _four times_ the block size */
void AES_bi_ige_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
const AES_KEY *key2, const unsigned char *ivec,
const int enc)
{
size_t n;
size_t len = length;
unsigned char tmp[AES_BLOCK_SIZE];
unsigned char tmp2[AES_BLOCK_SIZE];
unsigned char tmp3[AES_BLOCK_SIZE];
unsigned char prev[AES_BLOCK_SIZE];
const unsigned char *iv;
const unsigned char *iv2;
OPENSSL_assert(in && out && key && ivec);
OPENSSL_assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc));
OPENSSL_assert((length % AES_BLOCK_SIZE) == 0);
if (AES_ENCRYPT == enc) {
/*
* XXX: Do a separate case for when in != out (strictly should check
* for overlap, too)
*/
/* First the forward pass */
iv = ivec;
iv2 = ivec + AES_BLOCK_SIZE;
while (len >= AES_BLOCK_SIZE) {
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] = in[n] ^ iv[n];
AES_encrypt(out, out, key);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] ^= iv2[n];
iv = out;
memcpy(prev, in, AES_BLOCK_SIZE);
iv2 = prev;
len -= AES_BLOCK_SIZE;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
/* And now backwards */
iv = ivec + AES_BLOCK_SIZE * 2;
iv2 = ivec + AES_BLOCK_SIZE * 3;
len = length;
while (len >= AES_BLOCK_SIZE) {
out -= AES_BLOCK_SIZE;
/*
* XXX: reduce copies by alternating between buffers
*/
memcpy(tmp, out, AES_BLOCK_SIZE);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] ^= iv[n];
/*
* hexdump(stdout, "out ^ iv", out, AES_BLOCK_SIZE);
*/
AES_encrypt(out, out, key);
/*
* hexdump(stdout,"enc", out, AES_BLOCK_SIZE);
*/
/*
* hexdump(stdout,"iv2", iv2, AES_BLOCK_SIZE);
*/
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] ^= iv2[n];
/*
* hexdump(stdout,"out", out, AES_BLOCK_SIZE);
*/
iv = out;
memcpy(prev, tmp, AES_BLOCK_SIZE);
iv2 = prev;
len -= AES_BLOCK_SIZE;
}
} else {
/* First backwards */
iv = ivec + AES_BLOCK_SIZE * 2;
iv2 = ivec + AES_BLOCK_SIZE * 3;
in += length;
out += length;
while (len >= AES_BLOCK_SIZE) {
in -= AES_BLOCK_SIZE;
out -= AES_BLOCK_SIZE;
memcpy(tmp, in, AES_BLOCK_SIZE);
memcpy(tmp2, in, AES_BLOCK_SIZE);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
tmp[n] ^= iv2[n];
AES_decrypt(tmp, out, key);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] ^= iv[n];
memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
iv = tmp3;
iv2 = out;
len -= AES_BLOCK_SIZE;
}
/* And now forwards */
iv = ivec;
iv2 = ivec + AES_BLOCK_SIZE;
len = length;
while (len >= AES_BLOCK_SIZE) {
memcpy(tmp, out, AES_BLOCK_SIZE);
memcpy(tmp2, out, AES_BLOCK_SIZE);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
tmp[n] ^= iv2[n];
AES_decrypt(tmp, out, key);
for (n = 0; n < AES_BLOCK_SIZE; ++n)
out[n] ^= iv[n];
memcpy(tmp3, tmp2, AES_BLOCK_SIZE);
iv = tmp3;
iv2 = out;
len -= AES_BLOCK_SIZE;
in += AES_BLOCK_SIZE;
out += AES_BLOCK_SIZE;
}
}
}

View file

@ -0,0 +1,42 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#ifndef HEADER_AES_LOCL_H
# define HEADER_AES_LOCL_H
# include <openssl/e_os2.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
# define GETU32(p) SWAP(*((u32 *)(p)))
# define PUTU32(ct, st) { *((u32 *)(ct)) = SWAP((st)); }
# else
# define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ ((u32)(pt)[2] << 8) ^ ((u32)(pt)[3]))
# define PUTU32(ct, st) { (ct)[0] = (u8)((st) >> 24); (ct)[1] = (u8)((st) >> 16); (ct)[2] = (u8)((st) >> 8); (ct)[3] = (u8)(st); }
# endif
# ifdef AES_LONG
typedef unsigned long u32;
# else
typedef unsigned int u32;
# endif
typedef unsigned short u16;
typedef unsigned char u8;
# define MAXKC (256/32)
# define MAXKB (256/8)
# define MAXNR 14
/* This controls loop-unrolling in aes_core.c */
# undef FULL_UNROLL
#endif /* !HEADER_AES_LOCL_H */

View file

@ -0,0 +1,21 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/opensslv.h>
#include <openssl/aes.h>
#include "aes_locl.h"
const char *AES_options(void)
{
#ifdef FULL_UNROLL
return "aes(full)";
#else
return "aes(partial)";
#endif
}

View file

@ -0,0 +1,19 @@
/*
* Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/aes.h>
#include <openssl/modes.h>
void AES_ofb128_encrypt(const unsigned char *in, unsigned char *out,
size_t length, const AES_KEY *key,
unsigned char *ivec, int *num)
{
CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
(block128_f) AES_encrypt);
}

View file

@ -0,0 +1,27 @@
/*
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include <openssl/aes.h>
#include <openssl/modes.h>
int AES_wrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
return CRYPTO_128_wrap(key, iv, out, in, inlen, (block128_f) AES_encrypt);
}
int AES_unwrap_key(AES_KEY *key, const unsigned char *iv,
unsigned char *out,
const unsigned char *in, unsigned int inlen)
{
return CRYPTO_128_unwrap(key, iv, out, in, inlen,
(block128_f) AES_decrypt);
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,929 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by David S. Miller and Andy Polyakov.
# The module is licensed under 2-clause BSD license. October 2012.
# All rights reserved.
# ====================================================================
######################################################################
# AES for SPARC T4.
#
# AES round instructions complete in 3 cycles and can be issued every
# cycle. It means that round calculations should take 4*rounds cycles,
# because any given round instruction depends on result of *both*
# previous instructions:
#
# |0 |1 |2 |3 |4
# |01|01|01|
# |23|23|23|
# |01|01|...
# |23|...
#
# Provided that fxor [with IV] takes 3 cycles to complete, critical
# path length for CBC encrypt would be 3+4*rounds, or in other words
# it should process one byte in at least (3+4*rounds)/16 cycles. This
# estimate doesn't account for "collateral" instructions, such as
# fetching input from memory, xor-ing it with zero-round key and
# storing the result. Yet, *measured* performance [for data aligned
# at 64-bit boundary!] deviates from this equation by less than 0.5%:
#
# 128-bit key 192- 256-
# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
# (*) numbers after slash are for
# misaligned data.
#
# Out-of-order execution logic managed to fully overlap "collateral"
# instructions with those on critical path. Amazing!
#
# As with Intel AES-NI, question is if it's possible to improve
# performance of parallelizable modes by interleaving round
# instructions. Provided round instruction latency and throughput
# optimal interleave factor is 2. But can we expect 2x performance
# improvement? Well, as round instructions can be issued one per
# cycle, they don't saturate the 2-way issue pipeline and therefore
# there is room for "collateral" calculations... Yet, 2x speed-up
# over CBC encrypt remains unattaintable:
#
# 128-bit key 192- 256-
# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
# (*) numbers after slash are for
# misaligned data.
#
# Estimates based on amount of instructions under assumption that
# round instructions are not pairable with any other instruction
# suggest that latter is the actual case and pipeline runs
# underutilized. It should be noted that T4 out-of-order execution
# logic is so capable that performance gain from 2x interleave is
# not even impressive, ~7-13% over non-interleaved code, largest
# for 256-bit keys.
# To anchor to something else, software implementation processes
# one byte in 29 cycles with 128-bit key on same processor. Intel
# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
# in 0.93, naturally with AES-NI.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
$output = pop;
open STDOUT,">$output";
$::evp=1; # if $evp is set to 0, script generates module with
# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
# points. These however are not fully compatible with openssl/aes.h,
# because they expect AES_KEY to be aligned at 64-bit boundary. When
# used through EVP, alignment is arranged at EVP layer. Second thing
# that is arranged by EVP is at least 32-bit alignment of IV.
######################################################################
# single-round subroutines
#
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.text
.globl aes_t4_encrypt
.align 32
aes_t4_encrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Lenc:
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_eround01 %f16, %f4, %f2, %f0
aes_eround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Lenc
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
aes_eround01_l %f16, %f4, %f2, %f0
aes_eround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_encrypt,#function
.size aes_t4_encrypt,.-aes_t4_encrypt
.globl aes_t4_decrypt
.align 32
aes_t4_decrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Ldec:
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_dround01 %f16, %f4, %f2, %f0
aes_dround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Ldec
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
aes_dround01_l %f16, %f4, %f2, %f0
aes_dround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_decrypt,#function
.size aes_t4_decrypt,.-aes_t4_decrypt
___
}
######################################################################
# key setup subroutines
#
{
my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
$code.=<<___;
.globl aes_t4_set_encrypt_key
.align 32
aes_t4_set_encrypt_key:
.Lset_encrypt_key:
and $inp, 7, $tmp
alignaddr $inp, %g0, $inp
cmp $bits, 192
ldd [$inp + 0], %f0
bl,pt %icc,.L128
ldd [$inp + 8], %f2
be,pt %icc,.L192
ldd [$inp + 16], %f4
brz,pt $tmp, .L256aligned
ldd [$inp + 24], %f6
ldd [$inp + 32], %f8
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
faligndata %f6, %f8, %f6
.L256aligned:
___
for ($i=0; $i<6; $i++) {
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
aes_kexpand0 %f4, %f2, %f4
std %f6, [$out + `32*$i+24`]
aes_kexpand2 %f6, %f4, %f6
___
}
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
std %f6, [$out + `32*$i+24`]
std %f0, [$out + `32*$i+32`]
std %f2, [$out + `32*$i+40`]
mov 14, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L192:
brz,pt $tmp, .L192aligned
nop
ldd [$inp + 24], %f6
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
.L192aligned:
___
for ($i=0; $i<7; $i++) {
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
aes_kexpand2 %f4, %f2, %f4
___
}
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
std %f0, [$out + `24*$i+24`]
std %f2, [$out + `24*$i+32`]
mov 12, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L128:
brz,pt $tmp, .L128aligned
nop
ldd [$inp + 16], %f4
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
.L128aligned:
___
for ($i=0; $i<10; $i++) {
$code.=<<___;
std %f0, [$out + `16*$i+0`]
aes_kexpand1 %f0, %f2, $i, %f0
std %f2, [$out + `16*$i+8`]
aes_kexpand2 %f2, %f0, %f2
___
}
$code.=<<___;
std %f0, [$out + `16*$i+0`]
std %f2, [$out + `16*$i+8`]
mov 10, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.type aes_t4_set_encrypt_key,#function
.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
.globl aes_t4_set_decrypt_key
.align 32
aes_t4_set_decrypt_key:
mov %o7, %o5
call .Lset_encrypt_key
nop
mov %o5, %o7
sll $tmp, 4, $inp ! $tmp is number of rounds
add $tmp, 2, $tmp
add $out, $inp, $inp ! $inp=$out+16*rounds
srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
.Lkey_flip:
ldd [$out + 0], %f0
ldd [$out + 8], %f2
ldd [$out + 16], %f4
ldd [$out + 24], %f6
ldd [$inp + 0], %f8
ldd [$inp + 8], %f10
ldd [$inp - 16], %f12
ldd [$inp - 8], %f14
sub $tmp, 1, $tmp
std %f0, [$inp + 0]
std %f2, [$inp + 8]
std %f4, [$inp - 16]
std %f6, [$inp - 8]
std %f8, [$out + 0]
std %f10, [$out + 8]
std %f12, [$out + 16]
std %f14, [$out + 24]
add $out, 32, $out
brnz $tmp, .Lkey_flip
sub $inp, 32, $inp
retl
xor %o0, %o0, %o0
.type aes_t4_set_decrypt_key,#function
.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
___
}
{{{
my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
$code.=<<___;
.align 32
_aes128_encrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f4
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01_l %f52, %f4, %f2, %f0
retl
aes_eround23_l %f54, %f4, %f2, %f2
.type _aes128_encrypt_1x,#function
.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
.align 32
_aes128_encrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f8
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01 %f48, %f4, %f6, %f10
aes_eround23 %f50, %f4, %f6, %f6
aes_eround01_l %f52, %f8, %f2, %f0
aes_eround23_l %f54, %f8, %f2, %f2
aes_eround01_l %f52, %f10, %f6, %f4
retl
aes_eround23_l %f54, %f10, %f6, %f6
.type _aes128_encrypt_2x,#function
.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
___
&alg_cbc_encrypt_implement("aes",128);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_xts_implement("aes",128,"en");
&alg_xts_implement("aes",128,"de");
}
&alg_cbc_decrypt_implement("aes",128);
$code.=<<___;
.align 32
_aes128_decrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f4
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01_l %f52, %f4, %f2, %f0
retl
aes_dround23_l %f54, %f4, %f2, %f2
.type _aes128_decrypt_1x,#function
.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
.align 32
_aes128_decrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f8
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01 %f48, %f4, %f6, %f10
aes_dround23 %f50, %f4, %f6, %f6
aes_dround01_l %f52, %f8, %f2, %f0
aes_dround23_l %f54, %f8, %f2, %f2
aes_dround01_l %f52, %f10, %f6, %f4
retl
aes_dround23_l %f54, %f10, %f6, %f6
.type _aes128_decrypt_2x,#function
.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
___
$code.=<<___;
.align 32
_aes192_encrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f4
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01_l %f60, %f4, %f2, %f0
retl
aes_eround23_l %f62, %f4, %f2, %f2
.type _aes192_encrypt_1x,#function
.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
.align 32
_aes192_encrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f8
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01 %f56, %f4, %f6, %f10
aes_eround23 %f58, %f4, %f6, %f6
aes_eround01_l %f60, %f8, %f2, %f0
aes_eround23_l %f62, %f8, %f2, %f2
aes_eround01_l %f60, %f10, %f6, %f4
retl
aes_eround23_l %f62, %f10, %f6, %f6
.type _aes192_encrypt_2x,#function
.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
.align 32
_aes256_encrypt_1x:
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f4, %f2, %f0
aes_eround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f4, %f2, %f0
aes_eround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_1x,#function
.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
.align 32
_aes256_encrypt_2x:
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f8, %f2, %f0
aes_eround23 %f22, %f8, %f2, %f2
aes_eround01 %f20, %f10, %f6, %f4
aes_eround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f8, %f2, %f0
aes_eround23_l %f22, %f8, %f2, %f2
aes_eround01_l %f20, %f10, %f6, %f4
aes_eround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_2x,#function
.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
.align 32
_aes192_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
___
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_encrypt_implement("aes",192);
if ($::evp) {
&alg_ctr32_implement("aes",256);
&alg_xts_implement("aes",256,"en");
&alg_xts_implement("aes",256,"de");
&alg_ctr32_implement("aes",192);
}
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
$code.=<<___;
.align 32
_aes256_decrypt_1x:
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f4, %f2, %f0
aes_dround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f4, %f2, %f0
aes_dround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_1x,#function
.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
.align 32
_aes256_decrypt_2x:
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f8, %f2, %f0
aes_dround23 %f22, %f8, %f2, %f2
aes_dround01 %f20, %f10, %f6, %f4
aes_dround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f8, %f2, %f0
aes_dround23_l %f22, %f8, %f2, %f2
aes_dround01_l %f20, %f10, %f6, %f4
aes_dround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_2x,#function
.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
___
}}}
if (!$::evp) {
$code.=<<___;
.global AES_encrypt
AES_encrypt=aes_t4_encrypt
.global AES_decrypt
AES_decrypt=aes_t4_decrypt
.global AES_set_encrypt_key
.align 32
AES_set_encrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_encrypt_key
nop
1: retl
nop
.type AES_set_encrypt_key,#function
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global AES_set_decrypt_key
.align 32
AES_set_decrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_decrypt_key
nop
1: retl
nop
.type AES_set_decrypt_key,#function
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
$code.=<<___;
.globl AES_cbc_encrypt
.align 32
AES_cbc_encrypt:
ld [$key + 240], %g1
nop
brz $enc, .Lcbc_decrypt
cmp %g1, 12
bl,pt %icc, aes128_t4_cbc_encrypt
nop
be,pn %icc, aes192_t4_cbc_encrypt
nop
ba aes256_t4_cbc_encrypt
nop
.Lcbc_decrypt:
bl,pt %icc, aes128_t4_cbc_decrypt
nop
be,pn %icc, aes192_t4_cbc_decrypt
nop
ba aes256_t4_cbc_decrypt
nop
.type AES_cbc_encrypt,#function
.size AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
$code.=<<___;
.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,916 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
# Atom 70.7/92.1/60.1 61.1/75.4(***)
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +28%/64% improvement on Core 2
# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
##
## Decryption stuff
## Key schedule constants
##
$k_dksd=0x1a0; # decryption key schedule: invskew x*D
&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
$k_dksb=0x1c0; # decryption key schedule: invskew x*B
&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
$k_dks9=0x200; # decryption key schedule: invskew x*9
&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
##
## Decryption stuff
## Round function constants
##
$k_dipt=0x220; # decryption input transform
&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
$k_dsbo=0x2c0; # decryption sbox final output
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&pand ("xmm0","xmm6");
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pxor ("xmm2","xmm5");
&psrld ("xmm1",4);
&add ($key,16);
&pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
&pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
&pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&sub ($round,1); # nr--
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
##
## Decryption core
##
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
&lea ($base,&DWP($k_dsbd,$const));
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
&mov ($magic,$round);
&psrld ("xmm1",4)
&movdqu ("xmm5",&QWP(0,$key));
&shl ($magic,4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
&xor ($magic,0x30);
&pshufb ("xmm0","xmm1");
&and ($magic,0x30);
&pxor ("xmm2","xmm5");
&movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
&pxor ("xmm0","xmm2");
&add ($key,16);
&lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
&jmp (&label("dec_entry"));
&set_label("dec_loop",16);
##
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
&movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
&pshufb ("xmm1","xmm3"); # 0 = sb9t
&pxor ("xmm0","xmm4");
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
&pshufb ("xmm4","xmm2"); # 4 = sbdu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbdt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
&pshufb ("xmm4","xmm2"); # 4 = sbbu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbbt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
&pshufb ("xmm4","xmm2"); # 4 = sbeu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbet
&pxor ("xmm0","xmm4"); # 4 = ch
&add ($key,16); # next round key
&palignr("xmm5","xmm5",12);
&pxor ("xmm0","xmm1"); # 0 = ch
&sub ($round,1); # nr--
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&pand ("xmm0","xmm6"); # 0 = k
&psrld ("xmm1",4); # 1 = i
&pshufb ("xmm2","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm0",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("dec_loop"));
# middle of last round
&movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm0"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
&movdqa ("xmm2",&QWP(0,$magic));
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_decrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
&je (&label("schedule_192"));
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_192
##
## 192-bit specific part of key schedule.
##
## The main body of this schedule is the same as the 128-bit
## schedule, but with more smearing. The long, high side is
## stored in %xmm7 as before, and the short, low side is in
## the high bits of %xmm6.
##
## This schedule is somewhat nastier, however, because each
## round produces 192 bits of key material, or 1.5 round keys.
## Therefore, on each cycle we do 2 rounds and produce 3 round
## keys.
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
&mov ($round,4);
&set_label("loop_schedule_192");
&call ("_vpaes_schedule_round");
&palignr("xmm0","xmm6",8);
&call ("_vpaes_schedule_mangle"); # save key n
&call ("_vpaes_schedule_192_smear");
&call ("_vpaes_schedule_mangle"); # save key n+1
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # save key n+2
&call ("_vpaes_schedule_192_smear");
&jmp (&label("loop_schedule_192"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_192_smear
##
## Smear the short, low side in the 192-bit key schedule.
##
## Inputs:
## %xmm7: high side, b a x y
## %xmm6: low side, d c 0 0
## %xmm13: 0
##
## Outputs:
## %xmm6: b+c+d b+c 0 0
## %xmm0: b+c+d b+c b a
##
&function_begin_B("_vpaes_schedule_192_smear");
&pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
&pxor ("xmm6","xmm1"); # -> c+d c 0 0
&pxor ("xmm1","xmm1");
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
&movdqa ("xmm0","xmm6");
&movhlps("xmm6","xmm1"); # clobber low side with zeros
&ret ();
&function_end_B("_vpaes_schedule_192_smear");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("${PREFIX}_set_encrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_encrypt_key");
&function_begin("${PREFIX}_set_decrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&shl ($base,4);
&lea ($key,&DWP(16,$key,$base));
&mov ($out,1);
&mov ($magic,$round);
&shr ($magic,1);
&and ($magic,32);
&xor ($magic,32); # nbist==192?0:32;
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_decrypt_key");
&function_begin("${PREFIX}_encrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_encrypt");
&function_begin("${PREFIX}_decrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_decrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_decrypt");
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0)); # inp
&mov ($out,&wparam(1)); # out
&mov ($round,&wparam(2)); # len
&mov ($key,&wparam(3)); # key
&sub ($round,16);
&jc (&label("cbc_abort"));
&lea ($base,&DWP(-56,"esp"));
&mov ($const,&wparam(4)); # ivp
&and ($base,-16);
&mov ($magic,&wparam(5)); # enc
&xchg ($base,"esp"); # alloca
&movdqu ("xmm1",&QWP(0,$const)); # load IV
&sub ($out,$inp);
&mov (&DWP(48,"esp"),$base);
&mov (&DWP(0,"esp"),$out); # save out
&mov (&DWP(4,"esp"),$key) # save key
&mov (&DWP(8,"esp"),$const); # save ivp
&mov ($out,$round); # $out works as $len
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&cmp ($magic,0);
&je (&label("cbc_dec_loop"));
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&pxor ("xmm0","xmm1"); # inp^=iv
&call ("_vpaes_encrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&movdqa ("xmm1","xmm0");
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_enc_loop"));
&jmp (&label("cbc_done"));
&set_label("cbc_dec_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&movdqa (&QWP(16,"esp"),"xmm1"); # save IV
&movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
&call ("_vpaes_decrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&pxor ("xmm0",&QWP(16,"esp")); # out^=iv
&movdqa ("xmm1",&QWP(32,"esp")); # load next IV
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_dec_loop"));
&set_label("cbc_done");
&mov ($base,&DWP(8,"esp")); # restore ivp
&mov ("esp",&DWP(48,"esp"));
&movdqu (&QWP(0,$base),"xmm1"); # write IV
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,64 @@
LIBS=../../libcrypto
SOURCE[../../libcrypto]=\
aes_misc.c aes_ecb.c aes_cfb.c aes_ofb.c \
aes_ige.c aes_wrap.c {- $target{aes_asm_src} -}
GENERATE[aes-ia64.s]=asm/aes-ia64.S
GENERATE[aes-586.s]=asm/aes-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[aes-586.s]=../perlasm/x86asm.pl
GENERATE[vpaes-x86.s]=asm/vpaes-x86.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[vpaes-586.s]=../perlasm/x86asm.pl
GENERATE[aesni-x86.s]=asm/aesni-x86.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[aesni-586.s]=../perlasm/x86asm.pl
GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl $(PERLASM_SCHEME)
GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl $(PERLASM_SCHEME)
GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl $(PERLASM_SCHEME)
GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl $(PERLASM_SCHEME)
GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl $(PERLASM_SCHEME)
GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl $(PERLASM_SCHEME)
GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl $(PERLASM_SCHEME)
GENERATE[aes-sparcv9.S]=asm/aes-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[aes-sparcv9.o]=..
GENERATE[aest4-sparcv9.S]=asm/aest4-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[aest4-sparcv9.o]=..
DEPEND[aest4-sparcv9.S]=../perlasm/sparcv9_modes.pl
GENERATE[aesfx-sparcv9.S]=asm/aesfx-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[aesfx-sparcv9.o]=..
GENERATE[aes-ppc.s]=asm/aes-ppc.pl $(PERLASM_SCHEME)
GENERATE[vpaes-ppc.s]=asm/vpaes-ppc.pl $(PERLASM_SCHEME)
GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl $(PERLASM_SCHEME)
GENERATE[aes-parisc.s]=asm/aes-parisc.pl $(PERLASM_SCHEME)
GENERATE[aes-mips.S]=asm/aes-mips.pl $(PERLASM_SCHEME)
INCLUDE[aes-mips.o]=..
GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl $(PERLASM_SCHEME)
INCLUDE[aesv8-armx.o]=..
GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl $(PERLASM_SCHEME)
GENERATE[aes-armv4.S]=asm/aes-armv4.pl $(PERLASM_SCHEME)
INCLUDE[aes-armv4.o]=..
GENERATE[bsaes-armv7.S]=asm/bsaes-armv7.pl $(PERLASM_SCHEME)
INCLUDE[bsaes-armv7.o]=..
GENERATE[aes-s390x.S]=asm/aes-s390x.pl $(PERLASM_SCHEME)
INCLUDE[aes-s390x.o]=..
BEGINRAW[Makefile]
##### AES assembler implementations
# GNU make "catch all"
{- $builddir -}/aes-%.S: {- $sourcedir -}/asm/aes-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
{- $builddir -}/bsaes-%.S: {- $sourcedir -}/asm/bsaes-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
ENDRAW[Makefile]