1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Upgrade openssl from 1.1.0e to 1.1.1b, with source code. 4.0.78

This commit is contained in:
winlin 2021-03-01 20:47:57 +08:00
parent 8f1c992379
commit 96dbd7bced
1476 changed files with 616554 additions and 4 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,880 @@
#!/usr/bin/env perl
# Copyright 2017-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for ARMv8.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT implementation. It makes no
# sense to attempt SIMD/NEON implementation for following reason.
# 64-bit lanes of vector registers can't be addressed as easily as in
# 32-bit mode. This means that 64-bit NEON is bound to be slower than
# 32-bit NEON, and this implementation is faster than 32-bit NEON on
# same processor. Even though it takes more scalar xor's and andn's,
# it gets compensated by availability of rotate. Not to forget that
# most processors achieve higher issue rate with scalar instructions.
#
# February 2018.
#
# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
# variant with register permutation/rotation twist that allows to
# eliminate copies to temporary registers. If you look closely you'll
# notice that it uses only one lane of vector registers. The new
# instructions effectively facilitate parallel hashing, which we don't
# support [yet?]. But lowest-level core procedure is prepared for it.
# The inner round is 67 [vector] instructions, so it's not actually
# obvious that it will provide performance improvement [in serial
# hash] as long as vector instructions issue rate is limited to 1 per
# cycle...
#
######################################################################
# Numbers are cycles per processed byte.
#
# r=1088(*)
#
# Cortex-A53 13
# Cortex-A57 12
# X-Gene 14
# Mongoose 10
# Kryo 12
# Denver 7.8
# Apple A7 7.2
#
# (*) Corresponds to SHA3-256. No improvement coefficients are listed
# because they vary too much from compiler to compiler. Newer
# compiler does much better and improvement varies from 5% on
# Cortex-A57 to 25% on Cortex-A53. While in comparison to older
# compiler this code is at least 2x faster...
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
$code.=<<___;
.text
.align 8 // strategic alignment and padding that allows to use
// address value as loop termination condition...
.quad 0,0,0,0,0,0,0,0
.type iotas,%object
iotas:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas,.-iotas
___
{{{
my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
(0, 5, 10, 15, 20));
$A[3][3] = "x25"; # x18 is reserved
my @C = map("x$_", (26,27,28,30));
$code.=<<___;
.type KeccakF1600_int,%function
.align 5
KeccakF1600_int:
adr $C[2],iotas
.inst 0xd503233f // paciasp
stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
b .Loop
.align 4
.Loop:
////////////////////////////////////////// Theta
eor $C[0],$A[0][0],$A[1][0]
stp $A[0][4],$A[1][4],[sp,#0] // offload pair...
eor $C[1],$A[0][1],$A[1][1]
eor $C[2],$A[0][2],$A[1][2]
eor $C[3],$A[0][3],$A[1][3]
___
$C[4]=$A[0][4];
$C[5]=$A[1][4];
$code.=<<___;
eor $C[4],$A[0][4],$A[1][4]
eor $C[0],$C[0],$A[2][0]
eor $C[1],$C[1],$A[2][1]
eor $C[2],$C[2],$A[2][2]
eor $C[3],$C[3],$A[2][3]
eor $C[4],$C[4],$A[2][4]
eor $C[0],$C[0],$A[3][0]
eor $C[1],$C[1],$A[3][1]
eor $C[2],$C[2],$A[3][2]
eor $C[3],$C[3],$A[3][3]
eor $C[4],$C[4],$A[3][4]
eor $C[0],$C[0],$A[4][0]
eor $C[2],$C[2],$A[4][2]
eor $C[1],$C[1],$A[4][1]
eor $C[3],$C[3],$A[4][3]
eor $C[4],$C[4],$A[4][4]
eor $C[5],$C[0],$C[2],ror#63
eor $A[0][1],$A[0][1],$C[5]
eor $A[1][1],$A[1][1],$C[5]
eor $A[2][1],$A[2][1],$C[5]
eor $A[3][1],$A[3][1],$C[5]
eor $A[4][1],$A[4][1],$C[5]
eor $C[5],$C[1],$C[3],ror#63
eor $C[2],$C[2],$C[4],ror#63
eor $C[3],$C[3],$C[0],ror#63
eor $C[4],$C[4],$C[1],ror#63
eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2]
eor $A[1][2],$A[1][2],$C[5]
eor $A[2][2],$A[2][2],$C[5]
eor $A[3][2],$A[3][2],$C[5]
eor $A[4][2],$A[4][2],$C[5]
eor $A[0][0],$A[0][0],$C[4]
eor $A[1][0],$A[1][0],$C[4]
eor $A[2][0],$A[2][0],$C[4]
eor $A[3][0],$A[3][0],$C[4]
eor $A[4][0],$A[4][0],$C[4]
___
$C[4]=undef;
$C[5]=undef;
$code.=<<___;
ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data
eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3]
eor $A[1][3],$A[1][3],$C[2]
eor $A[2][3],$A[2][3],$C[2]
eor $A[3][3],$A[3][3],$C[2]
eor $A[4][3],$A[4][3],$C[2]
eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4]
eor $A[1][4],$A[1][4],$C[3]
eor $A[2][4],$A[2][4],$C[3]
eor $A[3][4],$A[3][4],$C[3]
eor $A[4][4],$A[4][4],$C[3]
////////////////////////////////////////// Rho+Pi
mov $C[3],$A[0][1]
ror $A[0][1],$A[1][1],#64-$rhotates[1][1]
//mov $C[1],$A[0][2]
ror $A[0][2],$A[2][2],#64-$rhotates[2][2]
//mov $C[0],$A[0][3]
ror $A[0][3],$A[3][3],#64-$rhotates[3][3]
//mov $C[2],$A[0][4]
ror $A[0][4],$A[4][4],#64-$rhotates[4][4]
ror $A[1][1],$A[1][4],#64-$rhotates[1][4]
ror $A[2][2],$A[2][3],#64-$rhotates[2][3]
ror $A[3][3],$A[3][2],#64-$rhotates[3][2]
ror $A[4][4],$A[4][1],#64-$rhotates[4][1]
ror $A[1][4],$A[4][2],#64-$rhotates[4][2]
ror $A[2][3],$A[3][4],#64-$rhotates[3][4]
ror $A[3][2],$A[2][1],#64-$rhotates[2][1]
ror $A[4][1],$A[1][3],#64-$rhotates[1][3]
ror $A[4][2],$A[2][4],#64-$rhotates[2][4]
ror $A[3][4],$A[4][3],#64-$rhotates[4][3]
ror $A[2][1],$A[1][2],#64-$rhotates[1][2]
ror $A[1][3],$A[3][1],#64-$rhotates[3][1]
ror $A[2][4],$A[4][0],#64-$rhotates[4][0]
ror $A[4][3],$A[3][0],#64-$rhotates[3][0]
ror $A[1][2],$A[2][0],#64-$rhotates[2][0]
ror $A[3][1],$A[1][0],#64-$rhotates[1][0]
ror $A[1][0],$C[0],#64-$rhotates[0][3]
ror $A[2][0],$C[3],#64-$rhotates[0][1]
ror $A[3][0],$C[2],#64-$rhotates[0][4]
ror $A[4][0],$C[1],#64-$rhotates[0][2]
////////////////////////////////////////// Chi+Iota
bic $C[0],$A[0][2],$A[0][1]
bic $C[1],$A[0][3],$A[0][2]
bic $C[2],$A[0][0],$A[0][4]
bic $C[3],$A[0][1],$A[0][0]
eor $A[0][0],$A[0][0],$C[0]
bic $C[0],$A[0][4],$A[0][3]
eor $A[0][1],$A[0][1],$C[1]
ldr $C[1],[sp,#16]
eor $A[0][3],$A[0][3],$C[2]
eor $A[0][4],$A[0][4],$C[3]
eor $A[0][2],$A[0][2],$C[0]
ldr $C[3],[$C[1]],#8 // Iota[i++]
bic $C[0],$A[1][2],$A[1][1]
tst $C[1],#255 // are we done?
str $C[1],[sp,#16]
bic $C[1],$A[1][3],$A[1][2]
bic $C[2],$A[1][0],$A[1][4]
eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota
bic $C[3],$A[1][1],$A[1][0]
eor $A[1][0],$A[1][0],$C[0]
bic $C[0],$A[1][4],$A[1][3]
eor $A[1][1],$A[1][1],$C[1]
eor $A[1][3],$A[1][3],$C[2]
eor $A[1][4],$A[1][4],$C[3]
eor $A[1][2],$A[1][2],$C[0]
bic $C[0],$A[2][2],$A[2][1]
bic $C[1],$A[2][3],$A[2][2]
bic $C[2],$A[2][0],$A[2][4]
bic $C[3],$A[2][1],$A[2][0]
eor $A[2][0],$A[2][0],$C[0]
bic $C[0],$A[2][4],$A[2][3]
eor $A[2][1],$A[2][1],$C[1]
eor $A[2][3],$A[2][3],$C[2]
eor $A[2][4],$A[2][4],$C[3]
eor $A[2][2],$A[2][2],$C[0]
bic $C[0],$A[3][2],$A[3][1]
bic $C[1],$A[3][3],$A[3][2]
bic $C[2],$A[3][0],$A[3][4]
bic $C[3],$A[3][1],$A[3][0]
eor $A[3][0],$A[3][0],$C[0]
bic $C[0],$A[3][4],$A[3][3]
eor $A[3][1],$A[3][1],$C[1]
eor $A[3][3],$A[3][3],$C[2]
eor $A[3][4],$A[3][4],$C[3]
eor $A[3][2],$A[3][2],$C[0]
bic $C[0],$A[4][2],$A[4][1]
bic $C[1],$A[4][3],$A[4][2]
bic $C[2],$A[4][0],$A[4][4]
bic $C[3],$A[4][1],$A[4][0]
eor $A[4][0],$A[4][0],$C[0]
bic $C[0],$A[4][4],$A[4][3]
eor $A[4][1],$A[4][1],$C[1]
eor $A[4][3],$A[4][3],$C[2]
eor $A[4][4],$A[4][4],$C[3]
eor $A[4][2],$A[4][2],$C[0]
bne .Loop
ldr x30,[sp,#24]
.inst 0xd50323bf // autiasp
ret
.size KeccakF1600_int,.-KeccakF1600_int
.type KeccakF1600,%function
.align 5
KeccakF1600:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#48
str x0,[sp,#32] // offload argument
mov $C[0],x0
ldp $A[0][0],$A[0][1],[x0,#16*0]
ldp $A[0][2],$A[0][3],[$C[0],#16*1]
ldp $A[0][4],$A[1][0],[$C[0],#16*2]
ldp $A[1][1],$A[1][2],[$C[0],#16*3]
ldp $A[1][3],$A[1][4],[$C[0],#16*4]
ldp $A[2][0],$A[2][1],[$C[0],#16*5]
ldp $A[2][2],$A[2][3],[$C[0],#16*6]
ldp $A[2][4],$A[3][0],[$C[0],#16*7]
ldp $A[3][1],$A[3][2],[$C[0],#16*8]
ldp $A[3][3],$A[3][4],[$C[0],#16*9]
ldp $A[4][0],$A[4][1],[$C[0],#16*10]
ldp $A[4][2],$A[4][3],[$C[0],#16*11]
ldr $A[4][4],[$C[0],#16*12]
bl KeccakF1600_int
ldr $C[0],[sp,#32]
stp $A[0][0],$A[0][1],[$C[0],#16*0]
stp $A[0][2],$A[0][3],[$C[0],#16*1]
stp $A[0][4],$A[1][0],[$C[0],#16*2]
stp $A[1][1],$A[1][2],[$C[0],#16*3]
stp $A[1][3],$A[1][4],[$C[0],#16*4]
stp $A[2][0],$A[2][1],[$C[0],#16*5]
stp $A[2][2],$A[2][3],[$C[0],#16*6]
stp $A[2][4],$A[3][0],[$C[0],#16*7]
stp $A[3][1],$A[3][2],[$C[0],#16*8]
stp $A[3][3],$A[3][4],[$C[0],#16*9]
stp $A[4][0],$A[4][1],[$C[0],#16*10]
stp $A[4][2],$A[4][3],[$C[0],#16*11]
str $A[4][4],[$C[0],#16*12]
ldp x19,x20,[x29,#16]
add sp,sp,#48
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
.inst 0xd50323bf // autiasp
ret
.size KeccakF1600,.-KeccakF1600
.globl SHA3_absorb
.type SHA3_absorb,%function
.align 5
SHA3_absorb:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#64
stp x0,x1,[sp,#32] // offload arguments
stp x2,x3,[sp,#48]
mov $C[0],x0 // uint64_t A[5][5]
mov $C[1],x1 // const void *inp
mov $C[2],x2 // size_t len
mov $C[3],x3 // size_t bsz
ldp $A[0][0],$A[0][1],[$C[0],#16*0]
ldp $A[0][2],$A[0][3],[$C[0],#16*1]
ldp $A[0][4],$A[1][0],[$C[0],#16*2]
ldp $A[1][1],$A[1][2],[$C[0],#16*3]
ldp $A[1][3],$A[1][4],[$C[0],#16*4]
ldp $A[2][0],$A[2][1],[$C[0],#16*5]
ldp $A[2][2],$A[2][3],[$C[0],#16*6]
ldp $A[2][4],$A[3][0],[$C[0],#16*7]
ldp $A[3][1],$A[3][2],[$C[0],#16*8]
ldp $A[3][3],$A[3][4],[$C[0],#16*9]
ldp $A[4][0],$A[4][1],[$C[0],#16*10]
ldp $A[4][2],$A[4][3],[$C[0],#16*11]
ldr $A[4][4],[$C[0],#16*12]
b .Loop_absorb
.align 4
.Loop_absorb:
subs $C[0],$C[2],$C[3] // len - bsz
blo .Labsorbed
str $C[0],[sp,#48] // save len - bsz
___
for (my $i=0; $i<24; $i+=2) {
my $j = $i+1;
$code.=<<___;
ldr $C[0],[$C[1]],#8 // *inp++
#ifdef __AARCH64EB__
rev $C[0],$C[0]
#endif
eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
cmp $C[3],#8*($i+2)
blo .Lprocess_block
ldr $C[0],[$C[1]],#8 // *inp++
#ifdef __AARCH64EB__
rev $C[0],$C[0]
#endif
eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0]
beq .Lprocess_block
___
}
$code.=<<___;
ldr $C[0],[$C[1]],#8 // *inp++
#ifdef __AARCH64EB__
rev $C[0],$C[0]
#endif
eor $A[4][4],$A[4][4],$C[0]
.Lprocess_block:
str $C[1],[sp,#40] // save inp
bl KeccakF1600_int
ldr $C[1],[sp,#40] // restore arguments
ldp $C[2],$C[3],[sp,#48]
b .Loop_absorb
.align 4
.Labsorbed:
ldr $C[1],[sp,#32]
stp $A[0][0],$A[0][1],[$C[1],#16*0]
stp $A[0][2],$A[0][3],[$C[1],#16*1]
stp $A[0][4],$A[1][0],[$C[1],#16*2]
stp $A[1][1],$A[1][2],[$C[1],#16*3]
stp $A[1][3],$A[1][4],[$C[1],#16*4]
stp $A[2][0],$A[2][1],[$C[1],#16*5]
stp $A[2][2],$A[2][3],[$C[1],#16*6]
stp $A[2][4],$A[3][0],[$C[1],#16*7]
stp $A[3][1],$A[3][2],[$C[1],#16*8]
stp $A[3][3],$A[3][4],[$C[1],#16*9]
stp $A[4][0],$A[4][1],[$C[1],#16*10]
stp $A[4][2],$A[4][3],[$C[1],#16*11]
str $A[4][4],[$C[1],#16*12]
mov x0,$C[2] // return value
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
.inst 0xd50323bf // autiasp
ret
.size SHA3_absorb,.-SHA3_absorb
___
{
my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
$code.=<<___;
.globl SHA3_squeeze
.type SHA3_squeeze,%function
.align 5
SHA3_squeeze:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-48]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
mov $A_flat,x0 // put aside arguments
mov $out,x1
mov $len,x2
mov $bsz,x3
.Loop_squeeze:
ldr x4,[x0],#8
cmp $len,#8
blo .Lsqueeze_tail
#ifdef __AARCH64EB__
rev x4,x4
#endif
str x4,[$out],#8
subs $len,$len,#8
beq .Lsqueeze_done
subs x3,x3,#8
bhi .Loop_squeeze
mov x0,$A_flat
bl KeccakF1600
mov x0,$A_flat
mov x3,$bsz
b .Loop_squeeze
.align 4
.Lsqueeze_tail:
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done
strb w4,[$out],#1
.Lsqueeze_done:
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x29,x30,[sp],#48
.inst 0xd50323bf // autiasp
ret
.size SHA3_squeeze,.-SHA3_squeeze
___
} }}}
{{{
my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
"v".($_+3).".16b", "v".($_+4).".16b" ],
(0, 5, 10, 15, 20));
my @C = map("v$_.16b", (25..31));
$code.=<<___;
.type KeccakF1600_ce,%function
.align 5
KeccakF1600_ce:
mov x9,#12
adr x10,iotas
b .Loop_ce
.align 4
.Loop_ce:
___
for($i=0; $i<2; $i++) {
$code.=<<___;
////////////////////////////////////////////////// Theta
eor3 $C[0],$A[0][0],$A[1][0],$A[2][0]
eor3 $C[1],$A[0][1],$A[1][1],$A[2][1]
eor3 $C[2],$A[0][2],$A[1][2],$A[2][2]
eor3 $C[3],$A[0][3],$A[1][3],$A[2][3]
eor3 $C[4],$A[0][4],$A[1][4],$A[2][4]
eor3 $C[0],$C[0], $A[3][0],$A[4][0]
eor3 $C[1],$C[1], $A[3][1],$A[4][1]
eor3 $C[2],$C[2], $A[3][2],$A[4][2]
eor3 $C[3],$C[3], $A[3][3],$A[4][3]
eor3 $C[4],$C[4], $A[3][4],$A[4][4]
rax1 $C[5],$C[0],$C[2] // D[1]
rax1 $C[6],$C[1],$C[3] // D[2]
rax1 $C[2],$C[2],$C[4] // D[3]
rax1 $C[3],$C[3],$C[0] // D[4]
rax1 $C[4],$C[4],$C[1] // D[0]
////////////////////////////////////////////////// Theta+Rho+Pi
xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1]
xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4]
xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2]
xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4]
xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0]
xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2]
xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2]
xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3]
xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4]
xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3]
xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0]
xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4]
eor $A[0][0],$A[0][0],$C[4]
ldr x11,[x10],#8
xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3]
xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2]
xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1]
xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2]
xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0]
xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // *
xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4]
xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1]
xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3]
xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1]
xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0]
xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0]
////////////////////////////////////////////////// Chi+Iota
dup $C[6],x11 // borrow C[6]
bcax $C[3], $A[0][0],$A[0][2],$C[0] // *
bcax $A[0][1],$C[0], $C[1], $A[0][2] // *
bcax $A[0][2],$A[0][2],$A[0][4],$C[1]
bcax $A[0][3],$C[1], $A[0][0],$A[0][4]
bcax $A[0][4],$A[0][4],$C[0], $A[0][0]
bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // *
bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // *
bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3]
bcax $A[1][3],$A[1][3],$C[2], $A[1][4]
bcax $A[1][4],$A[1][4],$A[1][1],$C[2]
eor $A[0][0],$C[3],$C[6] // Iota
bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // *
bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // *
bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3]
bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4]
bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0]
bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // *
bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // *
bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3]
bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4]
bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0]
bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // *
bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // *
bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3]
bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4]
bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0]
___
( $A[1][1], $C[0]) = ( $C[0], $A[1][1]);
($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]);
($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]);
($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]);
}
$code.=<<___;
subs x9,x9,#1
bne .Loop_ce
ret
.size KeccakF1600_ce,.-KeccakF1600_ce
.type KeccakF1600_cext,%function
.align 5
KeccakF1600_cext:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // per ABI requirement
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
___
for($i=0; $i<24; $i+=2) { # load A[5][5]
my $j=$i+1;
$code.=<<___;
ldp d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
ldr d24,[x0,#8*$i]
bl KeccakF1600_ce
ldr x30,[sp,#8]
___
for($i=0; $i<24; $i+=2) { # store A[5][5]
my $j=$i+1;
$code.=<<___;
stp d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
str d24,[x0,#8*$i]
ldp d8,d9,[sp,#16]
ldp d10,d11,[sp,#32]
ldp d12,d13,[sp,#48]
ldp d14,d15,[sp,#64]
ldr x29,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size KeccakF1600_cext,.-KeccakF1600_cext
___
{
my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));
$code.=<<___;
.globl SHA3_absorb_cext
.type SHA3_absorb_cext,%function
.align 5
SHA3_absorb_cext:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // per ABI requirement
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
___
for($i=0; $i<24; $i+=2) { # load A[5][5]
my $j=$i+1;
$code.=<<___;
ldp d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
ldr d24,[x0,#8*$i]
b .Loop_absorb_ce
.align 4
.Loop_absorb_ce:
subs $len,$len,$bsz // len - bsz
blo .Labsorbed_ce
___
for (my $i=0; $i<24; $i+=2) {
my $j = $i+1;
$code.=<<___;
ldr d31,[$inp],#8 // *inp++
#ifdef __AARCH64EB__
rev64 v31.16b,v31.16b
#endif
eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b
cmp $bsz,#8*($i+2)
blo .Lprocess_block_ce
ldr d31,[$inp],#8 // *inp++
#ifdef __AARCH64EB__
rev v31.16b,v31.16b
#endif
eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b
beq .Lprocess_block_ce
___
}
$code.=<<___;
ldr d31,[$inp],#8 // *inp++
#ifdef __AARCH64EB__
rev v31.16b,v31.16b
#endif
eor $A[4][4],$A[4][4],v31.16b
.Lprocess_block_ce:
bl KeccakF1600_ce
b .Loop_absorb_ce
.align 4
.Labsorbed_ce:
___
for($i=0; $i<24; $i+=2) { # store A[5][5]
my $j=$i+1;
$code.=<<___;
stp d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
str d24,[x0,#8*$i]
add x0,$len,$bsz // return value
ldp d8,d9,[sp,#16]
ldp d10,d11,[sp,#32]
ldp d12,d13,[sp,#48]
ldp d14,d15,[sp,#64]
ldp x29,x30,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size SHA3_absorb_cext,.-SHA3_absorb_cext
___
}
{
my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
$code.=<<___;
.globl SHA3_squeeze_cext
.type SHA3_squeeze_cext,%function
.align 5
SHA3_squeeze_cext:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-16]!
add x29,sp,#0
mov x9,$ctx
mov x10,$bsz
.Loop_squeeze_ce:
ldr x4,[x9],#8
cmp $len,#8
blo .Lsqueeze_tail_ce
#ifdef __AARCH64EB__
rev x4,x4
#endif
str x4,[$out],#8
beq .Lsqueeze_done_ce
sub $len,$len,#8
subs x10,x10,#8
bhi .Loop_squeeze_ce
bl KeccakF1600_cext
ldr x30,[sp,#8]
mov x9,$ctx
mov x10,$bsz
b .Loop_squeeze_ce
.align 4
.Lsqueeze_tail_ce:
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
lsr x4,x4,#8
subs $len,$len,#1
beq .Lsqueeze_done_ce
strb w4,[$out],#1
.Lsqueeze_done_ce:
ldr x29,[sp],#16
.inst 0xd50323bf // autiasp
ret
.size SHA3_squeeze_cext,.-SHA3_squeeze_cext
___
} }}}
$code.=<<___;
.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
___
{ my %opcode = (
"rax1" => 0xce608c00, "eor3" => 0xce000000,
"bcax" => 0xce200000, "xar" => 0xce800000 );
sub unsha3 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
m/\bdup\b/ and s/\.16b/.2d/g or
s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,482 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for AVX2.
#
# July 2017.
#
# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
# other than A[0][0] in magic order into 6 [256-bit] registers, *each
# dedicated to one axis*, Pi permutation is reduced to intra-register
# shuffles...
#
# It makes other steps more intricate, but overall, is it a win? To be
# more specific index permutations organized by quadruples are:
#
# [4][4] [3][3] [2][2] [1][1]<-+
# [0][4] [0][3] [0][2] [0][1]<-+
# [3][0] [1][0] [4][0] [2][0] |
# [4][3] [3][1] [2][4] [1][2] |
# [3][4] [1][3] [4][2] [2][1] |
# [2][3] [4][1] [1][4] [3][2] |
# [2][2] [4][4] [1][1] [3][3] -+
#
# This however is highly impractical for Theta and Chi. What would help
# Theta is if x indices were aligned column-wise, or in other words:
#
# [0][4] [0][3] [0][2] [0][1]
# [3][0] [1][0] [4][0] [2][0]
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
# [2][4] [4][3] [1][2] [3][1]
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
# [3][4] [1][3] [4][2] [2][1]
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
# [1][4] [2][3] [3][2] [4][1]
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
# [4][4] [3][3] [2][2] [1][1]
#
# So here we have it, lines not marked with vpermq() represent the magic
# order in which data is to be loaded and maintained. [And lines marked
# with vpermq() represent Pi circular permutation in chosen layout. Note
# that first step is permutation-free.] A[0][0] is loaded to register of
# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
# Digits in variables' names denote right-most coordinates:
my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
$A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
$A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
$A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
$A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
$A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
$A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
map("%ymm$_",(0..6));
# We also need to map the magic order into offsets within structure:
my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
[2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
[2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
[2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
[2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
@A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
# But on the other hand Chi is much better off if y indices were aligned
# column-wise, not x. For this reason we have to shuffle data prior
# Chi and revert it afterwards. Prior shuffle is naturally merged with
# Pi itself:
#
# [0][4] [0][3] [0][2] [0][1]
# [3][0] [1][0] [4][0] [2][0]
#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
# [3][1] [1][2] [4][3] [2][4]
#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
# [3][4] [1][3] [4][2] [2][1]
#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
# [3][2] [1][4] [4][1] [2][3]
#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
# [3][3] [1][1] [4][4] [2][2]
#
# And reverse post-Chi permutation:
#
# [0][4] [0][3] [0][2] [0][1]
# [3][0] [1][0] [4][0] [2][0]
#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
# [2][4] [4][3] [1][2] [3][1]
#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
# [3][4] [1][3] [4][2] [2][1]
#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
# [1][4] [2][3] [3][2] [4][1]
#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
# [4][4] [3][3] [2][2] [1][1]
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
# r=1088(*)
#
# Haswell 8.7/+10%
# Skylake 7.8/+20%
# Ryzen 17(**)
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
# coefficient in comparison to scalar keccak1600-x86_64.pl.
# (**) It's expected that Ryzen performs poorly, because instruction
# issue rate is limited to two AVX2 instructions per cycle and
# in addition vpblendd is reportedly bound to specific port.
# Obviously this code path should not be executed on Ryzen.
my @T = map("%ymm$_",(7..15));
my ($C14,$C00,$D00,$D14) = @T[5..8];
$code.=<<___;
.text
.type __KeccakF1600,\@function
.align 32
__KeccakF1600:
lea rhotates_left+96(%rip),%r8
lea rhotates_right+96(%rip),%r9
lea iotas(%rip),%r10
mov \$24,%eax
jmp .Loop_avx2
.align 32
.Loop_avx2:
######################################### Theta
vpshufd \$0b01001110,$A20,$C00
vpxor $A31,$A41,$C14
vpxor $A11,$A21,@T[2]
vpxor $A01,$C14,$C14
vpxor @T[2],$C14,$C14 # C[1..4]
vpermq \$0b10010011,$C14,@T[4]
vpxor $A20,$C00,$C00
vpermq \$0b01001110,$C00,@T[0]
vpsrlq \$63,$C14,@T[1]
vpaddq $C14,$C14,@T[2]
vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1)
vpermq \$0b00111001,@T[1],$D14
vpxor @T[4],@T[1],$D00
vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpxor $A00,$C00,$C00
vpxor @T[0],$C00,$C00 # C[0..0]
vpsrlq \$63,$C00,@T[0]
vpaddq $C00,$C00,@T[1]
vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1)
vpxor $D00,$A20,$A20 # ^= D[0..0]
vpxor $D00,$A00,$A00 # ^= D[0..0]
vpblendd \$0b11000000,@T[1],$D14,$D14
vpblendd \$0b00000011,$C00,@T[4],@T[4]
vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
######################################### Rho + Pi + pre-Chi shuffle
vpsllvq 0*32-96(%r8),$A20,@T[3]
vpsrlvq 0*32-96(%r9),$A20,$A20
vpor @T[3],$A20,$A20
vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta
vpsllvq 2*32-96(%r8),$A31,@T[4]
vpsrlvq 2*32-96(%r9),$A31,$A31
vpor @T[4],$A31,$A31
vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta
vpsllvq 3*32-96(%r8),$A21,@T[5]
vpsrlvq 3*32-96(%r9),$A21,$A21
vpor @T[5],$A21,$A21
vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta
vpsllvq 4*32-96(%r8),$A41,@T[6]
vpsrlvq 4*32-96(%r9),$A41,$A41
vpor @T[6],$A41,$A41
vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta
vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
vpsllvq 5*32-96(%r8),$A11,@T[7]
vpsrlvq 5*32-96(%r9),$A11,@T[1]
vpor @T[7],@T[1],@T[1] # $A11 -> future $A01
vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta
vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
vpsllvq 1*32-96(%r8),$A01,@T[8]
vpsrlvq 1*32-96(%r9),$A01,@T[2]
vpor @T[8],@T[2],@T[2] # $A01 -> future $A20
######################################### Chi
vpsrldq \$8,@T[1],@T[7]
vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
vpxor @T[3],$A31,$A31
vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
vpxor @T[5],$A41,$A41
vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
vpxor @T[6],$A11,$A11
vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
vpxor @T[2],$A20,$A20
vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
vpermq \$0b10001101,$A41,$A41
vpermq \$0b01110010,$A11,$A11
vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
vpxor @T[0],$A00,$A00
vpxor @T[1],$A01,$A01
vpxor @T[4],$A21,$A21
######################################### Iota
vpxor (%r10),$A00,$A00
lea 32(%r10),%r10
dec %eax
jnz .Loop_avx2
ret
.size __KeccakF1600,.-__KeccakF1600
___
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
my $out = $inp; # in squeeze
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 32
SHA3_absorb:
mov %rsp,%r11
lea -240(%rsp),%rsp
and \$-32,%rsp
lea 96($A_flat),$A_flat
lea 96($inp),$inp
lea 96(%rsp),%r10
vzeroupper
vpbroadcastq -96($A_flat),$A00 # load A[5][5]
vmovdqu 8+32*0-96($A_flat),$A01
vmovdqu 8+32*1-96($A_flat),$A20
vmovdqu 8+32*2-96($A_flat),$A31
vmovdqu 8+32*3-96($A_flat),$A21
vmovdqu 8+32*4-96($A_flat),$A41
vmovdqu 8+32*5-96($A_flat),$A11
vpxor @T[0],@T[0],@T[0]
vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
vmovdqa @T[0],32*3-96(%r10)
vmovdqa @T[0],32*4-96(%r10)
vmovdqa @T[0],32*5-96(%r10)
vmovdqa @T[0],32*6-96(%r10)
.Loop_absorb_avx2:
mov $bsz,%rax
sub $bsz,$len
jc .Ldone_absorb_avx2
shr \$3,%eax
vpbroadcastq 0-96($inp),@T[0]
vmovdqu 8-96($inp),@T[1]
sub \$4,%eax
___
for(my $i=5; $i<25; $i++) {
$code.=<<___
dec %eax
jz .Labsorved_avx2
mov 8*$i-96($inp),%r8
mov %r8,$A_jagged[$i]-96(%r10)
___
}
$code.=<<___;
.Labsorved_avx2:
lea ($inp,$bsz),$inp
vpxor @T[0],$A00,$A00
vpxor @T[1],$A01,$A01
vpxor 32*2-96(%r10),$A20,$A20
vpxor 32*3-96(%r10),$A31,$A31
vpxor 32*4-96(%r10),$A21,$A21
vpxor 32*5-96(%r10),$A41,$A41
vpxor 32*6-96(%r10),$A11,$A11
call __KeccakF1600
lea 96(%rsp),%r10
jmp .Loop_absorb_avx2
.Ldone_absorb_avx2:
vmovq %xmm0,-96($A_flat)
vmovdqu $A01,8+32*0-96($A_flat)
vmovdqu $A20,8+32*1-96($A_flat)
vmovdqu $A31,8+32*2-96($A_flat)
vmovdqu $A21,8+32*3-96($A_flat)
vmovdqu $A41,8+32*4-96($A_flat)
vmovdqu $A11,8+32*5-96($A_flat)
vzeroupper
lea (%r11),%rsp
lea ($len,$bsz),%rax # return value
ret
.size SHA3_absorb,.-SHA3_absorb
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 32
SHA3_squeeze:
mov %rsp,%r11
lea 96($A_flat),$A_flat
shr \$3,$bsz
vzeroupper
vpbroadcastq -96($A_flat),$A00
vpxor @T[0],@T[0],@T[0]
vmovdqu 8+32*0-96($A_flat),$A01
vmovdqu 8+32*1-96($A_flat),$A20
vmovdqu 8+32*2-96($A_flat),$A31
vmovdqu 8+32*3-96($A_flat),$A21
vmovdqu 8+32*4-96($A_flat),$A41
vmovdqu 8+32*5-96($A_flat),$A11
mov $bsz,%rax
.Loop_squeeze_avx2:
mov @A_jagged[$i]-96($A_flat),%r8
___
for (my $i=0; $i<25; $i++) {
$code.=<<___;
sub \$8,$len
jc .Ltail_squeeze_avx2
mov %r8,($out)
lea 8($out),$out
je .Ldone_squeeze_avx2
dec %eax
je .Lextend_output_avx2
mov @A_jagged[$i+1]-120($A_flat),%r8
___
}
$code.=<<___;
.Lextend_output_avx2:
call __KeccakF1600
vmovq %xmm0,-96($A_flat)
vmovdqu $A01,8+32*0-96($A_flat)
vmovdqu $A20,8+32*1-96($A_flat)
vmovdqu $A31,8+32*2-96($A_flat)
vmovdqu $A21,8+32*3-96($A_flat)
vmovdqu $A41,8+32*4-96($A_flat)
vmovdqu $A11,8+32*5-96($A_flat)
mov $bsz,%rax
jmp .Loop_squeeze_avx2
.Ltail_squeeze_avx2:
add \$8,$len
.Loop_tail_avx2:
mov %r8b,($out)
lea 1($out),$out
shr \$8,%r8
dec $len
jnz .Loop_tail_avx2
.Ldone_squeeze_avx2:
vzeroupper
lea (%r11),%rsp
ret
.size SHA3_squeeze,.-SHA3_squeeze
.align 64
rhotates_left:
.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
rhotates_right:
.quad 64-3, 64-18, 64-36, 64-41
.quad 64-1, 64-62, 64-28, 64-27
.quad 64-45, 64-6, 64-56, 64-39
.quad 64-10, 64-61, 64-55, 64-8
.quad 64-2, 64-15, 64-25, 64-20
.quad 64-44, 64-43, 64-21, 64-14
iotas:
.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=pop;
open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,551 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for AVX-512F.
#
# July 2017.
#
# Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
# Pretty straightforward, the only "magic" is data layout in registers.
# It's impossible to have one that is optimal for every step, hence
# it's changing as algorithm progresses. Data is saved in linear order,
# but in-register order morphs between rounds. Even rounds take in
# linear layout, and odd rounds - transposed, or "verticaly-shaped"...
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
# r=1088(*)
#
# Knights Landing 7.6
# Skylake-X 5.7
#
# (*) Corresponds to SHA3-256.
########################################################################
# Below code is combination of two ideas. One is taken from Keccak Code
# Package, hereafter KCP, and another one from initial version of this
# module. What is common is observation that Pi's input and output are
# "mostly transposed", i.e. if input is aligned by x coordinate, then
# output is [mostly] aligned by y. Both versions, KCP and predecessor,
# were trying to use one of them from round to round, which resulted in
# some kind of transposition in each round. This version still does
# transpose data, but only every second round. Another essential factor
# is that KCP transposition has to be performed with instructions that
# turned to be rather expensive on Knights Landing, both latency- and
# throughput-wise. Not to mention that some of them have to depend on
# each other. On the other hand initial version of this module was
# relying heavily on blend instructions. There were lots of them,
# resulting in higher instruction count, yet it performed better on
# Knights Landing, because processor can execute pair of them each
# cycle and they have minimal latency. This module is an attempt to
# bring best parts together:-)
#
# Coordinates below correspond to those in sha/keccak1600.c. Input
# layout is straight linear:
#
# [0][4] [0][3] [0][2] [0][1] [0][0]
# [1][4] [1][3] [1][2] [1][1] [1][0]
# [2][4] [2][3] [2][2] [2][1] [2][0]
# [3][4] [3][3] [3][2] [3][1] [3][0]
# [4][4] [4][3] [4][2] [4][1] [4][0]
#
# It's perfect for Theta, while Pi is reduced to intra-register
# permutations which yield layout perfect for Chi:
#
# [4][0] [3][0] [2][0] [1][0] [0][0]
# [4][1] [3][1] [2][1] [1][1] [0][1]
# [4][2] [3][2] [2][2] [1][2] [0][2]
# [4][3] [3][3] [2][3] [1][3] [0][3]
# [4][4] [3][4] [2][4] [1][4] [0][4]
#
# Now instead of performing full transposition and feeding it to next
# identical round, we perform kind of diagonal transposition to layout
# from initial version of this module, and make it suitable for Theta:
#
# [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
# [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
# [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
# [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
# [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
#
# Now intra-register permutations yield initial [almost] straight
# linear layout:
#
# [4][4] [3][3] [2][2] [1][1] [0][0]
##[0][4] [0][3] [0][2] [0][1] [0][0]
# [3][4] [2][3] [1][2] [0][1] [4][0]
##[2][3] [2][2] [2][1] [2][0] [2][4]
# [2][4] [1][3] [0][2] [4][1] [3][0]
##[4][2] [4][1] [4][0] [4][4] [4][3]
# [1][4] [0][3] [4][2] [3][1] [2][0]
##[1][1] [1][0] [1][4] [1][3] [1][2]
# [0][4] [4][3] [3][2] [2][1] [1][0]
##[3][0] [3][4] [3][3] [3][2] [3][1]
#
# This means that odd round Chi is performed in less suitable layout,
# with a number of additional permutations. But overall it turned to be
# a win. Permutations are fastest possible on Knights Landing and they
# are laid down to be independent of each other. In the essence I traded
# 20 blend instructions for 3 permutations. The result is 13% faster
# than KCP on Skylake-X, and >40% on Knights Landing.
#
# As implied, data is loaded in straight linear order. Digits in
# variables' names represent coordinates of right-most element of
# loaded data chunk:
my ($A00, # [0][4] [0][3] [0][2] [0][1] [0][0]
$A10, # [1][4] [1][3] [1][2] [1][1] [1][0]
$A20, # [2][4] [2][3] [2][2] [2][1] [2][0]
$A30, # [3][4] [3][3] [3][2] [3][1] [3][0]
$A40) = # [4][4] [4][3] [4][2] [4][1] [4][0]
map("%zmm$_",(0..4));
# We also need to map the magic order into offsets within structure:
my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
[1,0], [1,1], [1,2], [1,3], [1,4],
[2,0], [2,1], [2,2], [2,3], [2,4],
[3,0], [3,1], [3,2], [3,3], [3,4],
[4,0], [4,1], [4,2], [4,3], [4,4]);
@A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
my @T = map("%zmm$_",(5..12));
my @Theta = map("%zmm$_",(33,13..16)); # invalid @Theta[0] is not typo
my @Pi0 = map("%zmm$_",(17..21));
my @Rhotate0 = map("%zmm$_",(22..26));
my @Rhotate1 = map("%zmm$_",(27..31));
my ($C00,$D00) = @T[0..1];
my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
$code.=<<___;
.text
.type __KeccakF1600,\@function
.align 32
__KeccakF1600:
lea iotas(%rip),%r10
mov \$12,%eax
jmp .Loop_avx512
.align 32
.Loop_avx512:
######################################### Theta, even round
vmovdqa64 $A00,@T[0] # put aside original A00
vpternlogq \$0x96,$A20,$A10,$A00 # and use it as "C00"
vpternlogq \$0x96,$A40,$A30,$A00
vprolq \$1,$A00,$D00
vpermq $A00,@Theta[1],$A00
vpermq $D00,@Theta[4],$D00
vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
vpternlogq \$0x96,$A00,$D00,$A10
vpternlogq \$0x96,$A00,$D00,$A20
vpternlogq \$0x96,$A00,$D00,$A30
vpternlogq \$0x96,$A00,$D00,$A40
######################################### Rho
vprolvq @Rhotate0[0],@T[0],$A00 # T[0] is original A00
vprolvq @Rhotate0[1],$A10,$A10
vprolvq @Rhotate0[2],$A20,$A20
vprolvq @Rhotate0[3],$A30,$A30
vprolvq @Rhotate0[4],$A40,$A40
######################################### Pi
vpermq $A00,@Pi0[0],$A00
vpermq $A10,@Pi0[1],$A10
vpermq $A20,@Pi0[2],$A20
vpermq $A30,@Pi0[3],$A30
vpermq $A40,@Pi0[4],$A40
######################################### Chi
vmovdqa64 $A00,@T[0]
vmovdqa64 $A10,@T[1]
vpternlogq \$0xD2,$A20,$A10,$A00
vpternlogq \$0xD2,$A30,$A20,$A10
vpternlogq \$0xD2,$A40,$A30,$A20
vpternlogq \$0xD2,@T[0],$A40,$A30
vpternlogq \$0xD2,@T[1],@T[0],$A40
######################################### Iota
vpxorq (%r10),$A00,${A00}{$k00001}
lea 16(%r10),%r10
######################################### Harmonize rounds
vpblendmq $A20,$A10,@{T[1]}{$k00010}
vpblendmq $A30,$A20,@{T[2]}{$k00010}
vpblendmq $A40,$A30,@{T[3]}{$k00010}
vpblendmq $A10,$A00,@{T[0]}{$k00010}
vpblendmq $A00,$A40,@{T[4]}{$k00010}
vpblendmq $A30,@T[1],@{T[1]}{$k00100}
vpblendmq $A40,@T[2],@{T[2]}{$k00100}
vpblendmq $A20,@T[0],@{T[0]}{$k00100}
vpblendmq $A00,@T[3],@{T[3]}{$k00100}
vpblendmq $A10,@T[4],@{T[4]}{$k00100}
vpblendmq $A40,@T[1],@{T[1]}{$k01000}
vpblendmq $A30,@T[0],@{T[0]}{$k01000}
vpblendmq $A00,@T[2],@{T[2]}{$k01000}
vpblendmq $A10,@T[3],@{T[3]}{$k01000}
vpblendmq $A20,@T[4],@{T[4]}{$k01000}
vpblendmq $A40,@T[0],@{T[0]}{$k10000}
vpblendmq $A00,@T[1],@{T[1]}{$k10000}
vpblendmq $A10,@T[2],@{T[2]}{$k10000}
vpblendmq $A20,@T[3],@{T[3]}{$k10000}
vpblendmq $A30,@T[4],@{T[4]}{$k10000}
#vpermq @T[0],@Theta[0],$A00 # doesn't actually change order
vpermq @T[1],@Theta[1],$A10
vpermq @T[2],@Theta[2],$A20
vpermq @T[3],@Theta[3],$A30
vpermq @T[4],@Theta[4],$A40
######################################### Theta, odd round
vmovdqa64 $T[0],$A00 # real A00
vpternlogq \$0x96,$A20,$A10,$C00 # C00 is @T[0]'s alias
vpternlogq \$0x96,$A40,$A30,$C00
vprolq \$1,$C00,$D00
vpermq $C00,@Theta[1],$C00
vpermq $D00,@Theta[4],$D00
vpternlogq \$0x96,$C00,$D00,$A00
vpternlogq \$0x96,$C00,$D00,$A30
vpternlogq \$0x96,$C00,$D00,$A10
vpternlogq \$0x96,$C00,$D00,$A40
vpternlogq \$0x96,$C00,$D00,$A20
######################################### Rho
vprolvq @Rhotate1[0],$A00,$A00
vprolvq @Rhotate1[3],$A30,@T[1]
vprolvq @Rhotate1[1],$A10,@T[2]
vprolvq @Rhotate1[4],$A40,@T[3]
vprolvq @Rhotate1[2],$A20,@T[4]
vpermq $A00,@Theta[4],@T[5]
vpermq $A00,@Theta[3],@T[6]
######################################### Iota
vpxorq -8(%r10),$A00,${A00}{$k00001}
######################################### Pi
vpermq @T[1],@Theta[2],$A10
vpermq @T[2],@Theta[4],$A20
vpermq @T[3],@Theta[1],$A30
vpermq @T[4],@Theta[3],$A40
######################################### Chi
vpternlogq \$0xD2,@T[6],@T[5],$A00
vpermq @T[1],@Theta[1],@T[7]
#vpermq @T[1],@Theta[0],@T[1]
vpternlogq \$0xD2,@T[1],@T[7],$A10
vpermq @T[2],@Theta[3],@T[0]
vpermq @T[2],@Theta[2],@T[2]
vpternlogq \$0xD2,@T[2],@T[0],$A20
#vpermq @T[3],@Theta[0],@T[3]
vpermq @T[3],@Theta[4],@T[1]
vpternlogq \$0xD2,@T[1],@T[3],$A30
vpermq @T[4],@Theta[2],@T[0]
vpermq @T[4],@Theta[1],@T[4]
vpternlogq \$0xD2,@T[4],@T[0],$A40
dec %eax
jnz .Loop_avx512
ret
.size __KeccakF1600,.-__KeccakF1600
___
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
my $out = $inp; # in squeeze
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 32
SHA3_absorb:
mov %rsp,%r11
lea -320(%rsp),%rsp
and \$-64,%rsp
lea 96($A_flat),$A_flat
lea 96($inp),$inp
lea 128(%rsp),%r9
lea theta_perm(%rip),%r8
kxnorw $k11111,$k11111,$k11111
kshiftrw \$15,$k11111,$k00001
kshiftrw \$11,$k11111,$k11111
kshiftlw \$1,$k00001,$k00010
kshiftlw \$2,$k00001,$k00100
kshiftlw \$3,$k00001,$k01000
kshiftlw \$4,$k00001,$k10000
#vmovdqa64 64*0(%r8),@Theta[0]
vmovdqa64 64*1(%r8),@Theta[1]
vmovdqa64 64*2(%r8),@Theta[2]
vmovdqa64 64*3(%r8),@Theta[3]
vmovdqa64 64*4(%r8),@Theta[4]
vmovdqa64 64*5(%r8),@Rhotate1[0]
vmovdqa64 64*6(%r8),@Rhotate1[1]
vmovdqa64 64*7(%r8),@Rhotate1[2]
vmovdqa64 64*8(%r8),@Rhotate1[3]
vmovdqa64 64*9(%r8),@Rhotate1[4]
vmovdqa64 64*10(%r8),@Rhotate0[0]
vmovdqa64 64*11(%r8),@Rhotate0[1]
vmovdqa64 64*12(%r8),@Rhotate0[2]
vmovdqa64 64*13(%r8),@Rhotate0[3]
vmovdqa64 64*14(%r8),@Rhotate0[4]
vmovdqa64 64*15(%r8),@Pi0[0]
vmovdqa64 64*16(%r8),@Pi0[1]
vmovdqa64 64*17(%r8),@Pi0[2]
vmovdqa64 64*18(%r8),@Pi0[3]
vmovdqa64 64*19(%r8),@Pi0[4]
vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
vpxorq @T[0],@T[0],@T[0]
vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
vmovdqa64 @T[0],1*64-128(%r9)
vmovdqa64 @T[0],2*64-128(%r9)
vmovdqa64 @T[0],3*64-128(%r9)
vmovdqa64 @T[0],4*64-128(%r9)
jmp .Loop_absorb_avx512
.align 32
.Loop_absorb_avx512:
mov $bsz,%rax
sub $bsz,$len
jc .Ldone_absorb_avx512
shr \$3,%eax
___
for(my $i=0; $i<25; $i++) {
$code.=<<___
mov 8*$i-96($inp),%r8
mov %r8,$A_jagged[$i]-128(%r9)
dec %eax
jz .Labsorved_avx512
___
}
$code.=<<___;
.Labsorved_avx512:
lea ($inp,$bsz),$inp
vpxorq 64*0-128(%r9),$A00,$A00
vpxorq 64*1-128(%r9),$A10,$A10
vpxorq 64*2-128(%r9),$A20,$A20
vpxorq 64*3-128(%r9),$A30,$A30
vpxorq 64*4-128(%r9),$A40,$A40
call __KeccakF1600
jmp .Loop_absorb_avx512
.align 32
.Ldone_absorb_avx512:
vmovdqu64 $A00,40*0-96($A_flat){$k11111}
vmovdqu64 $A10,40*1-96($A_flat){$k11111}
vmovdqu64 $A20,40*2-96($A_flat){$k11111}
vmovdqu64 $A30,40*3-96($A_flat){$k11111}
vmovdqu64 $A40,40*4-96($A_flat){$k11111}
vzeroupper
lea (%r11),%rsp
lea ($len,$bsz),%rax # return value
ret
.size SHA3_absorb,.-SHA3_absorb
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 32
SHA3_squeeze:
mov %rsp,%r11
lea 96($A_flat),$A_flat
cmp $bsz,$len
jbe .Lno_output_extension_avx512
lea theta_perm(%rip),%r8
kxnorw $k11111,$k11111,$k11111
kshiftrw \$15,$k11111,$k00001
kshiftrw \$11,$k11111,$k11111
kshiftlw \$1,$k00001,$k00010
kshiftlw \$2,$k00001,$k00100
kshiftlw \$3,$k00001,$k01000
kshiftlw \$4,$k00001,$k10000
#vmovdqa64 64*0(%r8),@Theta[0]
vmovdqa64 64*1(%r8),@Theta[1]
vmovdqa64 64*2(%r8),@Theta[2]
vmovdqa64 64*3(%r8),@Theta[3]
vmovdqa64 64*4(%r8),@Theta[4]
vmovdqa64 64*5(%r8),@Rhotate1[0]
vmovdqa64 64*6(%r8),@Rhotate1[1]
vmovdqa64 64*7(%r8),@Rhotate1[2]
vmovdqa64 64*8(%r8),@Rhotate1[3]
vmovdqa64 64*9(%r8),@Rhotate1[4]
vmovdqa64 64*10(%r8),@Rhotate0[0]
vmovdqa64 64*11(%r8),@Rhotate0[1]
vmovdqa64 64*12(%r8),@Rhotate0[2]
vmovdqa64 64*13(%r8),@Rhotate0[3]
vmovdqa64 64*14(%r8),@Rhotate0[4]
vmovdqa64 64*15(%r8),@Pi0[0]
vmovdqa64 64*16(%r8),@Pi0[1]
vmovdqa64 64*17(%r8),@Pi0[2]
vmovdqa64 64*18(%r8),@Pi0[3]
vmovdqa64 64*19(%r8),@Pi0[4]
vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
vmovdqu64 40*1-96($A_flat),${A10}{$k11111}{z}
vmovdqu64 40*2-96($A_flat),${A20}{$k11111}{z}
vmovdqu64 40*3-96($A_flat),${A30}{$k11111}{z}
vmovdqu64 40*4-96($A_flat),${A40}{$k11111}{z}
.Lno_output_extension_avx512:
shr \$3,$bsz
lea -96($A_flat),%r9
mov $bsz,%rax
jmp .Loop_squeeze_avx512
.align 32
.Loop_squeeze_avx512:
cmp \$8,$len
jb .Ltail_squeeze_avx512
mov (%r9),%r8
lea 8(%r9),%r9
mov %r8,($out)
lea 8($out),$out
sub \$8,$len # len -= 8
jz .Ldone_squeeze_avx512
sub \$1,%rax # bsz--
jnz .Loop_squeeze_avx512
#vpermq @Theta[4],@Theta[4],@Theta[3]
#vpermq @Theta[3],@Theta[4],@Theta[2]
#vpermq @Theta[3],@Theta[3],@Theta[1]
call __KeccakF1600
vmovdqu64 $A00,40*0-96($A_flat){$k11111}
vmovdqu64 $A10,40*1-96($A_flat){$k11111}
vmovdqu64 $A20,40*2-96($A_flat){$k11111}
vmovdqu64 $A30,40*3-96($A_flat){$k11111}
vmovdqu64 $A40,40*4-96($A_flat){$k11111}
lea -96($A_flat),%r9
mov $bsz,%rax
jmp .Loop_squeeze_avx512
.Ltail_squeeze_avx512:
mov $out,%rdi
mov %r9,%rsi
mov $len,%rcx
.byte 0xf3,0xa4 # rep movsb
.Ldone_squeeze_avx512:
vzeroupper
lea (%r11),%rsp
ret
.size SHA3_squeeze,.-SHA3_squeeze
.align 64
theta_perm:
.quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
.quad 4, 0, 1, 2, 3, 5, 6, 7
.quad 3, 4, 0, 1, 2, 5, 6, 7
.quad 2, 3, 4, 0, 1, 5, 6, 7
.quad 1, 2, 3, 4, 0, 5, 6, 7
rhotates1:
.quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
.quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
.quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
.quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
.quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
rhotates0:
.quad 0, 1, 62, 28, 27, 0, 0, 0
.quad 36, 44, 6, 55, 20, 0, 0, 0
.quad 3, 10, 43, 25, 39, 0, 0, 0
.quad 41, 45, 15, 21, 8, 0, 0, 0
.quad 18, 2, 61, 56, 14, 0, 0, 0
pi0_perm:
.quad 0, 3, 1, 4, 2, 5, 6, 7
.quad 1, 4, 2, 0, 3, 5, 6, 7
.quad 2, 0, 3, 1, 4, 5, 6, 7
.quad 3, 1, 4, 2, 0, 5, 6, 7
.quad 4, 2, 0, 3, 1, 5, 6, 7
iotas:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=pop;
open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,392 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for AVX512VL.
#
# December 2017.
#
# This is an adaptation of AVX2 module that reuses register data
# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
# module for further information on layout.
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
# r=1088(*)
#
# Skylake-X 6.4/+47%
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
# coefficient in comparison to scalar keccak1600-x86_64.pl.
# Digits in variables' names denote right-most coordinates:
my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
$A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
$A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
$A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
$A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
$A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
$A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
map("%ymm$_",(0..6));
# We also need to map the magic order into offsets within structure:
my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
[2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
[2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
[2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
[2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
@A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
my @T = map("%ymm$_",(7..15));
my ($C14,$C00,$D00,$D14) = @T[5..8];
my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
$code.=<<___;
.text
.type __KeccakF1600,\@function
.align 32
__KeccakF1600:
lea iotas(%rip),%r10
mov \$24,%eax
jmp .Loop_avx512vl
.align 32
.Loop_avx512vl:
######################################### Theta
vpshufd \$0b01001110,$A20,$C00
vpxor $A31,$A41,$C14
vpxor $A11,$A21,@T[2]
vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4]
vpxor $A20,$C00,$C00
vpermq \$0b01001110,$C00,@T[0]
vpermq \$0b10010011,$C14,@T[4]
vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1)
vpermq \$0b00111001,@T[1],$D14
vpxor @T[4],@T[1],$D00
vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0]
vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1)
vpxor $D00,$A00,$A00 # ^= D[0..0]
vpblendd \$0b11000000,@T[1],$D14,$D14
vpblendd \$0b00000011,$C00,@T[4],@T[0]
######################################### Rho + Pi + pre-Chi shuffle
vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta
vprolvq $R20,$A20,$A20
vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta
vprolvq $R31,$A31,$A31
vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta
vprolvq $R21,$A21,$A21
vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta
vprolvq $R41,$A41,$A41
vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31
vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21
vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta
vprolvq $R11,$A11,@T[1] # $A11 -> future $A01
vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41
vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11
vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta
vprolvq $R01,$A01,@T[2] # $A01 -> future $A20
######################################### Chi
vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0]
vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0]
vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0]
vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4]
vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3]
vpsrldq \$8,@T[1],@T[0]
vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4]
vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4]
vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4]
vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2]
vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3]
vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3]
vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0]
vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0]
vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
vpermq \$0b10001101,$A41,$A41
vpermq \$0b01110010,$A11,$A11
vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1]
vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1]
######################################### Iota
vpternlogq \$0x96,(%r10),@T[0],$A00
lea 32(%r10),%r10
dec %eax
jnz .Loop_avx512vl
ret
.size __KeccakF1600,.-__KeccakF1600
___
my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
my $out = $inp; # in squeeze
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 32
SHA3_absorb:
mov %rsp,%r11
lea -240(%rsp),%rsp
and \$-32,%rsp
lea 96($A_flat),$A_flat
lea 96($inp),$inp
lea 96(%rsp),%r10
lea rhotates_left(%rip),%r8
vzeroupper
vpbroadcastq -96($A_flat),$A00 # load A[5][5]
vmovdqu 8+32*0-96($A_flat),$A01
vmovdqu 8+32*1-96($A_flat),$A20
vmovdqu 8+32*2-96($A_flat),$A31
vmovdqu 8+32*3-96($A_flat),$A21
vmovdqu 8+32*4-96($A_flat),$A41
vmovdqu 8+32*5-96($A_flat),$A11
vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
vmovdqa64 1*32(%r8),$R01
vmovdqa64 2*32(%r8),$R31
vmovdqa64 3*32(%r8),$R21
vmovdqa64 4*32(%r8),$R41
vmovdqa64 5*32(%r8),$R11
vpxor @T[0],@T[0],@T[0]
vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
vmovdqa @T[0],32*3-96(%r10)
vmovdqa @T[0],32*4-96(%r10)
vmovdqa @T[0],32*5-96(%r10)
vmovdqa @T[0],32*6-96(%r10)
.Loop_absorb_avx512vl:
mov $bsz,%rax
sub $bsz,$len
jc .Ldone_absorb_avx512vl
shr \$3,%eax
vpbroadcastq 0-96($inp),@T[0]
vmovdqu 8-96($inp),@T[1]
sub \$4,%eax
___
for(my $i=5; $i<25; $i++) {
$code.=<<___
dec %eax
jz .Labsorved_avx512vl
mov 8*$i-96($inp),%r8
mov %r8,$A_jagged[$i]-96(%r10)
___
}
$code.=<<___;
.Labsorved_avx512vl:
lea ($inp,$bsz),$inp
vpxor @T[0],$A00,$A00
vpxor @T[1],$A01,$A01
vpxor 32*2-96(%r10),$A20,$A20
vpxor 32*3-96(%r10),$A31,$A31
vpxor 32*4-96(%r10),$A21,$A21
vpxor 32*5-96(%r10),$A41,$A41
vpxor 32*6-96(%r10),$A11,$A11
call __KeccakF1600
lea 96(%rsp),%r10
jmp .Loop_absorb_avx512vl
.Ldone_absorb_avx512vl:
vmovq %xmm0,-96($A_flat)
vmovdqu $A01,8+32*0-96($A_flat)
vmovdqu $A20,8+32*1-96($A_flat)
vmovdqu $A31,8+32*2-96($A_flat)
vmovdqu $A21,8+32*3-96($A_flat)
vmovdqu $A41,8+32*4-96($A_flat)
vmovdqu $A11,8+32*5-96($A_flat)
vzeroupper
lea (%r11),%rsp
lea ($len,$bsz),%rax # return value
ret
.size SHA3_absorb,.-SHA3_absorb
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 32
SHA3_squeeze:
mov %rsp,%r11
lea 96($A_flat),$A_flat
lea rhotates_left(%rip),%r8
shr \$3,$bsz
vzeroupper
vpbroadcastq -96($A_flat),$A00
vpxor @T[0],@T[0],@T[0]
vmovdqu 8+32*0-96($A_flat),$A01
vmovdqu 8+32*1-96($A_flat),$A20
vmovdqu 8+32*2-96($A_flat),$A31
vmovdqu 8+32*3-96($A_flat),$A21
vmovdqu 8+32*4-96($A_flat),$A41
vmovdqu 8+32*5-96($A_flat),$A11
vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices
vmovdqa64 1*32(%r8),$R01
vmovdqa64 2*32(%r8),$R31
vmovdqa64 3*32(%r8),$R21
vmovdqa64 4*32(%r8),$R41
vmovdqa64 5*32(%r8),$R11
mov $bsz,%rax
.Loop_squeeze_avx512vl:
mov @A_jagged[$i]-96($A_flat),%r8
___
for (my $i=0; $i<25; $i++) {
$code.=<<___;
sub \$8,$len
jc .Ltail_squeeze_avx512vl
mov %r8,($out)
lea 8($out),$out
je .Ldone_squeeze_avx512vl
dec %eax
je .Lextend_output_avx512vl
mov @A_jagged[$i+1]-120($A_flat),%r8
___
}
$code.=<<___;
.Lextend_output_avx512vl:
call __KeccakF1600
vmovq %xmm0,-96($A_flat)
vmovdqu $A01,8+32*0-96($A_flat)
vmovdqu $A20,8+32*1-96($A_flat)
vmovdqu $A31,8+32*2-96($A_flat)
vmovdqu $A21,8+32*3-96($A_flat)
vmovdqu $A41,8+32*4-96($A_flat)
vmovdqu $A11,8+32*5-96($A_flat)
mov $bsz,%rax
jmp .Loop_squeeze_avx512vl
.Ltail_squeeze_avx512vl:
add \$8,$len
.Loop_tail_avx512vl:
mov %r8b,($out)
lea 1($out),$out
shr \$8,%r8
dec $len
jnz .Loop_tail_avx512vl
.Ldone_squeeze_avx512vl:
vzeroupper
lea (%r11),%rsp
ret
.size SHA3_squeeze,.-SHA3_squeeze
.align 64
rhotates_left:
.quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
.quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
.quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
.quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
.quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
.quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
iotas:
.quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
.quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
.quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
.quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
.quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
.quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
.quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
.quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
.quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
.quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
.quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
.quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
.quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
.quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
.quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
.quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
.quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
.quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
.quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
.quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
.quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
.quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
___
$output=pop;
open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,885 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# [ABI- and endian-neutral] Keccak-1600 for C64x.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
# with bit interleaving. 64-bit values are simply split between A- and
# B-files, with A-file holding least significant halves. This works
# out perfectly, because all operations including cross-communications
# [in rotate operations] are always complementary. Performance is
# [incredible for a 32-bit processor] 10.9 cycles per processed byte
# for r=1088, which corresponds to SHA3-256. This is >15x faster than
# compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
# On average processor ends up issuing ~4.5 instructions per cycle...
my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
$A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
my @C = (0..4,$A[3][0],$A[4][0]);
my $iotas = "A14";
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
sub ROL64 {
my ($src,$rot,$dst,$p) = @_;
if ($rot&1) {
$code.=<<___;
$p ROTL B$src,$rot/2+1,A$dst
|| ROTL A$src,$rot/2, B$dst
___
} else {
$code.=<<___;
$p ROTL A$src,$rot/2,A$dst
|| ROTL B$src,$rot/2,B$dst
___
}
}
########################################################################
# Stack frame layout
#
# SP--->+------+------+
# | | |
# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
# | | |
# +2--->+------+------+<- -8
# | | |
# +3--->+------+------+<- -7
# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
# +4--->+------+------+<- -6
# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
# +5--->+------+------+<- -5 below is ABI-compliant layout
# | A10 | A11 |
# +6--->+------+------+<- -4
# | A12 | A13 |
# +7--->+------+------+<- -3
# | A14 | B3 |
# +8--->+------+------+<- -2
# | B10 | B11 |
# +9--->+------+------+<- -1
# | B12 | B13 |
# +------+------+<---FP
# | A15 |
# +------+--
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.nocmp
.asg KeccakF1600,_KeccakF1600
.asg SHA3_absorb,_SHA3_absorb
.asg SHA3_squeeze,_SHA3_squeeze
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.align 32
_KeccakF1600_int:
.asmfunc
STDW A3:A2,*FP[-7]
|| STDW B3:B2,*SP[4]
_KeccakF1600_cheat:
.if __TI_EABI__
ADDKPC _KeccakF1600_int,B0
|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
.else
ADDKPC _KeccakF1600_int,B0
|| MVKL (iotas-_KeccakF1600_int),$iotas
MVKH (iotas-_KeccakF1600_int),$iotas
.endif
ADD B0,$iotas,$iotas
loop?:
XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
|| XOR B$A[0][2],B$A[1][2],B$C[2]
|| XOR A$A[0][3],A$A[1][3],A$C[3]
|| XOR B$A[0][3],B$A[1][3],B$C[3]
|| XOR A$A[0][0],A$A[1][0],A$C[0]
|| XOR B$A[0][0],B$A[1][0],B$C[0]
XOR A$A[2][2],A$C[2],A$C[2]
|| XOR B$A[2][2],B$C[2],B$C[2]
|| XOR A$A[2][3],A$C[3],A$C[3]
|| XOR B$A[2][3],B$C[3],B$C[3]
|| XOR A$A[2][0],A$C[0],A$C[0]
|| XOR B$A[2][0],B$C[0],B$C[0]
XOR A$A[3][2],A$C[2],A$C[2]
|| XOR B$A[3][2],B$C[2],B$C[2]
|| XOR A$A[3][3],A$C[3],A$C[3]
|| XOR B$A[3][3],B$C[3],B$C[3]
|| XOR A$A[3][0],A$C[0],A$C[0]
|| XOR B$A[3][0],B$C[0],B$C[0]
XOR A$A[4][2],A$C[2],A$C[2]
|| XOR B$A[4][2],B$C[2],B$C[2]
|| XOR A$A[4][3],A$C[3],A$C[3]
|| XOR B$A[4][3],B$C[3],B$C[3]
|| XOR A$A[4][0],A$C[0],A$C[0]
|| XOR B$A[4][0],B$C[0],B$C[0]
XOR A$A[0][4],A$A[1][4],A$C[4]
|| XOR B$A[0][4],B$A[1][4],B$C[4]
|| XOR A$A[0][1],A$A[1][1],A$C[1]
|| XOR B$A[0][1],B$A[1][1],B$C[1]
|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
STDW B$A[3][0]:B$A[4][0],*SP[2]
|| XOR A$A[2][4],A$C[4],A$C[4]
|| XOR B$A[2][4],B$C[4],B$C[4]
|| XOR A$A[2][1],A$C[1],A$C[1]
|| XOR B$A[2][1],B$C[1],B$C[1]
|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
|| ROTL A$C[2],0,B$C[5]
XOR A$A[3][4],A$C[4],A$C[4]
|| XOR B$A[3][4],B$C[4],B$C[4]
|| XOR A$A[3][1],A$C[1],A$C[1]
|| XOR B$A[3][1],B$C[1],B$C[1]
|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
|| ROTL A$C[3],0,B$C[6]
XOR A$A[4][4],A$C[4],A$C[4]
|| XOR B$A[4][4],B$C[4],B$C[4]
|| XOR A$A[4][1],A$C[1],A$C[1]
|| XOR B$A[4][1],B$C[1],B$C[1]
|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
|| XOR B$C[0],B$C[5],B$C[5]
XOR A$C[5],A$A[0][1],A$A[0][1]
|| XOR B$C[5],B$A[0][1],B$A[0][1]
|| XOR A$C[5],A$A[1][1],A$A[1][1]
|| XOR B$C[5],B$A[1][1],B$A[1][1]
|| XOR A$C[5],A$A[2][1],A$A[2][1]
|| XOR B$C[5],B$A[2][1],B$A[2][1]
XOR A$C[5],A$A[3][1],A$A[3][1]
|| XOR B$C[5],B$A[3][1],B$A[3][1]
|| XOR A$C[5],A$A[4][1],A$A[4][1]
|| XOR B$C[5],B$A[4][1],B$A[4][1]
|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
|| ROTL A$C[4],0,B$C[5]
|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
|| XOR B$C[1],B$C[6],B$C[6]
XOR A$C[6],A$A[0][2],A$A[0][2]
|| XOR B$C[6],B$A[0][2],B$A[0][2]
|| XOR A$C[6],A$A[1][2],A$A[1][2]
|| XOR B$C[6],B$A[1][2],B$A[1][2]
|| XOR A$C[6],A$A[2][2],A$A[2][2]
|| XOR B$C[6],B$A[2][2],B$A[2][2]
|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
|| ROTL A$C[1],0,B$C[1]
XOR A$C[6],A$A[3][2],A$A[3][2]
|| XOR B$C[6],B$A[3][2],B$A[3][2]
|| XOR A$C[6],A$A[4][2],A$A[4][2]
|| XOR B$C[6],B$A[4][2],B$A[4][2]
|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
|| ROTL A$C[0],0,B$C[6]
|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
|| XOR B$C[5],B$C[2],B$C[2]
XOR A$C[2],A$A[0][3],A$A[0][3]
|| XOR B$C[2],B$A[0][3],B$A[0][3]
|| XOR A$C[2],A$A[1][3],A$A[1][3]
|| XOR B$C[2],B$A[1][3],B$A[1][3]
|| XOR A$C[2],A$A[2][3],A$A[2][3]
|| XOR B$C[2],B$A[2][3],B$A[2][3]
XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
|| XOR B$C[6],B$C[3],B$C[3]
|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
|| LDDW *SP[2],B$A[3][0]:B$A[4][0]
|| XOR A$C[2],A$A[3][3],A$A[3][3]
|| XOR B$C[2],B$A[3][3],B$A[3][3]
XOR A$C[2],A$A[4][3],A$A[4][3]
|| XOR B$C[2],B$A[4][3],B$A[4][3]
|| XOR A$C[3],A$A[0][4],A$A[0][4]
|| XOR B$C[3],B$A[0][4],B$A[0][4]
|| XOR A$C[3],A$A[1][4],A$A[1][4]
|| XOR B$C[3],B$A[1][4],B$A[1][4]
XOR A$C[3],A$A[2][4],A$A[2][4]
|| XOR B$C[3],B$A[2][4],B$A[2][4]
|| XOR A$C[3],A$A[3][4],A$A[3][4]
|| XOR B$C[3],B$A[3][4],B$A[3][4]
|| XOR A$C[3],A$A[4][4],A$A[4][4]
|| XOR B$C[3],B$A[4][4],B$A[4][4]
XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
|| XOR B$C[1],B$C[4],B$C[4]
|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
|| MV B$A[0][1],B$C[1]
___
&ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
$code.=<<___;
XOR A$C[4],A$A[0][0],A$A[0][0]
|| XOR B$C[4],B$A[0][0],B$A[0][0]
|| XOR A$C[4],A$A[1][0],A$A[1][0]
|| XOR B$C[4],B$A[1][0],B$A[1][0]
|| MV A$A[0][3],A$C[3]
|| MV B$A[0][3],B$C[3]
___
&ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
$code.=<<___;
XOR A$C[4],A$A[2][0],A$A[2][0]
|| XOR B$C[4],B$A[2][0],B$A[2][0]
|| XOR A$C[4],A$A[3][0],A$A[3][0]
|| XOR B$C[4],B$A[3][0],B$A[3][0]
|| MV A$A[0][2],A$C[2]
|| MV B$A[0][2],B$C[2]
___
&ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
$code.=<<___;
XOR A$C[4],A$A[4][0],A$A[4][0]
|| XOR B$C[4],B$A[4][0],B$A[4][0]
|| MV A$A[0][4],A$C[4]
|| MV B$A[0][4],B$C[4]
___
&ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
&ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
$code.=<<___;
|| LDW *${iotas}++[2],A$C[0]
___
&ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
$code.=<<___;
|| LDW *${iotas}[-1],B$C[0]
___
&ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
&ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
&ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
&ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
&ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
&ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
&ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
&ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
&ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
&ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
&ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
&ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
&ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
&ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
#&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
&ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
&ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
&ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
$code.=<<___;
|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
|| ANDN B$A[0][2],B$A[0][1],B$C[4]
|| ANDN A$A[0][3],A$A[0][2],A$C[1]
|| ANDN B$A[0][3],B$A[0][2],B$C[1]
|| ANDN A$A[0][4],A$A[0][3],A$C[2]
|| ANDN B$A[0][4],B$A[0][3],B$C[2]
___
&ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
$code.=<<___;
|| ANDN A$A[0][0],A$A[0][4],A$C[3]
|| ANDN B$A[0][0],B$A[0][4],B$C[3]
|| XOR A$C[4],A$A[0][0],A$A[0][0]
|| XOR B$C[4],B$A[0][0],B$A[0][0]
|| ANDN A$A[0][1],A$A[0][0],A$C[4]
|| ANDN B$A[0][1],B$A[0][0],B$C[4]
XOR A$C[1],A$A[0][1],A$A[0][1]
|| XOR B$C[1],B$A[0][1],B$A[0][1]
|| XOR A$C[2],A$A[0][2],A$A[0][2]
|| XOR B$C[2],B$A[0][2],B$A[0][2]
|| XOR A$C[3],A$A[0][3],A$A[0][3]
|| XOR B$C[3],B$A[0][3],B$A[0][3]
XOR A$C[4],A$A[0][4],A$A[0][4]
|| XOR B$C[4],B$A[0][4],B$A[0][4]
|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
|| XOR B$C[0],B$A[0][0],B$A[0][0]
|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
ANDN A$A[1][2],A$A[1][1],A$C[4]
|| ANDN B$A[1][2],B$A[1][1],B$C[4]
|| ANDN A$A[1][3],A$A[1][2],A$C[1]
|| ANDN B$A[1][3],B$A[1][2],B$C[1]
|| ANDN A$A[1][4],A$A[1][3],A$C[2]
|| ANDN B$A[1][4],B$A[1][3],B$C[2]
ANDN A$A[1][0],A$A[1][4],A$C[3]
|| ANDN B$A[1][0],B$A[1][4],B$C[3]
|| XOR A$C[4],A$A[1][0],A$A[1][0]
|| XOR B$C[4],B$A[1][0],B$A[1][0]
|| ANDN A$A[1][1],A$A[1][0],A$C[4]
|| ANDN B$A[1][1],B$A[1][0],B$C[4]
XOR A$C[1],A$A[1][1],A$A[1][1]
|| XOR B$C[1],B$A[1][1],B$A[1][1]
|| XOR A$C[2],A$A[1][2],A$A[1][2]
|| XOR B$C[2],B$A[1][2],B$A[1][2]
|| XOR A$C[3],A$A[1][3],A$A[1][3]
|| XOR B$C[3],B$A[1][3],B$A[1][3]
XOR A$C[4],A$A[1][4],A$A[1][4]
|| XOR B$C[4],B$A[1][4],B$A[1][4]
|| ANDN A$A[2][2],A$A[2][1],A$C[4]
|| ANDN B$A[2][2],B$A[2][1],B$C[4]
|| ANDN A$A[2][3],A$A[2][2],A$C[1]
|| ANDN B$A[2][3],B$A[2][2],B$C[1]
ANDN A$A[2][4],A$A[2][3],A$C[2]
|| ANDN B$A[2][4],B$A[2][3],B$C[2]
|| ANDN A$A[2][0],A$A[2][4],A$C[3]
|| ANDN B$A[2][0],B$A[2][4],B$C[3]
|| XOR A$C[4],A$A[2][0],A$A[2][0]
|| XOR B$C[4],B$A[2][0],B$A[2][0]
ANDN A$A[2][1],A$A[2][0],A$C[4]
|| ANDN B$A[2][1],B$A[2][0],B$C[4]
|| XOR A$C[1],A$A[2][1],A$A[2][1]
|| XOR B$C[1],B$A[2][1],B$A[2][1]
|| XOR A$C[2],A$A[2][2],A$A[2][2]
|| XOR B$C[2],B$A[2][2],B$A[2][2]
XOR A$C[3],A$A[2][3],A$A[2][3]
|| XOR B$C[3],B$A[2][3],B$A[2][3]
|| XOR A$C[4],A$A[2][4],A$A[2][4]
|| XOR B$C[4],B$A[2][4],B$A[2][4]
ANDN A$A[3][2],A$A[3][1],A$C[4]
|| ANDN B$A[3][2],B$A[3][1],B$C[4]
|| ANDN A$A[3][3],A$A[3][2],A$C[1]
|| ANDN B$A[3][3],B$A[3][2],B$C[1]
|| ANDN A$A[3][4],A$A[3][3],A$C[2]
|| ANDN B$A[3][4],B$A[3][3],B$C[2]
ANDN A$A[3][0],A$A[3][4],A$C[3]
|| ANDN B$A[3][0],B$A[3][4],B$C[3]
|| XOR A$C[4],A$A[3][0],A$A[3][0]
|| XOR B$C[4],B$A[3][0],B$A[3][0]
|| ANDN A$A[3][1],A$A[3][0],A$C[4]
|| ANDN B$A[3][1],B$A[3][0],B$C[4]
XOR A$C[1],A$A[3][1],A$A[3][1]
|| XOR B$C[1],B$A[3][1],B$A[3][1]
|| XOR A$C[2],A$A[3][2],A$A[3][2]
|| XOR B$C[2],B$A[3][2],B$A[3][2]
|| XOR A$C[3],A$A[3][3],A$A[3][3]
||[A0] BNOP loop?
XOR B$C[3],B$A[3][3],B$A[3][3]
|| XOR A$C[4],A$A[3][4],A$A[3][4]
|| XOR B$C[4],B$A[3][4],B$A[3][4]
||[!A0] LDDW *FP[-7],A3:A2
||[!A0] LDDW *SP[4], RA:B2
ANDN A$A[4][2],A$A[4][1],A$C[4]
|| ANDN B$A[4][2],B$A[4][1],B$C[4]
|| ANDN A$A[4][3],A$A[4][2],A$C[1]
|| ANDN B$A[4][3],B$A[4][2],B$C[1]
|| ANDN A$A[4][4],A$A[4][3],A$C[2]
|| ANDN B$A[4][4],B$A[4][3],B$C[2]
ANDN A$A[4][0],A$A[4][4],A$C[3]
|| ANDN B$A[4][0],B$A[4][4],B$C[3]
|| XOR A$C[4],A$A[4][0],A$A[4][0]
|| XOR B$C[4],B$A[4][0],B$A[4][0]
|| ANDN A$A[4][1],A$A[4][0],A$C[4]
|| ANDN B$A[4][1],B$A[4][0],B$C[4]
XOR A$C[1],A$A[4][1],A$A[4][1]
|| XOR B$C[1],B$A[4][1],B$A[4][1]
|| XOR A$C[2],A$A[4][2],A$A[4][2]
|| XOR B$C[2],B$A[4][2],B$A[4][2]
|| XOR A$C[3],A$A[4][3],A$A[4][3]
|| XOR B$C[3],B$A[4][3],B$A[4][3]
XOR A$C[4],A$A[4][4],A$A[4][4]
|| XOR B$C[4],B$A[4][4],B$A[4][4]
;;===== branch to loop? is taken here
BNOP RA,5
.endasmfunc
.newblock
.global _KeccakF1600
.align 32
_KeccakF1600:
.asmfunc stack_usage(80)
STW FP,*SP--(80) ; save frame pointer
|| MV SP,FP
STDW B13:B12,*SP[9]
|| STDW A13:A12,*FP[-4]
STDW B11:B10,*SP[8]
|| STDW A11:A10,*FP[-5]
STW RA, *SP[15]
|| STW A14,*FP[-6]
|| MV A4,A2
|| ADD 4,A4,B2
LDW *A2++[2],A$A[0][0] ; load A[5][5]
|| LDW *B2++[2],B$A[0][0]
LDW *A2++[2],A$A[0][1]
|| LDW *B2++[2],B$A[0][1]
LDW *A2++[2],A$A[0][2]
|| LDW *B2++[2],B$A[0][2]
LDW *A2++[2],A$A[0][3]
|| LDW *B2++[2],B$A[0][3]
LDW *A2++[2],A$A[0][4]
|| LDW *B2++[2],B$A[0][4]
LDW *A2++[2],A$A[1][0]
|| LDW *B2++[2],B$A[1][0]
LDW *A2++[2],A$A[1][1]
|| LDW *B2++[2],B$A[1][1]
LDW *A2++[2],A$A[1][2]
|| LDW *B2++[2],B$A[1][2]
LDW *A2++[2],A$A[1][3]
|| LDW *B2++[2],B$A[1][3]
LDW *A2++[2],A$A[1][4]
|| LDW *B2++[2],B$A[1][4]
LDW *A2++[2],A$A[2][0]
|| LDW *B2++[2],B$A[2][0]
LDW *A2++[2],A$A[2][1]
|| LDW *B2++[2],B$A[2][1]
LDW *A2++[2],A$A[2][2]
|| LDW *B2++[2],B$A[2][2]
LDW *A2++[2],A$A[2][3]
|| LDW *B2++[2],B$A[2][3]
LDW *A2++[2],A$A[2][4]
|| LDW *B2++[2],B$A[2][4]
LDW *A2++[2],A$A[3][0]
|| LDW *B2++[2],B$A[3][0]
LDW *A2++[2],A$A[3][1]
|| LDW *B2++[2],B$A[3][1]
LDW *A2++[2],A$A[3][2]
|| LDW *B2++[2],B$A[3][2]
LDW *A2++[2],A$A[3][3]
|| LDW *B2++[2],B$A[3][3]
LDW *A2++[2],A$A[3][4]
|| LDW *B2++[2],B$A[3][4]
|| BNOP _KeccakF1600_int
ADDKPC ret?,RA
|| LDW *A2++[2],A$A[4][0]
|| LDW *B2++[2],B$A[4][0]
LDW *A2++[2],A$A[4][1]
|| LDW *B2++[2],B$A[4][1]
LDW *A2++[2],A$A[4][2]
|| LDW *B2++[2],B$A[4][2]
LDW *A2++[2],A$A[4][3]
|| LDW *B2++[2],B$A[4][3]
LDW *A2,A$A[4][4]
|| LDW *B2,B$A[4][4]
|| ADDK -192,A2 ; rewind
|| ADDK -192,B2
.align 16
ret?:
STW A$A[0][0],*A2++[2] ; store A[5][5]
|| STW B$A[0][0],*B2++[2]
STW A$A[0][1],*A2++[2]
|| STW B$A[0][1],*B2++[2]
STW A$A[0][2],*A2++[2]
|| STW B$A[0][2],*B2++[2]
STW A$A[0][3],*A2++[2]
|| STW B$A[0][3],*B2++[2]
STW A$A[0][4],*A2++[2]
|| STW B$A[0][4],*B2++[2]
STW A$A[1][0],*A2++[2]
|| STW B$A[1][0],*B2++[2]
STW A$A[1][1],*A2++[2]
|| STW B$A[1][1],*B2++[2]
STW A$A[1][2],*A2++[2]
|| STW B$A[1][2],*B2++[2]
STW A$A[1][3],*A2++[2]
|| STW B$A[1][3],*B2++[2]
STW A$A[1][4],*A2++[2]
|| STW B$A[1][4],*B2++[2]
STW A$A[2][0],*A2++[2]
|| STW B$A[2][0],*B2++[2]
STW A$A[2][1],*A2++[2]
|| STW B$A[2][1],*B2++[2]
STW A$A[2][2],*A2++[2]
|| STW B$A[2][2],*B2++[2]
STW A$A[2][3],*A2++[2]
|| STW B$A[2][3],*B2++[2]
STW A$A[2][4],*A2++[2]
|| STW B$A[2][4],*B2++[2]
STW A$A[3][0],*A2++[2]
|| STW B$A[3][0],*B2++[2]
STW A$A[3][1],*A2++[2]
|| STW B$A[3][1],*B2++[2]
STW A$A[3][2],*A2++[2]
|| STW B$A[3][2],*B2++[2]
STW A$A[3][3],*A2++[2]
|| STW B$A[3][3],*B2++[2]
STW A$A[3][4],*A2++[2]
|| STW B$A[3][4],*B2++[2]
LDW *SP[15],RA
|| LDW *FP[-6],A14
STW A$A[4][0],*A2++[2]
|| STW B$A[4][0],*B2++[2]
STW A$A[4][1],*A2++[2]
|| STW B$A[4][1],*B2++[2]
STW A$A[4][2],*A2++[2]
|| STW B$A[4][2],*B2++[2]
STW A$A[4][3],*A2++[2]
|| STW B$A[4][3],*B2++[2]
STW A$A[4][4],*A2
|| STW B$A[4][4],*B2
|| ADDK -192,A2 ; rewind
MV A2,A4 ; return original A4
|| LDDW *SP[8], B11:B10
|| LDDW *FP[-5],A11:A10
LDDW *SP[9], B13:B12
|| LDDW *FP[-4],A13:A12
|| BNOP RA
LDW *++SP(80),FP ; restore frame pointer
NOP 4 ; wait till FP is committed
.endasmfunc
.newblock
.asg B2,BSZ
.asg A2,INP
.asg A3,LEN
.global _SHA3_absorb
.align 32
_SHA3_absorb:
.asmfunc stack_usage(80)
STW FP,*SP--(80) ; save frame pointer
|| MV SP,FP
STDW B13:B12,*SP[9]
|| STDW A13:A12,*FP[-4]
STDW B11:B10,*SP[8]
|| STDW A11:A10,*FP[-5]
STW RA, *SP[15]
|| STW A14,*FP[-6]
STW A4,*SP[1] ; save A[][]
|| MV B4,INP ; reassign arguments
|| MV A6,LEN
|| MV B6,BSZ
|| ADD 4,A4,B4
LDW *A4++[2],A$A[0][0] ; load A[5][5]
|| LDW *B4++[2],B$A[0][0]
LDW *A4++[2],A$A[0][1]
|| LDW *B4++[2],B$A[0][1]
LDW *A4++[2],A$A[0][2]
|| LDW *B4++[2],B$A[0][2]
LDW *A4++[2],A$A[0][3]
|| LDW *B4++[2],B$A[0][3]
LDW *A4++[2],A$A[0][4]
|| LDW *B4++[2],B$A[0][4]
LDW *A4++[2],A$A[1][0]
|| LDW *B4++[2],B$A[1][0]
LDW *A4++[2],A$A[1][1]
|| LDW *B4++[2],B$A[1][1]
LDW *A4++[2],A$A[1][2]
|| LDW *B4++[2],B$A[1][2]
LDW *A4++[2],A$A[1][3]
|| LDW *B4++[2],B$A[1][3]
LDW *A4++[2],A$A[1][4]
|| LDW *B4++[2],B$A[1][4]
LDW *A4++[2],A$A[2][0]
|| LDW *B4++[2],B$A[2][0]
LDW *A4++[2],A$A[2][1]
|| LDW *B4++[2],B$A[2][1]
LDW *A4++[2],A$A[2][2]
|| LDW *B4++[2],B$A[2][2]
LDW *A4++[2],A$A[2][3]
|| LDW *B4++[2],B$A[2][3]
LDW *A4++[2],A$A[2][4]
|| LDW *B4++[2],B$A[2][4]
LDW *A4++[2],A$A[3][0]
|| LDW *B4++[2],B$A[3][0]
LDW *A4++[2],A$A[3][1]
|| LDW *B4++[2],B$A[3][1]
LDW *A4++[2],A$A[3][2]
|| LDW *B4++[2],B$A[3][2]
LDW *A4++[2],A$A[3][3]
|| LDW *B4++[2],B$A[3][3]
LDW *A4++[2],A$A[3][4]
|| LDW *B4++[2],B$A[3][4]
LDW *A4++[2],A$A[4][0]
|| LDW *B4++[2],B$A[4][0]
LDW *A4++[2],A$A[4][1]
|| LDW *B4++[2],B$A[4][1]
LDW *A4++[2],A$A[4][2]
|| LDW *B4++[2],B$A[4][2]
LDW *A4++[2],A$A[4][3]
|| LDW *B4++[2],B$A[4][3]
LDW *A4,A$A[4][4]
|| LDW *B4,B$A[4][4]
|| ADDKPC loop?,RA
STDW RA:BSZ,*SP[4]
loop?:
CMPLTU LEN,BSZ,A0 ; len < bsz?
|| SHRU BSZ,3,BSZ
[A0] BNOP ret?
||[A0] ZERO BSZ
||[A0] LDW *SP[1],A2 ; pull A[][]
[BSZ] LDNDW *INP++,A1:A0
||[BSZ] SUB LEN,8,LEN
||[BSZ] SUB BSZ,1,BSZ
NOP 4
___
for ($y = 0; $y < 5; $y++) {
for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
$code.=<<___;
.if .BIG_ENDIAN
SWAP2 A0,A1
|| SWAP2 A1,A0
SWAP4 A0,A0
SWAP4 A1,A1
||[!BSZ]BNOP _KeccakF1600_cheat
||[!BSZ]STDW LEN:INP,*SP[3]
|| DEAL A0,A0
.else
[!BSZ]BNOP _KeccakF1600_cheat
||[!BSZ]STDW LEN:INP,*SP[3]
|| DEAL A0,A0
.endif
[BSZ] LDNDW *INP++,A1:A0
|| DEAL A1,A1
[BSZ] SUB LEN,8,LEN
||[BSZ] SUB BSZ,1,BSZ
PACK2 A1,A0,A0
|| PACKH2 A1,A0,A1
XOR A0,A$A[$y][$x],A$A[$y][$x]
XOR A1,B$A[$y][$x],B$A[$y][$x]
___
}
}
$code.=<<___;
.if .BIG_ENDIAN
SWAP2 A0,A1
|| SWAP2 A1,A0
SWAP4 A0,A0
SWAP4 A1,A1
.endif
BNOP _KeccakF1600_cheat
|| STDW LEN:INP,*SP[3]
|| DEAL A0,A0
DEAL A1,A1
NOP
PACK2 A1,A0,A0
|| PACKH2 A1,A0,A1
XOR A0,A$A[4][4],A$A[4][4]
XOR A1,B$A[4][4],B$A[4][4]
.align 16
ret?:
MV LEN,A4 ; return value
|| ADD 4,A2,B2
STW A$A[0][0],*A2++[2] ; store A[5][5]
|| STW B$A[0][0],*B2++[2]
STW A$A[0][1],*A2++[2]
|| STW B$A[0][1],*B2++[2]
STW A$A[0][2],*A2++[2]
|| STW B$A[0][2],*B2++[2]
STW A$A[0][3],*A2++[2]
|| STW B$A[0][3],*B2++[2]
STW A$A[0][4],*A2++[2]
|| STW B$A[0][4],*B2++[2]
STW A$A[1][0],*A2++[2]
|| STW B$A[1][0],*B2++[2]
STW A$A[1][1],*A2++[2]
|| STW B$A[1][1],*B2++[2]
STW A$A[1][2],*A2++[2]
|| STW B$A[1][2],*B2++[2]
STW A$A[1][3],*A2++[2]
|| STW B$A[1][3],*B2++[2]
STW A$A[1][4],*A2++[2]
|| STW B$A[1][4],*B2++[2]
STW A$A[2][0],*A2++[2]
|| STW B$A[2][0],*B2++[2]
STW A$A[2][1],*A2++[2]
|| STW B$A[2][1],*B2++[2]
STW A$A[2][2],*A2++[2]
|| STW B$A[2][2],*B2++[2]
STW A$A[2][3],*A2++[2]
|| STW B$A[2][3],*B2++[2]
STW A$A[2][4],*A2++[2]
|| STW B$A[2][4],*B2++[2]
LDW *SP[15],RA
|| LDW *FP[-6],A14
STW A$A[3][0],*A2++[2]
|| STW B$A[3][0],*B2++[2]
STW A$A[3][1],*A2++[2]
|| STW B$A[3][1],*B2++[2]
STW A$A[3][2],*A2++[2]
|| STW B$A[3][2],*B2++[2]
STW A$A[3][3],*A2++[2]
|| STW B$A[3][3],*B2++[2]
STW A$A[3][4],*A2++[2]
|| STW B$A[3][4],*B2++[2]
LDDW *SP[8], B11:B10
|| LDDW *FP[-5],A11:A10
LDDW *SP[9], B13:B12
|| LDDW *FP[-4],A13:A12
BNOP RA
|| LDW *++SP(80),FP ; restore frame pointer
STW A$A[4][0],*A2++[2]
|| STW B$A[4][0],*B2++[2]
STW A$A[4][1],*A2++[2]
|| STW B$A[4][1],*B2++[2]
STW A$A[4][2],*A2++[2]
|| STW B$A[4][2],*B2++[2]
STW A$A[4][3],*A2++[2]
|| STW B$A[4][3],*B2++[2]
STW A$A[4][4],*A2++[2]
|| STW B$A[4][4],*B2++[2]
.endasmfunc
.newblock
.global _SHA3_squeeze
.asg A12,OUT
.asg A13,LEN
.asg A14,BSZ
.align 32
_SHA3_squeeze:
.asmfunc stack_usage(24)
STW FP,*SP--(24) ; save frame pointer
|| MV SP,FP
STW RA, *SP[5]
|| STW A14,*FP[-2]
STDW A13:A12,*FP[-2]
|| MV B4,OUT ; reassign arguments
MV A6,LEN
|| MV B6,BSZ
loop?:
LDW *SP[5],RA ; reload RA
|| SHRU BSZ,3,A1
|| MV A4,A8
|| ADD 4,A4,B8
block?:
CMPLTU LEN,8,A0 ; len < 8?
[A0] BNOP tail?
LDW *A8++[2],A9
|| LDW *B8++[2],B9
|| SUB LEN,8,LEN ; len -= 8
MV LEN,A0
|| SUB A1,1,A1 ; bsz--
|| NOP 4
.if .BIG_ENDIAN
SWAP4 A9,A9
|| SWAP4 B9,B9
SWAP2 A9,A9
|| SWAP2 B9,B9
.endif
[!A0] BNOP ret?
||[!A0] ZERO A1
PACK2 B9,A9,B7
||[A1] BNOP block?
PACKH2 B9,A9,B9
|| SHFL B7,B7
SHFL B9,B9
STNW B7,*OUT++
STNW B9,*OUT++
NOP
BNOP _KeccakF1600,4
ADDKPC loop?,RA
.align 16
tail?:
.if .BIG_ENDIAN
SWAP4 A9,A9
|| SWAP4 B9,B9
SWAP2 A9,A9
|| SWAP2 B9,B9
.endif
PACK2 B9,A9,B7
PACKH2 B9,A9,B9
|| SHFL B7,B7
SHFL B9,B9
STB B7,*OUT++
|| SHRU B7,8,B7
|| ADD LEN,7,A0
[A0] STB B7,*OUT++
||[A0] SHRU B7,8,B7
||[A0] SUB A0,1,A0
[A0] STB B7,*OUT++
||[A0] SHRU B7,8,B7
||[A0] SUB A0,1,A0
[A0] STB B7,*OUT++
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
||[A0] SHRU B9,8,B9
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
||[A0] SHRU B9,8,B9
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
ret?:
LDDW *FP[-2],A13:A12
BNOP RA
|| LDW *FP[-2],A14
LDW *++SP(24),FP ; restore frame pointer
NOP 4 ; wait till FP is committed
.endasmfunc
.if __TI_EABI__
.sect ".text:sha_asm.const"
.else
.sect ".const:sha_asm"
.endif
.align 256
.uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
iotas:
.uword 0x00000001, 0x00000000
.uword 0x00000000, 0x00000089
.uword 0x00000000, 0x8000008b
.uword 0x00000000, 0x80008080
.uword 0x00000001, 0x0000008b
.uword 0x00000001, 0x00008000
.uword 0x00000001, 0x80008088
.uword 0x00000001, 0x80000082
.uword 0x00000000, 0x0000000b
.uword 0x00000000, 0x0000000a
.uword 0x00000001, 0x00008082
.uword 0x00000000, 0x00008003
.uword 0x00000001, 0x0000808b
.uword 0x00000001, 0x8000000b
.uword 0x00000001, 0x8000008a
.uword 0x00000001, 0x80000081
.uword 0x00000000, 0x80000081
.uword 0x00000000, 0x80000008
.uword 0x00000000, 0x00000083
.uword 0x00000000, 0x80008003
.uword 0x00000001, 0x80008088
.uword 0x00000000, 0x80000088
.uword 0x00000001, 0x00008000
.uword 0x00000000, 0x80008082
.cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$output=pop;
open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,440 @@
#!/usr/bin/env perl
# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for x86 MMX.
#
# June 2017.
#
# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
# C[5] held in register bank and D[5] offloaded to memory. Though
# instead of actually unrolling the loop pair-wise I simply flip
# pointers to T[][] and A[][] and the end of round. Since number of
# rounds is even, last round writes to A[][] and everything works out.
# It's argued that MMX is the only code path meaningful to implement
# for x86. This is because non-MMX-capable processors is an extinct
# breed, and they as well can lurk executing compiler-generated code.
# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per
# processed byte on Pentium. Which is fair result. But older compilers
# produce worse code. On the other hand one can wonder why not 128-bit
# SSE2? Well, SSE2 won't provide double improvement, rather far from
# that, if any at all on some processors, because it will take extra
# permutations and inter-bank data trasfers. Besides, contemporary
# CPUs are better off executing 64-bit code, and it makes lesser sense
# to invest into fancy 32-bit code. And the decision doesn't seem to
# be inadequate, if one compares below results to "64-bit platforms in
# 32-bit mode" SIMD data points available at
# http://keccak.noekeon.org/sw_performance.html.
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
# r=1088(i)
#
# PIII 30/+150%
# Pentium M 27/+150%
# P4 40/+85%
# Core 2 19/+170%
# Sandy Bridge(ii) 18/+140%
# Atom 33/+180%
# Silvermont(ii) 30/+180%
# VIA Nano(ii) 43/+60%
# Sledgehammer(ii)(iii) 24/+130%
#
# (i) Corresponds to SHA3-256. Numbers after slash are improvement
# coefficients over KECCAK_2X [with bit interleave and lane
# complementing] position-independent *scalar* code generated
# by gcc-5.x. It's not exactly fair comparison, but it's a
# datapoint...
# (ii) 64-bit processor executing 32-bit code.
# (iii) Result is considered to be representative even for older AMD
# processors.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
my @C = map("mm$_",(0..4));
my @T = map("mm$_",(5..7));
my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
my @D = map(8*$_+4, (0..4));
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
&static_label("iotas");
&function_begin_B("_KeccakF1600");
&movq (@C[0],&QWP($A[4][0],"esi"));
&movq (@C[1],&QWP($A[4][1],"esi"));
&movq (@C[2],&QWP($A[4][2],"esi"));
&movq (@C[3],&QWP($A[4][3],"esi"));
&movq (@C[4],&QWP($A[4][4],"esi"));
&mov ("ecx",24); # loop counter
&jmp (&label("loop"));
&set_label("loop",16);
######################################### Theta
&pxor (@C[0],&QWP($A[0][0],"esi"));
&pxor (@C[1],&QWP($A[0][1],"esi"));
&pxor (@C[2],&QWP($A[0][2],"esi"));
&pxor (@C[3],&QWP($A[0][3],"esi"));
&pxor (@C[4],&QWP($A[0][4],"esi"));
&pxor (@C[0],&QWP($A[1][0],"esi"));
&pxor (@C[1],&QWP($A[1][1],"esi"));
&pxor (@C[2],&QWP($A[1][2],"esi"));
&pxor (@C[3],&QWP($A[1][3],"esi"));
&pxor (@C[4],&QWP($A[1][4],"esi"));
&pxor (@C[0],&QWP($A[2][0],"esi"));
&pxor (@C[1],&QWP($A[2][1],"esi"));
&pxor (@C[2],&QWP($A[2][2],"esi"));
&pxor (@C[3],&QWP($A[2][3],"esi"));
&pxor (@C[4],&QWP($A[2][4],"esi"));
&pxor (@C[2],&QWP($A[3][2],"esi"));
&pxor (@C[0],&QWP($A[3][0],"esi"));
&pxor (@C[1],&QWP($A[3][1],"esi"));
&pxor (@C[3],&QWP($A[3][3],"esi"));
&movq (@T[0],@C[2]);
&pxor (@C[4],&QWP($A[3][4],"esi"));
&movq (@T[2],@C[2]);
&psrlq (@T[0],63);
&movq (@T[1],@C[0]);
&psllq (@T[2],1);
&pxor (@T[0],@C[0]);
&psrlq (@C[0],63);
&pxor (@T[0],@T[2]);
&psllq (@T[1],1);
&movq (@T[2],@C[1]);
&movq (&QWP(@D[1],"esp"),@T[0]); # D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
&pxor (@T[1],@C[0]);
&psrlq (@T[2],63);
&pxor (@T[1],@C[3]);
&movq (@C[0],@C[1]);
&movq (&QWP(@D[4],"esp"),@T[1]); # D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
&psllq (@C[0],1);
&pxor (@T[2],@C[4]);
&pxor (@C[0],@T[2]);
&movq (@T[2],@C[3]);
&psrlq (@C[3],63);
&movq (&QWP(@D[0],"esp"),@C[0]); # D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
&psllq (@T[2],1);
&movq (@T[0],@C[4]);
&psrlq (@C[4],63);
&pxor (@C[1],@C[3]);
&psllq (@T[0],1);
&pxor (@C[1],@T[2]);
&pxor (@C[2],@C[4]);
&movq (&QWP(@D[2],"esp"),@C[1]); # D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
&pxor (@C[2],@T[0]);
######################################### first Rho(0) is special
&movq (@C[3],&QWP($A[3][3],"esi"));
&movq (&QWP(@D[3],"esp"),@C[2]); # D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
&pxor (@C[3],@C[2]);
&movq (@C[4],&QWP($A[4][4],"esi"));
&movq (@T[2],@C[3]);
&psrlq (@C[3],64-$rhotates[3][3]);
&pxor (@C[4],@T[1]);
&psllq (@T[2],$rhotates[3][3]);
&movq (@T[1],@C[4]);
&psrlq (@C[4],64-$rhotates[4][4]);
&por (@C[3],@T[2]); # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
&psllq (@T[1],$rhotates[4][4]);
&movq (@C[2],&QWP($A[2][2],"esi"));
&por (@C[4],@T[1]); # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
&pxor (@C[2],@C[1]);
&movq (@C[1],&QWP($A[1][1],"esi"));
&movq (@T[1],@C[2]);
&psrlq (@C[2],64-$rhotates[2][2]);
&pxor (@C[1],&QWP(@D[1],"esp"));
&psllq (@T[1],$rhotates[2][2]);
&movq (@T[2],@C[1]);
&psrlq (@C[1],64-$rhotates[1][1]);
&por (@C[2],@T[1]); # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
&psllq (@T[2],$rhotates[1][1]);
&pxor (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */ /* D[0] */
&por (@C[1],@T[2]); # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
sub Chi() { ######### regular Chi step
my ($y,$xrho) = @_;
&movq (@T[0],@C[1]);
&movq (@T[1],@C[2]);
&pandn (@T[0],@C[2]);
&pandn (@C[2],@C[3]);
&pxor (@T[0],@C[0]);
&pxor (@C[2],@C[1]);
&pxor (@T[0],&QWP(0,"ebx")) if ($y == 0);
&lea ("ebx",&DWP(8,"ebx")) if ($y == 0);
&movq (@T[2],@C[3]);
&movq (&QWP($A[$y][0],"edi"),@T[0]); # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
&movq (@T[0],@C[4]);
&pandn (@C[3],@C[4]);
&pandn (@C[4],@C[0]);
&pxor (@C[3],@T[1]);
&movq (&QWP($A[$y][1],"edi"),@C[2]); # R[0][1] = C[1] ^ (~C[2] & C[3]);
&pxor (@C[4],@T[2]);
&movq (@T[2],&QWP($A[0][$xrho],"esi")) if (defined($xrho));
&movq (&QWP($A[$y][2],"edi"),@C[3]); # R[0][2] = C[2] ^ (~C[3] & C[4]);
&pandn (@C[0],@C[1]);
&movq (&QWP($A[$y][3],"edi"),@C[4]); # R[0][3] = C[3] ^ (~C[4] & C[0]);
&pxor (@C[0],@T[0]);
&pxor (@T[2],&QWP(@D[$xrho],"esp")) if (defined($xrho));
&movq (&QWP($A[$y][4],"edi"),@C[0]); # R[0][4] = C[4] ^ (~C[0] & C[1]);
}
&Chi (0, 3);
sub Rho() { ######### regular Rho step
my $x = shift;
#&movq (@T[2],&QWP($A[0][$x],"esi")); # moved to Chi
#&pxor (@T[2],&QWP(@D[$x],"esp")); # moved to Chi
&movq (@C[0],@T[2]);
&psrlq (@T[2],64-$rhotates[0][$x]);
&movq (@C[1],&QWP($A[1][($x+1)%5],"esi"));
&psllq (@C[0],$rhotates[0][$x]);
&pxor (@C[1],&QWP(@D[($x+1)%5],"esp"));
&por (@C[0],@T[2]); # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
&movq (@T[1],@C[1]);
&psrlq (@C[1],64-$rhotates[1][($x+1)%5]);
&movq (@C[2],&QWP($A[2][($x+2)%5],"esi"));
&psllq (@T[1],$rhotates[1][($x+1)%5]);
&pxor (@C[2],&QWP(@D[($x+2)%5],"esp"));
&por (@C[1],@T[1]); # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
&movq (@T[2],@C[2]);
&psrlq (@C[2],64-$rhotates[2][($x+2)%5]);
&movq (@C[3],&QWP($A[3][($x+3)%5],"esi"));
&psllq (@T[2],$rhotates[2][($x+2)%5]);
&pxor (@C[3],&QWP(@D[($x+3)%5],"esp"));
&por (@C[2],@T[2]); # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
&movq (@T[0],@C[3]);
&psrlq (@C[3],64-$rhotates[3][($x+3)%5]);
&movq (@C[4],&QWP($A[4][($x+4)%5],"esi"));
&psllq (@T[0],$rhotates[3][($x+3)%5]);
&pxor (@C[4],&QWP(@D[($x+4)%5],"esp"));
&por (@C[3],@T[0]); # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
&movq (@T[1],@C[4]);
&psrlq (@C[4],64-$rhotates[4][($x+4)%5]);
&psllq (@T[1],$rhotates[4][($x+4)%5]);
&por (@C[4],@T[1]); # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
}
&Rho (3); &Chi (1, 1);
&Rho (1); &Chi (2, 4);
&Rho (4); &Chi (3, 2);
&Rho (2); ###&Chi (4);
&movq (@T[0],@C[0]); ######### last Chi(4) is special
&xor ("edi","esi"); # &xchg ("esi","edi");
&movq (&QWP(@D[1],"esp"),@C[1]);
&xor ("esi","edi");
&xor ("edi","esi");
&movq (@T[1],@C[1]);
&movq (@T[2],@C[2]);
&pandn (@T[1],@C[2]);
&pandn (@T[2],@C[3]);
&pxor (@C[0],@T[1]);
&pxor (@C[1],@T[2]);
&movq (@T[1],@C[3]);
&movq (&QWP($A[4][0],"esi"),@C[0]); # R[4][0] = C[0] ^= (~C[1] & C[2]);
&pandn (@T[1],@C[4]);
&movq (&QWP($A[4][1],"esi"),@C[1]); # R[4][1] = C[1] ^= (~C[2] & C[3]);
&pxor (@C[2],@T[1]);
&movq (@T[2],@C[4]);
&movq (&QWP($A[4][2],"esi"),@C[2]); # R[4][2] = C[2] ^= (~C[3] & C[4]);
&pandn (@T[2],@T[0]);
&pandn (@T[0],&QWP(@D[1],"esp"));
&pxor (@C[3],@T[2]);
&pxor (@C[4],@T[0]);
&movq (&QWP($A[4][3],"esi"),@C[3]); # R[4][3] = C[3] ^= (~C[4] & D[0]);
&sub ("ecx",1);
&movq (&QWP($A[4][4],"esi"),@C[4]); # R[4][4] = C[4] ^= (~D[0] & D[1]);
&jnz (&label("loop"));
&lea ("ebx",&DWP(-192,"ebx")); # rewind iotas
&ret ();
&function_end_B("_KeccakF1600");
&function_begin("KeccakF1600");
&mov ("esi",&wparam(0));
&mov ("ebp","esp");
&sub ("esp",240);
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
&and ("esp",-8);
&lea ("esi",&DWP(100,"esi")); # size optimization
&lea ("edi",&DWP(8*5+100,"esp")); # size optimization
&call ("_KeccakF1600");
&mov ("esp","ebp");
&emms ();
&function_end("KeccakF1600");
&function_begin("SHA3_absorb");
&mov ("esi",&wparam(0)); # A[][]
&mov ("eax",&wparam(1)); # inp
&mov ("ecx",&wparam(2)); # len
&mov ("edx",&wparam(3)); # bsz
&mov ("ebp","esp");
&sub ("esp",240+8);
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
&and ("esp",-8);
&mov ("edi","esi");
&lea ("esi",&DWP(100,"esi")); # size optimization
&mov (&DWP(-4,"ebp"),"edx"); # save bsz
&jmp (&label("loop"));
&set_label("loop",16);
&cmp ("ecx","edx"); # len < bsz?
&jc (&label("absorbed"));
&shr ("edx",3); # bsz /= 8
&set_label("block");
&movq ("mm0",&QWP(0,"eax"));
&lea ("eax",&DWP(8,"eax"));
&pxor ("mm0",&QWP(0,"edi"));
&lea ("edi",&DWP(8,"edi"));
&sub ("ecx",8); # len -= 8
&movq (&QWP(-8,"edi"),"mm0");
&dec ("edx"); # bsz--
&jnz (&label("block"));
&lea ("edi",&DWP(8*5+100,"esp")); # size optimization
&mov (&DWP(-8,"ebp"),"ecx"); # save len
&call ("_KeccakF1600");
&mov ("ecx",&DWP(-8,"ebp")); # pull len
&mov ("edx",&DWP(-4,"ebp")); # pull bsz
&lea ("edi",&DWP(-100,"esi"));
&jmp (&label("loop"));
&set_label("absorbed",16);
&mov ("eax","ecx"); # return value
&mov ("esp","ebp");
&emms ();
&function_end("SHA3_absorb");
&function_begin("SHA3_squeeze");
&mov ("esi",&wparam(0)); # A[][]
&mov ("eax",&wparam(1)); # out
&mov ("ecx",&wparam(2)); # len
&mov ("edx",&wparam(3)); # bsz
&mov ("ebp","esp");
&sub ("esp",240+8);
&call (&label("pic_point"));
&set_label("pic_point");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
&and ("esp",-8);
&shr ("edx",3); # bsz /= 8
&mov ("edi","esi");
&lea ("esi",&DWP(100,"esi")); # size optimization
&mov (&DWP(-4,"ebp"),"edx"); # save bsz
&jmp (&label("loop"));
&set_label("loop",16);
&cmp ("ecx",8); # len < 8?
&jc (&label("tail"));
&movq ("mm0",&QWP(0,"edi"));
&lea ("edi",&DWP(8,"edi"));
&movq (&QWP(0,"eax"),"mm0");
&lea ("eax",&DWP(8,"eax"));
&sub ("ecx",8); # len -= 8
&jz (&label("done"));
&dec ("edx"); # bsz--
&jnz (&label("loop"));
&lea ("edi",&DWP(8*5+100,"esp")); # size optimization
&mov (&DWP(-8,"ebp"),"ecx"); # save len
&call ("_KeccakF1600");
&mov ("ecx",&DWP(-8,"ebp")); # pull len
&mov ("edx",&DWP(-4,"ebp")); # pull bsz
&lea ("edi",&DWP(-100,"esi"));
&jmp (&label("loop"));
&set_label("tail",16);
&mov ("esi","edi");
&mov ("edi","eax");
&data_word("0xA4F39066"); # rep movsb
&set_label("done");
&mov ("esp","ebp");
&emms ();
&function_end("SHA3_squeeze");
&set_label("iotas",32);
&data_word(0x00000001,0x00000000);
&data_word(0x00008082,0x00000000);
&data_word(0x0000808a,0x80000000);
&data_word(0x80008000,0x80000000);
&data_word(0x0000808b,0x00000000);
&data_word(0x80000001,0x00000000);
&data_word(0x80008081,0x80000000);
&data_word(0x00008009,0x80000000);
&data_word(0x0000008a,0x00000000);
&data_word(0x00000088,0x00000000);
&data_word(0x80008009,0x00000000);
&data_word(0x8000000a,0x00000000);
&data_word(0x8000808b,0x00000000);
&data_word(0x0000008b,0x80000000);
&data_word(0x00008089,0x80000000);
&data_word(0x00008003,0x80000000);
&data_word(0x00008002,0x80000000);
&data_word(0x00000080,0x80000000);
&data_word(0x0000800a,0x00000000);
&data_word(0x8000000a,0x80000000);
&data_word(0x80008081,0x80000000);
&data_word(0x00008080,0x80000000);
&data_word(0x80000001,0x00000000);
&data_word(0x80008008,0x80000000);
&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View file

@ -0,0 +1,758 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for PPC64.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT implementation that works on
# *any* PPC64. Then PowerISA 2.07 adds 2x64-bit vector rotate, and
# it's possible to achieve performance better than below, but that is
# naturally option only for POWER8 and successors...
#
######################################################################
# Numbers are cycles per processed byte.
#
# r=1088(*)
#
# PPC970/G5 14.6/+120%
# POWER7 10.3/+100%
# POWER8 11.5/+85%
# POWER9 9.4/+45%
#
# (*) Corresponds to SHA3-256. Percentage after slash is improvement
# over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
# much better (but watch out for them generating code specific
# to processor they execute on).
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T+6*$SIZE_T+32;
$LOCALS=6*$SIZE_T;
$TEMP=$LOCALS+6*$SIZE_T;
my $sp ="r1";
my @A = map([ "r$_", "r".($_+1), "r".($_+2), "r".($_+3), "r".($_+4) ],
(7, 12, 17, 22, 27));
$A[1][1] = "r6"; # r13 is reserved
my @C = map("r$_", (0,3,4,5));
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
$code.=<<___;
.text
.type KeccakF1600_int,\@function
.align 5
KeccakF1600_int:
li r0,24
mtctr r0
b .Loop
.align 4
.Loop:
xor $C[0],$A[0][0],$A[1][0] ; Theta
std $A[0][4],`$TEMP+0`($sp)
xor $C[1],$A[0][1],$A[1][1]
std $A[1][4],`$TEMP+8`($sp)
xor $C[2],$A[0][2],$A[1][2]
std $A[2][4],`$TEMP+16`($sp)
xor $C[3],$A[0][3],$A[1][3]
std $A[3][4],`$TEMP+24`($sp)
___
$C[4]=$A[0][4];
$C[5]=$A[1][4];
$C[6]=$A[2][4];
$C[7]=$A[3][4];
$code.=<<___;
xor $C[4],$A[0][4],$A[1][4]
xor $C[0],$C[0],$A[2][0]
xor $C[1],$C[1],$A[2][1]
xor $C[2],$C[2],$A[2][2]
xor $C[3],$C[3],$A[2][3]
xor $C[4],$C[4],$A[2][4]
xor $C[0],$C[0],$A[3][0]
xor $C[1],$C[1],$A[3][1]
xor $C[2],$C[2],$A[3][2]
xor $C[3],$C[3],$A[3][3]
xor $C[4],$C[4],$A[3][4]
xor $C[0],$C[0],$A[4][0]
xor $C[2],$C[2],$A[4][2]
xor $C[1],$C[1],$A[4][1]
xor $C[3],$C[3],$A[4][3]
rotldi $C[5],$C[2],1
xor $C[4],$C[4],$A[4][4]
rotldi $C[6],$C[3],1
xor $C[5],$C[5],$C[0]
rotldi $C[7],$C[4],1
xor $A[0][1],$A[0][1],$C[5]
xor $A[1][1],$A[1][1],$C[5]
xor $A[2][1],$A[2][1],$C[5]
xor $A[3][1],$A[3][1],$C[5]
xor $A[4][1],$A[4][1],$C[5]
rotldi $C[5],$C[0],1
xor $C[6],$C[6],$C[1]
xor $C[2],$C[2],$C[7]
rotldi $C[7],$C[1],1
xor $C[3],$C[3],$C[5]
xor $C[4],$C[4],$C[7]
xor $C[1], $A[0][2],$C[6] ;mr $C[1],$A[0][2]
xor $A[1][2],$A[1][2],$C[6]
xor $A[2][2],$A[2][2],$C[6]
xor $A[3][2],$A[3][2],$C[6]
xor $A[4][2],$A[4][2],$C[6]
xor $A[0][0],$A[0][0],$C[4]
xor $A[1][0],$A[1][0],$C[4]
xor $A[2][0],$A[2][0],$C[4]
xor $A[3][0],$A[3][0],$C[4]
xor $A[4][0],$A[4][0],$C[4]
___
$C[4]=undef;
$C[5]=undef;
$C[6]=undef;
$C[7]=undef;
$code.=<<___;
ld $A[0][4],`$TEMP+0`($sp)
xor $C[0], $A[0][3],$C[2] ;mr $C[0],$A[0][3]
ld $A[1][4],`$TEMP+8`($sp)
xor $A[1][3],$A[1][3],$C[2]
ld $A[2][4],`$TEMP+16`($sp)
xor $A[2][3],$A[2][3],$C[2]
ld $A[3][4],`$TEMP+24`($sp)
xor $A[3][3],$A[3][3],$C[2]
xor $A[4][3],$A[4][3],$C[2]
xor $C[2], $A[0][4],$C[3] ;mr $C[2],$A[0][4]
xor $A[1][4],$A[1][4],$C[3]
xor $A[2][4],$A[2][4],$C[3]
xor $A[3][4],$A[3][4],$C[3]
xor $A[4][4],$A[4][4],$C[3]
mr $C[3],$A[0][1] ; Rho+Pi
rotldi $A[0][1],$A[1][1],$rhotates[1][1]
;mr $C[1],$A[0][2]
rotldi $A[0][2],$A[2][2],$rhotates[2][2]
;mr $C[0],$A[0][3]
rotldi $A[0][3],$A[3][3],$rhotates[3][3]
;mr $C[2],$A[0][4]
rotldi $A[0][4],$A[4][4],$rhotates[4][4]
rotldi $A[1][1],$A[1][4],$rhotates[1][4]
rotldi $A[2][2],$A[2][3],$rhotates[2][3]
rotldi $A[3][3],$A[3][2],$rhotates[3][2]
rotldi $A[4][4],$A[4][1],$rhotates[4][1]
rotldi $A[1][4],$A[4][2],$rhotates[4][2]
rotldi $A[2][3],$A[3][4],$rhotates[3][4]
rotldi $A[3][2],$A[2][1],$rhotates[2][1]
rotldi $A[4][1],$A[1][3],$rhotates[1][3]
rotldi $A[4][2],$A[2][4],$rhotates[2][4]
rotldi $A[3][4],$A[4][3],$rhotates[4][3]
rotldi $A[2][1],$A[1][2],$rhotates[1][2]
rotldi $A[1][3],$A[3][1],$rhotates[3][1]
rotldi $A[2][4],$A[4][0],$rhotates[4][0]
rotldi $A[4][3],$A[3][0],$rhotates[3][0]
rotldi $A[1][2],$A[2][0],$rhotates[2][0]
rotldi $A[3][1],$A[1][0],$rhotates[1][0]
rotldi $A[1][0],$C[0],$rhotates[0][3]
rotldi $A[2][0],$C[3],$rhotates[0][1]
rotldi $A[3][0],$C[2],$rhotates[0][4]
rotldi $A[4][0],$C[1],$rhotates[0][2]
andc $C[0],$A[0][2],$A[0][1] ; Chi+Iota
andc $C[1],$A[0][3],$A[0][2]
andc $C[2],$A[0][0],$A[0][4]
andc $C[3],$A[0][1],$A[0][0]
xor $A[0][0],$A[0][0],$C[0]
andc $C[0],$A[0][4],$A[0][3]
xor $A[0][1],$A[0][1],$C[1]
ld $C[1],`$LOCALS+4*$SIZE_T`($sp)
xor $A[0][3],$A[0][3],$C[2]
xor $A[0][4],$A[0][4],$C[3]
xor $A[0][2],$A[0][2],$C[0]
ldu $C[3],8($C[1]) ; Iota[i++]
andc $C[0],$A[1][2],$A[1][1]
std $C[1],`$LOCALS+4*$SIZE_T`($sp)
andc $C[1],$A[1][3],$A[1][2]
andc $C[2],$A[1][0],$A[1][4]
xor $A[0][0],$A[0][0],$C[3] ; A[0][0] ^= Iota
andc $C[3],$A[1][1],$A[1][0]
xor $A[1][0],$A[1][0],$C[0]
andc $C[0],$A[1][4],$A[1][3]
xor $A[1][1],$A[1][1],$C[1]
xor $A[1][3],$A[1][3],$C[2]
xor $A[1][4],$A[1][4],$C[3]
xor $A[1][2],$A[1][2],$C[0]
andc $C[0],$A[2][2],$A[2][1]
andc $C[1],$A[2][3],$A[2][2]
andc $C[2],$A[2][0],$A[2][4]
andc $C[3],$A[2][1],$A[2][0]
xor $A[2][0],$A[2][0],$C[0]
andc $C[0],$A[2][4],$A[2][3]
xor $A[2][1],$A[2][1],$C[1]
xor $A[2][3],$A[2][3],$C[2]
xor $A[2][4],$A[2][4],$C[3]
xor $A[2][2],$A[2][2],$C[0]
andc $C[0],$A[3][2],$A[3][1]
andc $C[1],$A[3][3],$A[3][2]
andc $C[2],$A[3][0],$A[3][4]
andc $C[3],$A[3][1],$A[3][0]
xor $A[3][0],$A[3][0],$C[0]
andc $C[0],$A[3][4],$A[3][3]
xor $A[3][1],$A[3][1],$C[1]
xor $A[3][3],$A[3][3],$C[2]
xor $A[3][4],$A[3][4],$C[3]
xor $A[3][2],$A[3][2],$C[0]
andc $C[0],$A[4][2],$A[4][1]
andc $C[1],$A[4][3],$A[4][2]
andc $C[2],$A[4][0],$A[4][4]
andc $C[3],$A[4][1],$A[4][0]
xor $A[4][0],$A[4][0],$C[0]
andc $C[0],$A[4][4],$A[4][3]
xor $A[4][1],$A[4][1],$C[1]
xor $A[4][3],$A[4][3],$C[2]
xor $A[4][4],$A[4][4],$C[3]
xor $A[4][2],$A[4][2],$C[0]
bdnz .Loop
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size KeccakF1600_int,.-KeccakF1600_int
.type KeccakF1600,\@function
.align 5
KeccakF1600:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
bl PICmeup
subi r12,r12,8 ; prepare for ldu
$PUSH r3,`$LOCALS+0*$SIZE_T`($sp)
;$PUSH r4,`$LOCALS+1*$SIZE_T`($sp)
;$PUSH r5,`$LOCALS+2*$SIZE_T`($sp)
;$PUSH r6,`$LOCALS+3*$SIZE_T`($sp)
$PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
ld $A[0][0],`8*0`(r3) ; load A[5][5]
ld $A[0][1],`8*1`(r3)
ld $A[0][2],`8*2`(r3)
ld $A[0][3],`8*3`(r3)
ld $A[0][4],`8*4`(r3)
ld $A[1][0],`8*5`(r3)
ld $A[1][1],`8*6`(r3)
ld $A[1][2],`8*7`(r3)
ld $A[1][3],`8*8`(r3)
ld $A[1][4],`8*9`(r3)
ld $A[2][0],`8*10`(r3)
ld $A[2][1],`8*11`(r3)
ld $A[2][2],`8*12`(r3)
ld $A[2][3],`8*13`(r3)
ld $A[2][4],`8*14`(r3)
ld $A[3][0],`8*15`(r3)
ld $A[3][1],`8*16`(r3)
ld $A[3][2],`8*17`(r3)
ld $A[3][3],`8*18`(r3)
ld $A[3][4],`8*19`(r3)
ld $A[4][0],`8*20`(r3)
ld $A[4][1],`8*21`(r3)
ld $A[4][2],`8*22`(r3)
ld $A[4][3],`8*23`(r3)
ld $A[4][4],`8*24`(r3)
bl KeccakF1600_int
$POP r3,`$LOCALS+0*$SIZE_T`($sp)
std $A[0][0],`8*0`(r3) ; return A[5][5]
std $A[0][1],`8*1`(r3)
std $A[0][2],`8*2`(r3)
std $A[0][3],`8*3`(r3)
std $A[0][4],`8*4`(r3)
std $A[1][0],`8*5`(r3)
std $A[1][1],`8*6`(r3)
std $A[1][2],`8*7`(r3)
std $A[1][3],`8*8`(r3)
std $A[1][4],`8*9`(r3)
std $A[2][0],`8*10`(r3)
std $A[2][1],`8*11`(r3)
std $A[2][2],`8*12`(r3)
std $A[2][3],`8*13`(r3)
std $A[2][4],`8*14`(r3)
std $A[3][0],`8*15`(r3)
std $A[3][1],`8*16`(r3)
std $A[3][2],`8*17`(r3)
std $A[3][3],`8*18`(r3)
std $A[3][4],`8*19`(r3)
std $A[4][0],`8*20`(r3)
std $A[4][1],`8*21`(r3)
std $A[4][2],`8*22`(r3)
std $A[4][3],`8*23`(r3)
std $A[4][4],`8*24`(r3)
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,1,0
.long 0
.size KeccakF1600,.-KeccakF1600
.type dword_le_load,\@function
.align 5
dword_le_load:
lbzu r0,1(r3)
lbzu r4,1(r3)
lbzu r5,1(r3)
insrdi r0,r4,8,48
lbzu r4,1(r3)
insrdi r0,r5,8,40
lbzu r5,1(r3)
insrdi r0,r4,8,32
lbzu r4,1(r3)
insrdi r0,r5,8,24
lbzu r5,1(r3)
insrdi r0,r4,8,16
lbzu r4,1(r3)
insrdi r0,r5,8,8
insrdi r0,r4,8,0
blr
.long 0
.byte 0,12,0x14,0,0,0,1,0
.long 0
.size dword_le_load,.-dword_le_load
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 5
SHA3_absorb:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
bl PICmeup
subi r4,r4,1 ; prepare for lbzu
subi r12,r12,8 ; prepare for ldu
$PUSH r3,`$LOCALS+0*$SIZE_T`($sp) ; save A[][]
$PUSH r4,`$LOCALS+1*$SIZE_T`($sp) ; save inp
$PUSH r5,`$LOCALS+2*$SIZE_T`($sp) ; save len
$PUSH r6,`$LOCALS+3*$SIZE_T`($sp) ; save bsz
mr r0,r6
$PUSH r12,`$LOCALS+4*$SIZE_T`($sp)
ld $A[0][0],`8*0`(r3) ; load A[5][5]
ld $A[0][1],`8*1`(r3)
ld $A[0][2],`8*2`(r3)
ld $A[0][3],`8*3`(r3)
ld $A[0][4],`8*4`(r3)
ld $A[1][0],`8*5`(r3)
ld $A[1][1],`8*6`(r3)
ld $A[1][2],`8*7`(r3)
ld $A[1][3],`8*8`(r3)
ld $A[1][4],`8*9`(r3)
ld $A[2][0],`8*10`(r3)
ld $A[2][1],`8*11`(r3)
ld $A[2][2],`8*12`(r3)
ld $A[2][3],`8*13`(r3)
ld $A[2][4],`8*14`(r3)
ld $A[3][0],`8*15`(r3)
ld $A[3][1],`8*16`(r3)
ld $A[3][2],`8*17`(r3)
ld $A[3][3],`8*18`(r3)
ld $A[3][4],`8*19`(r3)
ld $A[4][0],`8*20`(r3)
ld $A[4][1],`8*21`(r3)
ld $A[4][2],`8*22`(r3)
ld $A[4][3],`8*23`(r3)
ld $A[4][4],`8*24`(r3)
mr r3,r4
mr r4,r5
mr r5,r0
b .Loop_absorb
.align 4
.Loop_absorb:
$UCMP r4,r5 ; len < bsz?
blt .Labsorbed
sub r4,r4,r5 ; len -= bsz
srwi r5,r5,3
$PUSH r4,`$LOCALS+2*$SIZE_T`($sp) ; save len
mtctr r5
bl dword_le_load ; *inp++
xor $A[0][0],$A[0][0],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[0][1],$A[0][1],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[0][2],$A[0][2],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[0][3],$A[0][3],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[0][4],$A[0][4],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[1][0],$A[1][0],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[1][1],$A[1][1],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[1][2],$A[1][2],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[1][3],$A[1][3],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[1][4],$A[1][4],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[2][0],$A[2][0],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[2][1],$A[2][1],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[2][2],$A[2][2],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[2][3],$A[2][3],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[2][4],$A[2][4],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[3][0],$A[3][0],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[3][1],$A[3][1],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[3][2],$A[3][2],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[3][3],$A[3][3],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[3][4],$A[3][4],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[4][0],$A[4][0],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[4][1],$A[4][1],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[4][2],$A[4][2],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[4][3],$A[4][3],r0
bdz .Lprocess_block
bl dword_le_load ; *inp++
xor $A[4][4],$A[4][4],r0
.Lprocess_block:
$PUSH r3,`$LOCALS+1*$SIZE_T`($sp) ; save inp
bl KeccakF1600_int
$POP r0,`$LOCALS+4*$SIZE_T`($sp) ; pull iotas[24]
$POP r5,`$LOCALS+3*$SIZE_T`($sp) ; restore bsz
$POP r4,`$LOCALS+2*$SIZE_T`($sp) ; restore len
$POP r3,`$LOCALS+1*$SIZE_T`($sp) ; restore inp
addic r0,r0,`-8*24` ; rewind iotas
$PUSH r0,`$LOCALS+4*$SIZE_T`($sp)
b .Loop_absorb
.align 4
.Labsorbed:
$POP r3,`$LOCALS+0*$SIZE_T`($sp)
std $A[0][0],`8*0`(r3) ; return A[5][5]
std $A[0][1],`8*1`(r3)
std $A[0][2],`8*2`(r3)
std $A[0][3],`8*3`(r3)
std $A[0][4],`8*4`(r3)
std $A[1][0],`8*5`(r3)
std $A[1][1],`8*6`(r3)
std $A[1][2],`8*7`(r3)
std $A[1][3],`8*8`(r3)
std $A[1][4],`8*9`(r3)
std $A[2][0],`8*10`(r3)
std $A[2][1],`8*11`(r3)
std $A[2][2],`8*12`(r3)
std $A[2][3],`8*13`(r3)
std $A[2][4],`8*14`(r3)
std $A[3][0],`8*15`(r3)
std $A[3][1],`8*16`(r3)
std $A[3][2],`8*17`(r3)
std $A[3][3],`8*18`(r3)
std $A[3][4],`8*19`(r3)
std $A[4][0],`8*20`(r3)
std $A[4][1],`8*21`(r3)
std $A[4][2],`8*22`(r3)
std $A[4][3],`8*23`(r3)
std $A[4][4],`8*24`(r3)
mr r3,r4 ; return value
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,4,0
.long 0
.size SHA3_absorb,.-SHA3_absorb
___
{
my ($A_flat,$out,$len,$bsz) = map("r$_",(28..31));
$code.=<<___;
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 5
SHA3_squeeze:
$STU $sp,`-10*$SIZE_T`($sp)
mflr r0
$PUSH r28,`6*$SIZE_T`($sp)
$PUSH r29,`7*$SIZE_T`($sp)
$PUSH r30,`8*$SIZE_T`($sp)
$PUSH r31,`9*$SIZE_T`($sp)
$PUSH r0,`10*$SIZE_T+$LRSAVE`($sp)
mr $A_flat,r3
subi r3,r3,8 ; prepare for ldu
subi $out,r4,1 ; prepare for stbu
mr $len,r5
mr $bsz,r6
b .Loop_squeeze
.align 4
.Loop_squeeze:
ldu r0,8(r3)
${UCMP}i $len,8
blt .Lsqueeze_tail
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
srdi r0,r0,8
stbu r0,1($out)
subic. $len,$len,8
beq .Lsqueeze_done
subic. r6,r6,8
bgt .Loop_squeeze
mr r3,$A_flat
bl KeccakF1600
subi r3,$A_flat,8 ; prepare for ldu
mr r6,$bsz
b .Loop_squeeze
.align 4
.Lsqueeze_tail:
mtctr $len
.Loop_tail:
stbu r0,1($out)
srdi r0,r0,8
bdnz .Loop_tail
.Lsqueeze_done:
$POP r0,`10*$SIZE_T+$LRSAVE`($sp)
$POP r28,`6*$SIZE_T`($sp)
$POP r29,`7*$SIZE_T`($sp)
$POP r30,`8*$SIZE_T`($sp)
$POP r31,`9*$SIZE_T`($sp)
mtlr r0
addi $sp,$sp,`10*$SIZE_T`
blr
.long 0
.byte 0,12,4,1,0x80,4,4,0
.long 0
.size SHA3_squeeze,.-SHA3_squeeze
___
}
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
PICmeup:
mflr r0
bcl 20,31,\$+4
mflr r12 ; vvvvvv "distance" between . and 1st data entry
addi r12,r12,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
.type iotas,\@object
iotas:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas,.-iotas
.asciz "Keccak-1600 absorb and squeeze for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,560 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for s390x.
#
# June 2017.
#
# Below code is [lane complementing] KECCAK_2X implementation (see
# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
# instead of actually unrolling the loop pair-wise I simply flip
# pointers to T[][] and A[][] at the end of round. Since number of
# rounds is even, last round writes to A[][] and everything works out.
# In the nutshell it's transliteration of x86_64 module, because both
# architectures have similar capabilities/limitations. Performance
# measurement is problematic as I don't have access to an idle system.
# It looks like z13 processes one byte [out of long message] in ~14
# cycles. At least the result is consistent with estimate based on
# amount of instruction and assumed instruction issue rate. It's ~2.5x
# faster than compiler-generated code.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20));
my @C = map("%r$_",(0,1,5..7));
my @D = map("%r$_",(8..12));
my @T = map("%r$_",(13..14));
my ($src,$dst,$iotas) = map("%r$_",(2..4));
my $sp = "%r15";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+25*8;
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
{ my @C = @C; # copy, because we mess them up...
my @D = @D;
$code.=<<___;
.text
.type __KeccakF1600,\@function
.align 32
__KeccakF1600:
st${g} %r14,$SIZE_T*14($sp)
lg @C[0],$A[4][0]($src)
lg @C[1],$A[4][1]($src)
lg @C[2],$A[4][2]($src)
lg @C[3],$A[4][3]($src)
lg @C[4],$A[4][4]($src)
larl $iotas,iotas
j .Loop
.align 16
.Loop:
lg @D[0],$A[0][0]($src)
lg @D[1],$A[1][1]($src)
lg @D[2],$A[2][2]($src)
lg @D[3],$A[3][3]($src)
xgr @C[0],@D[0]
xg @C[1],$A[0][1]($src)
xg @C[2],$A[0][2]($src)
xg @C[3],$A[0][3]($src)
lgr @D[4],@C[4]
xg @C[4],$A[0][4]($src)
xg @C[0],$A[1][0]($src)
xgr @C[1],@D[1]
xg @C[2],$A[1][2]($src)
xg @C[3],$A[1][3]($src)
xg @C[4],$A[1][4]($src)
xg @C[0],$A[2][0]($src)
xg @C[1],$A[2][1]($src)
xgr @C[2],@D[2]
xg @C[3],$A[2][3]($src)
xg @C[4],$A[2][4]($src)
xg @C[0],$A[3][0]($src)
xg @C[1],$A[3][1]($src)
xg @C[2],$A[3][2]($src)
xgr @C[3],@D[3]
xg @C[4],$A[3][4]($src)
lgr @T[0],@C[2]
rllg @C[2],@C[2],1
xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0]
rllg @C[0],@C[0],1
xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3]
rllg @C[3],@C[3],1
xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1]
rllg @C[1],@C[1],1
xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4]
rllg @C[4],@C[4],1
xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2]
___
(@D[0..4], @C) = (@C[1..4,0], @D);
$code.=<<___;
xgr @C[1],@D[1]
xgr @C[2],@D[2]
xgr @C[3],@D[3]
rllg @C[1],@C[1],$rhotates[1][1]
xgr @C[4],@D[4]
rllg @C[2],@C[2],$rhotates[2][2]
xgr @C[0],@D[0]
lgr @T[0],@C[1]
ogr @C[1],@C[2]
rllg @C[3],@C[3],$rhotates[3][3]
xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2])
rllg @C[4],@C[4],$rhotates[4][4]
xg @C[1],0($iotas)
la $iotas,8($iotas)
stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
lgr @T[1],@C[4]
ngr @C[4],@C[3]
lghi @C[1],-1 # no 'not' instruction :-(
xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3])
xgr @C[2],@C[1] # not @C[2]
stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3])
ogr @C[2],@C[3]
xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3])
ngr @T[0],@C[0]
stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3])
xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0])
ogr @T[1],@C[0]
stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0])
xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0])
stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0])
lg @C[0],$A[0][3]($src)
lg @C[4],$A[4][2]($src)
lg @C[3],$A[3][1]($src)
lg @C[1],$A[1][4]($src)
lg @C[2],$A[2][0]($src)
xgr @C[0],@D[3]
xgr @C[4],@D[2]
rllg @C[0],@C[0],$rhotates[0][3]
xgr @C[3],@D[1]
rllg @C[4],@C[4],$rhotates[4][2]
xgr @C[1],@D[4]
rllg @C[3],@C[3],$rhotates[3][1]
xgr @C[2],@D[0]
lgr @T[0],@C[0]
ogr @C[0],@C[4]
rllg @C[1],@C[1],$rhotates[1][4]
xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4])
rllg @C[2],@C[2],$rhotates[2][0]
stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4])
lgr @T[1],@C[1]
ngr @C[1],@T[0]
lghi @C[0],-1 # no 'not' instruction :-(
xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0])
xgr @C[4],@C[0] # not @C[4]
stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0])
ogr @C[4],@C[3]
xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3])
ngr @C[3],@C[2]
stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3])
xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2])
ogr @T[1],@C[2]
stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2])
xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2])
stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2])
lg @C[2],$A[2][3]($src)
lg @C[3],$A[3][4]($src)
lg @C[1],$A[1][2]($src)
lg @C[4],$A[4][0]($src)
lg @C[0],$A[0][1]($src)
xgr @C[2],@D[3]
xgr @C[3],@D[4]
rllg @C[2],@C[2],$rhotates[2][3]
xgr @C[1],@D[2]
rllg @C[3],@C[3],$rhotates[3][4]
xgr @C[4],@D[0]
rllg @C[1],@C[1],$rhotates[1][2]
xgr @C[0],@D[1]
lgr @T[0],@C[2]
ngr @C[2],@C[3]
rllg @C[4],@C[4],$rhotates[4][0]
xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3])
lghi @T[1],-1 # no 'not' instruction :-(
stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3])
xgr @C[3],@T[1] # not @C[3]
lgr @T[1],@C[4]
ngr @C[4],@C[3]
rllg @C[0],@C[0],$rhotates[0][1]
xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3])
ogr @T[0],@C[1]
stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1])
ngr @C[1],@C[0]
stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1])
xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0])
ogr @C[0],@T[1]
stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0])
xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4])
stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
lg @C[2],$A[2][1]($src)
lg @C[3],$A[3][2]($src)
lg @C[1],$A[1][0]($src)
lg @C[4],$A[4][3]($src)
lg @C[0],$A[0][4]($src)
xgr @C[2],@D[1]
xgr @C[3],@D[2]
rllg @C[2],@C[2],$rhotates[2][1]
xgr @C[1],@D[0]
rllg @C[3],@C[3],$rhotates[3][2]
xgr @C[4],@D[3]
rllg @C[1],@C[1],$rhotates[1][0]
xgr @C[0],@D[4]
rllg @C[4],@C[4],$rhotates[4][3]
lgr @T[0],@C[2]
ogr @C[2],@C[3]
lghi @T[1],-1 # no 'not' instruction :-(
xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3])
xgr @C[3],@T[1] # not @C[3]
stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3])
lgr @T[1],@C[4]
ogr @C[4],@C[3]
rllg @C[0],@C[0],$rhotates[0][4]
xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3])
ngr @T[0],@C[1]
stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1])
ogr @C[1],@C[0]
stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1])
xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0])
ngr @C[0],@T[1]
stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0])
xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4])
stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
xg @D[2],$A[0][2]($src)
xg @D[3],$A[1][3]($src)
xg @D[1],$A[4][1]($src)
xg @D[4],$A[2][4]($src)
xgr $dst,$src # xchg $dst,$src
rllg @D[2],@D[2],$rhotates[0][2]
xg @D[0],$A[3][0]($src)
rllg @D[3],@D[3],$rhotates[1][3]
xgr $src,$dst
rllg @D[1],@D[1],$rhotates[4][1]
xgr $dst,$src
rllg @D[4],@D[4],$rhotates[2][4]
___
@C = @D[2..4,0,1];
$code.=<<___;
lgr @T[0],@C[0]
ngr @C[0],@C[1]
lghi @T[1],-1 # no 'not' instruction :-(
xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1])
xgr @C[1],@T[1] # not @C[1]
stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1])
lgr @T[1],@C[2]
ngr @C[2],@C[1]
rllg @D[0],@D[0],$rhotates[3][0]
xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1])
ogr @T[0],@C[4]
stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4])
ngr @C[4],@C[3]
stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4])
xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3])
ogr @C[3],@T[1]
stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3])
xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3])
lgr @C[1],@C[0] # harmonize with the loop top
lgr @C[0],@T[0]
stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
tmll $iotas,255
jnz .Loop
l${g} %r14,$SIZE_T*14($sp)
br %r14
.size __KeccakF1600,.-__KeccakF1600
___
}
{
$code.=<<___;
.type KeccakF1600,\@function
.align 32
KeccakF1600:
.LKeccakF1600:
lghi %r1,-$frame
stm${g} %r6,%r15,$SIZE_T*6($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
lghi @D[0],-1 # no 'not' instruction :-(
lghi @D[1],-1
lghi @D[2],-1
lghi @D[3],-1
lghi @D[4],-1
lghi @T[0],-1
xg @D[0],$A[0][1]($src)
xg @D[1],$A[0][2]($src)
xg @D[2],$A[1][3]($src)
xg @D[3],$A[2][2]($src)
xg @D[4],$A[3][2]($src)
xg @T[0],$A[4][0]($src)
stmg @D[0],@D[1],$A[0][1]($src)
stg @D[2],$A[1][3]($src)
stg @D[3],$A[2][2]($src)
stg @D[4],$A[3][2]($src)
stg @T[0],$A[4][0]($src)
la $dst,$stdframe($sp)
bras %r14,__KeccakF1600
lghi @D[0],-1 # no 'not' instruction :-(
lghi @D[1],-1
lghi @D[2],-1
lghi @D[3],-1
lghi @D[4],-1
lghi @T[0],-1
xg @D[0],$A[0][1]($src)
xg @D[1],$A[0][2]($src)
xg @D[2],$A[1][3]($src)
xg @D[3],$A[2][2]($src)
xg @D[4],$A[3][2]($src)
xg @T[0],$A[4][0]($src)
stmg @D[0],@D[1],$A[0][1]($src)
stg @D[2],$A[1][3]($src)
stg @D[3],$A[2][2]($src)
stg @D[4],$A[3][2]($src)
stg @T[0],$A[4][0]($src)
lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
br %r14
.size KeccakF1600,.-KeccakF1600
___
}
{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5));
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 32
SHA3_absorb:
lghi %r1,-$frame
stm${g} %r5,%r15,$SIZE_T*5($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
lghi @D[0],-1 # no 'not' instruction :-(
lghi @D[1],-1
lghi @D[2],-1
lghi @D[3],-1
lghi @D[4],-1
lghi @T[0],-1
xg @D[0],$A[0][1]($src)
xg @D[1],$A[0][2]($src)
xg @D[2],$A[1][3]($src)
xg @D[3],$A[2][2]($src)
xg @D[4],$A[3][2]($src)
xg @T[0],$A[4][0]($src)
stmg @D[0],@D[1],$A[0][1]($src)
stg @D[2],$A[1][3]($src)
stg @D[3],$A[2][2]($src)
stg @D[4],$A[3][2]($src)
stg @T[0],$A[4][0]($src)
.Loop_absorb:
cl${g}r $len,$bsz
jl .Ldone_absorb
srl${g} $bsz,3
la %r1,0($A_flat)
.Lblock_absorb:
lrvg %r0,0($inp)
la $inp,8($inp)
xg %r0,0(%r1)
a${g}hi $len,-8
stg %r0,0(%r1)
la %r1,8(%r1)
brct $bsz,.Lblock_absorb
stm${g} $inp,$len,$frame+3*$SIZE_T($sp)
la $dst,$stdframe($sp)
bras %r14,__KeccakF1600
lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp)
j .Loop_absorb
.align 16
.Ldone_absorb:
lghi @D[0],-1 # no 'not' instruction :-(
lghi @D[1],-1
lghi @D[2],-1
lghi @D[3],-1
lghi @D[4],-1
lghi @T[0],-1
xg @D[0],$A[0][1]($src)
xg @D[1],$A[0][2]($src)
xg @D[2],$A[1][3]($src)
xg @D[3],$A[2][2]($src)
xg @D[4],$A[3][2]($src)
xg @T[0],$A[4][0]($src)
stmg @D[0],@D[1],$A[0][1]($src)
stg @D[2],$A[1][3]($src)
stg @D[3],$A[2][2]($src)
stg @D[4],$A[3][2]($src)
stg @T[0],$A[4][0]($src)
lgr %r2,$len # return value
lm${g} %r6,%r15,$frame+6*$SIZE_T($sp)
br %r14
.size SHA3_absorb,.-SHA3_absorb
___
}
{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5));
$code.=<<___;
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 32
SHA3_squeeze:
srl${g} $bsz,3
st${g} %r14,2*$SIZE_T($sp)
lghi %r14,8
st${g} $bsz,5*$SIZE_T($sp)
la %r1,0($A_flat)
j .Loop_squeeze
.align 16
.Loop_squeeze:
cl${g}r $len,%r14
jl .Ltail_squeeze
lrvg %r0,0(%r1)
la %r1,8(%r1)
stg %r0,0($out)
la $out,8($out)
a${g}hi $len,-8 # len -= 8
jz .Ldone_squeeze
brct $bsz,.Loop_squeeze # bsz--
stm${g} $out,$len,3*$SIZE_T($sp)
bras %r14,.LKeccakF1600
lm${g} $out,$bsz,3*$SIZE_T($sp)
lghi %r14,8
la %r1,0($A_flat)
j .Loop_squeeze
.Ltail_squeeze:
lg %r0,0(%r1)
.Loop_tail_squeeze:
stc %r0,0($out)
la $out,1($out)
srlg %r0,8
brct $len,.Loop_tail_squeeze
.Ldone_squeeze:
l${g} %r14,2*$SIZE_T($sp)
br %r14
.size SHA3_squeeze,.-SHA3_squeeze
___
}
$code.=<<___;
.align 256
.quad 0,0,0,0,0,0,0,0
.type iotas,\@object
iotas:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas,.-iotas
.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
# unlike 32-bit shift 64-bit one takes three arguments
$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
print $code;
close STDOUT;

View file

@ -0,0 +1,607 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for x86_64.
#
# June 2017.
#
# Below code is [lane complementing] KECCAK_2X implementation (see
# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
# instead of actually unrolling the loop pair-wise I simply flip
# pointers to T[][] and A[][] at the end of round. Since number of
# rounds is even, last round writes to A[][] and everything works out.
# How does it compare to x86_64 assembly module in Keccak Code Package?
# Depending on processor it's either as fast or faster by up to 15%...
#
########################################################################
# Numbers are cycles per processed byte out of large message.
#
# r=1088(*)
#
# P4 25.8
# Core 2 12.9
# Westmere 13.7
# Sandy Bridge 12.9(**)
# Haswell 9.6
# Skylake 9.4
# Silvermont 22.8
# Goldmont 15.8
# VIA Nano 17.3
# Sledgehammer 13.3
# Bulldozer 16.5
# Ryzen 8.8
#
# (*) Corresponds to SHA3-256. Improvement over compiler-generate
# varies a lot, most commont coefficient is 15% in comparison to
# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
# (**) Sandy Bridge has broken rotate instruction. Performance can be
# improved by 14% by replacing rotates with double-precision
# shift with same register as source and destination.
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
my @D = map("%r$_",(8..12));
my @T = map("%r$_",(13..14));
my $iotas = "%r15";
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
$code.=<<___;
.text
.type __KeccakF1600,\@abi-omnipotent
.align 32
__KeccakF1600:
mov $A[4][0](%rdi),@C[0]
mov $A[4][1](%rdi),@C[1]
mov $A[4][2](%rdi),@C[2]
mov $A[4][3](%rdi),@C[3]
mov $A[4][4](%rdi),@C[4]
jmp .Loop
.align 32
.Loop:
mov $A[0][0](%rdi),@D[0]
mov $A[1][1](%rdi),@D[1]
mov $A[2][2](%rdi),@D[2]
mov $A[3][3](%rdi),@D[3]
xor $A[0][2](%rdi),@C[2]
xor $A[0][3](%rdi),@C[3]
xor @D[0], @C[0]
xor $A[0][1](%rdi),@C[1]
xor $A[1][2](%rdi),@C[2]
xor $A[1][0](%rdi),@C[0]
mov @C[4],@D[4]
xor $A[0][4](%rdi),@C[4]
xor @D[2], @C[2]
xor $A[2][0](%rdi),@C[0]
xor $A[1][3](%rdi),@C[3]
xor @D[1], @C[1]
xor $A[1][4](%rdi),@C[4]
xor $A[3][2](%rdi),@C[2]
xor $A[3][0](%rdi),@C[0]
xor $A[2][3](%rdi),@C[3]
xor $A[2][1](%rdi),@C[1]
xor $A[2][4](%rdi),@C[4]
mov @C[2],@T[0]
rol \$1,@C[2]
xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
xor @D[3], @C[3]
rol \$1,@C[0]
xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
xor $A[3][1](%rdi),@C[1]
rol \$1,@C[3]
xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
xor $A[3][4](%rdi),@C[4]
rol \$1,@C[1]
xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
rol \$1,@C[4]
xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
___
(@D[0..4], @C) = (@C[1..4,0], @D);
$code.=<<___;
xor @D[1],@C[1]
xor @D[2],@C[2]
rol \$$rhotates[1][1],@C[1]
xor @D[3],@C[3]
xor @D[4],@C[4]
rol \$$rhotates[2][2],@C[2]
xor @D[0],@C[0]
mov @C[1],@T[0]
rol \$$rhotates[3][3],@C[3]
or @C[2],@C[1]
xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
rol \$$rhotates[4][4],@C[4]
xor ($iotas),@C[1]
lea 8($iotas),$iotas
mov @C[4],@T[1]
and @C[3],@C[4]
mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
not @C[2]
mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
or @C[3],@C[2]
mov $A[4][2](%rdi),@C[4]
xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
and @C[0],@T[0]
mov $A[1][4](%rdi),@C[1]
xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
mov $A[2][0](%rdi),@C[2]
mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
or @C[0],@T[1]
mov $A[0][3](%rdi),@C[0]
xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
mov $A[3][1](%rdi),@C[3]
mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
xor @D[3],@C[0]
xor @D[2],@C[4]
rol \$$rhotates[0][3],@C[0]
xor @D[1],@C[3]
xor @D[4],@C[1]
rol \$$rhotates[4][2],@C[4]
rol \$$rhotates[3][1],@C[3]
xor @D[0],@C[2]
rol \$$rhotates[1][4],@C[1]
mov @C[0],@T[0]
or @C[4],@C[0]
rol \$$rhotates[2][0],@C[2]
xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
mov @C[1],@T[1]
and @T[0],@C[1]
mov $A[0][1](%rdi),@C[0]
xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
not @C[4]
mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
or @C[3],@C[4]
mov $A[1][2](%rdi),@C[1]
xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
and @C[2],@C[3]
mov $A[4][0](%rdi),@C[4]
xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
or @C[2],@T[1]
mov $A[2][3](%rdi),@C[2]
xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
mov $A[3][4](%rdi),@C[3]
mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
xor @D[3],@C[2]
xor @D[4],@C[3]
rol \$$rhotates[2][3],@C[2]
xor @D[2],@C[1]
rol \$$rhotates[3][4],@C[3]
xor @D[0],@C[4]
rol \$$rhotates[1][2],@C[1]
xor @D[1],@C[0]
rol \$$rhotates[4][0],@C[4]
mov @C[2],@T[0]
and @C[3],@C[2]
rol \$$rhotates[0][1],@C[0]
not @C[3]
xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
mov @C[4],@T[1]
and @C[3],@C[4]
mov $A[2][1](%rdi),@C[2]
xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
or @C[1],@T[0]
mov $A[4][3](%rdi),@C[4]
xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
and @C[0],@C[1]
xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
or @C[0],@T[1]
mov $A[1][0](%rdi),@C[1]
xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
mov $A[3][2](%rdi),@C[3]
mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
mov $A[0][4](%rdi),@C[0]
xor @D[1],@C[2]
xor @D[2],@C[3]
rol \$$rhotates[2][1],@C[2]
xor @D[0],@C[1]
rol \$$rhotates[3][2],@C[3]
xor @D[3],@C[4]
rol \$$rhotates[1][0],@C[1]
xor @D[4],@C[0]
rol \$$rhotates[4][3],@C[4]
mov @C[2],@T[0]
or @C[3],@C[2]
rol \$$rhotates[0][4],@C[0]
not @C[3]
xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
mov @C[4],@T[1]
or @C[3],@C[4]
xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
and @C[1],@T[0]
xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
or @C[0],@C[1]
xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
and @T[1],@C[0]
xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
xor $A[0][2](%rdi),@D[2]
xor $A[1][3](%rdi),@D[3]
rol \$$rhotates[0][2],@D[2]
xor $A[4][1](%rdi),@D[1]
rol \$$rhotates[1][3],@D[3]
xor $A[2][4](%rdi),@D[4]
rol \$$rhotates[4][1],@D[1]
xor $A[3][0](%rdi),@D[0]
xchg %rsi,%rdi
rol \$$rhotates[2][4],@D[4]
rol \$$rhotates[3][0],@D[0]
___
@C = @D[2..4,0,1];
$code.=<<___;
mov @C[0],@T[0]
and @C[1],@C[0]
not @C[1]
xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
mov @C[2],@T[1]
and @C[1],@C[2]
xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
or @C[4],@T[0]
xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
and @C[3],@C[4]
xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
or @T[1],@C[3]
xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
mov @C[0],@C[1] # harmonize with the loop top
mov @T[0],@C[0]
test \$255,$iotas
jnz .Loop
lea -192($iotas),$iotas # rewind iotas
ret
.size __KeccakF1600,.-__KeccakF1600
.type KeccakF1600,\@abi-omnipotent
.align 32
KeccakF1600:
.cfi_startproc
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
lea 100(%rdi),%rdi # size optimization
sub \$200,%rsp
.cfi_adjust_cfa_offset 200
notq $A[0][1](%rdi)
notq $A[0][2](%rdi)
notq $A[1][3](%rdi)
notq $A[2][2](%rdi)
notq $A[3][2](%rdi)
notq $A[4][0](%rdi)
lea iotas(%rip),$iotas
lea 100(%rsp),%rsi # size optimization
call __KeccakF1600
notq $A[0][1](%rdi)
notq $A[0][2](%rdi)
notq $A[1][3](%rdi)
notq $A[2][2](%rdi)
notq $A[3][2](%rdi)
notq $A[4][0](%rdi)
lea -100(%rdi),%rdi # preserve A[][]
add \$200,%rsp
.cfi_adjust_cfa_offset -200
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbp
.cfi_pop %rbp
pop %rbx
.cfi_pop %rbx
ret
.cfi_endproc
.size KeccakF1600,.-KeccakF1600
___
{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
($A_flat,$inp) = ("%r8","%r9");
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function,4
.align 32
SHA3_absorb:
.cfi_startproc
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
lea 100(%rdi),%rdi # size optimization
sub \$232,%rsp
.cfi_adjust_cfa_offset 232
mov %rsi,$inp
lea 100(%rsp),%rsi # size optimization
notq $A[0][1](%rdi)
notq $A[0][2](%rdi)
notq $A[1][3](%rdi)
notq $A[2][2](%rdi)
notq $A[3][2](%rdi)
notq $A[4][0](%rdi)
lea iotas(%rip),$iotas
mov $bsz,216-100(%rsi) # save bsz
.Loop_absorb:
cmp $bsz,$len
jc .Ldone_absorb
shr \$3,$bsz
lea -100(%rdi),$A_flat
.Lblock_absorb:
mov ($inp),%rax
lea 8($inp),$inp
xor ($A_flat),%rax
lea 8($A_flat),$A_flat
sub \$8,$len
mov %rax,-8($A_flat)
sub \$1,$bsz
jnz .Lblock_absorb
mov $inp,200-100(%rsi) # save inp
mov $len,208-100(%rsi) # save len
call __KeccakF1600
mov 200-100(%rsi),$inp # pull inp
mov 208-100(%rsi),$len # pull len
mov 216-100(%rsi),$bsz # pull bsz
jmp .Loop_absorb
.align 32
.Ldone_absorb:
mov $len,%rax # return value
notq $A[0][1](%rdi)
notq $A[0][2](%rdi)
notq $A[1][3](%rdi)
notq $A[2][2](%rdi)
notq $A[3][2](%rdi)
notq $A[4][0](%rdi)
add \$232,%rsp
.cfi_adjust_cfa_offset -232
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbp
.cfi_pop %rbp
pop %rbx
.cfi_pop %rbx
ret
.cfi_endproc
.size SHA3_absorb,.-SHA3_absorb
___
}
{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
($out,$len,$bsz) = ("%r12","%r13","%r14");
$code.=<<___;
.globl SHA3_squeeze
.type SHA3_squeeze,\@function,4
.align 32
SHA3_squeeze:
.cfi_startproc
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
shr \$3,%rcx
mov $A_flat,%r8
mov %rsi,$out
mov %rdx,$len
mov %rcx,$bsz
jmp .Loop_squeeze
.align 32
.Loop_squeeze:
cmp \$8,$len
jb .Ltail_squeeze
mov (%r8),%rax
lea 8(%r8),%r8
mov %rax,($out)
lea 8($out),$out
sub \$8,$len # len -= 8
jz .Ldone_squeeze
sub \$1,%rcx # bsz--
jnz .Loop_squeeze
call KeccakF1600
mov $A_flat,%r8
mov $bsz,%rcx
jmp .Loop_squeeze
.Ltail_squeeze:
mov %r8, %rsi
mov $out,%rdi
mov $len,%rcx
.byte 0xf3,0xa4 # rep movsb
.Ldone_squeeze:
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r13
ret
.cfi_endproc
.size SHA3_squeeze,.-SHA3_squeeze
___
}
$code.=<<___;
.align 256
.quad 0,0,0,0,0,0,0,0
.type iotas,\@object
iotas:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808a
.quad 0x8000000080008000
.quad 0x000000000000808b
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008a
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000a
.quad 0x000000008000808b
.quad 0x800000000000008b
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800a
.quad 0x800000008000000a
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
.size iotas,.-iotas
.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
# Below replacement results in 11.2 on Sandy Bridge, 9.4 on
# Haswell, but it hurts other processors by up to 2-3-4x...
#s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
# Below replacement results in 9.3 on Haswell [as well as
# on Ryzen, i.e. it *hurts* Ryzen]...
#s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
print $_, "\n";
}
close STDOUT;

View file

@ -0,0 +1,850 @@
#!/usr/bin/env perl
# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for PowerISA 2.07.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT SIMD implementation, but with
# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
# POWER8 processor spends 9.8 cycles to process byte out of large
# buffer for r=1088, which matches SHA3-256. This is 17% better than
# scalar PPC64 code. It probably should be noted that if POWER8's
# successor can achieve higher scalar instruction issue rate, then
# this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
$UCMP ="cmplw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
my $sp ="r1";
my $iotas = "r12";
########################################################################
# Register layout:
#
# v0 A[0][0] A[1][0]
# v1 A[0][1] A[1][1]
# v2 A[0][2] A[1][2]
# v3 A[0][3] A[1][3]
# v4 A[0][4] A[1][4]
#
# v5 A[2][0] A[3][0]
# v6 A[2][1] A[3][1]
# v7 A[2][2] A[3][2]
# v8 A[2][3] A[3][3]
# v9 A[2][4] A[3][4]
#
# v10 A[4][0] A[4][1]
# v11 A[4][2] A[4][3]
# v12 A[4][4] A[4][4]
#
# v13..25 rhotates[][]
# v26..31 volatile
#
$code.=<<___;
.machine "any"
.text
.type KeccakF1600_int,\@function
.align 5
KeccakF1600_int:
li r0,24
mtctr r0
li r0,0
b .Loop
.align 4
.Loop:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
vxor v26,v26,v31 ; C[0..1]
vxor v27,v27,v28 ; C[2..3]
vxor v28,v29,v30 ; C[4..4]
vspltisb v31,1
vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
vrld v29,v26,v31 ; ROL64(C[0..1],1)
vrld v30,v27,v31 ; ROL64(C[2..3],1)
vrld v31,v28,v31 ; ROL64(C[4..4],1)
vpermdi v31,v31,v29,0b10
vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
vpermdi v29,v26,v26,0b00 ; C[0..0]
vpermdi v30,v28,v26,0b10 ; C[4..0]
vpermdi v31,v28,v28,0b11 ; C[4..4]
vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
vpermdi v29,v27,v27,0b00 ; C[2..2]
vpermdi v30,v26,v26,0b11 ; C[1..1]
vpermdi v31,v26,v27,0b10 ; C[1..2]
vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
vpermdi v29,v27,v27,0b11 ; C[3..3]
vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
vrld v26,v0, v13 ; v0
vrld v1, v1, v14
vrld v27,v2, v15 ; v2
vrld v28,v3, v16 ; v3
vrld v4, v4, v17
vrld v5, v5, v18
vrld v6, v6, v19
vrld v29,v7, v20 ; v7
vrld v8, v8, v21
vrld v9, v9, v22
vrld v10,v10,v23
vrld v30,v11,v24 ; v11
vrld v12,v12,v25
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
lvx_u v31,$iotas,r0 ; iotas[index]
addic r0,r0,16 ; index++
vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
vpermdi v26,v10,v11,0b10 ; A[4][1..2]
vpermdi v27,v12,v10,0b00 ; A[4][4..0]
vpermdi v28,v11,v12,0b10 ; A[4][3..4]
vpermdi v29,v10,v10,0b10 ; A[4][1..0]
vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
bdnz .Loop
vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size KeccakF1600_int,.-KeccakF1600_int
.type KeccakF1600,\@function
.align 5
KeccakF1600:
$STU $sp,-$FRAME($sp)
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mflr r8
mfspr r7, 256 ; save vrsave
stvx v20,r10,$sp
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
stw r7,`$FRAME-4`($sp) ; save vrsave
li r0, -1
$PUSH r8,`$FRAME+$LRSAVE`($sp)
mtspr 256, r0 ; preserve all AltiVec registers
li r11,16
lvx_4w v0,0,r3 ; load A[5][5]
li r10,32
lvx_4w v1,r11,r3
addi r11,r11,32
lvx_4w v2,r10,r3
addi r10,r10,32
lvx_4w v3,r11,r3
addi r11,r11,32
lvx_4w v4,r10,r3
addi r10,r10,32
lvx_4w v5,r11,r3
addi r11,r11,32
lvx_4w v6,r10,r3
addi r10,r10,32
lvx_4w v7,r11,r3
addi r11,r11,32
lvx_4w v8,r10,r3
addi r10,r10,32
lvx_4w v9,r11,r3
addi r11,r11,32
lvx_4w v10,r10,r3
addi r10,r10,32
lvx_4w v11,r11,r3
lvx_splt v12,r10,r3
bl PICmeup
li r11,16
lvx_u v13,0,r12 ; load rhotates
li r10,32
lvx_u v14,r11,r12
addi r11,r11,32
lvx_u v15,r10,r12
addi r10,r10,32
lvx_u v16,r11,r12
addi r11,r11,32
lvx_u v17,r10,r12
addi r10,r10,32
lvx_u v18,r11,r12
addi r11,r11,32
lvx_u v19,r10,r12
addi r10,r10,32
lvx_u v20,r11,r12
addi r11,r11,32
lvx_u v21,r10,r12
addi r10,r10,32
lvx_u v22,r11,r12
addi r11,r11,32
lvx_u v23,r10,r12
addi r10,r10,32
lvx_u v24,r11,r12
lvx_u v25,r10,r12
addi r12,r12,`16*16` ; points at iotas
bl KeccakF1600_int
li r11,16
stvx_4w v0,0,r3 ; return A[5][5]
li r10,32
stvx_4w v1,r11,r3
addi r11,r11,32
stvx_4w v2,r10,r3
addi r10,r10,32
stvx_4w v3,r11,r3
addi r11,r11,32
stvx_4w v4,r10,r3
addi r10,r10,32
stvx_4w v5,r11,r3
addi r11,r11,32
stvx_4w v6,r10,r3
addi r10,r10,32
stvx_4w v7,r11,r3
addi r11,r11,32
stvx_4w v8,r10,r3
addi r10,r10,32
stvx_4w v9,r11,r3
addi r11,r11,32
stvx_4w v10,r10,r3
addi r10,r10,32
stvx_4w v11,r11,r3
stvdx_u v12,r10,r3
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mtlr r8
mtspr 256, r7 ; restore vrsave
lvx v20,r10,$sp
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,0x04,1,0x80,0,1,0
.long 0
.size KeccakF1600,.-KeccakF1600
___
{
my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
$code.=<<___;
.globl SHA3_absorb
.type SHA3_absorb,\@function
.align 5
SHA3_absorb:
$STU $sp,-$FRAME($sp)
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mflr r8
mfspr r7, 256 ; save vrsave
stvx v20,r10,$sp
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
stw r7,`$FRAME-4`($sp) ; save vrsave
li r0, -1
$PUSH r8,`$FRAME+$LRSAVE`($sp)
mtspr 256, r0 ; preserve all AltiVec registers
li r11,16
lvx_4w v0,0,$A_jagged ; load A[5][5]
li r10,32
lvx_4w v1,r11,$A_jagged
addi r11,r11,32
lvx_4w v2,r10,$A_jagged
addi r10,r10,32
lvx_4w v3,r11,$A_jagged
addi r11,r11,32
lvx_4w v4,r10,$A_jagged
addi r10,r10,32
lvx_4w v5,r11,$A_jagged
addi r11,r11,32
lvx_4w v6,r10,$A_jagged
addi r10,r10,32
lvx_4w v7,r11,$A_jagged
addi r11,r11,32
lvx_4w v8,r10,$A_jagged
addi r10,r10,32
lvx_4w v9,r11,$A_jagged
addi r11,r11,32
lvx_4w v10,r10,$A_jagged
addi r10,r10,32
lvx_4w v11,r11,$A_jagged
lvx_splt v12,r10,$A_jagged
bl PICmeup
li r11,16
lvx_u v13,0,r12 ; load rhotates
li r10,32
lvx_u v14,r11,r12
addi r11,r11,32
lvx_u v15,r10,r12
addi r10,r10,32
lvx_u v16,r11,r12
addi r11,r11,32
lvx_u v17,r10,r12
addi r10,r10,32
lvx_u v18,r11,r12
addi r11,r11,32
lvx_u v19,r10,r12
addi r10,r10,32
lvx_u v20,r11,r12
addi r11,r11,32
lvx_u v21,r10,r12
addi r10,r10,32
lvx_u v22,r11,r12
addi r11,r11,32
lvx_u v23,r10,r12
addi r10,r10,32
lvx_u v24,r11,r12
lvx_u v25,r10,r12
li r10,-32
li r11,-16
addi r12,r12,`16*16` ; points at iotas
b .Loop_absorb
.align 4
.Loop_absorb:
$UCMP $len,$bsz ; len < bsz?
blt .Labsorbed
sub $len,$len,$bsz ; len -= bsz
srwi r0,$bsz,3
mtctr r0
lvx_u v30,r10,r12 ; permutation masks
lvx_u v31,r11,r12
?vspltisb v27,7 ; prepare masks for byte swap
?vxor v30,v30,v27 ; on big-endian
?vxor v31,v31,v27
vxor v27,v27,v27 ; zero
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v0, v0, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v1, v1, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v2, v2, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v3, v3, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v4, v4, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v0, v0, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v1, v1, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v2, v2, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v3, v3, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v4, v4, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v5, v5, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v6, v6, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v7, v7, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v8, v8, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v9, v9, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v5, v5, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v6, v6, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v7, v7, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v8, v8, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v9, v9, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v10, v10, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v10, v10, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v30
vxor v11, v11, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v11, v11, v26
bdz .Lprocess_block
lvdx_u v26,0,$inp
addi $inp,$inp,8
vperm v26,v26,v27,v31
vxor v12, v12, v26
.Lprocess_block:
bl KeccakF1600_int
b .Loop_absorb
.align 4
.Labsorbed:
li r11,16
stvx_4w v0,0,$A_jagged ; return A[5][5]
li r10,32
stvx_4w v1,r11,$A_jagged
addi r11,r11,32
stvx_4w v2,r10,$A_jagged
addi r10,r10,32
stvx_4w v3,r11,$A_jagged
addi r11,r11,32
stvx_4w v4,r10,$A_jagged
addi r10,r10,32
stvx_4w v5,r11,$A_jagged
addi r11,r11,32
stvx_4w v6,r10,$A_jagged
addi r10,r10,32
stvx_4w v7,r11,$A_jagged
addi r11,r11,32
stvx_4w v8,r10,$A_jagged
addi r10,r10,32
stvx_4w v9,r11,$A_jagged
addi r11,r11,32
stvx_4w v10,r10,$A_jagged
addi r10,r10,32
stvx_4w v11,r11,$A_jagged
stvdx_u v12,r10,$A_jagged
mr r3,$len ; return value
li r10,`15+6*$SIZE_T`
li r11,`31+6*$SIZE_T`
mtlr r8
mtspr 256, r7 ; restore vrsave
lvx v20,r10,$sp
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,0x04,1,0x80,0,4,0
.long 0
.size SHA3_absorb,.-SHA3_absorb
___
}
{
my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
$code.=<<___;
.globl SHA3_squeeze
.type SHA3_squeeze,\@function
.align 5
SHA3_squeeze:
mflr r9 ; r9 is not touched by KeccakF1600
subi $out,$out,1 ; prepare for stbu
addi r8,$A_jagged,4 ; prepare volatiles
mr r10,$bsz
li r11,0
b .Loop_squeeze
.align 4
.Loop_squeeze:
lwzx r7,r11,r8 ; lo
lwzx r0,r11,$A_jagged ; hi
${UCMP}i $len,8
blt .Lsqueeze_tail
stbu r7,1($out) ; write lo
srwi r7,r7,8
stbu r7,1($out)
srwi r7,r7,8
stbu r7,1($out)
srwi r7,r7,8
stbu r7,1($out)
stbu r0,1($out) ; write hi
srwi r0,r0,8
stbu r0,1($out)
srwi r0,r0,8
stbu r0,1($out)
srwi r0,r0,8
stbu r0,1($out)
subic. $len,$len,8
beqlr ; return if done
subic. r10,r10,8
ble .Loutput_expand
addi r11,r11,16 ; calculate jagged index
cmplwi r11,`16*5`
blt .Loop_squeeze
subi r11,r11,72
beq .Loop_squeeze
addi r11,r11,72
cmplwi r11,`16*5+8`
subi r11,r11,8
beq .Loop_squeeze
addi r11,r11,8
cmplwi r11,`16*10`
subi r11,r11,72
beq .Loop_squeeze
addi r11,r11,72
blt .Loop_squeeze
subi r11,r11,8
b .Loop_squeeze
.align 4
.Loutput_expand:
bl KeccakF1600
mtlr r9
addi r8,$A_jagged,4 ; restore volatiles
mr r10,$bsz
li r11,0
b .Loop_squeeze
.align 4
.Lsqueeze_tail:
mtctr $len
subic. $len,$len,4
ble .Loop_tail_lo
li r8,4
mtctr r8
.Loop_tail_lo:
stbu r7,1($out)
srdi r7,r7,8
bdnz .Loop_tail_lo
ble .Lsqueeze_done
mtctr $len
.Loop_tail_hi:
stbu r0,1($out)
srdi r0,r0,8
bdnz .Loop_tail_hi
.Lsqueeze_done:
blr
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size SHA3_squeeze,.-SHA3_squeeze
___
}
$code.=<<___;
.align 6
PICmeup:
mflr r0
bcl 20,31,\$+4
mflr r12 ; vvvvvv "distance" between . and 1st data entry
addi r12,r12,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
.type rhotates,\@object
.align 6
rhotates:
.quad 0, 36
.quad 1, 44
.quad 62, 6
.quad 28, 55
.quad 27, 20
.quad 3, 41
.quad 10, 45
.quad 43, 15
.quad 25, 21
.quad 39, 8
.quad 18, 2
.quad 61, 56
.quad 14, 14
.size rhotates,.-rhotates
.quad 0,0
.quad 0x0001020304050607,0x1011121314151617
.quad 0x1011121314151617,0x0001020304050607
.type iotas,\@object
iotas:
.quad 0x0000000000000001,0
.quad 0x0000000000008082,0
.quad 0x800000000000808a,0
.quad 0x8000000080008000,0
.quad 0x000000000000808b,0
.quad 0x0000000080000001,0
.quad 0x8000000080008081,0
.quad 0x8000000000008009,0
.quad 0x000000000000008a,0
.quad 0x0000000000000088,0
.quad 0x0000000080008009,0
.quad 0x000000008000000a,0
.quad 0x000000008000808b,0
.quad 0x800000000000008b,0
.quad 0x8000000000008089,0
.quad 0x8000000000008003,0
.quad 0x8000000000008002,0
.quad 0x8000000000000080,0
.quad 0x000000000000800a,0
.quad 0x800000008000000a,0
.quad 0x8000000080008081,0
.quad 0x8000000000008080,0
.quad 0x0000000080000001,0
.quad 0x8000000080008008,0
.size iotas,.-iotas
.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
if ($flavour =~ /le$/) { # little-endian
s/\?([a-z]+)/;$1/;
} else { # big-endian
s/\?([a-z]+)/$1/;
}
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,329 @@
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for Alpha.
# On 21264 performance is 33% better than code generated by vendor
# compiler, and 75% better than GCC [3.4], and in absolute terms is
# 8.7 cycles per processed byte. Implementation features vectorized
# byte swap, but not Xupdate.
@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7",
"\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
$ctx="a0"; # $16
$inp="a1";
$num="a2";
$A="a3";
$B="a4"; # 20
$C="a5";
$D="t8";
$E="t9"; @V=($A,$B,$C,$D,$E);
$t0="t10"; # 24
$t1="t11";
$t2="ra";
$t3="t12";
$K="AT"; # 28
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i==0);
ldq_u @X[0],0+0($inp)
ldq_u @X[1],0+7($inp)
___
$code.=<<___ if (!($i&1) && $i<14);
ldq_u @X[$i+2],($i+2)*4+0($inp)
ldq_u @X[$i+3],($i+2)*4+7($inp)
___
$code.=<<___ if (!($i&1) && $i<15);
extql @X[$i],$inp,@X[$i]
extqh @X[$i+1],$inp,@X[$i+1]
or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched
srl @X[$i],24,$t0 # vectorized byte swap
srl @X[$i],8,$t2
sll @X[$i],8,$t3
sll @X[$i],24,@X[$i]
zapnot $t0,0x11,$t0
zapnot $t2,0x22,$t2
zapnot @X[$i],0x88,@X[$i]
or $t0,$t2,$t0
zapnot $t3,0x44,$t3
sll $a,5,$t1
or @X[$i],$t0,@X[$i]
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
or @X[$i],$t3,@X[$i]
srl $a,27,$t0
bic $d,$b,$t3
sll $b,30,$b
extll @X[$i],4,@X[$i+1] # extract upper half
or $t2,$t3,$t2
addl @X[$i],$e,$e
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if (($i&1) && $i<15);
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
zapnot $a,0xf,$a
srl $a,27,$t0
addl @X[$i%16],$e,$e
bic $d,$b,$t3
sll $b,30,$b
or $t2,$t3,$t2
addl $t1,$e,$e
srl $b,32,$t3
zapnot @X[$i],0xf,@X[$i]
addl $t0,$e,$e
addl $t2,$e,$e
or $t3,$b,$b
___
$code.=<<___ if ($i>=15); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
and $b,$c,$t2
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
zapnot $a,0xf,$a
addl @X[$i%16],$e,$e
bic $d,$b,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
addl $t1,$e,$e
or $t2,$t3,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t0,$e,$e
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $b,32,$t3
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
zapnot @X[$i%16],0xf,@X[$i%16]
or $t1,@X[$j%16],@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79); # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t2,$e,$e
srl $a,27,$t0
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t0,$e,$e
or $t1,@X[$j%16],@X[$j%16]
___
$code.=<<___ if ($i<77);
zapnot @X[$i%16],0xf,@X[$i%16]
___
$code.=<<___ if ($i==79); # with context fetch
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
ldl @X[0],0($ctx)
sll $b,30,$t3
addl $t1,$e,$e
xor $b,$c,$t2
ldl @X[1],4($ctx)
srl $b,2,$b
addl @X[$i%16],$e,$e
xor $d,$t2,$t2
ldl @X[2],8($ctx)
srl $a,27,$t0
addl $t2,$e,$e
ldl @X[3],12($ctx)
or $t3,$b,$b
addl $t0,$e,$e
ldl @X[4],16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___; # with forward Xupdate
sll $a,5,$t1
addl $K,$e,$e
zapnot $a,0xf,$a
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
srl $a,27,$t0
and $b,$c,$t2
and $b,$d,$t3
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
sll $b,30,$b
addl $t1,$e,$e
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
srl @X[$j%16],31,$t1
addl $t0,$e,$e
or $t2,$t3,$t2
and $c,$d,$t3
or $t2,$t3,$t2
srl $b,32,$t3
addl @X[$i%16],$e,$e
addl @X[$j%16],@X[$j%16],@X[$j%16]
or $t3,$b,$b
addl $t2,$e,$e
or $t1,@X[$j%16],@X[$j%16]
zapnot @X[$i%16],0xf,@X[$i%16]
___
}
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl sha1_block_data_order
.align 5
.ent sha1_block_data_order
sha1_block_data_order:
lda sp,-64(sp)
stq ra,0(sp)
stq s0,8(sp)
stq s1,16(sp)
stq s2,24(sp)
stq s3,32(sp)
stq s4,40(sp)
stq s5,48(sp)
stq fp,56(sp)
.mask 0x0400fe00,-64
.frame sp,64,ra
.prologue 0
ldl $A,0($ctx)
ldl $B,4($ctx)
sll $num,6,$num
ldl $C,8($ctx)
ldl $D,12($ctx)
ldl $E,16($ctx)
addq $inp,$num,$num
.Lloop:
.set noreorder
ldah $K,23170(zero)
zapnot $B,0xf,$B
lda $K,31129($K) # K_00_19
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,28378(zero)
lda $K,-5215($K) # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-28900(zero)
lda $K,-17188($K) # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldah $K,-13725(zero)
lda $K,-15914($K) # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stl $A,0($ctx)
stl $B,4($ctx)
addq $inp,64,$inp
stl $C,8($ctx)
stl $D,12($ctx)
stl $E,16($ctx)
cmpult $inp,$num,$t1
bne $t1,.Lloop
.set noreorder
ldq ra,0(sp)
ldq s0,8(sp)
ldq s1,16(sp)
ldq s2,24(sp)
ldq s3,32(sp)
ldq s4,40(sp)
ldq s5,48(sp)
ldq fp,56(sp)
lda sp,64(sp)
ret (ra)
.end sha1_block_data_order
.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$output=pop and open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,742 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block procedure for ARMv4.
#
# January 2007.
# Size/performance trade-off
# ====================================================================
# impl size in bytes comp cycles[*] measured performance
# ====================================================================
# thumb 304 3212 4420
# armv4-small 392/+29% 1958/+64% 2250/+96%
# armv4-compact 740/+89% 1552/+26% 1840/+22%
# armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
# full unroll ~5100/+260% ~1260/+4% ~1300/+5%
# ====================================================================
# thumb = same as 'small' but in Thumb instructions[**] and
# with recurring code in two private functions;
# small = detached Xload/update, loops are folded;
# compact = detached Xload/update, 5x unroll;
# large = interleaved Xload/update, 5x unroll;
# full unroll = interleaved Xload/update, full unroll, estimated[!];
#
# [*] Manually counted instructions in "grand" loop body. Measured
# performance is affected by prologue and epilogue overhead,
# i-cache availability, branch penalties, etc.
# [**] While each Thumb instruction is twice smaller, they are not as
# diverse as ARM ones: e.g., there are only two arithmetic
# instructions with 3 arguments, no [fixed] rotate, addressing
# modes are limited. As result it takes more instructions to do
# the same job in Thumb, therefore the code is never twice as
# small and always slower.
# [***] which is also ~35% better than compiler generated code. Dual-
# issue Cortex A8 core was measured to process input block in
# ~990 cycles.
# August 2010.
#
# Rescheduling for dual-issue pipeline resulted in 13% improvement on
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
# September 2013.
#
# Add NEON implementation (see sha1-586.pl for background info). On
# Cortex A8 it was measured to process one byte in 6.7 cycles or >80%
# faster than integer-only code. Because [fully unrolled] NEON code
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
# byte, which is also >80% faster than integer-only code. Cortex-A15
# is even faster spending 5.6 cycles per byte outperforming integer-
# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$ctx="r0";
$inp="r1";
$len="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
$Xi="r14";
@V=($a,$b,$c,$d,$e);
sub Xupdate {
my ($a,$b,$c,$d,$e,$opt1,$opt2)=@_;
$code.=<<___;
ldr $t0,[$Xi,#15*4]
ldr $t1,[$Xi,#13*4]
ldr $t2,[$Xi,#7*4]
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
str $t0,[$Xi,#-4]!
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_16_19 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2");
$code.=<<___;
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
add $e,$e,$t1 @ E+=F_00_19(B,C,D)
___
}
sub BODY_20_39 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"eor $t1,$b,$t1,ror#2");
$code.=<<___;
add $e,$e,$t1 @ E+=F_20_39(B,C,D)
___
}
sub BODY_40_59 {
my ($a,$b,$c,$d,$e)=@_;
&Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d");
$code.=<<___;
add $e,$e,$t1 @ E+=F_40_59(B,C,D)
add $e,$e,$t2,ror#2
___
}
$code=<<___;
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 5
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
.Lsha1_block:
adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
ldmia $ctx,{$a,$b,$c,$d,$e}
.Lloop:
ldr $K,.LK_00_19
mov $Xi,sp
sub sp,sp,#15*4
mov $c,$c,ror#30
mov $d,$d,ror#30
mov $e,$e,ror#30 @ [6]
.L_00_15:
___
for($i=0;$i<5;$i++) {
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp
#endif
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_00_15(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
&BODY_16_19(@V); unshift(@V,pop(@V));
$code.=<<___;
ldr $K,.LK_20_39 @ [+15+16*4]
cmn sp,#0 @ [+3], clear carry to denote 20_39
.L_20_39_or_60_79:
___
for($i=0;$i<5;$i++) {
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp @ preserve carry
#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
ldr $K,.LK_40_59
sub sp,sp,#20*4 @ [+2]
.L_40_59:
___
for($i=0;$i<5;$i++) {
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
#if defined(__thumb2__)
mov $t3,sp
teq $Xi,$t3
#else
teq $Xi,sp
#endif
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
sub sp,sp,#20*4
cmp sp,#0 @ set carry to denote 60_79
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
.L_done:
add sp,sp,#80*4 @ "deallocate" stack frame
ldmia $ctx,{$K,$t0,$t1,$t2,$t3}
add $a,$K,$a
add $b,$t0,$b
add $c,$t1,$c,ror#2
add $d,$t2,$d,ror#2
add $e,$t3,$e,ror#2
stmia $ctx,{$a,$b,$c,$d,$e}
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha1_block_data_order,.-sha1_block_data_order
.align 5
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha1_block
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
#####################################################################
# NEON stuff
#
{{{
my @V=($a,$b,$c,$d,$e);
my ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14));
my $Xi=4;
my @X=map("q$_",(8..11,0..3));
my @Tx=("q12","q13");
my ($K,$zero)=("q14","q15");
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub body_00_19 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&bic ($t0,$d,$b)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t1,$c,$b)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$t1,$t0)', # F_00_19
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_00_19
'$j++; unshift(@V,pop(@V));'
)
}
sub body_20_39 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&eor ($t0,$b,$d)',
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)',
'&eor ($t1,$t0,$c)', # F_20_39
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_20_39
'$j++; unshift(@V,pop(@V));'
)
}
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'.
'&add ($e,$e,$Ki)', # e+=X[i]+K
'&and ($t0,$c,$d)',
'&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))',
'&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27)
'&eor ($t1,$c,$d)',
'&add ($e,$e,$t0)',
'&and ($t1,$t1,$b)',
'&mov ($b,$b,"ror#2")', # b=ROR(b,2)
'&add ($e,$e,$t1);'. # e+=F_40_59
'$j++; unshift(@V,pop(@V));'
)
}
sub Xupdate_16_31 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8]
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@Tx[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@Tx[0],@Tx[1],30);
eval(shift(@insns));
eval(shift(@insns));
&vshl_u32 (@Tx[1],@Tx[1],2);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xupdate_32_79 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]"
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
eval(shift(@insns));
eval(shift(@insns));
&veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0);
eval(shift(@insns));
&veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]"
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 (@X[0],@Tx[0],30);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer
&sub ($Xfer,$Xfer,64) if ($Xi%4==0);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2
foreach (@insns) { eval; } # remaining instructions [if any]
$Xi++; push(@X,shift(@X)); # "rotate" X[]
}
sub Xuplast_80 ()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vadd_i32 (@Tx[1],@X[-1&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@Tx[1]}","[$Xfer,:128]!");
&sub ($Xfer,$Xfer,64);
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
&it ("eq");
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!");
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[-4&7],@X[-4&7]);
foreach (@insns) { eval; } # remaining instructions
$Xi=0;
}
sub Xloop()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e);
&vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K);
eval(shift(@insns));
eval(shift(@insns));
&vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU
foreach (@insns) { eval; }
$Xi++;
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type sha1_block_data_order_neon,%function
.align 4
sha1_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
sub $Xfer,sp,#64
adr $K_XX_XX,.LK_00_19
bic $Xfer,$Xfer,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
mov sp,$Xfer @ alloca
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]!
vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19
vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on
vrev32.8 @X[-3&7],@X[-3&7] @ big-endian...
vrev32.8 @X[-2&7],@X[-2&7]
vadd.i32 @X[0],@X[-4&7],$K
vrev32.8 @X[-1&7],@X[-1&7]
vadd.i32 @X[1],@X[-3&7],$K
vst1.32 {@X[0]},[$Xfer,:128]!
vadd.i32 @X[2],@X[-2&7],$K
vst1.32 {@X[1]},[$Xfer,:128]!
vst1.32 {@X[2]},[$Xfer,:128]!
ldr $Ki,[sp] @ big RAW stall
.Loop_neon:
___
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_16_31(\&body_00_19);
&Xupdate_32_79(\&body_00_19);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_20_39);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_40_59);
&Xupdate_32_79(\&body_20_39);
&Xuplast_80(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
&Xloop(\&body_20_39);
$code.=<<___;
ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context
add $a,$a,$Ki
ldr $Ki,[$ctx,#16]
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
it eq
moveq sp,$saved_sp
add $e,$e,$Ki
it ne
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
itt ne
addne $Xfer,sp,#3*16
bne .Loop_neon
@ vldmia sp!,{d8-d15}
ldmia sp!,{r4-r12,pc}
.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
#endif
___
}}}
#####################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3));
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xf,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d|0x10
# endif
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
.LARMv8:
vstmdb sp!,{d8-d15} @ ABI specification says so
veor $E,$E,$E
adr r3,.LK_00_19
vld1.32 {$ABCD},[$ctx]!
vld1.32 {$E\[0]},[$ctx]
sub $ctx,$ctx,#16
vld1.32 {@Kxx[0]\[]},[r3,:32]!
vld1.32 {@Kxx[1]\[]},[r3,:32]!
vld1.32 {@Kxx[2]\[]},[r3,:32]!
vld1.32 {@Kxx[3]\[]},[r3,:32]
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vadd.i32 $W0,@Kxx[0],@MSG[0]
vrev32.8 @MSG[2],@MSG[2]
vmov $ABCD_SAVE,$ABCD @ offload
subs $len,$len,#1
vadd.i32 $W1,@Kxx[0],@MSG[1]
vrev32.8 @MSG[3],@MSG[3]
sha1h $E1,$ABCD @ 0
sha1c $ABCD,$E,$W0
vadd.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1$f $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD @ $i
sha1p $ABCD,$E1,$W1
vadd.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD @ 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD @ 19
sha1p $ABCD,$E1,$W1
vadd.i32 $E,$E,$E0
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
bne .Loop_v8
vst1.32 {$ABCD},[$ctx]!
vst1.32 {$E\[0]},[$ctx]
vldmia sp!,{d8-d15}
ret @ bx lr
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
{ my %opcode = (
"sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40,
"sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40,
"sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
# this fix-up provides Thumb encoding in conjunction with INST
$word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or
s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo;
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
s/\bret\b/bx lr/o or
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
print $_,$/;
}
close STDOUT; # enforce flush

View file

@ -0,0 +1,364 @@
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# hardware-assisted software(*)
# Apple A7 2.31 4.13 (+14%)
# Cortex-A53 2.24 8.03 (+97%)
# Cortex-A57 2.35 7.88 (+74%)
# Denver 2.13 3.97 (+0%)(**)
# X-Gene 8.80 (+200%)
# Mongoose 2.05 6.50 (+160%)
# Kryo 1.88 8.00 (+90%)
#
# (*) Software results are presented mostly for reference purposes.
# (**) Keep in mind that Denver relies on binary translation, which
# optimizes compiler output at run-time.
$flavour = shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
($ctx,$inp,$num)=("x0","x1","x2");
@Xw=map("w$_",(3..17,19));
@Xx=map("x$_",(3..17,19));
@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
($t0,$t1,$t2,$K)=map("w$_",(25..28));
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i<15 && !($i&1));
lsr @Xx[$i+1],@Xx[$i],#32
___
$code.=<<___ if ($i<14 && !($i&1));
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
___
$code.=<<___ if ($i<14 && ($i&1));
#ifdef __ARMEB__
ror @Xx[$i+1],@Xx[$i+1],#32
#else
rev32 @Xx[$i+1],@Xx[$i+1]
#endif
___
$code.=<<___ if ($i<14);
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==19);
movz $K,#0xeba1
movk $K,#0x6ed9,lsl#16
___
$code.=<<___ if ($i>=14);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
bic $t0,$d,$b
and $t1,$c,$b
ror $t2,$a,#27
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $d,$d,$K // future e+=K
orr $t0,$t0,$t1
add $e,$e,$t2 // e+=rot(a,5)
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==59);
movz $K,#0xc1d6
movk $K,#0xca62,lsl#16
___
$code.=<<___;
orr $t0,$b,$c
and $t1,$b,$c
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
ror $t2,$a,#27
and $t0,$t0,$d
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
add $e,$e,$t2 // e+=rot(a,5)
orr $t0,$t0,$t1
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=($i+2)&15;
$code.=<<___ if ($i==39);
movz $K,#0xbcdc
movk $K,#0x8f1b,lsl#16
___
$code.=<<___ if ($i<78);
eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15]
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15]
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
ror @Xw[$j],@Xw[$j],#31
___
$code.=<<___ if ($i==78);
ldp @Xw[1],@Xw[2],[$ctx]
eor $t0,$d,$b
ror $t2,$a,#27
add $d,$d,$K // future e+=K
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
add $d,$d,@Xw[($i+1)&15] // future e+=X[i]
add $e,$e,$t0 // e+=F(b,c,d)
___
$code.=<<___ if ($i==79);
ldp @Xw[3],@Xw[4],[$ctx,#8]
eor $t0,$d,$b
ror $t2,$a,#27
eor $t0,$t0,$c
add $e,$e,$t2 // e+=rot(a,5)
ror $b,$b,#2
ldr @Xw[5],[$ctx,#16]
add $e,$e,$t0 // e+=F(b,c,d)
___
}
$code.=<<___;
#include "arm_arch.h"
.text
.extern OPENSSL_armcap_P
.globl sha1_block_data_order
.type sha1_block_data_order,%function
.align 6
sha1_block_data_order:
#ifdef __ILP32__
ldrsw x16,.LOPENSSL_armcap_P
#else
ldr x16,.LOPENSSL_armcap_P
#endif
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
tst w16,#ARMV8_SHA1
b.ne .Lv8_entry
stp x29,x30,[sp,#-96]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldr $E,[$ctx,#16]
.Loop:
ldr @Xx[0],[$inp],#64
movz $K,#0x7999
sub $num,$num,#1
movk $K,#0x5a82,lsl#16
#ifdef __ARMEB__
ror $Xx[0],@Xx[0],#32
#else
rev32 @Xx[0],@Xx[0]
#endif
add $E,$E,$K // warm it up
add $E,$E,@Xw[0]
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $B,$B,@Xw[2]
add $C,$C,@Xw[3]
add $A,$A,@Xw[1]
add $D,$D,@Xw[4]
add $E,$E,@Xw[5]
stp $A,$B,[$ctx]
stp $C,$D,[$ctx,#8]
str $E,[$ctx,#16]
cbnz $num,.Loop
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldr x29,[sp],#96
ret
.size sha1_block_data_order,.-sha1_block_data_order
___
{{{
my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
my @MSG=map("v$_.16b",(4..7));
my @Kxx=map("v$_.4s",(16..19));
my ($W0,$W1)=("v20.4s","v21.4s");
my $ABCD_SAVE="v22.16b";
$code.=<<___;
.type sha1_block_armv8,%function
.align 6
sha1_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
adr x4,.Lconst
eor $E,$E,$E
ld1.32 {$ABCD},[$ctx],#16
ld1.32 {$E}[0],[$ctx]
sub $ctx,$ctx,#16
ld1.32 {@Kxx[0]-@Kxx[3]},[x4]
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
add.i32 $W0,@Kxx[0],@MSG[0]
rev32 @MSG[2],@MSG[2]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
add.i32 $W1,@Kxx[0],@MSG[1]
rev32 @MSG[3],@MSG[3]
sha1h $E1,$ABCD
sha1c $ABCD,$E,$W0 // 0
add.i32 $W0,@Kxx[$j],@MSG[2]
sha1su0 @MSG[0],@MSG[1],@MSG[2]
___
for ($j=0,$i=1;$i<20-3;$i++) {
my $f=("c","p","m","p")[$i/5];
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1$f $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1su1 @MSG[0],@MSG[3]
___
$code.=<<___ if ($i<20-4);
sha1su0 @MSG[1],@MSG[2],@MSG[3]
___
($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0);
push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0);
}
$code.=<<___;
sha1h $E0,$ABCD // $i
sha1p $ABCD,$E1,$W1
add.i32 $W1,@Kxx[$j],@MSG[3]
sha1h $E1,$ABCD // 18
sha1p $ABCD,$E0,$W0
sha1h $E0,$ABCD // 19
sha1p $ABCD,$E1,$W1
add.i32 $E,$E,$E0
add.i32 $ABCD,$ABCD,$ABCD_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD},[$ctx],#16
st1.32 {$E}[0],[$ctx]
ldr x29,[sp],#16
ret
.size sha1_block_armv8,.-sha1_block_armv8
.align 6
.Lconst:
.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap_P,4,4
___
}}}
{ my %opcode = (
"sha1c" => 0x5e000000, "sha1p" => 0x5e001000,
"sha1m" => 0x5e002000, "sha1su0" => 0x5e003000,
"sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 );
sub unsha1 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
s/\.\w?32\b//o and s/\.16b/\.4s/go;
m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,337 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA1 for C64x+.
#
# November 2011
#
# If compared to compiler-generated code with similar characteristics,
# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
# this implementation is 25% smaller and >2x faster. In absolute terms
# performance is (quite impressive) ~6.5 cycles per processed byte.
# Fully unrolled assembler would be ~5x larger and is likely to be
# ~15% faster. It would be free from references to intermediate ring
# buffer, but put more pressure on L1P [both because the code would be
# larger and won't be using SPLOOP buffer]. There are no plans to
# realize fully unrolled variant though...
#
# !!! Note that this module uses AMR, which means that all interrupt
# service routines are expected to preserve it and for own well-being
# zero it upon entry.
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
($XPA,$XPB) = ("A5","B5"); # X circular buffer
($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
$code=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg sha1_block_data_order,_sha1_block_data_order
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.if .BIG_ENDIAN
.asg MV,SWAP2
.asg MV,SWAP4
.endif
.global _sha1_block_data_order
_sha1_block_data_order:
.asmfunc stack_usage(64)
MV $NUM,A0 ; reassign $NUM
|| MVK -64,B0
[!A0] BNOP RA ; if ($NUM==0) return;
|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
|| [A0] MV SP,FP
[A0] LDW *${CTX}[0],$A ; load A-E...
|| [A0] AND B0,SP,SP ; align stack at 64 bytes
[A0] LDW *${CTX}[1],$B
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
[A0] LDW *${CTX}[2],$C
|| [A0] MVK 0x00404,B0
[A0] LDW *${CTX}[3],$D
|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
[A0] LDW *${CTX}[4],$E
|| [A0] MVC B0,AMR ; setup circular addressing
LDNW *${INP}++,$TX1 ; pre-fetch input
NOP 1
loop?:
MVK 0x00007999,$K
|| ADDAW SP,2,$XPA
|| SUB A0,1,A0
|| MVK 13,B0
MVKH 0x5a820000,$K ; K_00_19
|| ADDAW SP,2,$XPB
|| MV $A,$Actx
|| MV $B,$Bctx
;;==================================================
SPLOOPD 5 ; BODY_00_13
|| MV $C,$Cctx
|| MV $D,$Dctx
|| MV $E,$Ectx
|| MVC B0,ILC
ROTL $A,5,$Arot
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
|| ADD $K,$E,$T ; T=E+K
XOR $F0,$F,$F ; F_00_19(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
|| SWAP2 $TX1,$TX2
|| LDNW *${INP}++,$TX1
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| SWAP4 $TX2,$TX3 ; byte swap
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
ADD $TX3,$T,$A ; A=T+Xi
|| STW $TX3,*${XPB}++
SPKERNEL
;;==================================================
ROTL $A,5,$Arot ; BODY_14
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
|| ADD $K,$E,$T ; T=E+K
XOR $F0,$F,$F ; F_00_19(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
|| SWAP2 $TX1,$TX2
|| LDNW *${INP}++,$TX1
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| SWAP4 $TX2,$TX2 ; byte swap
|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
|| LDW *${XPA}[7],$X8
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|| MV $TX2,$TX3
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
;;==================================================
ROTL $A,5,$Arot ; BODY_15
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
|| ADD $K,$E,$T ; T=E+K
XOR $F0,$F,$F ; F_00_19(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
|| SWAP2 $TX1,$TX2
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| SWAP4 $TX2,$TX2 ; byte swap
|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
|| LDW *${XPA}++,$X0
|| LDW *${XPB}[4],$X2
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
|| XOR $X8,$X13,$TX1
|| LDW *${XPA}[7],$X8
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|| MV $TX2,$TX3
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
|| XOR $TX0,$TX1,$TX1
|| MVK 3,B0
;;==================================================
SPLOOPD 5 ; BODY_16_19
|| MVC B0,ILC
ROTL $A,5,$Arot
|| AND $C,$B,$F
|| ANDN $D,$B,$F0
|| ADD $K,$E,$T ; T=E+K
|| ROTL $TX1,1,$TX2 ; Xupdate output
XOR $F0,$F,$F ; F_00_19(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| XOR $X0,$X2,$TX0
|| LDW *${XPA}++,$X0
|| LDW *${XPB}[4],$X2
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
|| XOR $X8,$X13,$TX1
|| LDW *${XPA}[7],$X8
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|| MV $TX2,$TX3
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
|| XOR $TX0,$TX1,$TX1
SPKERNEL
MVK 0xffffeba1,$K
|| MVK 19,B0
MVKH 0x6ed90000,$K ; K_20_39
___
sub BODY_20_39 {
$code.=<<___;
;;==================================================
SPLOOPD 5 ; BODY_20_39
|| MVC B0,ILC
ROTL $A,5,$Arot
|| XOR $B,$C,$F
|| ADD $K,$E,$T ; T=E+K
|| ROTL $TX1,1,$TX2 ; Xupdate output
XOR $D,$F,$F ; F_20_39(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| XOR $X0,$X2,$TX0
|| LDW *${XPA}++,$X0
|| LDW *${XPB}[4],$X2
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
|| XOR $X8,$X13,$TX1
|| LDW *${XPA}[7],$X8
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|| MV $TX2,$TX3
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++ ; last one is redundant
|| XOR $TX0,$TX1,$TX1
SPKERNEL
___
$code.=<<___ if (!shift);
MVK 0xffffbcdc,$K
MVKH 0x8f1b0000,$K ; K_40_59
___
} &BODY_20_39();
$code.=<<___;
;;==================================================
SPLOOPD 5 ; BODY_40_59
|| MVC B0,ILC
|| AND $B,$C,$F
|| AND $B,$D,$F0
ROTL $A,5,$Arot
|| XOR $F0,$F,$F
|| AND $C,$D,$F0
|| ADD $K,$E,$T ; T=E+K
|| ROTL $TX1,1,$TX2 ; Xupdate output
XOR $F0,$F,$F ; F_40_59(B,C,D)
|| MV $D,$E ; E=D
|| MV $C,$D ; D=C
ADD $F,$T,$T ; T+=F_40_59(B,C,D)
|| ROTL $B,30,$C ; C=ROL(B,30)
|| XOR $X0,$X2,$TX0
|| LDW *${XPA}++,$X0
|| LDW *${XPB}[4],$X2
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| MV $A,$B ; B=A
|| XOR $X8,$X13,$TX1
|| LDW *${XPA}[7],$X8
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|| MV $TX2,$TX3
ADD $TX2,$T,$A ; A=T+Xi
|| STW $TX2,*${XPB}++
|| XOR $TX0,$TX1,$TX1
|| AND $B,$C,$F
|| AND $B,$D,$F0
SPKERNEL
MVK 0xffffc1d6,$K
|| MVK 18,B0
MVKH 0xca620000,$K ; K_60_79
___
&BODY_20_39(-1); # BODY_60_78
$code.=<<___;
;;==================================================
[A0] B loop?
|| ROTL $A,5,$Arot ; BODY_79
|| XOR $B,$C,$F
|| ROTL $TX1,1,$TX2 ; Xupdate output
[A0] LDNW *${INP}++,$TX1 ; pre-fetch input
|| ADD $K,$E,$T ; T=E+K
|| XOR $D,$F,$F ; F_20_39(B,C,D)
ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
|| ROTL $B,30,$C ; C=ROL(B,30)
ADD $Arot,$T,$T ; T+=ROL(A,5)
|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
ADD $TX2,$T,$A ; A=T+Xi
ADD $Actx,$A,$A ; A+=Actx
|| ADD $Cctx,$C,$C ; C+=Cctx
;; end of loop?
BNOP RA ; return
|| MV FP,SP ; restore stack pointer
|| LDW *FP[0],FP ; restore frame pointer
STW $A,*${CTX}[0] ; emit A-E...
|| MVK 0,B0
STW $B,*${CTX}[1]
|| MVC B0,AMR ; clear AMR
STW $C,*${CTX}[2]
STW $D,*${CTX}[3]
STW $E,*${CTX}[4]
.endasmfunc
.sect .const
.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;
close STDOUT;

View file

@ -0,0 +1,314 @@
#! /usr/bin/env perl
# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Eternal question is what's wrong with compiler generated code? The
# trick is that it's possible to reduce the number of shifts required
# to perform rotations by maintaining copy of 32-bit value in upper
# bits of 64-bit register. Just follow mux2 and shrp instructions...
# Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
# is >50% better than HP C and >2x better than gcc.
$output = pop;
$code=<<___;
.ident \"sha1-ia64.s, version 1.3\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
.explicit
___
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
#$human=1;
if ($human) { # useful for visual code auditing...
($A,$B,$C,$D,$E) = ("A","B","C","D","E");
($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "K_00_19","K_20_39","K_40_59","K_60_79" );
@X= ( "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
"X8", "X9","X10","X11","X12","X13","X14","X15" );
}
else {
($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4");
($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
( "r14", "r15", "loc10", "loc11" );
@X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" );
}
sub BODY_00_15 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___ if ($i==0);
{ .mmi; ld1 $X[$i]=[inp],2 // MSB
ld1 tmp2=[tmp3],2 };;
{ .mmi; ld1 tmp0=[inp],2
ld1 tmp4=[tmp3],2 // LSB
dep $X[$i]=$X[$i],tmp2,8,8 };;
___
if ($i<15) {
$code.=<<___;
{ .mmi; ld1 $Xn=[inp],2 // forward Xload
nop.m 0x0
dep tmp1=tmp0,tmp4,8,8 };;
{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload
and tmp4=$c,$b
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xload
or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 };; // a>>27
{ .mmi; ld1 tmp0=[inp],2 // forward Xload
add $e=$e,tmp4 // e+=F_00_19(b,c,d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload
or tmp5=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)
dep $Xn=$Xn,tmp2,8,8 // forward Xload
mux2 $X[$i]=$X[$i],0x44 } //;;
___
}
else {
$code.=<<___;
{ .mii; and tmp3=$c,$b
dep tmp1=tmp0,tmp4,8,8;;
dep $X[$i]=$X[$i],tmp1,16,16} //;;
{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19
andcm tmp1=$d,$b
dep.z tmp5=$a,5,27 };; // a<<5
{ .mmi; add $e=$e,$X[$i] // e+=Xupdate
or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
mux2 $X[$i]=$X[$i],0x44 };;
___
}
}
sub BODY_16_19 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_00_19 // e+=K_00_19
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; andcm tmp1=$d,$b
and tmp0=$c,$b };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
nop.i 0 };;
{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d)
xor $Xn=$Xn,tmp3 // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
sub BODY_20_39 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e,$Konst)=@_;
$Konst = $K_20_39 if (!defined($Konst));
my $j=$i+1;
my $Xn=@X[$j%16];
if ($i<79) {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_XX_XX
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5)
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0 };;
___
}
else {
$code.=<<___;
{ .mib; add $e=$e,$Konst // e+=K_60_79
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; xor tmp0=$c,$b
add $h1=$h1,$a };; // wrap up
{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate
extr.u tmp1=$a,27,5 } // a>>27
{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d
add $h3=$h3,$c };; // wrap up
{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d)
or tmp1=tmp1,tmp5 // ROTATE(a,5)
shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;?
{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5)
add tmp3=1,inp // used in unaligned codepath
add $h4=$h4,$d };; // wrap up
___
}
}
sub BODY_40_59 {
local *code=shift;
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
my $Xn=@X[$j%16];
$code.=<<___;
{ .mib; add $e=$e,$K_40_59 // e+=K_40_59
dep.z tmp5=$a,5,27 } // a<<5
{ .mib; and tmp1=$c,$d
xor tmp0=$c,$d };;
{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate
add tmp5=tmp5,tmp1 // a<<5+(c&d)
extr.u tmp1=$a,27,5 } // a>>27
{ .mmi; and tmp0=tmp0,$b
xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate
xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate
{ .mmi; add $e=$e,tmp0 // e+=b&(c^d)
add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d)
shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30)
{ .mmi; xor $Xn=$Xn,tmp3
mux2 tmp6=$a,0x44 };; // see b in next iteration
{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d)
shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1)
nop.i 0x0 };;
___
}
sub BODY_60_79 { &BODY_20_39(@_,$K_60_79); }
$code.=<<___;
.text
tmp0=r8;
tmp1=r9;
tmp2=r10;
tmp3=r11;
ctx=r32; // in0
inp=r33; // in1
// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
.global sha1_block_data_order#
.proc sha1_block_data_order#
.align 32
sha1_block_data_order:
.prologue
{ .mmi; alloc tmp1=ar.pfs,3,14,0,0
$ADDP tmp0=4,ctx
.save ar.lc,r3
mov r3=ar.lc }
{ .mmi; $ADDP ctx=0,ctx
$ADDP inp=0,inp
mov r2=pr };;
tmp4=in2;
tmp5=loc12;
tmp6=loc13;
.body
{ .mlx; ld4 $h0=[ctx],8
movl $K_00_19=0x5a827999 }
{ .mlx; ld4 $h1=[tmp0],8
movl $K_20_39=0x6ed9eba1 };;
{ .mlx; ld4 $h2=[ctx],8
movl $K_40_59=0x8f1bbcdc }
{ .mlx; ld4 $h3=[tmp0]
movl $K_60_79=0xca62c1d6 };;
{ .mmi; ld4 $h4=[ctx],-16
add in2=-1,in2 // adjust num for ar.lc
mov ar.ec=1 };;
{ .mmi; nop.m 0
add tmp3=1,inp
mov ar.lc=in2 };; // brp.loop.imp: too far
.Ldtop:
{ .mmi; mov $A=$h0
mov $B=$h1
mux2 tmp6=$h1,0x44 }
{ .mmi; mov $C=$h2
mov $D=$h3
mov $E=$h4 };;
___
{ my $i;
my @V=($A,$B,$C,$D,$E);
for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
(($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check
}
$code.=<<___;
{ .mmb; add $h0=$h0,$A
add $h2=$h2,$C
br.ctop.dptk.many .Ldtop };;
.Ldend:
{ .mmi; add tmp0=4,ctx
mov ar.lc=r3 };;
{ .mmi; st4 [ctx]=$h0,8
st4 [tmp0]=$h1,8 };;
{ .mmi; st4 [ctx]=$h2,8
st4 [tmp0]=$h3 };;
{ .mib; st4 [ctx]=$h4,-16
mov pr=r2,0x1ffff
br.ret.sptk.many b0 };;
.endp sha1_block_data_order#
stringz "SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
open STDOUT,">$output" if $output;
print $code;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,461 @@
#! /usr/bin/env perl
# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for MIPS.
# Performance improvement is 30% on unaligned input. The "secret" is
# to deploy lwl/lwr pair to load unaligned input. One could have
# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
# compatible subroutine. There is room for minor optimization on
# little-endian platforms...
# September 2012.
#
# Add MIPS32r2 code (>25% less instructions).
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="daddu"; # incidentally works even on n32
$PTR_SUB="dsubu"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_ADD="addu";
$PTR_SUB="subu";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC});
for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
# offsets of the Most and Least Significant Bytes
$MSB=$big_endian?0:3;
$LSB=3&~$MSB;
@X=map("\$$_",(8..23)); # a4-a7,s0-s11
$ctx=$a0;
$inp=$a1;
$num=$a2;
$A="\$1";
$B="\$2";
$C="\$3";
$D="\$7";
$E="\$24"; @V=($A,$B,$C,$D,$E);
$t0="\$25";
$t1=$num; # $num is offloaded to stack
$t2="\$30"; # fp
$K="\$31"; # ra
sub BODY_00_14 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[$i],@X[$i] # byte swap($i)
rotr @X[$i],@X[$i],16
#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or $t1,$t2
or @X[$i],$t1
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
xor $t0,$c,$d
rotr $t1,$a,27
and $t0,$b
addu $e,$t1
#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
lw @X[$j],$j*4($inp)
#else
lwl @X[$j],$j*4+$MSB($inp)
lwr @X[$j],$j*4+$LSB($inp)
#endif
xor $t0,$d
addu $e,@X[$i]
rotr $b,$b,2
addu $e,$t0
#else
lwl @X[$j],$j*4+$MSB($inp)
sll $t0,$a,5 # $i
addu $e,$K
lwr @X[$j],$j*4+$LSB($inp)
srl $t1,$a,27
addu $e,$t0
xor $t0,$c,$d
addu $e,$t1
sll $t2,$b,30
and $t0,$b
srl $b,$b,2
xor $t0,$d
addu $e,@X[$i]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_15_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if (!$big_endian && $i==15);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[$i],@X[$i] # byte swap($i)
rotr @X[$i],@X[$i],16
#else
srl $t0,@X[$i],24 # byte swap($i)
srl $t1,@X[$i],8
andi $t2,@X[$i],0xFF00
sll @X[$i],@X[$i],24
andi $t1,0xFF00
sll $t2,$t2,8
or @X[$i],$t0
or @X[$i],$t1
or @X[$i],$t2
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
xor @X[$j%16],@X[($j+2)%16]
xor $t0,$c,$d
rotr $t1,$a,27
xor @X[$j%16],@X[($j+8)%16]
and $t0,$b
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
xor $t0,$d
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
and $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
xor $t0,$d
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
xor @X[$j%16],@X[($j+2)%16]
addu $e,$K # $i
rotr $t1,$a,27
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
xor $t0,$b
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
xor $t0,$b
srl $t1,@X[$j%16],31
addu @X[$j%16],@X[$j%16]
srl $b,$b,2
addu $e,@X[$i%16]
or @X[$j%16],$t1
or $b,$t2
addu $e,$t0
#endif
___
$code.=<<___ if ($i==79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
lw @X[0],0($ctx)
addu $e,$K # $i
lw @X[1],4($ctx)
rotr $t1,$a,27
lw @X[2],8($ctx)
xor $t0,$c,$d
addu $e,$t1
lw @X[3],12($ctx)
xor $t0,$b
addu $e,@X[$i%16]
lw @X[4],16($ctx)
rotr $b,$b,2
addu $e,$t0
#else
lw @X[0],0($ctx)
sll $t0,$a,5 # $i
addu $e,$K
lw @X[1],4($ctx)
srl $t1,$a,27
addu $e,$t0
lw @X[2],8($ctx)
xor $t0,$c,$d
addu $e,$t1
lw @X[3],12($ctx)
sll $t2,$b,30
xor $t0,$b
lw @X[4],16($ctx)
srl $b,$b,2
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
addu $e,$K # $i
and $t0,$c,$d
xor @X[$j%16],@X[($j+2)%16]
rotr $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
xor $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
and $t0,$b
addu $e,@X[$i%16]
rotr @X[$j%16],@X[$j%16],31
rotr $b,$b,2
addu $e,$t0
#else
xor @X[$j%16],@X[($j+2)%16]
sll $t0,$a,5 # $i
addu $e,$K
srl $t1,$a,27
addu $e,$t0
xor @X[$j%16],@X[($j+8)%16]
and $t0,$c,$d
addu $e,$t1
xor @X[$j%16],@X[($j+13)%16]
sll $t2,$b,30
addu $e,$t0
srl $t1,@X[$j%16],31
xor $t0,$c,$d
addu @X[$j%16],@X[$j%16]
and $t0,$b
srl $b,$b,2
or @X[$j%16],$t1
addu $e,@X[$i%16]
or $b,$t2
addu $e,$t0
#endif
___
}
$FRAMESIZE=16; # large enough to accommodate NUBI saved registers
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
$code=<<___;
#include "mips_arch.h"
.text
.set noat
.set noreorder
.align 5
.globl sha1_block_data_order
.ent sha1_block_data_order
sha1_block_data_order:
.frame $sp,$FRAMESIZE*$SZREG,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_S $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL $num,6
$PTR_ADD $num,$inp
$REG_S $num,0($sp)
lw $A,0($ctx)
lw $B,4($ctx)
lw $C,8($ctx)
lw $D,12($ctx)
b .Loop
lw $E,16($ctx)
.align 4
.Loop:
.set reorder
#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
lui $K,0x5a82
lw @X[0],($inp)
ori $K,0x7999 # K_00_19
#else
lwl @X[0],$MSB($inp)
lui $K,0x5a82
lwr @X[0],$LSB($inp)
ori $K,0x7999 # K_00_19
#endif
___
for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x6ed9
ori $K,0xeba1 # K_20_39
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0x8f1b
ori $K,0xbcdc # K_40_59
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
lui $K,0xca62
ori $K,0xc1d6 # K_60_79
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
$PTR_ADD $inp,64
$REG_L $num,0($sp)
addu $A,$X[0]
addu $B,$X[1]
sw $A,0($ctx)
addu $C,$X[2]
addu $D,$X[3]
sw $B,4($ctx)
addu $E,$X[4]
sw $C,8($ctx)
sw $D,12($ctx)
sw $E,16($ctx)
.set noreorder
bne $inp,$num,.Loop
nop
.set noreorder
$REG_L $ra,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $fp,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-9)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-10)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-13)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-14)*$SZREG($sp)
$REG_L $gp,($FRAMESIZE-15)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end sha1_block_data_order
.rdata
.asciiz "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;

View file

@ -0,0 +1,279 @@
#! /usr/bin/env perl
# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for PA-RISC.
# June 2009.
#
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
# for aligned input and >50% better for unaligned. Compared to vendor
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
# few percent faster in 32-bit one (this for aligned input, data for
# unaligned input is not available).
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
# [+ argument transfer]
$ctx="%r26"; # arg0
$inp="%r25"; # arg1
$num="%r24"; # arg2
$t0="%r28";
$t1="%r29";
$K="%r31";
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<15);
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
addl @X[$i],$e,$e
and $c,$b,$t0
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
addl $t0,$e,$e
___
$code.=<<___ if ($i>=15); # with forward Xupdate
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
and $c,$b,$t0
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
add $t0,$e,$e
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t0,$e,$e
___
$code.=<<___ if ($i==79); # with context load
ldw 0($ctx),@X[0] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
ldw 4($ctx),@X[1]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
ldw 8($ctx),@X[2]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
ldw 12($ctx),@X[3]
addl $t0,$e,$e
ldw 16($ctx),@X[4]
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
shd $a,$a,27,$t1 ; $i
addl $K,$e,$e
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
xor $d,$c,$t0
addl @X[$i%16],$e,$e
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
and $b,$t0,$t0
addl $t1,$e,$e
shd $b,$b,2,$b
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t0,$e,$e
and $d,$c,$t1
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t1,$e,$e
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
sha1_block_data_order
.PROC
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
ldw 0($ctx),$A
ldw 4($ctx),$B
ldw 8($ctx),$C
ldw 12($ctx),$D
ldw 16($ctx),$E
extru $inp,31,2,$t0 ; t0=inp&3;
sh3addl $t0,%r0,$t0 ; t0*=8;
subi 32,$t0,$t0 ; t0=32-t0;
mtctl $t0,%cr11 ; %sar=t0;
L\$oop
ldi 3,$t0
andcm $inp,$t0,$t0 ; 64-bit neutral
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
ldw 60($t0),@X[15]
ldw 64($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align input
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
ldil L'0x5a827000,$K ; K_00_19
ldo 0x999($K),$K
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x6ed9e000,$K ; K_20_39
ldo 0xba1($K),$K
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x8f1bb000,$K ; K_40_59
ldo 0xcdc($K),$K
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0xca62c000,$K ; K_60_79
ldo 0x1d6($K),$K
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stw $A,0($ctx)
stw $B,4($ctx)
stw $C,8($ctx)
stw $D,12($ctx)
stw $E,16($ctx)
addib,*<> -1,$num,L\$oop
ldo 64($inp),$inp
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler/) {
$gnuas = 1;
}
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
s/,\*/,/ if ($SIZE_T==4);
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,351 @@
#! /usr/bin/env perl
# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input(*), except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank.
#
# (*) this means that this module is inappropriate for PPC403? Does
# anybody know if pre-POWER3 can sustain unaligned load?
# -m64 -m32
# ----------------------------------
# PPC970,gcc-4.0.0 +76% +59%
# Power6,xlc-7 +68% +33%
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
# Define endianness based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T+64;
$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$t0 ="r15";
$t1 ="r6";
$A ="r7";
$B ="r8";
$C ="r9";
$D ="r10";
$E ="r11";
$T ="r12";
@V=($A,$B,$C,$D,$E,$T);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
sub loadbe {
my ($dst, $src, $temp_reg) = @_;
$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $dst,$src
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $temp_reg,$src
rotlwi $dst,$temp_reg,8
rlwimi $dst,$temp_reg,24,0,7
rlwimi $dst,$temp_reg,24,16,23
___
}
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
# Since the last value of $f is discarded, we can use
# it as a temp reg to swap byte-order when needed.
loadbe("@X[$i]","`$i*4`($inp)",$f) if ($i==0);
loadbe("@X[$j]","`$j*4`($inp)",$f) if ($i<15);
$code.=<<___ if ($i<15);
add $f,$K,$e
rotlwi $e,$a,5
add $f,$f,@X[$i]
and $t0,$c,$b
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
add $f,$f,$t0
___
$code.=<<___ if ($i>=15);
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$c,$b
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
andc $t1,$d,$b
rotlwi $b,$b,30
or $t0,$t0,$t1
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$t0
rotlwi @X[$j%16],@X[$j%16],1
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
add $f,$K,$e
xor $t0,$b,$d
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
xor $t0,$t0,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$t0
rotlwi $b,$b,30
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
add $f,$f,$e
rotlwi @X[$j%16],@X[$j%16],1
___
$code.=<<___ if ($i==79);
add $f,$K,$e
xor $t0,$b,$d
rotlwi $e,$a,5
lwz r16,0($ctx)
add $f,$f,@X[$i%16]
xor $t0,$t0,$c
lwz r17,4($ctx)
add $f,$f,$t0
rotlwi $b,$b,30
lwz r18,8($ctx)
lwz r19,12($ctx)
add $f,$f,$e
lwz r20,16($ctx)
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e,$f)=@_;
my $j=$i+1;
$code.=<<___;
add $f,$K,$e
rotlwi $e,$a,5
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
add $f,$f,@X[$i%16]
and $t0,$b,$c
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
add $f,$f,$e
or $t1,$b,$c
rotlwi $b,$b,30
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
and $t1,$t1,$d
or $t0,$t0,$t1
rotlwi @X[$j%16],@X[$j%16],1
add $f,$f,$t0
___
}
$code=<<___;
.machine "any"
.text
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
lwz $D,12($ctx)
lwz $E,16($ctx)
andi. r0,$inp,3
bne Lunaligned
Laligned:
mtctr $num
bl Lsha1_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for 64-byte input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,4095 ; distance to closest page boundary
srwi. $t1,$t1,6 ; t1/=64
beq Lcross_page
$UCMP $num,$t1
ble Laligned ; didn't cross the page boundary
mtctr $t1
subfc $num,$t1,$num
bl Lsha1_block_private
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
# This is private block function, which uses tailored calling
# interface, namely upon entry SHA_CTX is pre-loaded to given
# registers and counter register contains amount of chunks to
# digest...
$code.=<<___;
.align 4
Lsha1_block_private:
___
$code.=<<___; # load K_00_19
lis $K,0x5a82
ori $K,$K,0x7999
___
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_20_39
lis $K,0x6ed9
ori $K,$K,0xeba1
___
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_40_59
lis $K,0x8f1b
ori $K,$K,0xbcdc
___
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; # load K_60_79
lis $K,0xca62
ori $K,$K,0xc1d6
___
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add r16,r16,$E
add r17,r17,$T
add r18,r18,$A
add r19,r19,$B
add r20,r20,$C
stw r16,0($ctx)
mr $A,r16
stw r17,4($ctx)
mr $B,r17
stw r18,8($ctx)
mr $C,r18
stw r19,12($ctx)
mr $D,r19
stw r20,16($ctx)
mr $E,r20
addi $inp,$inp,`16*4`
bdnz Lsha1_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size .sha1_block_data_order,.-.sha1_block_data_order
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,249 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for s390x.
# April 2007.
#
# Performance is >30% better than gcc 3.3 generated code. But the real
# twist is that SHA1 hardware support is detected and utilized. In
# which case performance can reach further >4.5x for larger chunks.
# January 2009.
#
# Optimize Xupdate for amount of memory references and reschedule
# instructions to favour dual-issue z10 pipeline. On z10 hardware is
# "only" ~2.3x faster than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 it was measured to perform
# 23% better than code generated by gcc 4.3.
$kimdfunc=1; # magic function code for kimd instruction
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$K_00_39="%r0"; $K=$K_00_39;
$K_40_79="%r1";
$ctx="%r2"; $prefetch="%r2";
$inp="%r3";
$len="%r4";
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9"; @V=($A,$B,$C,$D,$E);
$t0="%r10";
$t1="%r11";
@X=("%r12","%r13","%r14");
$sp="%r15";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*4;
sub Xupdate {
my $i=shift;
$code.=<<___ if ($i==15);
lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
lr $X[0],$X[2]
___
return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
$code.=<<___ if ($i<16);
lg $X[0],`$i*4`($inp) ### Xload($i)
rllg $X[1],$X[0],32
___
$code.=<<___ if ($i>=16);
xgr $X[0],$prefetch ### Xupdate($i)
lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
xgr $X[0],$prefetch
rll $X[0],$X[0],1
rllg $X[1],$X[0],32
rll $X[1],$X[1],1
rllg $X[0],$X[1],32
lr $X[2],$X[1] # feedback
___
$code.=<<___ if ($i<=70);
stg $X[0],`$stdframe+4*($i%16)`($sp)
___
unshift(@X,pop(@X));
}
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$d
xr $t0,$c
alr $e,$t1
nr $t0,$b
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
xr $t0,$c
alr $e,$xi
xr $t0,$d
rll $b,$b,30
alr $e,$t0
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=$X[1];
&Xupdate($i);
$code.=<<___;
alr $e,$K ### $i
rll $t1,$a,5
lr $t0,$b
alr $e,$t1
or $t0,$c
lr $t1,$b
nr $t0,$d
nr $t1,$c
alr $e,$xi
or $t0,$t1
rll $b,$b,30
alr $e,$t0
___
}
$code.=<<___;
#include "s390x_arch.h"
.text
.align 64
.type Ktable,\@object
Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
.skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
.size Ktable,.-Ktable
.globl sha1_block_data_order
.type sha1_block_data_order,\@function
sha1_block_data_order:
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,S390X_KIMD(%r1) # check kimd capabilities
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
sllg %r3,$len,6
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
st${g} $ctx,`2*$SIZE_T`($sp)
stm${g} %r6,%r15,`6*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $t0,Ktable
llgf $A,0($ctx)
llgf $B,4($ctx)
llgf $C,8($ctx)
llgf $D,12($ctx)
llgf $E,16($ctx)
lg $K_00_39,0($t0)
lg $K_40_79,8($t0)
.Lloop:
rllg $K_00_39,$K_00_39,32
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_00_39,$K_00_39,32
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___; $K=$K_40_79;
rllg $K_40_79,$K_40_79,32
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
rllg $K_40_79,$K_40_79,32
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,64($inp)
al $A,0($ctx)
al $B,4($ctx)
al $C,8($ctx)
al $D,12($ctx)
al $E,16($ctx)
st $A,0($ctx)
st $B,4($ctx)
st $C,8($ctx)
st $D,12($ctx)
st $E,16($ctx)
brct${g} $len,.Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size sha1_block_data_order,.-sha1_block_data_order
.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,434 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller
# ====================================================================
# Performance improvement is not really impressive on pre-T1 CPU: +8%
# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it
# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and
# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick.
# X[16] vector is packed to 8 64-bit registers and as result nothing
# is spilled on stack. In addition input data is loaded in compact
# instruction sequence, thus minimizing the window when the code is
# subject to [inter-thread] cache-thrashing hazard. The goal is to
# ensure scalability on UltraSPARC T1, or rather to avoid decay when
# amount of active threads exceeds the number of physical cores.
# SPARC T4 SHA1 hardware achieves 3.72 cycles per byte, which is 3.1x
# faster than software. Multi-process benchmark saturates at 11x
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.
$output=pop;
open STDOUT,">$output";
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$rot1m="%g2";
$tmp64="%g3";
$Xi="%g4";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$K_00_19="%l5";
$K_20_39="%l6";
$K_40_59="%l7";
$K_60_79="%g5";
@K=($K_00_19,$K_20_39,$K_40_59,$K_60_79);
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi=($i&1)?@X[($i/2)%8]:$Xi;
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
add $tmp0,$e,$e
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $xi,$e,$e
___
if ($i&1 && $i<15) {
$code.=
" srlx @X[(($i+1)/2)%8],32,$Xi\n";
}
$code.=<<___;
add $tmp1,$e,$e
___
}
sub Xupdate {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i/2;
if ($i&1) {
$code.=<<___;
sll $a,5,$tmp0 !! $i
add @K[$i/20],$e,$e
srl $a,27,$tmp1
___
} else {
$code.=<<___;
sllx @X[($j+6)%8],32,$Xi ! Xupdate($i)
xor @X[($j+1)%8],@X[$j%8],@X[$j%8]
srlx @X[($j+7)%8],32,$tmp1
xor @X[($j+4)%8],@X[$j%8],@X[$j%8]
sll $a,5,$tmp0 !! $i
or $tmp1,$Xi,$Xi
add @K[$i/20],$e,$e !!
xor $Xi,@X[$j%8],@X[$j%8]
srlx @X[$j%8],31,$Xi
add @X[$j%8],@X[$j%8],@X[$j%8]
and $Xi,$rot1m,$Xi
andn @X[$j%8],$rot1m,@X[$j%8]
srl $a,27,$tmp1 !!
or $Xi,@X[$j%8],@X[$j%8]
___
}
}
sub BODY_16_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
add $xi,$e,$e
andn $d,$b,$tmp1
srl $b,2,$b
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $xi;
&Xupdate(@_);
if ($i&1) {
$xi=@X[($i/2)%8];
} else {
$xi=$Xi;
$code.="\tsrlx @X[($i/2)%8],32,$xi\n";
}
$code.=<<___;
add $tmp0,$e,$e !!
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
srl $b,2,$b
and $d,$tmp1,$tmp1
add $xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
___
}
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.align 32
.globl sha1_block_data_order
sha1_block_data_order:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
andcc %g1, CFR_SHA1, %g0
be .Lsoftware
nop
ld [%o0 + 0x00], %f0 ! load context
ld [%o0 + 0x04], %f1
ld [%o0 + 0x08], %f2
andcc %o1, 0x7, %g0
ld [%o0 + 0x0c], %f3
bne,pn %icc, .Lhwunaligned
ld [%o0 + 0x10], %f4
.Lhw_loop:
ldd [%o1 + 0x00], %f8
ldd [%o1 + 0x08], %f10
ldd [%o1 + 0x10], %f12
ldd [%o1 + 0x18], %f14
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
.word 0x81b02820 ! SHA1
bne,pt SIZE_T_CC, .Lhw_loop
nop
.Lhwfinish:
st %f0, [%o0 + 0x00] ! store context
st %f1, [%o0 + 0x04]
st %f2, [%o0 + 0x08]
st %f3, [%o0 + 0x0c]
retl
st %f4, [%o0 + 0x10]
.align 8
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f10
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f12
ldd [%o1 + 0x10], %f14
ldd [%o1 + 0x18], %f16
ldd [%o1 + 0x20], %f18
ldd [%o1 + 0x28], %f20
ldd [%o1 + 0x30], %f22
ldd [%o1 + 0x38], %f24
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x40], %f26
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
faligndata %f10, %f12, %f8
faligndata %f12, %f14, %f10
faligndata %f14, %f16, %f12
faligndata %f16, %f18, %f14
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
.word 0x81b02820 ! SHA1
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
.align 16
.Lsoftware:
save %sp,-STACK_FRAME,%sp
sllx $len,6,$len
add $inp,$len,$len
or %g0,1,$rot1m
sllx $rot1m,32,$rot1m
or $rot1m,1,$rot1m
ld [$ctx+0],$A
ld [$ctx+4],$B
ld [$ctx+8],$C
ld [$ctx+12],$D
ld [$ctx+16],$E
andn $inp,7,$tmp0
sethi %hi(0x5a827999),$K_00_19
or $K_00_19,%lo(0x5a827999),$K_00_19
sethi %hi(0x6ed9eba1),$K_20_39
or $K_20_39,%lo(0x6ed9eba1),$K_20_39
sethi %hi(0x8f1bbcdc),$K_40_59
or $K_40_59,%lo(0x8f1bbcdc),$K_40_59
sethi %hi(0xca62c1d6),$K_60_79
or $K_60_79,%lo(0xca62c1d6),$K_60_79
.Lloop:
ldx [$tmp0+0],@X[0]
ldx [$tmp0+16],@X[2]
ldx [$tmp0+32],@X[4]
ldx [$tmp0+48],@X[6]
and $inp,7,$tmp1
ldx [$tmp0+8],@X[1]
sll $tmp1,3,$tmp1
ldx [$tmp0+24],@X[3]
subcc %g0,$tmp1,$tmp2 ! should be 64-$tmp1, but -$tmp1 works too
ldx [$tmp0+40],@X[5]
bz,pt %icc,.Laligned
ldx [$tmp0+56],@X[7]
sllx @X[0],$tmp1,@X[0]
ldx [$tmp0+64],$tmp64
___
for($i=0;$i<7;$i++)
{ $code.=<<___;
srlx @X[$i+1],$tmp2,$Xi
sllx @X[$i+1],$tmp1,@X[$i+1]
or $Xi,@X[$i],@X[$i]
___
}
$code.=<<___;
srlx $tmp64,$tmp2,$tmp64
or $tmp64,@X[7],@X[7]
.Laligned:
srlx @X[0],32,$Xi
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
for (;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ld [$ctx+0],@X[0]
ld [$ctx+4],@X[1]
ld [$ctx+8],@X[2]
ld [$ctx+12],@X[3]
add $inp,64,$inp
ld [$ctx+16],@X[4]
cmp $inp,$len
add $A,@X[0],$A
st $A,[$ctx+0]
add $B,@X[1],$B
st $B,[$ctx+4]
add $C,@X[2],$C
st $C,[$ctx+8]
add $D,@X[3],$D
st $D,[$ctx+12]
add $E,@X[4],$E
st $E,[$ctx+16]
bne SIZE_T_CC,.Lloop
andn $inp,7,$tmp0
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my $ref,$opf;
my %visopf = ( "faligndata" => 0x048,
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/ge;
s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,608 @@
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2009
#
# Provided that UltraSPARC VIS instructions are pipe-lined(*) and
# pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
# Graphic Unit would make it possible to achieve higher instruction-
# level parallelism, ILP, and thus higher performance. It should be
# explicitly noted that ILP is the keyword, and it means that this
# code would be unsuitable for cores like UltraSPARC-Tx. The idea is
# not really novel, Sun had VIS-powered implementation for a while.
# Unlike Sun's implementation this one can process multiple unaligned
# input blocks, and as such works as drop-in replacement for OpenSSL
# sha1_block_data_order. Performance improvement was measured to be
# 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
# UltraSPARC-III. See below for discussion...
#
# The module does not present direct interest for OpenSSL, because
# it doesn't provide better performance on contemporary SPARCv9 CPUs,
# UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
# absolutely must score on UltraSPARC-I-IV can simply replace
# crypto/sha/asm/sha1-sparcv9.pl with this module.
#
# (*) "Pipe-lined" means that even if it takes several cycles to
# complete, next instruction using same functional unit [but not
# depending on the result of the current instruction] can start
# execution without having to wait for the unit. "Pairable"
# means that two [or more] independent instructions can be
# issued at the very same time.
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output=shift;
open STDOUT,">$output";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$tmp0="%i3";
$tmp1="%i4";
$tmp2="%i5";
$tmp3="%g5";
$base="%g1";
$align="%g4";
$Xfer="%o5";
$nXfer=$tmp3;
$Xi="%o7";
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
@V=($A,$B,$C,$D,$E);
$Actx="%o0";
$Bctx="%o1";
$Cctx="%o2";
$Dctx="%o3";
$Ectx="%o4";
$fmul="%f32";
$VK_00_19="%f34";
$VK_20_39="%f36";
$VK_40_59="%f38";
$VK_60_79="%f40";
@VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
@X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
"%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
# This is reference 2x-parallelized VIS-powered Xupdate procedure. It
# covers even K_NN_MM addition...
sub Xupdate {
my ($i)=@_;
my $K=@VK[($i+16)/20];
my $j=($i+16)%16;
# [ provided that GSR.alignaddr_offset is 5, $mul contains
# 0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
# chosen registers... ]
$code.=<<___;
fxors @X[($j+13)%16],@X[$j],@X[$j] !-1/-1/-1:X[0]^=X[13]
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
![fxors %f15,%f2,%f2]
for %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
![fxors %f0,%f3,%f3] !10/17/12:X[0] dependency
fpadd32 $K,@X[$j],%f20
std %f20,[$Xfer+`4*$j`]
___
# The numbers delimited with slash are the earliest possible dispatch
# cycles for given instruction assuming 1 cycle latency for simple VIS
# instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
# on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
# 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
# round. As [long as] FPU/VIS instructions are perfectly pairable with
# IALU ones, the round timing is defined by the maximum between VIS
# and IALU timings. The latter varies from round to round and averages
# out at 6.25 ticks. This means that USI&II should operate at IALU
# rate, while USIII&IV - at VIS rate. This explains why performance
# improvement varies among processors. Well, given that pure IALU
# sha1-sparcv9.pl module exhibits virtually uniform performance of
# ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
# lower limits. Real-life performance was measured to be 6.6 cycles
# per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
# half-round VIS timing, because there are 16 Xupdate-free rounds,
# which "push down" average theoretical timing to 8 cycles...
# (*) SPARC64-V[II] was originally believed to have 2 cycles VIS
# latency. Well, it might have, but it doesn't have dedicated
# VIS-unit. Instead, VIS instructions are executed by other
# functional units, ones used here - by IALU. This doesn't
# improve effective ILP...
}
# The reference Xupdate procedure is then "strained" over *pairs* of
# BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
# and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
# plenty of room to amortize for read-after-write hazard, as well as
# to fetch and align input for the next spin. The VIS instructions are
# scheduled for latency of 2 cycles, because there are not enough IALU
# instructions to schedule for latency of 3, while scheduling for 1
# would give no gain on USI&II anyway.
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
sll $b,30,$tmp2
add $tmp1,$e,$e
andn $d,$b,$tmp1
add $Xi,$e,$e
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
and $c,$b,$tmp3
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
sll $b,30,$tmp2
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
andn $d,$b,$tmp1
add $Xi,$e,$e
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
or $tmp1,$tmp3,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $tmp1,$e,$e
___
$code.=<<___ if ($i&1 && $i>=2);
std %f20,[$Xfer+`4*$l`] !
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1) && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
___
$code.=<<___ if ($i&1 && $i<64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
add $tmp1,$e,$e
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
or $tmp2,$b,$b
add $Xi,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
$code.=<<___ if ($i==64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fpadd32 $K,@X[$l],%f20
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
std %f20,[$Xfer+`4*$l`]
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>64);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $k=($j+16+2)%16; # ahead reference
my $l=($j+16-2)%16; # behind reference
my $K=@VK[($j+16-2)/20];
$j=($j+16)%16;
$code.=<<___ if (!($i&1));
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
fxors @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
srl $a,27,$tmp1
add $tmp0,$e,$e
fxor @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
and $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
or $c,$b,$tmp1
fxor %f18,@X[$j],@X[$j] ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
srl $b,2,$b
and $d,$tmp1,$tmp1
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
faligndata @X[$j],@X[$j],%f18 ! 3/ 7/ 5:Tmp=X[0,1]>>>24
or $tmp2,$b,$b
add $tmp1,$e,$e
fpadd32 @X[$j],@X[$j],@X[$j] ! 4/ 8/ 6:X[0,1]<<=1
___
$code.=<<___ if ($i&1);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
fmul8ulx16 %f18,$fmul,%f18 ! 5/10/ 7:Tmp>>=7, Tmp&=1
and $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $K,@X[$l],%f20 !
sll $b,30,$tmp2
or $c,$b,$tmp1
fxors @X[($k+13)%16],@X[$k],@X[$k] !-1/-1/-1:X[0]^=X[13]
srl $b,2,$b
and $d,$tmp1,$tmp1
fxor %f18,@X[$j],@X[$j] ! 8/14/10:X[0,1]|=Tmp
add $Xi,$e,$e
or $tmp1,$tmp0,$tmp1
or $tmp2,$b,$b
add $tmp1,$e,$e
std %f20,[$Xfer+`4*$l`] !
___
}
# If there is more data to process, then we pre-fetch the data for
# next iteration in last ten rounds...
sub BODY_70_79 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i&~1;
my $m=($i%8)*2;
$j=($j+16)%16;
$code.=<<___ if ($i==70);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
ldd [$inp+64],@X[0]
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
and $inp,-64,$nXfer
inc 64,$inp
and $nXfer,255,$nXfer
alignaddr %g0,$align,%g0
add $base,$nXfer,$nXfer
___
$code.=<<___ if ($i==71);
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i>=72);
faligndata @X[$m],@X[$m+2],@X[$m]
sll $a,5,$tmp0 !! $i
ld [$Xfer+`4*($i%16)`],$Xi
srl $a,27,$tmp1
add $tmp0,$e,$e
xor $c,$b,$tmp0
add $tmp1,$e,$e
fpadd32 $VK_00_19,@X[$m],%f20
sll $b,30,$tmp2
xor $d,$tmp0,$tmp1
srl $b,2,$b
add $tmp1,$e,$e
or $tmp2,$b,$b
add $Xi,$e,$e
___
$code.=<<___ if ($i<77);
ldd [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
___
$code.=<<___ if ($i==77); # redundant if $inp was aligned
add $align,63,$tmp0
and $tmp0,-8,$tmp0
ldd [$inp+$tmp0],@X[16]
___
$code.=<<___ if ($i>=72);
std %f20,[$nXfer+`4*$m`]
___
}
$code.=<<___;
.section ".text",#alloc,#execinstr
.align 64
vis_const:
.long 0x5a827999,0x5a827999 ! K_00_19
.long 0x6ed9eba1,0x6ed9eba1 ! K_20_39
.long 0x8f1bbcdc,0x8f1bbcdc ! K_40_59
.long 0xca62c1d6,0xca62c1d6 ! K_60_79
.long 0x00000100,0x00000100
.align 64
.type vis_const,#object
.size vis_const,(.-vis_const)
.globl sha1_block_data_order
sha1_block_data_order:
save %sp,-$frame,%sp
add %fp,$bias-256,$base
1: call .+8
add %o7,vis_const-1b,$tmp0
ldd [$tmp0+0],$VK_00_19
ldd [$tmp0+8],$VK_20_39
ldd [$tmp0+16],$VK_40_59
ldd [$tmp0+24],$VK_60_79
ldd [$tmp0+32],$fmul
ld [$ctx+0],$Actx
and $base,-256,$base
ld [$ctx+4],$Bctx
sub $base,$bias+$frame,%sp
ld [$ctx+8],$Cctx
and $inp,7,$align
ld [$ctx+12],$Dctx
and $inp,-8,$inp
ld [$ctx+16],$Ectx
! X[16] is maintained in FP register bank
alignaddr %g0,$align,%g0
ldd [$inp+0],@X[0]
sub $inp,-64,$Xfer
ldd [$inp+8],@X[2]
and $Xfer,-64,$Xfer
ldd [$inp+16],@X[4]
and $Xfer,255,$Xfer
ldd [$inp+24],@X[6]
add $base,$Xfer,$Xfer
ldd [$inp+32],@X[8]
ldd [$inp+40],@X[10]
ldd [$inp+48],@X[12]
brz,pt $align,.Laligned
ldd [$inp+56],@X[14]
ldd [$inp+64],@X[16]
faligndata @X[0],@X[2],@X[0]
faligndata @X[2],@X[4],@X[2]
faligndata @X[4],@X[6],@X[4]
faligndata @X[6],@X[8],@X[6]
faligndata @X[8],@X[10],@X[8]
faligndata @X[10],@X[12],@X[10]
faligndata @X[12],@X[14],@X[12]
faligndata @X[14],@X[16],@X[14]
.Laligned:
mov 5,$tmp0
dec 1,$len
alignaddr %g0,$tmp0,%g0
fpadd32 $VK_00_19,@X[0],%f16
fpadd32 $VK_00_19,@X[2],%f18
fpadd32 $VK_00_19,@X[4],%f20
fpadd32 $VK_00_19,@X[6],%f22
fpadd32 $VK_00_19,@X[8],%f24
fpadd32 $VK_00_19,@X[10],%f26
fpadd32 $VK_00_19,@X[12],%f28
fpadd32 $VK_00_19,@X[14],%f30
std %f16,[$Xfer+0]
mov $Actx,$A
std %f18,[$Xfer+8]
mov $Bctx,$B
std %f20,[$Xfer+16]
mov $Cctx,$C
std %f22,[$Xfer+24]
mov $Dctx,$D
std %f24,[$Xfer+32]
mov $Ectx,$E
std %f26,[$Xfer+40]
fxors @X[13],@X[0],@X[0]
std %f28,[$Xfer+48]
ba .Loop
std %f30,[$Xfer+56]
.align 32
.Loop:
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
for (;$i<70;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
tst $len
bz,pn `$bits==32?"%icc":"%xcc"`,.Ltail
nop
___
for (;$i<80;$i++) { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
mov 5,$tmp0
fxors @X[13],@X[0],@X[0]
mov $Actx,$A
mov $Bctx,$B
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer
.align 32
.Ltail:
___
for($i=70;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
add $A,$Actx,$Actx
add $B,$Bctx,$Bctx
add $C,$Cctx,$Cctx
add $D,$Dctx,$Dctx
add $E,$Ectx,$Ectx
st $Actx,[$ctx+0]
st $Bctx,[$ctx+4]
st $Cctx,[$ctx+8]
st $Dctx,[$ctx+12]
st $Ectx,[$ctx+16]
ret
restore
.type sha1_block_data_order,#function
.size sha1_block_data_order,(.-sha1_block_data_order)
.asciz "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my ($ref,$opf);
my %visopf = ( "fmul8ulx16" => 0x037,
"faligndata" => 0x048,
"fpadd32" => 0x052,
"fxor" => 0x06c,
"fxors" => 0x06d );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/gem;
$code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,266 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# sha1_block for Thumb.
#
# January 2007.
#
# The code does not present direct interest to OpenSSL, because of low
# performance. Its purpose is to establish _size_ benchmark. Pretty
# useless one I must say, because 30% or 88 bytes larger ARMv4 code
# [available on demand] is almost _twice_ as fast. It should also be
# noted that in-lining of .Lcommon and .Lrotate improves performance
# by over 40%, while code increases by only 10% or 32 bytes. But once
# again, the goal was to establish _size_ benchmark, not performance.
$output=shift;
open STDOUT,">$output";
$inline=0;
#$cheat_on_binutils=1;
$t0="r0";
$t1="r1";
$t2="r2";
$a="r3";
$b="r4";
$c="r5";
$d="r6";
$e="r7";
$K="r8"; # "upper" registers can be used in add/sub and mov insns
$ctx="r9";
$inp="r10";
$len="r11";
$Xi="r12";
sub common {
<<___;
sub $t0,#4
ldr $t1,[$t0]
add $e,$K @ E+=K_xx_xx
lsl $t2,$a,#5
add $t2,$e
lsr $e,$a,#27
add $t2,$e @ E+=ROR(A,27)
add $t2,$t1 @ E+=X[i]
___
}
sub rotate {
<<___;
mov $e,$d @ E=D
mov $d,$c @ D=C
lsl $c,$b,#30
lsr $b,$b,#2
orr $c,$b @ C=ROR(B,2)
mov $b,$a @ B=A
add $a,$t2,$t1 @ A=E+F_xx_xx(B,C,D)
___
}
sub BODY_00_19 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$c
eor $t1,$d
and $t1,$b
eor $t1,$d @ F_00_19(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_20_39 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
eor $t1,$c
eor $t1,$d @ F_20_39(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
sub BODY_40_59 {
$code.=$inline?&common():"\tbl .Lcommon\n";
$code.=<<___;
mov $t1,$b
and $t1,$c
mov $e,$b
orr $e,$c
and $e,$d
orr $t1,$e @ F_40_59(B,C,D)
___
$code.=$inline?&rotate():"\tbl .Lrotate\n";
}
$code=<<___;
.text
.code 16
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 2
sha1_block_data_order:
___
if ($cheat_on_binutils) {
$code.=<<___;
.code 32
add r3,pc,#1
bx r3 @ switch to Thumb ISA
.code 16
___
}
$code.=<<___;
push {r4-r7}
mov r3,r8
mov r4,r9
mov r5,r10
mov r6,r11
mov r7,r12
push {r3-r7,lr}
lsl r2,#6
mov $ctx,r0 @ save context
mov $inp,r1 @ save inp
mov $len,r2 @ save len
add $len,$inp @ $len to point at inp end
.Lloop:
mov $Xi,sp
mov $t2,sp
sub $t2,#16*4 @ [3]
.LXload:
ldrb $a,[$t1,#0] @ $t1 is r1 and holds inp
ldrb $b,[$t1,#1]
ldrb $c,[$t1,#2]
ldrb $d,[$t1,#3]
lsl $a,#24
lsl $b,#16
lsl $c,#8
orr $a,$b
orr $a,$c
orr $a,$d
add $t1,#4
push {$a}
cmp sp,$t2
bne .LXload @ [+14*16]
mov $inp,$t1 @ update $inp
sub $t2,#32*4
sub $t2,#32*4
mov $e,#31 @ [+4]
.LXupdate:
ldr $a,[sp,#15*4]
ldr $b,[sp,#13*4]
ldr $c,[sp,#7*4]
ldr $d,[sp,#2*4]
eor $a,$b
eor $a,$c
eor $a,$d
ror $a,$e
push {$a}
cmp sp,$t2
bne .LXupdate @ [+(11+1)*64]
ldmia $t0!,{$a,$b,$c,$d,$e} @ $t0 is r0 and holds ctx
mov $t0,$Xi
ldr $t2,.LK_00_19
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+7+4]
.L_00_19:
___
&BODY_00_19();
$code.=<<___;
cmp $Xi,$t0
bne .L_00_19 @ [+(2+9+4+2+8+2)*20]
ldr $t2,.LK_20_39
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_20_39_or_60_79:
___
&BODY_20_39();
$code.=<<___;
cmp $Xi,$t0
bne .L_20_39_or_60_79 @ [+(2+9+3+2+8+2)*20*2]
cmp sp,$t0
beq .Ldone @ [+2]
ldr $t2,.LK_40_59
mov $t1,$t0
sub $t1,#20*4
mov $Xi,$t1
mov $K,$t2 @ [+5]
.L_40_59:
___
&BODY_40_59();
$code.=<<___;
cmp $Xi,$t0
bne .L_40_59 @ [+(2+9+6+2+8+2)*20]
ldr $t2,.LK_60_79
mov $Xi,sp
mov $K,$t2
b .L_20_39_or_60_79 @ [+4]
.Ldone:
mov $t0,$ctx
ldr $t1,[$t0,#0]
ldr $t2,[$t0,#4]
add $a,$t1
ldr $t1,[$t0,#8]
add $b,$t2
ldr $t2,[$t0,#12]
add $c,$t1
ldr $t1,[$t0,#16]
add $d,$t2
add $e,$t1
stmia $t0!,{$a,$b,$c,$d,$e} @ [+20]
add sp,#80*4 @ deallocate stack frame
mov $t0,$ctx @ restore ctx
mov $t1,$inp @ restore inp
cmp $t1,$len
beq .Lexit
b .Lloop @ [+6] total 3212 cycles
.Lexit:
pop {r2-r7}
mov r8,r2
mov r9,r3
mov r10,r4
mov r11,r5
mov r12,r6
mov lr,r7
pop {r4-r7}
bx lr
.align 2
___
$code.=".Lcommon:\n".&common()."\tmov pc,lr\n" if (!$inline);
$code.=".Lrotate:\n".&rotate()."\tmov pc,lr\n" if (!$inline);
$code.=<<___;
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1
.LK_40_59: .word 0x8f1bbcdc
.LK_60_79: .word 0xca62c1d6
.size sha1_block_data_order,.-sha1_block_data_order
.asciz "SHA1 block transform for Thumb, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT; # enforce flush

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,732 @@
#! /usr/bin/env perl
# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.
# Performance is ~2x better than gcc 3.4 generated code and in "abso-
# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
# byte [on single-issue Xscale PXA250 core].
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
# September 2013.
#
# Add NEON implementation. On Cortex A8 it was measured to process one
# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
# code (meaning that latter performs sub-optimally, nothing was done
# about it).
# May 2014.
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
$len="r2"; $t1="r2";
$T1="r3"; $t3="r3";
$A="r4";
$B="r5";
$C="r6";
$D="r7";
$E="r8";
$F="r9";
$G="r10";
$H="r11";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
$t2="r12";
$Ktbl="r14";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
@ ldr $t1,[$inp],#4 @ $i
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
# ifndef __ARMEB__
rev $t1,$t1
# endif
#else
@ ldrb $t1,[$inp,#3] @ $i
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
ldrb $t2,[$inp,#2]
ldrb $t0,[$inp,#1]
orr $t1,$t1,$t2,lsl#8
ldrb $t2,[$inp],#4
orr $t1,$t1,$t0,lsl#16
# if $i==15
str $inp,[sp,#17*4] @ make room for $t4
# endif
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
orr $t1,$t1,$t2,lsl#24
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
add $h,$h,$t1 @ h+=X[i]
str $t1,[sp,#`$i%16`*4]
eor $t1,$f,$g
add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
and $t1,$t1,$e
add $h,$h,$t2 @ h+=K256[i]
eor $t1,$t1,$g @ Ch(e,f,g)
eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
add $h,$h,$t1 @ h+=Ch(e,f,g)
#if $i==31
and $t2,$t2,#0xff
cmp $t2,#0xf2 @ done?
#endif
#if $i<15
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4 @ prefetch
# else
ldrb $t1,[$inp,#3]
# endif
eor $t2,$a,$b @ a^b, b^c in next round
#else
ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
eor $t2,$a,$b @ a^b, b^c in next round
ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
#endif
eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
and $t3,$t3,$t2 @ (b^c)&=(a^b)
add $d,$d,$h @ d+=h
eor $t3,$t3,$b @ Maj(a,b,c)
add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
@ add $h,$h,$t3 @ h+=Maj(a,b,c)
___
($t2,$t3)=($t3,$t2);
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
@ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t4,[sp,#`($i+14)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
mov $t2,$t4,ror#$sigma1[0]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t2,$t2,$t4,ror#$sigma1[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
ldr $t1,[sp,#`($i+0)%16`*4]
eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
ldr $t4,[sp,#`($i+9)%16`*4]
add $t2,$t2,$t0
eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
add $t1,$t1,$t2
eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
add $t1,$t1,$t4 @ X[i]
___
&BODY_00_15(@_);
}
$code=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.type K256,%object
.align 5
K256:
.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256,.-K256
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
.Lsha256_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha256_block_data_order
#else
adr r3,.Lsha256_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256+32 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
.Loop:
# if __ARM_ARCH__>=7
ldr $t1,[$inp],#4
# else
ldrb $t1,[$inp,#3]
# endif
eor $t3,$B,$C @ magic
eor $t2,$t2,$t2
___
for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
#ifdef __thumb2__
ite eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t3,[sp,#16*4] @ pull ctx
bne .Lrounds_16_xx
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t0,[$t3,#0]
ldr $t1,[$t3,#4]
ldr $t2,[$t3,#8]
add $A,$A,$t0
ldr $t0,[$t3,#12]
add $B,$B,$t1
ldr $t1,[$t3,#16]
add $C,$C,$t2
ldr $t2,[$t3,#20]
add $D,$D,$t0
ldr $t0,[$t3,#24]
add $E,$E,$t1
ldr $t1,[$t3,#28]
add $F,$F,$t2
ldr $inp,[sp,#17*4] @ pull inp
ldr $t2,[sp,#18*4] @ pull inp+len
add $G,$G,$t0
add $H,$H,$t1
stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
cmp $inp,$t2
sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
___
######################################################################
# NEON stuff
#
{{{
my @X=map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
my $Xfer=$t4;
my $j=0;
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
&vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
&vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&veor ($T5,$T5,$T4); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
while($#insns>=2) { eval(shift(@insns)); }
&vst1_32 ("{$T0}","[$Xfer,:128]!");
eval(shift(@insns));
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vld1_32 ("{$T0}","[$Ktbl,:128]!");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vrev32_8 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&vadd_i32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&vst1_32 ("{$T0}","[$Xfer,:128]!");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&eor ($t1,$f,$g)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&and ($t1,$t1,$e)',
'&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&eor ($t1,$t1,$g)', # Ch(e,f,g)
'&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&ldr ($t1,"[sp,#64]") if ($j==31)',
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&add ($d,$d,$h)', # d+=h
'&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha256_block_data_order_neon
.type sha256_block_data_order_neon,%function
.align 5
.skip 16
sha256_block_data_order_neon:
.LNEON:
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
vld1.8 {@X[0]},[$inp]!
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
vld1.32 {$T0},[$Ktbl,:128]!
vld1.32 {$T1},[$Ktbl,:128]!
vld1.32 {$T2},[$Ktbl,:128]!
vld1.32 {$T3},[$Ktbl,:128]!
vrev32.8 @X[0],@X[0] @ yes, even on
str $ctx,[sp,#64]
vrev32.8 @X[1],@X[1] @ big-endian
str $inp,[sp,#68]
mov $Xfer,sp
vrev32.8 @X[2],@X[2]
str $len,[sp,#72]
vrev32.8 @X[3],@X[3]
str $t2,[sp,#76] @ save original sp
vadd.i32 $T0,$T0,@X[0]
vadd.i32 $T1,$T1,@X[1]
vst1.32 {$T0},[$Xfer,:128]!
vadd.i32 $T2,$T2,@X[2]
vst1.32 {$T1},[$Xfer,:128]!
vadd.i32 $T3,$T3,@X[3]
vst1.32 {$T2},[$Xfer,:128]!
vst1.32 {$T3},[$Xfer,:128]!
ldmia $ctx,{$A-$H}
sub $Xfer,$Xfer,#64
ldr $t1,[sp,#0]
eor $t2,$t2,$t2
eor $t3,$B,$C
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
teq $t1,#0 @ check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
ldr $inp,[sp,#68]
ldr $t0,[sp,#72]
sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
teq $inp,$t0
it eq
subeq $inp,$inp,#64 @ avoid SEGV
vld1.8 {@X[0]},[$inp]! @ load next input block
vld1.8 {@X[1]},[$inp]!
vld1.8 {@X[2]},[$inp]!
vld1.8 {@X[3]},[$inp]!
it ne
strne $inp,[sp,#68]
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
ldr $t0,[$t1,#0]
add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
ldr $t2,[$t1,#4]
ldr $t3,[$t1,#8]
ldr $t4,[$t1,#12]
add $A,$A,$t0 @ accumulate
ldr $t0,[$t1,#16]
add $B,$B,$t2
ldr $t2,[$t1,#20]
add $C,$C,$t3
ldr $t3,[$t1,#24]
add $D,$D,$t4
ldr $t4,[$t1,#28]
add $E,$E,$t0
str $A,[$t1],#4
add $F,$F,$t2
str $B,[$t1],#4
add $G,$G,$t3
str $C,[$t1],#4
add $H,$H,$t4
str $D,[$t1],#4
stmia $t1,{$E-$H}
ittte ne
movne $Xfer,sp
ldrne $t1,[sp,#0]
eorne $t2,$t2,$t2
ldreq sp,[sp,#76] @ restore original sp
itt ne
eorne $t3,$B,$C
bne .L_00_48
ldmia sp!,{r4-r12,pc}
.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
#endif
___
}}}
######################################################################
# ARMv8 stuff
#
{{{
my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if defined(__thumb2__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
.align 5
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
sub $Ktbl,$Ktbl,#256+32
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
b .Loop_v8
.align 4
.Loop_v8:
vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
vld1.32 {$W0},[$Ktbl]!
vrev32.8 @MSG[0],@MSG[0]
vrev32.8 @MSG[1],@MSG[1]
vrev32.8 @MSG[2],@MSG[2]
vrev32.8 @MSG[3],@MSG[3]
vmov $ABCD_SAVE,$ABCD @ offload
vmov $EFGH_SAVE,$EFGH
teq $inp,$len
___
for($i=0;$i<12;$i++) {
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
vld1.32 {$W1},[$Ktbl]!
vadd.i32 $W0,$W0,@MSG[0]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vld1.32 {$W0},[$Ktbl]!
vadd.i32 $W1,$W1,@MSG[1]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vld1.32 {$W1},[$Ktbl]
vadd.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#256-16 @ rewind
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
vadd.i32 $W1,$W1,@MSG[3]
vmov $abcd,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
it ne
bne .Loop_v8
vst1.32 {$ABCD,$EFGH},[$ctx]
ret @ bx lr
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
#endif
___
}}}
$code.=<<___;
.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
___
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
{ my %opcode = (
"sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
"sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|(($2&7)<<17)|(($2&8)<<4)
|(($3&7)<<1) |(($3&8)<<2);
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
}
}
}
foreach (split($/,$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View file

@ -0,0 +1,320 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256 for C64x+.
#
# January 2012
#
# Performance is just below 10 cycles per processed byte, which is
# almost 40% faster than compiler-generated code. Unroll is unlikely
# to give more than ~8% improvement...
#
# !!! Note that this module uses AMR, which means that all interrupt
# service routines are expected to preserve it and for own well-being
# zero it upon entry.
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
$K256="A3";
($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
=map("A$_",(16..31));
($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
=map("B$_",(16..31));
($Xia,$Xib)=("A5","B5"); # circular/ring buffer
$CTXB=$t2e;
($Xn,$X0,$K)=("B7","B8","B9");
($Maj,$Ch)=($T2,"B6");
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.nocmp
.asg sha256_block_data_order,_sha256_block_data_order
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.if .BIG_ENDIAN
.asg SWAP2,MV
.asg SWAP4,MV
.endif
.global _sha256_block_data_order
_sha256_block_data_order:
__sha256_block:
.asmfunc stack_usage(64)
MV $NUM,A0 ; reassign $NUM
|| MVK -64,B0
[!A0] BNOP RA ; if ($NUM==0) return;
|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
|| [A0] MV SP,FP
[A0] ADDKPC __sha256_block,B2
|| [A0] AND B0,SP,SP ; align stack at 64 bytes
.if __TI_EABI__
[A0] MVK 0x00404,B1
|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256
[A0] MVKH 0x50000,B1
|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256
.else
[A0] MVK 0x00404,B1
|| [A0] MVKL (K256-__sha256_block),$K256
[A0] MVKH 0x50000,B1
|| [A0] MVKH (K256-__sha256_block),$K256
.endif
[A0] MVC B1,AMR ; setup circular addressing
|| [A0] MV SP,$Xia
[A0] MV SP,$Xib
|| [A0] ADD B2,$K256,$K256
|| [A0] MV $CTXA,$CTXB
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
LDW *${CTXA}[0],$A ; load ctx
|| LDW *${CTXB}[4],$E
LDW *${CTXA}[1],$B
|| LDW *${CTXB}[5],$F
LDW *${CTXA}[2],$C
|| LDW *${CTXB}[6],$G
LDW *${CTXA}[3],$D
|| LDW *${CTXB}[7],$H
LDNW *$INP++,$Xn ; pre-fetch input
LDW *$K256++,$K ; pre-fetch K256[0]
MVK 14,B0 ; loop counters
MVK 47,B1
|| ADDAW $Xia,9,$Xia
outerloop?:
SUB A0,1,A0
|| MV $A,$Actx
|| MV $E,$Ectx
|| MVD $B,$Bctx
|| MVD $F,$Fctx
MV $C,$Cctx
|| MV $G,$Gctx
|| MVD $D,$Dctx
|| MVD $H,$Hctx
|| SWAP4 $Xn,$X0
SPLOOPD 8 ; BODY_00_14
|| MVC B0,ILC
|| SWAP2 $X0,$X0
LDNW *$INP++,$Xn
|| ROTL $A,30,$S0
|| OR $A,$B,$Maj
|| AND $A,$B,$t2a
|| ROTL $E,26,$S1
|| AND $F,$E,$Ch
|| ANDN $G,$E,$t2e
ROTL $A,19,$t0a
|| AND $C,$Maj,$Maj
|| ROTL $E,21,$t0e
|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
ROTL $A,10,$t1a
|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
|| ROTL $E,7,$t1e
|| ADD $K,$H,$T1 ; T1 = h + K256[i]
ADD $X0,$T1,$T1 ; T1 += X[i];
|| STW $X0,*$Xib++
|| XOR $t0a,$S0,$S0
|| XOR $t0e,$S1,$S1
XOR $t1a,$S0,$S0 ; Sigma0(a)
|| XOR $t1e,$S1,$S1 ; Sigma1(e)
|| LDW *$K256++,$K ; pre-fetch K256[i+1]
|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
|| ROTL $G,0,$H ; h = g
|| MV $F,$G ; g = f
|| MV $X0,$X14
|| SWAP4 $Xn,$X0
SWAP2 $X0,$X0
|| MV $E,$F ; f = e
|| ADD $D,$T1,$E ; e = d + T1
|| MV $C,$D ; d = c
MV $B,$C ; c = b
|| MV $A,$B ; b = a
|| ADD $T1,$T2,$A ; a = T1 + T2
SPKERNEL
ROTL $A,30,$S0 ; BODY_15
|| OR $A,$B,$Maj
|| AND $A,$B,$t2a
|| ROTL $E,26,$S1
|| AND $F,$E,$Ch
|| ANDN $G,$E,$t2e
|| LDW *${Xib}[1],$Xn ; modulo-scheduled
ROTL $A,19,$t0a
|| AND $C,$Maj,$Maj
|| ROTL $E,21,$t0e
|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
|| LDW *${Xib}[2],$X1 ; modulo-scheduled
ROTL $A,10,$t1a
|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
|| ROTL $E,7,$t1e
|| ADD $K,$H,$T1 ; T1 = h + K256[i]
ADD $X0,$T1,$T1 ; T1 += X[i];
|| STW $X0,*$Xib++
|| XOR $t0a,$S0,$S0
|| XOR $t0e,$S1,$S1
XOR $t1a,$S0,$S0 ; Sigma0(a)
|| XOR $t1e,$S1,$S1 ; Sigma1(e)
|| LDW *$K256++,$K ; pre-fetch K256[i+1]
|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
|| ROTL $G,0,$H ; h = g
|| MV $F,$G ; g = f
|| MV $X0,$X15
MV $E,$F ; f = e
|| ADD $D,$T1,$E ; e = d + T1
|| MV $C,$D ; d = c
|| MV $Xn,$X0 ; modulo-scheduled
|| LDW *$Xia,$X9 ; modulo-scheduled
|| ROTL $X1,25,$t0e ; modulo-scheduled
|| ROTL $X14,15,$t0a ; modulo-scheduled
SHRU $X1,3,$s0 ; modulo-scheduled
|| SHRU $X14,10,$s1 ; modulo-scheduled
|| ROTL $B,0,$C ; c = b
|| MV $A,$B ; b = a
|| ADD $T1,$T2,$A ; a = T1 + T2
SPLOOPD 10 ; BODY_16_63
|| MVC B1,ILC
|| ROTL $X1,14,$t1e ; modulo-scheduled
|| ROTL $X14,13,$t1a ; modulo-scheduled
XOR $t0e,$s0,$s0
|| XOR $t0a,$s1,$s1
|| MV $X15,$X14
|| MV $X1,$Xn
XOR $t1e,$s0,$s0 ; sigma0(X[i+1])
|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14])
|| LDW *${Xib}[2],$X1 ; module-scheduled
ROTL $A,30,$S0
|| OR $A,$B,$Maj
|| AND $A,$B,$t2a
|| ROTL $E,26,$S1
|| AND $F,$E,$Ch
|| ANDN $G,$E,$t2e
|| ADD $X9,$X0,$X0 ; X[i] += X[i+9]
ROTL $A,19,$t0a
|| AND $C,$Maj,$Maj
|| ROTL $E,21,$t0e
|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g)
|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1])
ROTL $A,10,$t1a
|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b)
|| ROTL $E,7,$t1e
|| ADD $H,$K,$T1 ; T1 = h + K256[i]
|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14])
XOR $t0a,$S0,$S0
|| XOR $t0e,$S1,$S1
|| ADD $X0,$T1,$T1 ; T1 += X[i]
|| STW $X0,*$Xib++
XOR $t1a,$S0,$S0 ; Sigma0(a)
|| XOR $t1e,$S1,$S1 ; Sigma1(e)
|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g)
|| MV $X0,$X15
|| ROTL $G,0,$H ; h = g
|| LDW *$K256++,$K ; pre-fetch K256[i+1]
ADD $S1,$T1,$T1 ; T1 += Sigma1(e)
|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c)
|| MV $F,$G ; g = f
|| MV $Xn,$X0 ; modulo-scheduled
|| LDW *++$Xia,$X9 ; modulo-scheduled
|| ROTL $X1,25,$t0e ; module-scheduled
|| ROTL $X14,15,$t0a ; modulo-scheduled
ROTL $X1,14,$t1e ; modulo-scheduled
|| ROTL $X14,13,$t1a ; modulo-scheduled
|| MV $E,$F ; f = e
|| ADD $D,$T1,$E ; e = d + T1
|| MV $C,$D ; d = c
|| MV $B,$C ; c = b
MV $A,$B ; b = a
|| ADD $T1,$T2,$A ; a = T1 + T2
|| SHRU $X1,3,$s0 ; modulo-scheduled
|| SHRU $X14,10,$s1 ; modulo-scheduled
SPKERNEL
[A0] B outerloop?
|| [A0] LDNW *$INP++,$Xn ; pre-fetch input
|| [A0] ADDK -260,$K256 ; rewind K256
|| ADD $Actx,$A,$A ; accumulate ctx
|| ADD $Ectx,$E,$E
|| ADD $Bctx,$B,$B
ADD $Fctx,$F,$F
|| ADD $Cctx,$C,$C
|| ADD $Gctx,$G,$G
|| ADD $Dctx,$D,$D
|| ADD $Hctx,$H,$H
|| [A0] LDW *$K256++,$K ; pre-fetch K256[0]
[!A0] BNOP RA
||[!A0] MV $CTXA,$CTXB
[!A0] MV FP,SP ; restore stack pointer
||[!A0] LDW *FP[0],FP ; restore frame pointer
[!A0] STW $A,*${CTXA}[0] ; save ctx
||[!A0] STW $E,*${CTXB}[4]
||[!A0] MVK 0,B0
[!A0] STW $B,*${CTXA}[1]
||[!A0] STW $F,*${CTXB}[5]
||[!A0] MVC B0,AMR ; clear AMR
STW $C,*${CTXA}[2]
|| STW $G,*${CTXB}[6]
STW $D,*${CTXA}[3]
|| STW $H,*${CTXB}[7]
.endasmfunc
.if __TI_EABI__
.sect ".text:sha_asm.const"
.else
.sect ".const:sha_asm"
.endif
.align 128
K256:
.uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,925 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA512 block transform for x86. September 2007.
#
# May 2013.
#
# Add SSSE3 code path, 20-25% improvement [over original SSE2 code].
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm SIMD(*) x86_64(**)
# Pentium 100 97 61 - -
# PIII 75 77 56 - -
# P4 116 95 82 34.6 30.8
# AMD K8 54 55 36 20.7 9.57
# Core2 66 57 40 15.9 9.97
# Westmere 70 - 38 12.2 9.58
# Sandy Bridge 58 - 35 11.9 11.2
# Ivy Bridge 50 - 33 11.5 8.17
# Haswell 46 - 29 11.3 7.66
# Skylake 40 - 26 13.3 7.25
# Bulldozer 121 - 50 14.0 13.5
# VIA Nano 91 - 52 33 14.7
# Atom 126 - 68 48(***) 14.7
# Silvermont 97 - 58 42(***) 17.5
# Goldmont 80 - 48 19.5 12.0
#
# (*) whichever best applicable.
# (**) x86_64 assembler performance is presented for reference
# purposes, the results are for integer-only code.
# (***) paddq is incredibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
# while on PIII - ~35%. On newer µ-archs improvement varies from 15%
# to 50%, but it's less important as they are expected to execute SSE2
# code-path, which is commonly ~2-3x faster [than compiler generated
# code]. SSE2 code-path is as fast as original sha512-sse2.pl, even
# though it does not use 128-bit operations. The latter means that
# SSE2-aware kernel is no longer required to execute the code. Another
# difference is that new code optimizes amount of writes, but at the
# cost of increased data cache "footprint" by 1/2KB.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output=pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$Tlo=&DWP(0,"esp"); $Thi=&DWP(4,"esp");
$Alo=&DWP(8,"esp"); $Ahi=&DWP(8+4,"esp");
$Blo=&DWP(16,"esp"); $Bhi=&DWP(16+4,"esp");
$Clo=&DWP(24,"esp"); $Chi=&DWP(24+4,"esp");
$Dlo=&DWP(32,"esp"); $Dhi=&DWP(32+4,"esp");
$Elo=&DWP(40,"esp"); $Ehi=&DWP(40+4,"esp");
$Flo=&DWP(48,"esp"); $Fhi=&DWP(48+4,"esp");
$Glo=&DWP(56,"esp"); $Ghi=&DWP(56+4,"esp");
$Hlo=&DWP(64,"esp"); $Hhi=&DWP(64+4,"esp");
$K512="ebp";
$Asse2=&QWP(0,"esp");
$Bsse2=&QWP(8,"esp");
$Csse2=&QWP(16,"esp");
$Dsse2=&QWP(24,"esp");
$Esse2=&QWP(32,"esp");
$Fsse2=&QWP(40,"esp");
$Gsse2=&QWP(48,"esp");
$Hsse2=&QWP(56,"esp");
$A="mm0"; # B-D and
$E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
$BxC="mm2"; # ... except for B^C
sub BODY_00_15_sse2 {
my $phase=shift;
#&movq ("mm5",$Fsse2); # load f
#&movq ("mm6",$Gsse2); # load g
&movq ("mm1",$E); # %mm1 is sliding right
&pxor ("mm5","mm6"); # f^=g
&psrlq ("mm1",14);
&movq ($Esse2,$E); # modulo-scheduled save e
&pand ("mm5",$E); # f&=e
&psllq ($E,23); # $E is sliding left
&movq ($A,"mm3") if ($phase<2);
&movq (&QWP(8*9,"esp"),"mm7") # save X[i]
&movq ("mm3","mm1"); # %mm3 is T1
&psrlq ("mm1",4);
&pxor ("mm5","mm6"); # Ch(e,f,g)
&pxor ("mm3",$E);
&psllq ($E,23);
&pxor ("mm3","mm1");
&movq ($Asse2,$A); # modulo-scheduled save a
&paddq ("mm7","mm5"); # X[i]+=Ch(e,f,g)
&pxor ("mm3",$E);
&psrlq ("mm1",23);
&paddq ("mm7",$Hsse2); # X[i]+=h
&pxor ("mm3","mm1");
&psllq ($E,4);
&paddq ("mm7",QWP(0,$K512)); # X[i]+=K512[i]
&pxor ("mm3",$E); # T1=Sigma1_512(e)
&movq ($E,$Dsse2); # e = load d, e in next round
&paddq ("mm3","mm7"); # T1+=X[i]
&movq ("mm5",$A); # %mm5 is sliding right
&psrlq ("mm5",28);
&paddq ($E,"mm3"); # d += T1
&movq ("mm6",$A); # %mm6 is sliding left
&movq ("mm7","mm5");
&psllq ("mm6",25);
&movq ("mm1",$Bsse2); # load b
&psrlq ("mm5",6);
&pxor ("mm7","mm6");
&sub ("esp",8);
&psllq ("mm6",5);
&pxor ("mm7","mm5");
&pxor ($A,"mm1"); # a^b, b^c in next round
&psrlq ("mm5",5);
&pxor ("mm7","mm6");
&pand ($BxC,$A); # (b^c)&(a^b)
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&pxor ($BxC,"mm1"); # [h=]Maj(a,b,c)
&pxor ("mm6","mm7"); # Sigma0_512(a)
&movq ("mm7",&QWP(8*(9+16-1),"esp")) if ($phase!=0); # pre-fetch
&movq ("mm5",$Fsse2) if ($phase==0); # load f
if ($phase>1) {
&paddq ($BxC,"mm6"); # h+=Sigma0(a)
&add ($K512,8);
#&paddq ($BxC,"mm3"); # h+=T1
($A,$BxC) = ($BxC,$A); # rotate registers
} else {
&paddq ("mm3",$BxC); # T1+=Maj(a,b,c)
&movq ($BxC,$A);
&add ($K512,8);
&paddq ("mm3","mm6"); # T1+=Sigma0(a)
&movq ("mm6",$Gsse2) if ($phase==0); # load g
#&movq ($A,"mm3"); # h=T1
}
}
sub BODY_00_15_x86 {
#define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
# LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
# HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
&mov ("ecx",$Elo);
&mov ("edx",$Ehi);
&mov ("esi","ecx");
&shr ("ecx",9); # lo>>9
&mov ("edi","edx");
&shr ("edx",9); # hi>>9
&mov ("ebx","ecx");
&shl ("esi",14); # lo<<14
&mov ("eax","edx");
&shl ("edi",14); # hi<<14
&xor ("ebx","esi");
&shr ("ecx",14-9); # lo>>14
&xor ("eax","edi");
&shr ("edx",14-9); # hi>>14
&xor ("eax","ecx");
&shl ("esi",18-14); # lo<<18
&xor ("ebx","edx");
&shl ("edi",18-14); # hi<<18
&xor ("ebx","esi");
&shr ("ecx",18-14); # lo>>18
&xor ("eax","edi");
&shr ("edx",18-14); # hi>>18
&xor ("eax","ecx");
&shl ("esi",23-18); # lo<<23
&xor ("ebx","edx");
&shl ("edi",23-18); # hi<<23
&xor ("eax","esi");
&xor ("ebx","edi"); # T1 = Sigma1(e)
&mov ("ecx",$Flo);
&mov ("edx",$Fhi);
&mov ("esi",$Glo);
&mov ("edi",$Ghi);
&add ("eax",$Hlo);
&adc ("ebx",$Hhi); # T1 += h
&xor ("ecx","esi");
&xor ("edx","edi");
&and ("ecx",$Elo);
&and ("edx",$Ehi);
&add ("eax",&DWP(8*(9+15)+0,"esp"));
&adc ("ebx",&DWP(8*(9+15)+4,"esp")); # T1 += X[0]
&xor ("ecx","esi");
&xor ("edx","edi"); # Ch(e,f,g) = (f^g)&e)^g
&mov ("esi",&DWP(0,$K512));
&mov ("edi",&DWP(4,$K512)); # K[i]
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Ch(e,f,g)
&mov ("ecx",$Dlo);
&mov ("edx",$Dhi);
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += K[i]
&mov ($Tlo,"eax");
&mov ($Thi,"ebx"); # put T1 away
&add ("eax","ecx");
&adc ("ebx","edx"); # d += T1
#define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
# LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
# HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ($Dlo,"eax");
&mov ($Dhi,"ebx");
&mov ("esi","ecx");
&shr ("ecx",2); # lo>>2
&mov ("edi","edx");
&shr ("edx",2); # hi>>2
&mov ("ebx","ecx");
&shl ("esi",4); # lo<<4
&mov ("eax","edx");
&shl ("edi",4); # hi<<4
&xor ("ebx","esi");
&shr ("ecx",7-2); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-2); # hi>>7
&xor ("ebx","ecx");
&shl ("esi",25-4); # lo<<25
&xor ("eax","edx");
&shl ("edi",25-4); # hi<<25
&xor ("eax","esi");
&shr ("ecx",28-7); # lo>>28
&xor ("ebx","edi");
&shr ("edx",28-7); # hi>>28
&xor ("eax","ecx");
&shl ("esi",30-25); # lo<<30
&xor ("ebx","edx");
&shl ("edi",30-25); # hi<<30
&xor ("eax","esi");
&xor ("ebx","edi"); # Sigma0(a)
&mov ("ecx",$Alo);
&mov ("edx",$Ahi);
&mov ("esi",$Blo);
&mov ("edi",$Bhi);
&add ("eax",$Tlo);
&adc ("ebx",$Thi); # T1 = Sigma0(a)+T1
&or ("ecx","esi");
&or ("edx","edi");
&and ("ecx",$Clo);
&and ("edx",$Chi);
&and ("esi",$Alo);
&and ("edi",$Ahi);
&or ("ecx","esi");
&or ("edx","edi"); # Maj(a,b,c) = ((a|b)&c)|(a&b)
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += Maj(a,b,c)
&mov ($Tlo,"eax");
&mov ($Thi,"ebx");
&mov (&LB("edx"),&BP(0,$K512)); # pre-fetch LSB of *K
&sub ("esp",8);
&lea ($K512,&DWP(8,$K512)); # K++
}
&function_begin("sha512_block_data_order");
&mov ("esi",wparam(0)); # ctx
&mov ("edi",wparam(1)); # inp
&mov ("eax",wparam(2)); # num
&mov ("ebx","esp"); # saved sp
&call (&label("pic_point")); # make it PIC!
&set_label("pic_point");
&blindpop($K512);
&lea ($K512,&DWP(&label("K512")."-".&label("pic_point"),$K512));
&sub ("esp",16);
&and ("esp",-64);
&shl ("eax",7);
&add ("eax","edi");
&mov (&DWP(0,"esp"),"esi"); # ctx
&mov (&DWP(4,"esp"),"edi"); # inp
&mov (&DWP(8,"esp"),"eax"); # inp+num*128
&mov (&DWP(12,"esp"),"ebx"); # saved sp
if ($sse2) {
&picmeup("edx","OPENSSL_ia32cap_P",$K512,&label("K512"));
&mov ("ecx",&DWP(0,"edx"));
&test ("ecx",1<<26);
&jz (&label("loop_x86"));
&mov ("edx",&DWP(4,"edx"));
# load ctx->h[0-7]
&movq ($A,&QWP(0,"esi"));
&and ("ecx",1<<24); # XMM registers availability
&movq ("mm1",&QWP(8,"esi"));
&and ("edx",1<<9); # SSSE3 bit
&movq ($BxC,&QWP(16,"esi"));
&or ("ecx","edx");
&movq ("mm3",&QWP(24,"esi"));
&movq ($E,&QWP(32,"esi"));
&movq ("mm5",&QWP(40,"esi"));
&movq ("mm6",&QWP(48,"esi"));
&movq ("mm7",&QWP(56,"esi"));
&cmp ("ecx",1<<24|1<<9);
&je (&label("SSSE3"));
&sub ("esp",8*10);
&jmp (&label("loop_sse2"));
&set_label("loop_sse2",16);
#&movq ($Asse2,$A);
&movq ($Bsse2,"mm1");
&movq ($Csse2,$BxC);
&movq ($Dsse2,"mm3");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&movq ($Gsse2,"mm6");
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&movq ("mm3",$A); # magic
&mov ("eax",&DWP(0,"edi"));
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&mov ("edx",15); # counter
&bswap ("eax");
&bswap ("ebx");
&jmp (&label("00_14_sse2"));
&set_label("00_14_sse2",16);
&movd ("mm1","eax");
&mov ("eax",&DWP(0,"edi"));
&movd ("mm7","ebx");
&mov ("ebx",&DWP(4,"edi"));
&add ("edi",8);
&bswap ("eax");
&bswap ("ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2();
&dec ("edx");
&jnz (&label("00_14_sse2"));
&movd ("mm1","eax");
&movd ("mm7","ebx");
&punpckldq("mm7","mm1");
&BODY_00_15_sse2(1);
&pxor ($A,$A); # A is in %mm3
&mov ("edx",32); # counter
&jmp (&label("16_79_sse2"));
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);
&movq ("mm6","mm5");
&psrlq ("mm5",6);
&psllq ("mm1",56);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm3","mm7");
&psrlq ("mm7",7-1);
&pxor ("mm3","mm1");
&psllq ("mm1",63-56);
&pxor ("mm3","mm7");
&psrlq ("mm7",8-7);
&pxor ("mm3","mm1");
&movq ("mm1","mm5");
&psrlq ("mm5",19-6);
&pxor ("mm7","mm3"); # sigma0
&psllq ("mm6",3);
&pxor ("mm1","mm5");
&paddq ("mm7",&QWP(8*(9+16),"esp"));
&pxor ("mm1","mm6");
&psrlq ("mm5",61-19);
&paddq ("mm7",&QWP(8*(9+16-9),"esp"));
&pxor ("mm1","mm5");
&psllq ("mm6",45-3);
&movq ("mm5",$Fsse2); # load f
&pxor ("mm1","mm6"); # sigma1
&movq ("mm6",$Gsse2); # load g
&paddq ("mm7","mm1"); # X[i]
#&movq (&QWP(8*9,"esp"),"mm7"); # moved to BODY_00_15
&BODY_00_15_sse2(2);
}
&dec ("edx");
&jnz (&label("16_79_sse2"));
#&movq ($A,$Asse2);
&paddq ($A,"mm3"); # from BODY_00_15
&movq ("mm1",$Bsse2);
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
&movq ("mm5",$Fsse2);
&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&mov ("eax",8*80);
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&lea ("esp",&DWP(0,"esp","eax")); # destroy frame
&sub ($K512,"eax"); # rewind K
&cmp ("edi",&DWP(8*10+8,"esp")); # are we done yet?
&jb (&label("loop_sse2"));
&mov ("esp",&DWP(8*10+12,"esp")); # restore sp
&emms ();
&function_end_A();
&set_label("SSSE3",32);
{ my ($cnt,$frame)=("ecx","edx");
my @X=map("xmm$_",(0..7));
my $j;
my $i=0;
&lea ($frame,&DWP(-64,"esp"));
&sub ("esp",256);
# fixed stack frame layout
#
# +0 A B C D E F G H # backing store
# +64 X[0]+K[i] .. X[15]+K[i] # XMM->MM xfer area
# +192 # XMM off-load ring buffer
# +256 # saved parameters
&movdqa (@X[1],&QWP(80*8,$K512)); # byte swap mask
&movdqu (@X[0],&QWP(0,"edi"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) {
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"edi")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]); # xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&jmp (&label("loop_ssse3"));
&nop ();
&set_label("loop_ssse3",32);
&movdqa (@X[2],&QWP(16*(($j+1)%4),$frame)); # pre-restore @X[1]
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]); # off-load @X[3]
&lea ($K512,&DWP(16*8,$K512));
#&movq ($Asse2,$A); # off-load A-H
&movq ($Bsse2,"mm1");
&mov ("ebx","edi");
&movq ($Csse2,$BxC);
&lea ("edi",&DWP(128,"edi")); # advance input
&movq ($Dsse2,"mm3");
&cmp ("edi","eax");
#&movq ($Esse2,$E);
&movq ($Fsse2,"mm5");
&cmovb ("ebx","edi");
&movq ($Gsse2,"mm6");
&mov ("ecx",4); # loop counter
&pxor ($BxC,"mm1"); # magic
&movq ($Hsse2,"mm7");
&pxor ("mm3","mm3"); # magic
&jmp (&label("00_47_ssse3"));
sub BODY_00_15_ssse3 { # "phase-less" copy of BODY_00_15_sse2
(
'&movq ("mm1",$E)', # %mm1 is sliding right
'&movq ("mm7",&QWP(((-8*$i)%128)-128,$frame))',# X[i]+K[i]
'&pxor ("mm5","mm6")', # f^=g
'&psrlq ("mm1",14)',
'&movq (&QWP(8*($i+4)%64,"esp"),$E)', # modulo-scheduled save e
'&pand ("mm5",$E)', # f&=e
'&psllq ($E,23)', # $E is sliding left
'&paddq ($A,"mm3")', # [h+=Maj(a,b,c)]
'&movq ("mm3","mm1")', # %mm3 is T1
'&psrlq("mm1",4)',
'&pxor ("mm5","mm6")', # Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psllq($E,23)',
'&pxor ("mm3","mm1")',
'&movq (&QWP(8*$i%64,"esp"),$A)', # modulo-scheduled save a
'&paddq("mm7","mm5")', # X[i]+=Ch(e,f,g)
'&pxor ("mm3",$E)',
'&psrlq("mm1",23)',
'&paddq("mm7",&QWP(8*($i+7)%64,"esp"))', # X[i]+=h
'&pxor ("mm3","mm1")',
'&psllq($E,4)',
'&pxor ("mm3",$E)', # T1=Sigma1_512(e)
'&movq ($E,&QWP(8*($i+3)%64,"esp"))', # e = load d, e in next round
'&paddq ("mm3","mm7")', # T1+=X[i]
'&movq ("mm5",$A)', # %mm5 is sliding right
'&psrlq("mm5",28)',
'&paddq ($E,"mm3")', # d += T1
'&movq ("mm6",$A)', # %mm6 is sliding left
'&movq ("mm7","mm5")',
'&psllq("mm6",25)',
'&movq ("mm1",&QWP(8*($i+1)%64,"esp"))', # load b
'&psrlq("mm5",6)',
'&pxor ("mm7","mm6")',
'&psllq("mm6",5)',
'&pxor ("mm7","mm5")',
'&pxor ($A,"mm1")', # a^b, b^c in next round
'&psrlq("mm5",5)',
'&pxor ("mm7","mm6")',
'&pand ($BxC,$A)', # (b^c)&(a^b)
'&psllq("mm6",6)',
'&pxor ("mm7","mm5")',
'&pxor ($BxC,"mm1")', # [h=]Maj(a,b,c)
'&pxor ("mm6","mm7")', # Sigma0_512(a)
'&movq ("mm5",&QWP(8*($i+5-1)%64,"esp"))', # pre-load f
'&paddq ($BxC,"mm6")', # h+=Sigma0(a)
'&movq ("mm6",&QWP(8*($i+6-1)%64,"esp"))', # pre-load g
'($A,$BxC) = ($BxC,$A); $i--;'
);
}
&set_label("00_47_ssse3",32);
for(;$j<16;$j++) {
my ($t0,$t2,$t1)=@X[2..4];
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa ($t2,@X[5]);
&movdqa (@X[1],$t0); # restore @X[1]
&palignr ($t0,@X[0],8); # X[1..2]
&movdqa (&QWP(16*($j%4),$frame),@X[4]); # off-load @X[4]
&palignr ($t2,@X[4],8); # X[9..10]
&movdqa ($t1,$t0);
&psrlq ($t0,7);
&paddq (@X[0],$t2); # X[0..1] += X[9..10]
&movdqa ($t2,$t1);
&psrlq ($t1,1);
&psllq ($t2,64-8);
&pxor ($t0,$t1);
&psrlq ($t1,8-1);
&pxor ($t0,$t2);
&psllq ($t2,8-1);
&pxor ($t0,$t1);
&movdqa ($t1,@X[7]);
&pxor ($t0,$t2); # sigma0(X[1..2])
&movdqa ($t2,@X[7]);
&psrlq ($t1,6);
&paddq (@X[0],$t0); # X[0..1] += sigma0(X[1..2])
&movdqa ($t0,@X[7]);
&psrlq ($t2,19);
&psllq ($t0,64-61);
&pxor ($t1,$t2);
&psrlq ($t2,61-19);
&pxor ($t1,$t0);
&psllq ($t0,61-19);
&pxor ($t1,$t2);
&movdqa ($t2,&QWP(16*(($j+2)%4),$frame));# pre-restore @X[1]
&pxor ($t1,$t0); # sigma0(X[1..2])
&movdqa ($t0,&QWP(16*($j%8),$K512));
eval(shift(@insns));
&paddq (@X[0],$t1); # X[0..1] += sigma0(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&paddq ($t0,@X[0]);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),$t0);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
&lea ($K512,&DWP(16*8,$K512));
&dec ("ecx");
&jnz (&label("00_47_ssse3"));
&movdqa (@X[1],&QWP(0,$K512)); # byte swap mask
&lea ($K512,&DWP(-80*8,$K512)); # rewind
&movdqu (@X[0],&QWP(0,"ebx"));
&pshufb (@X[0],@X[1]);
for ($j=0;$j<8;$j++) { # load next or same block
my @insns = (&BODY_00_15_ssse3(),&BODY_00_15_ssse3());
&movdqa (&QWP(16*(($j-1)%4),$frame),@X[3]) if ($j>4); # off-load
&movdqa (@X[3],&QWP(16*($j%8),$K512));
&movdqa (@X[2],@X[1]) if ($j<7); # perpetuate byte swap mask
&movdqu (@X[1],&QWP(16*($j+1),"ebx")) if ($j<7); # next input
&movdqa (@X[1],&QWP(16*(($j+1)%4),$frame)) if ($j==7);# restore @X[0]
&paddq (@X[3],@X[0]);
&pshufb (@X[1],@X[2]) if ($j<7);
foreach(@insns) { eval; }
&movdqa (&QWP(16*($j%8)-128,$frame),@X[3]);# xfer X[i]+K[i]
push(@X,shift(@X)); # rotate(@X)
}
#&movq ($A,$Asse2); # load A-H
&movq ("mm1",$Bsse2);
&paddq ($A,"mm3"); # from BODY_00_15
#&movq ($BxC,$Csse2);
&movq ("mm3",$Dsse2);
#&movq ($E,$Esse2);
#&movq ("mm5",$Fsse2);
#&movq ("mm6",$Gsse2);
&movq ("mm7",$Hsse2);
&pxor ($BxC,"mm1"); # de-magic
&paddq ($A,&QWP(0,"esi"));
&paddq ("mm1",&QWP(8,"esi"));
&paddq ($BxC,&QWP(16,"esi"));
&paddq ("mm3",&QWP(24,"esi"));
&paddq ($E,&QWP(32,"esi"));
&paddq ("mm5",&QWP(40,"esi"));
&paddq ("mm6",&QWP(48,"esi"));
&paddq ("mm7",&QWP(56,"esi"));
&movq (&QWP(0,"esi"),$A);
&movq (&QWP(8,"esi"),"mm1");
&movq (&QWP(16,"esi"),$BxC);
&movq (&QWP(24,"esi"),"mm3");
&movq (&QWP(32,"esi"),$E);
&movq (&QWP(40,"esi"),"mm5");
&movq (&QWP(48,"esi"),"mm6");
&movq (&QWP(56,"esi"),"mm7");
&cmp ("edi","eax") # are we done yet?
&jb (&label("loop_ssse3"));
&mov ("esp",&DWP(64+12,$frame)); # restore sp
&emms ();
}
&function_end_A();
}
&set_label("loop_x86",16);
# copy input block to stack reversing byte and qword order
for ($i=0;$i<8;$i++) {
&mov ("eax",&DWP($i*16+0,"edi"));
&mov ("ebx",&DWP($i*16+4,"edi"));
&mov ("ecx",&DWP($i*16+8,"edi"));
&mov ("edx",&DWP($i*16+12,"edi"));
&bswap ("eax");
&bswap ("ebx");
&bswap ("ecx");
&bswap ("edx");
&push ("eax");
&push ("ebx");
&push ("ecx");
&push ("edx");
}
&add ("edi",128);
&sub ("esp",9*8); # place for T,A,B,C,D,E,F,G,H
&mov (&DWP(8*(9+16)+4,"esp"),"edi");
# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
&lea ("edi",&DWP(8,"esp"));
&mov ("ecx",16);
&data_word(0xA5F3F689); # rep movsd
&set_label("00_15_x86",16);
&BODY_00_15_x86();
&cmp (&LB("edx"),0x94);
&jne (&label("00_15_x86"));
&set_label("16_79_x86",16);
#define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
# LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
# HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
&mov ("ecx",&DWP(8*(9+15+16-1)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-1)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",1); # lo>>1
&mov ("edi","edx");
&shr ("edx",1); # hi>>1
&mov ("eax","ecx");
&shl ("esi",24); # lo<<24
&mov ("ebx","edx");
&shl ("edi",24); # hi<<24
&xor ("ebx","esi");
&shr ("ecx",7-1); # lo>>7
&xor ("eax","edi");
&shr ("edx",7-1); # hi>>7
&xor ("eax","ecx");
&shl ("esi",31-24); # lo<<31
&xor ("ebx","edx");
&shl ("edi",25-24); # hi<<25
&xor ("ebx","esi");
&shr ("ecx",8-7); # lo>>8
&xor ("eax","edi");
&shr ("edx",8-7); # hi>>8
&xor ("eax","ecx");
&shl ("edi",31-25); # hi<<31
&xor ("ebx","edx");
&xor ("eax","edi"); # T1 = sigma0(X[-15])
&mov (&DWP(0,"esp"),"eax");
&mov (&DWP(4,"esp"),"ebx"); # put T1 away
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
# LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
# HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
&mov ("ecx",&DWP(8*(9+15+16-14)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16-14)+4,"esp"));
&mov ("esi","ecx");
&shr ("ecx",6); # lo>>6
&mov ("edi","edx");
&shr ("edx",6); # hi>>6
&mov ("eax","ecx");
&shl ("esi",3); # lo<<3
&mov ("ebx","edx");
&shl ("edi",3); # hi<<3
&xor ("eax","esi");
&shr ("ecx",19-6); # lo>>19
&xor ("ebx","edi");
&shr ("edx",19-6); # hi>>19
&xor ("eax","ecx");
&shl ("esi",13-3); # lo<<13
&xor ("ebx","edx");
&shl ("edi",13-3); # hi<<13
&xor ("ebx","esi");
&shr ("ecx",29-19); # lo>>29
&xor ("eax","edi");
&shr ("edx",29-19); # hi>>29
&xor ("ebx","ecx");
&shl ("edi",26-13); # hi<<26
&xor ("eax","edx");
&xor ("eax","edi"); # sigma1(X[-2])
&mov ("ecx",&DWP(8*(9+15+16)+0,"esp"));
&mov ("edx",&DWP(8*(9+15+16)+4,"esp"));
&add ("eax",&DWP(0,"esp"));
&adc ("ebx",&DWP(4,"esp")); # T1 = sigma1(X[-2])+T1
&mov ("esi",&DWP(8*(9+15+16-9)+0,"esp"));
&mov ("edi",&DWP(8*(9+15+16-9)+4,"esp"));
&add ("eax","ecx");
&adc ("ebx","edx"); # T1 += X[-16]
&add ("eax","esi");
&adc ("ebx","edi"); # T1 += X[-7]
&mov (&DWP(8*(9+15)+0,"esp"),"eax");
&mov (&DWP(8*(9+15)+4,"esp"),"ebx"); # save X[0]
&BODY_00_15_x86();
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_x86"));
&mov ("esi",&DWP(8*(9+16+80)+0,"esp"));# ctx
&mov ("edi",&DWP(8*(9+16+80)+4,"esp"));# inp
for($i=0;$i<4;$i++) {
&mov ("eax",&DWP($i*16+0,"esi"));
&mov ("ebx",&DWP($i*16+4,"esi"));
&mov ("ecx",&DWP($i*16+8,"esi"));
&mov ("edx",&DWP($i*16+12,"esi"));
&add ("eax",&DWP(8+($i*16)+0,"esp"));
&adc ("ebx",&DWP(8+($i*16)+4,"esp"));
&mov (&DWP($i*16+0,"esi"),"eax");
&mov (&DWP($i*16+4,"esi"),"ebx");
&add ("ecx",&DWP(8+($i*16)+8,"esp"));
&adc ("edx",&DWP(8+($i*16)+12,"esp"));
&mov (&DWP($i*16+8,"esi"),"ecx");
&mov (&DWP($i*16+12,"esi"),"edx");
}
&add ("esp",8*(9+16+80)); # destroy frame
&sub ($K512,8*80); # rewind K
&cmp ("edi",&DWP(8,"esp")); # are we done yet?
&jb (&label("loop_x86"));
&mov ("esp",&DWP(12,"esp")); # restore sp
&function_end_A();
&set_label("K512",64); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
&data_word(0x23ef65cd,0x71374491); # u64
&data_word(0xec4d3b2f,0xb5c0fbcf); # u64
&data_word(0x8189dbbc,0xe9b5dba5); # u64
&data_word(0xf348b538,0x3956c25b); # u64
&data_word(0xb605d019,0x59f111f1); # u64
&data_word(0xaf194f9b,0x923f82a4); # u64
&data_word(0xda6d8118,0xab1c5ed5); # u64
&data_word(0xa3030242,0xd807aa98); # u64
&data_word(0x45706fbe,0x12835b01); # u64
&data_word(0x4ee4b28c,0x243185be); # u64
&data_word(0xd5ffb4e2,0x550c7dc3); # u64
&data_word(0xf27b896f,0x72be5d74); # u64
&data_word(0x3b1696b1,0x80deb1fe); # u64
&data_word(0x25c71235,0x9bdc06a7); # u64
&data_word(0xcf692694,0xc19bf174); # u64
&data_word(0x9ef14ad2,0xe49b69c1); # u64
&data_word(0x384f25e3,0xefbe4786); # u64
&data_word(0x8b8cd5b5,0x0fc19dc6); # u64
&data_word(0x77ac9c65,0x240ca1cc); # u64
&data_word(0x592b0275,0x2de92c6f); # u64
&data_word(0x6ea6e483,0x4a7484aa); # u64
&data_word(0xbd41fbd4,0x5cb0a9dc); # u64
&data_word(0x831153b5,0x76f988da); # u64
&data_word(0xee66dfab,0x983e5152); # u64
&data_word(0x2db43210,0xa831c66d); # u64
&data_word(0x98fb213f,0xb00327c8); # u64
&data_word(0xbeef0ee4,0xbf597fc7); # u64
&data_word(0x3da88fc2,0xc6e00bf3); # u64
&data_word(0x930aa725,0xd5a79147); # u64
&data_word(0xe003826f,0x06ca6351); # u64
&data_word(0x0a0e6e70,0x14292967); # u64
&data_word(0x46d22ffc,0x27b70a85); # u64
&data_word(0x5c26c926,0x2e1b2138); # u64
&data_word(0x5ac42aed,0x4d2c6dfc); # u64
&data_word(0x9d95b3df,0x53380d13); # u64
&data_word(0x8baf63de,0x650a7354); # u64
&data_word(0x3c77b2a8,0x766a0abb); # u64
&data_word(0x47edaee6,0x81c2c92e); # u64
&data_word(0x1482353b,0x92722c85); # u64
&data_word(0x4cf10364,0xa2bfe8a1); # u64
&data_word(0xbc423001,0xa81a664b); # u64
&data_word(0xd0f89791,0xc24b8b70); # u64
&data_word(0x0654be30,0xc76c51a3); # u64
&data_word(0xd6ef5218,0xd192e819); # u64
&data_word(0x5565a910,0xd6990624); # u64
&data_word(0x5771202a,0xf40e3585); # u64
&data_word(0x32bbd1b8,0x106aa070); # u64
&data_word(0xb8d2d0c8,0x19a4c116); # u64
&data_word(0x5141ab53,0x1e376c08); # u64
&data_word(0xdf8eeb99,0x2748774c); # u64
&data_word(0xe19b48a8,0x34b0bcb5); # u64
&data_word(0xc5c95a63,0x391c0cb3); # u64
&data_word(0xe3418acb,0x4ed8aa4a); # u64
&data_word(0x7763e373,0x5b9cca4f); # u64
&data_word(0xd6b2b8a3,0x682e6ff3); # u64
&data_word(0x5defb2fc,0x748f82ee); # u64
&data_word(0x43172f60,0x78a5636f); # u64
&data_word(0xa1f0ab72,0x84c87814); # u64
&data_word(0x1a6439ec,0x8cc70208); # u64
&data_word(0x23631e28,0x90befffa); # u64
&data_word(0xde82bde9,0xa4506ceb); # u64
&data_word(0xb2c67915,0xbef9a3f7); # u64
&data_word(0xe372532b,0xc67178f2); # u64
&data_word(0xea26619c,0xca273ece); # u64
&data_word(0x21c0c207,0xd186b8c7); # u64
&data_word(0xcde0eb1e,0xeada7dd6); # u64
&data_word(0xee6ed178,0xf57d4f7f); # u64
&data_word(0x72176fba,0x06f067aa); # u64
&data_word(0xa2c898a6,0x0a637dc5); # u64
&data_word(0xbef90dae,0x113f9804); # u64
&data_word(0x131c471b,0x1b710b35); # u64
&data_word(0x23047d84,0x28db77f5); # u64
&data_word(0x40c72493,0x32caab7b); # u64
&data_word(0x15c9bebc,0x3c9ebe0a); # u64
&data_word(0x9c100d4c,0x431d67c4); # u64
&data_word(0xcb3e42b6,0x4cc5d4be); # u64
&data_word(0xfc657e2a,0x597f299c); # u64
&data_word(0x3ad6faec,0x5fcb6fab); # u64
&data_word(0x4a475817,0x6c44198c); # u64
&data_word(0x04050607,0x00010203); # byte swap
&data_word(0x0c0d0e0f,0x08090a0b); # mask
&function_end_B("sha512_block_data_order");
&asciz("SHA512 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View file

@ -0,0 +1,668 @@
#! /usr/bin/env perl
# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
# Xscale PXA250 core].
#
# July 2010.
#
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012.
#
# Improve NEON performance by 12% on Snapdragon S4. In absolute
# terms it's 22.6 cycles per byte, which is disappointing result.
# Technical writers asserted that 3-way S4 pipeline can sustain
# multiple NEON instructions per cycle, but dual NEON issue could
# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
# for further details. On side note Cortex-A15 processes one byte in
# 16 cycles.
# Byte order [in]dependence. =========================================
#
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
$Ahi="r6";
$Elo="r7";
$Ehi="r8";
$t0="r9";
$t1="r10";
$t2="r11";
$t3="r12";
############ r13 is stack pointer
$Ktbl="r14";
############ r15 is program counter
$Aoff=8*0;
$Boff=8*1;
$Coff=8*2;
$Doff=8*3;
$Eoff=8*4;
$Foff=8*5;
$Goff=8*6;
$Hoff=8*7;
$Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
eor $t1,$t1,$Elo,lsl#14
eor $t0,$t0,$Ehi,lsr#9
eor $t1,$t1,$Elo,lsr#9
eor $t0,$t0,$Elo,lsl#23
eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#$Foff+0] @ f.lo
adc $Thi,$Thi,$t1 @ T += Sigma1(e)
ldr $t1,[sp,#$Foff+4] @ f.hi
adds $Tlo,$Tlo,$t2
ldr $t2,[sp,#$Goff+0] @ g.lo
adc $Thi,$Thi,$t3 @ T += h
ldr $t3,[sp,#$Goff+4] @ g.hi
eor $t0,$t0,$t2
str $Elo,[sp,#$Eoff+0]
eor $t1,$t1,$t3
str $Ehi,[sp,#$Eoff+4]
and $t0,$t0,$Elo
str $Alo,[sp,#$Aoff+0]
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
#ifdef __thumb2__
it eq @ Thumb2 thing, sanity check in ARM
#endif
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
mov $t0,$Alo,lsr#28
mov $t1,$Ahi,lsr#28
eor $t0,$t0,$Ahi,lsl#4
eor $t1,$t1,$Alo,lsl#4
eor $t0,$t0,$Ahi,lsr#2
eor $t1,$t1,$Alo,lsr#2
eor $t0,$t0,$Alo,lsl#30
eor $t1,$t1,$Ahi,lsl#30
eor $t0,$t0,$Ahi,lsr#7
eor $t1,$t1,$Alo,lsr#7
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
adds $Alo,$Alo,$Tlo
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
# define VFP_ABI_POP vldmia sp!,{d8-d15}
#else
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
# define VFP_ABI_PUSH
# define VFP_ABI_POP
#endif
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
#if defined(__thumb2__)
.syntax unified
.thumb
# define adrl adr
#else
.code 32
#endif
.type K512,%object
.align 5
K512:
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
#endif
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
.Lsha512_block_data_order:
#if __ARM_ARCH__<7 && !defined(__thumb2__)
sub r3,pc,#8 @ sha512_block_data_order
#else
adr r3,.Lsha512_block_data_order
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
bne .LNEON
#endif
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
ldr $Ehi,[$ctx,#$Eoff+$hi]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
.Loop:
str $t0, [sp,#$Goff+0]
str $t1, [sp,#$Goff+4]
str $t2, [sp,#$Hoff+0]
str $t3, [sp,#$Hoff+4]
ldr $Alo,[$ctx,#$Aoff+$lo]
ldr $Ahi,[$ctx,#$Aoff+$hi]
ldr $Tlo,[$ctx,#$Boff+$lo]
ldr $Thi,[$ctx,#$Boff+$hi]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
str $Tlo,[sp,#$Boff+0]
str $Thi,[sp,#$Boff+4]
str $t0, [sp,#$Coff+0]
str $t1, [sp,#$Coff+4]
str $t2, [sp,#$Doff+0]
str $t3, [sp,#$Doff+4]
ldr $Tlo,[$ctx,#$Foff+$lo]
ldr $Thi,[$ctx,#$Foff+$hi]
str $Tlo,[sp,#$Foff+0]
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
ldrb $t2, [$inp,#4]
ldrb $Thi,[$inp,#3]
ldrb $t3, [$inp,#2]
orr $Tlo,$Tlo,$t0,lsl#8
ldrb $t0, [$inp,#1]
orr $Tlo,$Tlo,$t1,lsl#16
ldrb $t1, [$inp],#8
orr $Tlo,$Tlo,$t2,lsl#24
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
eor $Thi,$Thi,$t1,lsr#8
eor $Tlo,$Tlo,$t1,lsl#24
eor $Thi,$Thi,$t0,lsl#24
eor $Tlo,$Tlo,$t0,lsr#7
eor $Thi,$Thi,$t1,lsr#7
eor $Tlo,$Tlo,$t1,lsl#25
@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
mov $t0,$t2,lsr#19
mov $t1,$t3,lsr#19
eor $t0,$t0,$t3,lsl#13
eor $t1,$t1,$t2,lsl#13
eor $t0,$t0,$t3,lsr#29
eor $t1,$t1,$t2,lsr#29
eor $t0,$t0,$t2,lsl#3
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
___
&BODY_00_15(0x17);
$code.=<<___;
#ifdef __thumb2__
ittt eq @ Thumb2 thing, sanity check in ARM
#endif
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
ldr $Tlo,[sp,#$Boff+0]
ldr $Thi,[sp,#$Boff+4]
ldr $t0, [$ctx,#$Aoff+$lo]
ldr $t1, [$ctx,#$Aoff+$hi]
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
ldr $Ahi,[sp,#$Coff+4]
ldr $Tlo,[sp,#$Doff+0]
ldr $Thi,[sp,#$Doff+4]
ldr $t0, [$ctx,#$Coff+$lo]
ldr $t1, [$ctx,#$Coff+$hi]
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
ldr $Thi,[sp,#$Foff+4]
ldr $t0, [$ctx,#$Eoff+$lo]
ldr $t1, [$ctx,#$Eoff+$hi]
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
ldr $Ahi,[sp,#$Goff+4]
ldr $Tlo,[sp,#$Hoff+0]
ldr $Thi,[sp,#$Hoff+4]
ldr $t0, [$ctx,#$Goff+$lo]
ldr $t1, [$ctx,#$Goff+$hi]
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
sub $Ktbl,$Ktbl,#640
teq $inp,$len
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha512_block_data_order,.-sha512_block_data_order
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
#if $i>0
vadd.i64 $a,$Maj @ h+=Maj from the past
#endif
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vmov $Ch,$e
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
veor $t1,$t0
vbsl $Ch,$f,$g @ Ch(e,f,g)
vshr.u64 $t0,$a,#@Sigma0[0]
veor $t2,$t1 @ Sigma1(e)
vadd.i64 $T1,$Ch,$h
vshr.u64 $t1,$a,#@Sigma0[1]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vadd.i64 $T1,$t2
vshr.u64 $t2,$a,#@Sigma0[2]
vadd.i64 $K,@X[$i%16]
vsli.64 $t1,$a,#`64-@Sigma0[1]`
veor $Maj,$a,$b
vsli.64 $t2,$a,#`64-@Sigma0[2]`
veor $h,$t0,$t1
vadd.i64 $T1,$K
vbsl $Maj,$c,$b @ Maj(a,b,c)
veor $h,$t2 @ Sigma0(a)
vadd.i64 $d,$T1
vadd.i64 $Maj,$T1
@ vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vadd.i64 @_[0],d30 @ h+=Maj from the past
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.global sha512_block_data_order_neon
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
adr $Ktbl,K512
VFP_ABI_PUSH
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
VFP_ABI_POP
ret @ bx lr
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
#endif
___
}
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4
#endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
print $code;
close STDOUT; # enforce flush

View file

@ -0,0 +1,905 @@
#! /usr/bin/env perl
# Copyright 2014-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPLv2 terms is granted.
# ====================================================================
#
# SHA256/512 for ARMv8.
#
# Performance in cycles per processed byte and improvement coefficient
# over code generated with "default" compiler:
#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
# 10% (or by 1 cycle per round), but at the cost of 20% loss
# on Cortex-A53 (or by 4 cycles per round).
# (***) Super-impressive coefficients over gcc-generated code are
# indication of some compiler "pathology", most notably code
# generated with -mgeneral-regs-only is significantly faster
# and the gap is only 40-90%.
#
# October 2016.
#
# Originally it was reckoned that it makes no sense to implement NEON
# version of SHA256 for 64-bit processors. This is because performance
# improvement on most wide-spread Cortex-A5x processors was observed
# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
# observed that 32-bit NEON SHA256 performs significantly better than
# 64-bit scalar version on *some* of the more recent processors. As
# result 64-bit NEON version of SHA256 was added to provide best
# all-round performance. For example it executes ~30% faster on X-Gene
# and Mongoose. [For reference, NEON version of SHA512 is bound to
# deliver much less improvement, likely *negative* on Cortex-A5x.
# Which is why NEON support is limited to SHA256.]
$output=pop;
$flavour=pop;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
} else {
open STDOUT,">$output";
}
if ($output =~ /512/) {
$BITS=512;
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$reg_t="x";
} else {
$BITS=256;
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$reg_t="w";
}
$func="sha${BITS}_block_data_order";
($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
@X=map("$reg_t$_",(3..15,0..2));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
sub BODY_00_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)&15;
my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
$T0=@X[$i+3] if ($i<11);
$code.=<<___ if ($i<16);
#ifndef __AARCH64EB__
rev @X[$i],@X[$i] // $i
#endif
___
$code.=<<___ if ($i<13 && ($i&1));
ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ
___
$code.=<<___ if ($i==13);
ldp @X[14],@X[15],[$inp]
___
$code.=<<___ if ($i>=14);
ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
___
$code.=<<___ if ($i>0 && $i<16);
add $a,$a,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=11);
str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
___
# While ARMv8 specifies merged rotate-n-logical operation such as
# 'eor x,y,z,ror#n', it was found to negatively affect performance
# on Apple A7. The reason seems to be that it requires even 'y' to
# be available earlier. This means that such merged instruction is
# not necessarily best choice on critical path... On the other hand
# Cortex-A5x handles merged instructions much better than disjoint
# rotate and logical... See (**) footnote above.
$code.=<<___ if ($i<15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
and $t1,$f,$e
bic $t2,$g,$e
add $h,$h,@X[$i&15] // h+=X[i]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e)
ror $T0,$a,#$Sigma0[0]
add $h,$h,$t1 // h+=Ch(e,f,g)
eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
add $h,$h,$t0 // h+=Sigma1(e)
and $t3,$t3,$t2 // (b^c)&=(a^b)
add $d,$d,$h // d+=h
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a)
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
//add $h,$h,$t1 // h+=Sigma0(a)
___
$code.=<<___ if ($i>=15);
ror $t0,$e,#$Sigma1[0]
add $h,$h,$t2 // h+=K[i]
ror $T1,@X[($j+1)&15],#$sigma0[0]
and $t1,$f,$e
ror $T2,@X[($j+14)&15],#$sigma1[0]
bic $t2,$g,$e
ror $T0,$a,#$Sigma0[0]
add $h,$h,@X[$i&15] // h+=X[i]
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
orr $t1,$t1,$t2 // Ch(e,f,g)
eor $t2,$a,$b // a^b, b^c in next round
eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e)
eor $T0,$T0,$a,ror#$Sigma0[1]
add $h,$h,$t1 // h+=Ch(e,f,g)
and $t3,$t3,$t2 // (b^c)&=(a^b)
eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1])
add $h,$h,$t0 // h+=Sigma1(e)
eor $t3,$t3,$b // Maj(a,b,c)
eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a)
eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14])
add @X[$j],@X[$j],@X[($j+9)&15]
add $d,$d,$h // d+=h
add $h,$h,$t3 // h+=Maj(a,b,c)
ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round
add @X[$j],@X[$j],$T1
add $h,$h,$t1 // h+=Sigma0(a)
add @X[$j],@X[$j],$T2
___
($t2,$t3)=($t3,$t2);
}
$code.=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
#endif
.text
.extern OPENSSL_armcap_P
.globl $func
.type $func,%function
.align 6
$func:
#ifndef __KERNEL__
# ifdef __ILP32__
ldrsw x16,.LOPENSSL_armcap_P
# else
ldr x16,.LOPENSSL_armcap_P
# endif
adr x17,.LOPENSSL_armcap_P
add x16,x16,x17
ldr w16,[x16]
___
$code.=<<___ if ($SZ==4);
tst w16,#ARMV8_SHA256
b.ne .Lv8_entry
tst w16,#ARMV7_NEON
b.ne .Lneon_entry
___
$code.=<<___ if ($SZ==8);
tst w16,#ARMV8_SHA512
b.ne .Lv8_entry
___
$code.=<<___;
#endif
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-128]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#4*$SZ
ldp $A,$B,[$ctx] // load context
ldp $C,$D,[$ctx,#2*$SZ]
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
adr $Ktbl,.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
ldp @X[0],@X[1],[$inp],#2*$SZ
ldr $t2,[$Ktbl],#$SZ // *K++
eor $t3,$B,$C // magic seed
str $inp,[x29,#112]
___
for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=".Loop_16_xx:\n";
for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
cbnz $t2,.Loop_16_xx
ldp $ctx,$num,[x29,#96]
ldr $inp,[x29,#112]
sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind
ldp @X[0],@X[1],[$ctx]
ldp @X[2],@X[3],[$ctx,#2*$SZ]
add $inp,$inp,#14*$SZ // advance input pointer
ldp @X[4],@X[5],[$ctx,#4*$SZ]
add $A,$A,@X[0]
ldp @X[6],@X[7],[$ctx,#6*$SZ]
add $B,$B,@X[1]
add $C,$C,@X[2]
add $D,$D,@X[3]
stp $A,$B,[$ctx]
add $E,$E,@X[4]
add $F,$F,@X[5]
stp $C,$D,[$ctx,#2*$SZ]
add $G,$G,@X[6]
add $H,$H,@X[7]
cmp $inp,$num
stp $E,$F,[$ctx,#4*$SZ]
stp $G,$H,[$ctx,#6*$SZ]
b.ne .Loop
ldp x19,x20,[x29,#16]
add sp,sp,#4*$SZ
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#128
.inst 0xd50323bf // autiasp
ret
.size $func,.-$func
.align 6
.type .LK$BITS,%object
.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.long 0 //terminator
___
$code.=<<___;
.size .LK$BITS,.-.LK$BITS
#ifndef __KERNEL__
.align 3
.LOPENSSL_armcap_P:
# ifdef __ILP32__
.long OPENSSL_armcap_P-.
# else
.quad OPENSSL_armcap_P-.
# endif
#endif
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
if ($SZ==4) {
my $Ktbl="x3";
my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
my @MSG=map("v$_.16b",(4..7));
my ($W0,$W1)=("v16.4s","v17.4s");
my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
$code.=<<___;
#ifndef __KERNEL__
.type sha256_block_armv8,%function
.align 6
sha256_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
adr $Ktbl,.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
sub $num,$num,#1
ld1.32 {$W0},[$Ktbl],#16
rev32 @MSG[0],@MSG[0]
rev32 @MSG[1],@MSG[1]
rev32 @MSG[2],@MSG[2]
rev32 @MSG[3],@MSG[3]
orr $ABCD_SAVE,$ABCD,$ABCD // offload
orr $EFGH_SAVE,$EFGH,$EFGH
___
for($i=0;$i<12;$i++) {
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
sha256su0 @MSG[0],@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
sha256su1 @MSG[0],@MSG[2],@MSG[3]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
}
$code.=<<___;
ld1.32 {$W1},[$Ktbl],#16
add.i32 $W0,$W0,@MSG[0]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
ld1.32 {$W0},[$Ktbl],#16
add.i32 $W1,$W1,@MSG[1]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
ld1.32 {$W1},[$Ktbl]
add.i32 $W0,$W0,@MSG[2]
sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W0
sha256h2 $EFGH,$abcd,$W0
add.i32 $W1,$W1,@MSG[3]
orr $abcd,$ABCD,$ABCD
sha256h $ABCD,$EFGH,$W1
sha256h2 $EFGH,$abcd,$W1
add.i32 $ABCD,$ABCD,$ABCD_SAVE
add.i32 $EFGH,$EFGH,$EFGH_SAVE
cbnz $num,.Loop_hw
st1.32 {$ABCD,$EFGH},[$ctx]
ldr x29,[sp],#16
ret
.size sha256_block_armv8,.-sha256_block_armv8
#endif
___
}
if ($SZ==4) { ######################################### NEON stuff #
# You'll surely note a lot of similarities with sha256-armv4 module,
# and of course it's not a coincidence. sha256-armv4 was used as
# initial template, but was adapted for ARMv8 instruction set and
# extensively re-tuned for all-round performance.
my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
my $Ktbl="x16";
my $Xfer="x17";
my @X = map("q$_",(0..3));
my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
my $j=0;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
my $arg = pop;
$arg = "#$arg" if ($arg*1 eq $arg);
$code .= "\t$opcode\t".join(',',@_,$arg)."\n";
}
sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
sub Xupdate()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
&ext_8 ($T0,@X[0],@X[1],4); # X[1..4]
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ext_8 ($T3,@X[2],@X[3],4); # X[9..12]
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15]
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T2,$T0,$sigma0[0]);
eval(shift(@insns));
&ushr_32 ($T1,$T0,$sigma0[2]);
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12]
eval(shift(@insns));
&sli_32 ($T2,$T0,32-$sigma0[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T0,$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T2);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T3,$T0,32-$sigma0[1]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T4,$T7,$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T1,$T1,$T3); # sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T4,$T7,32-$sigma1[0]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T5,$T7,$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T3,$T7,$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
eval(shift(@insns));
eval(shift(@insns));
&sli_u32 ($T3,$T7,32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T4);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T3); # sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&ushr_32 ($T6,@X[0],$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T7,@X[0],$sigma1[2]);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T6,@X[0],32-$sigma1[0]);
eval(shift(@insns));
&ushr_32 ($T5,@X[0],$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T7,$T7,$T6);
eval(shift(@insns));
eval(shift(@insns));
&sli_32 ($T5,@X[0],32-$sigma1[1]);
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl], #16");
eval(shift(@insns));
&eor_8 ($T7,$T7,$T5); # sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
&eor_8 ($T5,$T5,$T5);
eval(shift(@insns));
eval(shift(@insns));
&mov (&Dhi($T5), &Dlo($T7));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17])
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
while($#insns>=1) { eval(shift(@insns)); }
&st1_32 ("{$T0}","[$Xfer], #16");
eval(shift(@insns));
push(@X,shift(@X)); # "rotate" X[]
}
sub Xpreload()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body);
my ($a,$b,$c,$d,$e,$f,$g,$h);
eval(shift(@insns));
eval(shift(@insns));
&ld1_8 ("{@X[0]}","[$inp],#16");
eval(shift(@insns));
eval(shift(@insns));
&ld1_32 ("{$T0}","[$Ktbl],#16");
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&rev32 (@X[0],@X[0]);
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
&add_32 ($T0,$T0,@X[0]);
foreach (@insns) { eval; } # remaining instructions
&st1_32 ("{$T0}","[$Xfer], #16");
push(@X,shift(@X)); # "rotate" X[]
}
sub body_00_15 () {
(
'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
'&add ($h,$h,$t1)', # h+=X[i]+K[i]
'&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past
'&and ($t1,$f,$e)',
'&bic ($t4,$g,$e)',
'&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
'&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
'&orr ($t1,$t1,$t4)', # Ch(e,f,g)
'&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
'&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
'&add ($h,$h,$t1)', # h+=Ch(e,f,g)
'&ror ($t0,$t0,"#$Sigma1[0]")',
'&eor ($t2,$a,$b)', # a^b, b^c in next round
'&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
'&add ($h,$h,$t0)', # h+=Sigma1(e)
'&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
'&ldr ($t1,"[$Ktbl]") if ($j==15);'.
'&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
'&ror ($t4,$t4,"#$Sigma0[0]")',
'&add ($d,$d,$h)', # d+=h
'&eor ($t3,$t3,$b)', # Maj(a,b,c)
'$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
)
}
$code.=<<___;
#ifdef __KERNEL__
.globl sha256_block_neon
#endif
.type sha256_block_neon,%function
.align 4
sha256_block_neon:
.Lneon_entry:
stp x29, x30, [sp, #-16]!
mov x29, sp
sub sp,sp,#16*4
adr $Ktbl,.LK256
add $num,$inp,$num,lsl#6 // len to point at the end of inp
ld1.8 {@X[0]},[$inp], #16
ld1.8 {@X[1]},[$inp], #16
ld1.8 {@X[2]},[$inp], #16
ld1.8 {@X[3]},[$inp], #16
ld1.32 {$T0},[$Ktbl], #16
ld1.32 {$T1},[$Ktbl], #16
ld1.32 {$T2},[$Ktbl], #16
ld1.32 {$T3},[$Ktbl], #16
rev32 @X[0],@X[0] // yes, even on
rev32 @X[1],@X[1] // big-endian
rev32 @X[2],@X[2]
rev32 @X[3],@X[3]
mov $Xfer,sp
add.32 $T0,$T0,@X[0]
add.32 $T1,$T1,@X[1]
add.32 $T2,$T2,@X[2]
st1.32 {$T0-$T1},[$Xfer], #32
add.32 $T3,$T3,@X[3]
st1.32 {$T2-$T3},[$Xfer]
sub $Xfer,$Xfer,#32
ldp $A,$B,[$ctx]
ldp $C,$D,[$ctx,#8]
ldp $E,$F,[$ctx,#16]
ldp $G,$H,[$ctx,#24]
ldr $t1,[sp,#0]
mov $t2,wzr
eor $t3,$B,$C
mov $t4,wzr
b .L_00_48
.align 4
.L_00_48:
___
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
&Xupdate(\&body_00_15);
$code.=<<___;
cmp $t1,#0 // check for K256 terminator
ldr $t1,[sp,#0]
sub $Xfer,$Xfer,#64
bne .L_00_48
sub $Ktbl,$Ktbl,#256 // rewind $Ktbl
cmp $inp,$num
mov $Xfer, #64
csel $Xfer, $Xfer, xzr, eq
sub $inp,$inp,$Xfer // avoid SEGV
mov $Xfer,sp
___
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
&Xpreload(\&body_00_15);
$code.=<<___;
add $A,$A,$t4 // h+=Sigma0(a) from the past
ldp $t0,$t1,[$ctx,#0]
add $A,$A,$t2 // h+=Maj(a,b,c) from the past
ldp $t2,$t3,[$ctx,#8]
add $A,$A,$t0 // accumulate
add $B,$B,$t1
ldp $t0,$t1,[$ctx,#16]
add $C,$C,$t2
add $D,$D,$t3
ldp $t2,$t3,[$ctx,#24]
add $E,$E,$t0
add $F,$F,$t1
ldr $t1,[sp,#0]
stp $A,$B,[$ctx,#0]
add $G,$G,$t2
mov $t2,wzr
stp $C,$D,[$ctx,#8]
add $H,$H,$t3
stp $E,$F,[$ctx,#16]
eor $t3,$B,$C
stp $G,$H,[$ctx,#24]
mov $t4,wzr
mov $Xfer,sp
b.ne .L_00_48
ldr x29,[x29]
add sp,sp,#16*4+16
ret
.size sha256_block_neon,.-sha256_block_neon
___
}
if ($SZ==8) {
my $Ktbl="x3";
my @H = map("v$_.16b",(0..4));
my ($fg,$de,$m9_10)=map("v$_.16b",(5..7));
my @MSG=map("v$_.16b",(16..23));
my ($W0,$W1)=("v24.2d","v25.2d");
my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29));
$code.=<<___;
#ifndef __KERNEL__
.type sha512_block_armv8,%function
.align 6
sha512_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input
ld1 {@MSG[4]-@MSG[7]},[$inp],#64
ld1.64 {@H[0]-@H[3]},[$ctx] // load context
adr $Ktbl,.LK512
rev64 @MSG[0],@MSG[0]
rev64 @MSG[1],@MSG[1]
rev64 @MSG[2],@MSG[2]
rev64 @MSG[3],@MSG[3]
rev64 @MSG[4],@MSG[4]
rev64 @MSG[5],@MSG[5]
rev64 @MSG[6],@MSG[6]
rev64 @MSG[7],@MSG[7]
b .Loop_hw
.align 4
.Loop_hw:
ld1.64 {$W0},[$Ktbl],#16
subs $num,$num,#1
sub x4,$inp,#128
orr $AB,@H[0],@H[0] // offload
orr $CD,@H[1],@H[1]
orr $EF,@H[2],@H[2]
orr $GH,@H[3],@H[3]
csel $inp,$inp,x4,ne // conditional rewind
___
for($i=0;$i<32;$i++) {
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1.64 {$W1},[$Ktbl],#16
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512su0 @MSG[0],@MSG[1]
ext $m9_10,@MSG[4],@MSG[5],#8
sha512h @H[3],$fg,$de
sha512su1 @MSG[0],@MSG[7],$m9_10
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
for(;$i<40;$i++) {
$code.=<<___ if ($i<39);
ld1.64 {$W1},[$Ktbl],#16
___
$code.=<<___ if ($i==39);
sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind
___
$code.=<<___;
add.i64 $W0,$W0,@MSG[0]
ld1 {@MSG[0]},[$inp],#16 // load next input
ext $W0,$W0,$W0,#8
ext $fg,@H[2],@H[3],#8
ext $de,@H[1],@H[2],#8
add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]"
sha512h @H[3],$fg,$de
rev64 @MSG[0],@MSG[0]
add.i64 @H[4],@H[1],@H[3] // "D + T1"
sha512h2 @H[3],$H[1],@H[0]
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
}
$code.=<<___;
add.i64 @H[0],@H[0],$AB // accumulate
add.i64 @H[1],@H[1],$CD
add.i64 @H[2],@H[2],$EF
add.i64 @H[3],@H[3],$GH
cbnz $num,.Loop_hw
st1.64 {@H[0]-@H[3]},[$ctx] // store context
ldr x29,[sp],#16
ret
.size sha512_block_armv8,.-sha512_block_armv8
#endif
___
}
$code.=<<___;
#ifndef __KERNEL__
.comm OPENSSL_armcap_P,4,4
#endif
___
{ my %opcode = (
"sha256h" => 0x5e004000, "sha256h2" => 0x5e005000,
"sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 );
sub unsha256 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
{ my %opcode = (
"sha512h" => 0xce608000, "sha512h2" => 0xce608400,
"sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
sub unsha512 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or
s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers
s/\.[ui]?8(\s)/$1/;
s/\.\w?64\b// and s/\.16b/\.2d/g or
s/\.\w?32\b// and s/\.16b/\.4s/g;
m/\bext\b/ and s/\.2d/\.16b/g or
m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,438 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA512 for C64x+.
#
# January 2012
#
# Performance is 19 cycles per processed byte. Compared to block
# transform function from sha512.c compiled with cl6x with -mv6400+
# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
# Loop unroll won't make it, this implementation, any faster, because
# it's effectively dominated by SHRU||SHL pairs and you can't schedule
# more of them.
#
# !!! Note that this module uses AMR, which means that all interrupt
# service routines are expected to preserve it and for own well-being
# zero it upon entry.
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments
$K512="A3";
($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
$Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
$Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
($T1hi, $T2hi)= ("A6","A7");
($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
($Khi,$Klo)=("A9","A8");
($MAJhi,$MAJlo)=($T2hi,$T2lo);
($t1hi,$t1lo)=($Khi,"B2");
$CTXB=$t1lo;
($Xihi,$Xilo)=("A5","B5"); # circular/ring buffer
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.nocmp
.asg sha512_block_data_order,_sha512_block_data_order
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.if .BIG_ENDIAN
.asg $Khi,KHI
.asg $Klo,KLO
.else
.asg $Khi,KLO
.asg $Klo,KHI
.endif
.global _sha512_block_data_order
_sha512_block_data_order:
__sha512_block:
.asmfunc stack_usage(40+128)
MV $NUM,A0 ; reassign $NUM
|| MVK -128,B0
[!A0] BNOP RA ; if ($NUM==0) return;
|| [A0] STW FP,*SP--(40) ; save frame pointer
|| [A0] MV SP,FP
[A0] STDW B13:B12,*SP[4]
|| [A0] MVK 0x00404,B1
[A0] STDW B11:B10,*SP[3]
|| [A0] STDW A13:A12,*FP[-3]
|| [A0] MVKH 0x60000,B1
[A0] STDW A11:A10,*SP[1]
|| [A0] MVC B1,AMR ; setup circular addressing
|| [A0] ADD B0,SP,SP ; alloca(128)
.if __TI_EABI__
[A0] AND B0,SP,SP ; align stack at 128 bytes
|| [A0] ADDKPC __sha512_block,B1
|| [A0] MVKL \$PCR_OFFSET(K512,__sha512_block),$K512
[A0] MVKH \$PCR_OFFSET(K512,__sha512_block),$K512
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
.else
[A0] AND B0,SP,SP ; align stack at 128 bytes
|| [A0] ADDKPC __sha512_block,B1
|| [A0] MVKL (K512-__sha512_block),$K512
[A0] MVKH (K512-__sha512_block),$K512
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
.endif
ADDAW SP,3,$Xilo
ADDAW SP,2,$Xihi
|| MV $CTXA,$CTXB
LDW *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
|| LDW *${CTXB}[1^.LITTLE_ENDIAN],$Alo
|| ADD B1,$K512,$K512
LDW *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
|| LDW *${CTXB}[3^.LITTLE_ENDIAN],$Blo
LDW *${CTXA}[4^.LITTLE_ENDIAN],$Chi
|| LDW *${CTXB}[5^.LITTLE_ENDIAN],$Clo
LDW *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
|| LDW *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
LDW *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
|| LDW *${CTXB}[9^.LITTLE_ENDIAN],$Elo
LDW *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
|| LDW *${CTXB}[11^.LITTLE_ENDIAN],$Flo
LDW *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
|| LDW *${CTXB}[13^.LITTLE_ENDIAN],$Glo
LDW *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
|| LDW *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
LDNDW *$INP++,B11:B10 ; pre-fetch input
LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
outerloop?:
MVK 15,B0 ; loop counters
|| MVK 64,B1
|| SUB A0,1,A0
MV $Ahi,$Actxhi
|| MV $Alo,$Actxlo
|| MV $Bhi,$Bctxhi
|| MV $Blo,$Bctxlo
|| MV $Chi,$Cctxhi
|| MV $Clo,$Cctxlo
|| MVD $Dhi,$Dctxhi
|| MVD $Dlo,$Dctxlo
MV $Ehi,$Ectxhi
|| MV $Elo,$Ectxlo
|| MV $Fhi,$Fctxhi
|| MV $Flo,$Fctxlo
|| MV $Ghi,$Gctxhi
|| MV $Glo,$Gctxlo
|| MVD $Hhi,$Hctxhi
|| MVD $Hlo,$Hctxlo
loop0_15?:
.if .BIG_ENDIAN
MV B11,$T1hi
|| MV B10,$T1lo
.else
SWAP4 B10,$T1hi
|| SWAP4 B11,$T1lo
SWAP2 $T1hi,$T1hi
|| SWAP2 $T1lo,$T1lo
.endif
loop16_79?:
STW $T1hi,*$Xihi++[2]
|| STW $T1lo,*$Xilo++[2] ; X[i] = T1
|| ADD $Hhi,$T1hi,$T1hi
|| ADDU $Hlo,$T1lo,$T1carry:$T1lo ; T1 += h
|| SHRU $Ehi,14,$S1hi
|| SHL $Ehi,32-14,$S1lo
XOR $Fhi,$Ghi,$CHhi
|| XOR $Flo,$Glo,$CHlo
|| ADD KHI,$T1hi,$T1hi
|| ADDU KLO,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += K512[i]
|| SHRU $Elo,14,$t0lo
|| SHL $Elo,32-14,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| AND $Ehi,$CHhi,$CHhi
|| AND $Elo,$CHlo,$CHlo
|| ROTL $Ghi,0,$Hhi
|| ROTL $Glo,0,$Hlo ; h = g
|| SHRU $Ehi,18,$t0hi
|| SHL $Ehi,32-18,$t0lo
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| XOR $Ghi,$CHhi,$CHhi
|| XOR $Glo,$CHlo,$CHlo ; Ch(e,f,g) = ((f^g)&e)^g
|| ROTL $Fhi,0,$Ghi
|| ROTL $Flo,0,$Glo ; g = f
|| SHRU $Elo,18,$t0lo
|| SHL $Elo,32-18,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| OR $Ahi,$Bhi,$MAJhi
|| OR $Alo,$Blo,$MAJlo
|| ROTL $Ehi,0,$Fhi
|| ROTL $Elo,0,$Flo ; f = e
|| SHRU $Ehi,41-32,$t0lo
|| SHL $Ehi,64-41,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| AND $Chi,$MAJhi,$MAJhi
|| AND $Clo,$MAJlo,$MAJlo
|| ROTL $Dhi,0,$Ehi
|| ROTL $Dlo,0,$Elo ; e = d
|| SHRU $Elo,41-32,$t0hi
|| SHL $Elo,64-41,$t0lo
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo ; Sigma1(e)
|| AND $Ahi,$Bhi,$t1hi
|| AND $Alo,$Blo,$t1lo
|| ROTL $Chi,0,$Dhi
|| ROTL $Clo,0,$Dlo ; d = c
|| SHRU $Ahi,28,$S0hi
|| SHL $Ahi,32-28,$S0lo
OR $t1hi,$MAJhi,$MAJhi
|| OR $t1lo,$MAJlo,$MAJlo ; Maj(a,b,c) = ((a|b)&c)|(a&b)
|| ADD $CHhi,$T1hi,$T1hi
|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Ch(e,f,g)
|| ROTL $Bhi,0,$Chi
|| ROTL $Blo,0,$Clo ; c = b
|| SHRU $Alo,28,$t0lo
|| SHL $Alo,32-28,$t0hi
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $S1hi,$T1hi,$T1hi
|| ADDU $S1lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += Sigma1(e)
|| ROTL $Ahi,0,$Bhi
|| ROTL $Alo,0,$Blo ; b = a
|| SHRU $Ahi,34-32,$t0lo
|| SHL $Ahi,64-34,$t0hi
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $MAJhi,$T1hi,$T2hi
|| ADDU $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo ; T2 = T1+Maj(a,b,c)
|| SHRU $Alo,34-32,$t0hi
|| SHL $Alo,64-34,$t0lo
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $Ehi,$T1hi,$T1hi
|| ADDU $Elo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += e
|| [B0] BNOP loop0_15?
|| SHRU $Ahi,39-32,$t0lo
|| SHL $Ahi,64-39,$t0hi
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| [B0] LDNDW *$INP++,B11:B10 ; pre-fetch input
||[!B1] BNOP break?
|| SHRU $Alo,39-32,$t0hi
|| SHL $Alo,64-39,$t0lo
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo ; Sigma0(a)
|| ADD $T1carry,$T1hi,$Ehi
|| MV $T1lo,$Elo ; e = T1
||[!B0] LDW *${Xihi}[28],$T1hi
||[!B0] LDW *${Xilo}[28],$T1lo ; X[i+14]
ADD $S0hi,$T2hi,$T2hi
|| ADDU $S0lo,$T2carry:$T2lo,$T2carry:$T2lo ; T2 += Sigma0(a)
|| [B1] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[i]
NOP ; avoid cross-path stall
ADD $T2carry,$T2hi,$Ahi
|| MV $T2lo,$Alo ; a = T2
|| [B0] SUB B0,1,B0
;;===== branch to loop00_15? is taken here
NOP
;;===== branch to break? is taken here
LDW *${Xihi}[2],$T2hi
|| LDW *${Xilo}[2],$T2lo ; X[i+1]
|| SHRU $T1hi,19,$S1hi
|| SHL $T1hi,32-19,$S1lo
SHRU $T1lo,19,$t0lo
|| SHL $T1lo,32-19,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| SHRU $T1hi,61-32,$t0lo
|| SHL $T1hi,64-61,$t0hi
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| SHRU $T1lo,61-32,$t0hi
|| SHL $T1lo,64-61,$t0lo
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| SHRU $T1hi,6,$t0hi
|| SHL $T1hi,32-6,$t0lo
XOR $t0hi,$S1hi,$S1hi
|| XOR $t0lo,$S1lo,$S1lo
|| SHRU $T1lo,6,$t0lo
|| LDW *${Xihi}[18],$T1hi
|| LDW *${Xilo}[18],$T1lo ; X[i+9]
XOR $t0lo,$S1lo,$S1lo ; sigma1(Xi[i+14])
|| LDW *${Xihi}[0],$CHhi
|| LDW *${Xilo}[0],$CHlo ; X[i]
|| SHRU $T2hi,1,$S0hi
|| SHL $T2hi,32-1,$S0lo
SHRU $T2lo,1,$t0lo
|| SHL $T2lo,32-1,$t0hi
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| SHRU $T2hi,8,$t0hi
|| SHL $T2hi,32-8,$t0lo
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| SHRU $T2lo,8,$t0lo
|| SHL $T2lo,32-8,$t0hi
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $S1hi,$T1hi,$T1hi
|| ADDU $S1lo,$T1lo,$T1carry:$T1lo ; T1 = X[i+9]+sigma1()
|| [B1] BNOP loop16_79?
|| SHRU $T2hi,7,$t0hi
|| SHL $T2hi,32-7,$t0lo
XOR $t0hi,$S0hi,$S0hi
|| XOR $t0lo,$S0lo,$S0lo
|| ADD $CHhi,$T1hi,$T1hi
|| ADDU $CHlo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += X[i]
|| SHRU $T2lo,7,$t0lo
XOR $t0lo,$S0lo,$S0lo ; sigma0(Xi[i+1]
ADD $S0hi,$T1hi,$T1hi
|| ADDU $S0lo,$T1carry:$T1lo,$T1carry:$T1lo ; T1 += sigma0()
|| [B1] SUB B1,1,B1
NOP ; avoid cross-path stall
ADD $T1carry,$T1hi,$T1hi
;;===== branch to loop16_79? is taken here
break?:
ADD $Ahi,$Actxhi,$Ahi ; accumulate ctx
|| ADDU $Alo,$Actxlo,$Actxlo:$Alo
|| [A0] LDNDW *$INP++,B11:B10 ; pre-fetch input
|| [A0] ADDK -640,$K512 ; rewind pointer to K512
ADD $Bhi,$Bctxhi,$Bhi
|| ADDU $Blo,$Bctxlo,$Bctxlo:$Blo
|| [A0] LDDW *$K512++,$Khi:$Klo ; pre-fetch K512[0]
ADD $Chi,$Cctxhi,$Chi
|| ADDU $Clo,$Cctxlo,$Cctxlo:$Clo
|| ADD $Actxlo,$Ahi,$Ahi
||[!A0] MV $CTXA,$CTXB
ADD $Dhi,$Dctxhi,$Dhi
|| ADDU $Dlo,$Dctxlo,$Dctxlo:$Dlo
|| ADD $Bctxlo,$Bhi,$Bhi
||[!A0] STW $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
||[!A0] STW $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
ADD $Ehi,$Ectxhi,$Ehi
|| ADDU $Elo,$Ectxlo,$Ectxlo:$Elo
|| ADD $Cctxlo,$Chi,$Chi
|| [A0] BNOP outerloop?
||[!A0] STW $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
||[!A0] STW $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
ADD $Fhi,$Fctxhi,$Fhi
|| ADDU $Flo,$Fctxlo,$Fctxlo:$Flo
|| ADD $Dctxlo,$Dhi,$Dhi
||[!A0] STW $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
||[!A0] STW $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
ADD $Ghi,$Gctxhi,$Ghi
|| ADDU $Glo,$Gctxlo,$Gctxlo:$Glo
|| ADD $Ectxlo,$Ehi,$Ehi
||[!A0] STW $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
||[!A0] STW $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
ADD $Hhi,$Hctxhi,$Hhi
|| ADDU $Hlo,$Hctxlo,$Hctxlo:$Hlo
|| ADD $Fctxlo,$Fhi,$Fhi
||[!A0] STW $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
||[!A0] STW $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
ADD $Gctxlo,$Ghi,$Ghi
||[!A0] STW $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
||[!A0] STW $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
ADD $Hctxlo,$Hhi,$Hhi
||[!A0] STW $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
||[!A0] STW $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
;;===== branch to outerloop? is taken here
STW $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
|| STW $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
|| MVK -40,B0
ADD FP,B0,SP ; destroy circular buffer
|| LDDW *FP[-4],A11:A10
LDDW *SP[2],A13:A12
|| LDDW *FP[-2],B11:B10
LDDW *SP[4],B13:B12
|| BNOP RA
LDW *++SP(40),FP ; restore frame pointer
MVK 0,B0
MVC B0,AMR ; clear AMR
NOP 2 ; wait till FP is committed
.endasmfunc
.if __TI_EABI__
.sect ".text:sha_asm.const"
.else
.sect ".const:sha_asm"
.endif
.align 128
K512:
.uword 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.uword 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.uword 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.uword 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.uword 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.uword 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.uword 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.uword 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.uword 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.uword 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.uword 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.uword 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.uword 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.uword 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.uword 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.uword 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.uword 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.uword 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.uword 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.uword 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.uword 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.uword 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.uword 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.uword 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.uword 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.uword 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.uword 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.uword 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.uword 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.uword 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.uword 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.uword 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.uword 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.uword 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.uword 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.uword 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.uword 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.uword 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.uword 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.uword 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
.cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;
close STDOUT;

View file

@ -0,0 +1,692 @@
#! /usr/bin/env perl
# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# SHA256/512_Transform for Itanium.
#
# sha512_block runs in 1003 cycles on Itanium 2, which is almost 50%
# faster than gcc and >60%(!) faster than code generated by HP-UX
# compiler (yes, HP-UX is generating slower code, because unlike gcc,
# it failed to deploy "shift right pair," 'shrp' instruction, which
# substitutes for 64-bit rotate).
#
# 924 cycles long sha256_block outperforms gcc by over factor of 2(!)
# and HP-UX compiler - by >40% (yes, gcc won sha512_block, but lost
# this one big time). Note that "formally" 924 is about 100 cycles
# too much. I mean it's 64 32-bit rounds vs. 80 virtually identical
# 64-bit ones and 1003*64/80 gives 802. Extra cycles, 2 per round,
# are spent on extra work to provide for 32-bit rotations. 32-bit
# rotations are still handled by 'shrp' instruction and for this
# reason lower 32 bits are deposited to upper half of 64-bit register
# prior 'shrp' issue. And in order to minimize the amount of such
# operations, X[16] values are *maintained* with copies of lower
# halves in upper halves, which is why you'll spot such instructions
# as custom 'mux2', "parallel 32-bit add," 'padd4' and "parallel
# 32-bit unsigned right shift," 'pshr4.u' instructions here.
#
# Rules of engagement.
#
# There is only one integer shifter meaning that if I have two rotate,
# deposit or extract instructions in adjacent bundles, they shall
# split [at run-time if they have to]. But note that variable and
# parallel shifts are performed by multi-media ALU and *are* pairable
# with rotates [and alike]. On the backside MMALU is rather slow: it
# takes 2 extra cycles before the result of integer operation is
# available *to* MMALU and 2(*) extra cycles before the result of MM
# operation is available "back" *to* integer ALU, not to mention that
# MMALU itself has 2 cycles latency. However! I explicitly scheduled
# these MM instructions to avoid MM stalls, so that all these extra
# latencies get "hidden" in instruction-level parallelism.
#
# (*) 2 cycles on Itanium 1 and 1 cycle on Itanium 2. But I schedule
# for 2 in order to provide for best *overall* performance,
# because on Itanium 1 stall on MM result is accompanied by
# pipeline flush, which takes 6 cycles:-(
#
# June 2012
#
# Improve performance by 15-20%. Note about "rules of engagement"
# above. Contemporary cores are equipped with additional shifter,
# so that they should perform even better than below, presumably
# by ~10%.
#
######################################################################
# Current performance in cycles per processed byte for Itanium 2
# pre-9000 series [little-endian] system:
#
# SHA1(*) 5.7
# SHA256 12.6
# SHA512 6.7
#
# (*) SHA1 result is presented purely for reference purposes.
#
# To generate code, pass the file name with either 256 or 512 in its
# name and compiler flags.
$output=pop;
if ($output =~ /512.*\.[s|asm]/) {
$SZ=8;
$BITS=8*$SZ;
$LDW="ld8";
$STW="st8";
$ADD="add";
$SHRU="shr.u";
$TABLE="K512";
$func="sha512_block_data_order";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
} elsif ($output =~ /256.*\.[s|asm]/) {
$SZ=4;
$BITS=8*$SZ;
$LDW="ld4";
$STW="st4";
$ADD="padd4";
$SHRU="pshr4.u";
$TABLE="K256";
$func="sha256_block_data_order";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
} else { die "nonsense $output"; }
open STDOUT,">$output" || die "can't open $output: $!";
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
$big_endian=0 if (/\-DL_ENDIAN/); }
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
$code=<<___;
.ident \"$output, version 2.0\"
.ident \"IA-64 ISA artwork by Andy Polyakov <appro\@openssl.org>\"
.explicit
.text
pfssave=r2;
lcsave=r3;
prsave=r14;
K=r15;
A_=r16; B_=r17; C_=r18; D_=r19;
E_=r20; F_=r21; G_=r22; H_=r23;
T1=r24; T2=r25;
s0=r26; s1=r27; t0=r28; t1=r29;
Ktbl=r30;
ctx=r31; // 1st arg
input=r56; // 2nd arg
num=r57; // 3rd arg
sgm0=r58; sgm1=r59; // small constants
// void $func (SHA_CTX *ctx, const void *in,size_t num[,int host])
.global $func#
.proc $func#
.align 32
.skip 16
$func:
.prologue
.save ar.pfs,pfssave
{ .mmi; alloc pfssave=ar.pfs,3,25,0,24
$ADDP ctx=0,r32 // 1st arg
.save ar.lc,lcsave
mov lcsave=ar.lc }
{ .mmi; $ADDP input=0,r33 // 2nd arg
mov num=r34 // 3rd arg
.save pr,prsave
mov prsave=pr };;
.body
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx };;
// load A-H
.Lpic_point:
{ .mmi; $LDW A_=[r8],4*$SZ
$LDW B_=[r9],4*$SZ
mov Ktbl=ip }
{ .mmi; $LDW C_=[r10],4*$SZ
$LDW D_=[r11],4*$SZ
mov sgm0=$sigma0[2] };;
{ .mmi; $LDW E_=[r8]
$LDW F_=[r9]
add Ktbl=($TABLE#-.Lpic_point),Ktbl }
{ .mmi; $LDW G_=[r10]
$LDW H_=[r11]
cmp.ne p0,p16=0,r0 };;
___
$code.=<<___ if ($BITS==64);
{ .mii; and r8=7,input
and input=~7,input;;
cmp.eq p9,p0=1,r8 }
{ .mmi; cmp.eq p10,p0=2,r8
cmp.eq p11,p0=3,r8
cmp.eq p12,p0=4,r8 }
{ .mmi; cmp.eq p13,p0=5,r8
cmp.eq p14,p0=6,r8
cmp.eq p15,p0=7,r8 };;
___
$code.=<<___;
.L_outer:
.rotr R[8],X[16]
A=R[0]; B=R[1]; C=R[2]; D=R[3]; E=R[4]; F=R[5]; G=R[6]; H=R[7]
{ .mmi; ld1 X[15]=[input],$SZ // eliminated in sha512
mov A=A_
mov ar.lc=14 }
{ .mmi; mov B=B_
mov C=C_
mov D=D_ }
{ .mmi; mov E=E_
mov F=F_
mov ar.ec=2 };;
{ .mmi; mov G=G_
mov H=H_
mov sgm1=$sigma1[2] }
{ .mib; mov r8=0
add r9=1-$SZ,input
brp.loop.imp .L_first16,.L_first16_end-16 };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
// in sha512 case I load whole X[16] at once and take care of alignment...
{ .mmi; add r8=1*$SZ,input
add r9=2*$SZ,input
add r10=3*$SZ,input };;
{ .mmb; $LDW X[15]=[input],4*$SZ
$LDW X[14]=[r8],4*$SZ
(p9) br.cond.dpnt.many .L1byte };;
{ .mmb; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
(p10) br.cond.dpnt.many .L2byte };;
{ .mmb; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
(p11) br.cond.dpnt.many .L3byte };;
{ .mmb; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
(p12) br.cond.dpnt.many .L4byte };;
{ .mmb; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
(p13) br.cond.dpnt.many .L5byte };;
{ .mmb; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
(p14) br.cond.dpnt.many .L6byte };;
{ .mmb; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
(p15) br.cond.dpnt.many .L7byte };;
{ .mmb; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L1byte:
{ .mmi; $LDW X[13]=[r9],4*$SZ
$LDW X[12]=[r10],4*$SZ
shrp X[15]=X[15],X[14],56 };;
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[14]=X[14],X[13],56 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[13]=X[13],X[12],56 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[12]=X[12],X[11],56 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[11]=X[11],X[10],56 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[10]=X[10],X[ 9],56 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[ 9]=X[ 9],X[ 8],56 };;
{ .mii; $LDW T1=[input]
shrp X[ 8]=X[ 8],X[ 7],56
shrp X[ 7]=X[ 7],X[ 6],56 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],56
shrp X[ 5]=X[ 5],X[ 4],56 };;
{ .mii; shrp X[ 4]=X[ 4],X[ 3],56
shrp X[ 3]=X[ 3],X[ 2],56 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],56
shrp X[ 1]=X[ 1],X[ 0],56 }
{ .mib; shrp X[ 0]=X[ 0],T1,56 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L2byte:
{ .mmi; $LDW X[11]=[input],4*$SZ
$LDW X[10]=[r8],4*$SZ
shrp X[15]=X[15],X[14],48 }
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[14]=X[14],X[13],48 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[13]=X[13],X[12],48 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[12]=X[12],X[11],48 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[11]=X[11],X[10],48 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[10]=X[10],X[ 9],48 };;
{ .mii; $LDW T1=[input]
shrp X[ 9]=X[ 9],X[ 8],48
shrp X[ 8]=X[ 8],X[ 7],48 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],48
shrp X[ 6]=X[ 6],X[ 5],48 };;
{ .mii; shrp X[ 5]=X[ 5],X[ 4],48
shrp X[ 4]=X[ 4],X[ 3],48 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],48
shrp X[ 2]=X[ 2],X[ 1],48 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],48
shrp X[ 0]=X[ 0],T1,48 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L3byte:
{ .mmi; $LDW X[ 9]=[r9],4*$SZ
$LDW X[ 8]=[r10],4*$SZ
shrp X[15]=X[15],X[14],40 };;
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[14]=X[14],X[13],40 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[13]=X[13],X[12],40 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[12]=X[12],X[11],40 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[11]=X[11],X[10],40 };;
{ .mii; $LDW T1=[input]
shrp X[10]=X[10],X[ 9],40
shrp X[ 9]=X[ 9],X[ 8],40 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],40
shrp X[ 7]=X[ 7],X[ 6],40 };;
{ .mii; shrp X[ 6]=X[ 6],X[ 5],40
shrp X[ 5]=X[ 5],X[ 4],40 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],40
shrp X[ 3]=X[ 3],X[ 2],40 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],40
shrp X[ 1]=X[ 1],X[ 0],40 }
{ .mib; shrp X[ 0]=X[ 0],T1,40 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L4byte:
{ .mmi; $LDW X[ 7]=[input],4*$SZ
$LDW X[ 6]=[r8],4*$SZ
shrp X[15]=X[15],X[14],32 }
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[14]=X[14],X[13],32 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[13]=X[13],X[12],32 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[12]=X[12],X[11],32 };;
{ .mii; $LDW T1=[input]
shrp X[11]=X[11],X[10],32
shrp X[10]=X[10],X[ 9],32 }
{ .mii; shrp X[ 9]=X[ 9],X[ 8],32
shrp X[ 8]=X[ 8],X[ 7],32 };;
{ .mii; shrp X[ 7]=X[ 7],X[ 6],32
shrp X[ 6]=X[ 6],X[ 5],32 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],32
shrp X[ 4]=X[ 4],X[ 3],32 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],32
shrp X[ 2]=X[ 2],X[ 1],32 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],32
shrp X[ 0]=X[ 0],T1,32 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L5byte:
{ .mmi; $LDW X[ 5]=[r9],4*$SZ
$LDW X[ 4]=[r10],4*$SZ
shrp X[15]=X[15],X[14],24 };;
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[14]=X[14],X[13],24 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[13]=X[13],X[12],24 };;
{ .mii; $LDW T1=[input]
shrp X[12]=X[12],X[11],24
shrp X[11]=X[11],X[10],24 }
{ .mii; shrp X[10]=X[10],X[ 9],24
shrp X[ 9]=X[ 9],X[ 8],24 };;
{ .mii; shrp X[ 8]=X[ 8],X[ 7],24
shrp X[ 7]=X[ 7],X[ 6],24 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],24
shrp X[ 5]=X[ 5],X[ 4],24 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],24
shrp X[ 3]=X[ 3],X[ 2],24 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],24
shrp X[ 1]=X[ 1],X[ 0],24 }
{ .mib; shrp X[ 0]=X[ 0],T1,24 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L6byte:
{ .mmi; $LDW X[ 3]=[input],4*$SZ
$LDW X[ 2]=[r8],4*$SZ
shrp X[15]=X[15],X[14],16 }
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[14]=X[14],X[13],16 };;
{ .mii; $LDW T1=[input]
shrp X[13]=X[13],X[12],16
shrp X[12]=X[12],X[11],16 }
{ .mii; shrp X[11]=X[11],X[10],16
shrp X[10]=X[10],X[ 9],16 };;
{ .mii; shrp X[ 9]=X[ 9],X[ 8],16
shrp X[ 8]=X[ 8],X[ 7],16 }
{ .mii; shrp X[ 7]=X[ 7],X[ 6],16
shrp X[ 6]=X[ 6],X[ 5],16 }
{ .mii; shrp X[ 5]=X[ 5],X[ 4],16
shrp X[ 4]=X[ 4],X[ 3],16 }
{ .mii; shrp X[ 3]=X[ 3],X[ 2],16
shrp X[ 2]=X[ 2],X[ 1],16 }
{ .mii; shrp X[ 1]=X[ 1],X[ 0],16
shrp X[ 0]=X[ 0],T1,16 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev // eliminated on big-endian
br.many .L_first16 };;
.L7byte:
{ .mmi; $LDW X[ 1]=[r9],4*$SZ
$LDW X[ 0]=[r10],4*$SZ
shrp X[15]=X[15],X[14],8 };;
{ .mii; $LDW T1=[input]
shrp X[14]=X[14],X[13],8
shrp X[13]=X[13],X[12],8 }
{ .mii; shrp X[12]=X[12],X[11],8
shrp X[11]=X[11],X[10],8 };;
{ .mii; shrp X[10]=X[10],X[ 9],8
shrp X[ 9]=X[ 9],X[ 8],8 }
{ .mii; shrp X[ 8]=X[ 8],X[ 7],8
shrp X[ 7]=X[ 7],X[ 6],8 }
{ .mii; shrp X[ 6]=X[ 6],X[ 5],8
shrp X[ 5]=X[ 5],X[ 4],8 }
{ .mii; shrp X[ 4]=X[ 4],X[ 3],8
shrp X[ 3]=X[ 3],X[ 2],8 }
{ .mii; shrp X[ 2]=X[ 2],X[ 1],8
shrp X[ 1]=X[ 1],X[ 0],8 }
{ .mib; shrp X[ 0]=X[ 0],T1,8 }
{ .mib; mov r8=0
mux1 X[15]=X[15],\@rev };; // eliminated on big-endian
.align 32
.L_first16:
{ .mmi; $LDW K=[Ktbl],$SZ
add A=A,r8 // H+=Sigma(0) from the past
_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; and T1=F,E
andcm r8=G,E
(p16) mux1 X[14]=X[14],\@rev };; // eliminated on big-endian
{ .mmi; and T2=A,B
and r9=A,C
_rotr r11=$t1,$Sigma1[1] } // ROTR(e,41)
{ .mmi; xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
and r8=B,C };;
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
.align 32
.L_first16:
{ .mmi; add A=A,r8 // H+=Sigma(0) from the past
add r10=2-$SZ,input
add r11=3-$SZ,input };;
{ .mmi; ld1 r9=[r9]
ld1 r10=[r10]
dep.z $t1=E,32,32 }
{ .mmi; ld1 r11=[r11]
$LDW K=[Ktbl],$SZ
zxt4 E=E };;
{ .mii; or $t1=$t1,E
dep X[15]=X[15],r9,8,8
mux2 $t0=A,0x44 };; // copy lower half to upper
{ .mmi; and T1=F,E
andcm r8=G,E
dep r11=r10,r11,8,8 };;
{ .mmi; and T2=A,B
and r9=A,C
dep X[15]=X[15],r11,16,16 };;
{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch
xor T1=T1,r8 // T1=((e & f) ^ (~e & g))
_rotr r10=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mmi; and r8=B,C
_rotr r11=$t1,$Sigma1[1] };; // ROTR(e,18)
___
$code.=<<___;
{ .mmi; add T1=T1,H // T1=Ch(e,f,g)+h
xor r10=r10,r11
_rotr r11=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mmi; xor T2=T2,r9
add K=K,X[15] };;
{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
xor T2=T2,r8 // T2=((a & b) ^ (a & c) ^ (b & c))
_rotr r8=$t0,$Sigma0[0] } // ROTR(a,28)
{ .mmi; xor r11=r11,r10 // Sigma1(e)
_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
{ .mmi; add T1=T1,r11 // T+=Sigma1(e)
xor r8=r8,r9
_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
{ .mmi; xor r8=r8,r9 // Sigma0(a)
add D=D,T1
mux2 H=X[15],0x44 } // mov H=X[15] in sha512
{ .mib; (p16) add r9=1-$SZ,input // not used in sha512
add X[15]=T1,T2 // H=T1+Maj(a,b,c)
br.ctop.sptk .L_first16 };;
.L_first16_end:
{ .mib; mov ar.lc=$rounds-17
brp.loop.imp .L_rest,.L_rest_end-16 }
{ .mib; mov ar.ec=1
br.many .L_rest };;
.align 32
.L_rest:
{ .mmi; $LDW K=[Ktbl],$SZ
add A=A,r8 // H+=Sigma0(a) from the past
_rotr r8=X[15-1],$sigma0[0] } // ROTR(s0,1)
{ .mmi; add X[15]=X[15],X[15-9] // X[i&0xF]+=X[(i+9)&0xF]
$SHRU s0=X[15-1],sgm0 };; // s0=X[(i+1)&0xF]>>7
{ .mib; and T1=F,E
_rotr r9=X[15-1],$sigma0[1] } // ROTR(s0,8)
{ .mib; andcm r10=G,E
$SHRU s1=X[15-14],sgm1 };; // s1=X[(i+14)&0xF]>>6
// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
// upon $SHRU output usage
{ .mmi; xor T1=T1,r10 // T1=((e & f) ^ (~e & g))
xor r9=r8,r9
_rotr r10=X[15-14],$sigma1[0] }// ROTR(s1,19)
{ .mmi; and T2=A,B
and r8=A,C
_rotr r11=X[15-14],$sigma1[1] };;// ROTR(s1,61)
___
$t0="t0", $t1="t1", $code.=<<___ if ($BITS==32);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
dep.z $t1=E,32,32 }
{ .mib; xor r10=r11,r10
zxt4 E=E };;
{ .mii; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
shrp r9=E,$t1,32+$Sigma1[0] // ROTR(e,14)
mux2 $t0=A,0x44 };; // copy lower half to upper
// Pair of mmi; splits on Itanium 1 and prevents pipeline flush
// upon mux2 output usage
{ .mmi; xor T2=T2,r8
shrp r8=E,$t1,32+$Sigma1[1]} // ROTR(e,18)
{ .mmi; and r10=B,C
add T1=T1,H // T1=Ch(e,f,g)+h
or $t1=$t1,E };;
___
$t0="A", $t1="E", $code.=<<___ if ($BITS==64);
{ .mib; xor s0=s0,r9 // s0=sigma0(X[(i+1)&0xF])
_rotr r9=$t1,$Sigma1[0] } // ROTR(e,14)
{ .mib; xor r10=r11,r10
xor T2=T2,r8 };;
{ .mib; xor s1=s1,r10 // s1=sigma1(X[(i+14)&0xF])
_rotr r8=$t1,$Sigma1[1] } // ROTR(e,18)
{ .mib; and r10=B,C
add T1=T1,H };; // T1+=H
___
$code.=<<___;
{ .mib; xor r9=r9,r8
_rotr r8=$t1,$Sigma1[2] } // ROTR(e,41)
{ .mib; xor T2=T2,r10 // T2=((a & b) ^ (a & c) ^ (b & c))
add X[15]=X[15],s0 };; // X[i]+=sigma0(X[i+1])
{ .mmi; xor r9=r9,r8 // Sigma1(e)
add X[15]=X[15],s1 // X[i]+=sigma0(X[i+14])
_rotr r8=$t0,$Sigma0[0] };; // ROTR(a,28)
{ .mmi; add K=K,X[15]
add T1=T1,r9 // T1+=Sigma1(e)
_rotr r9=$t0,$Sigma0[1] };; // ROTR(a,34)
{ .mmi; add T1=T1,K // T1+=K[i]+X[i]
xor r8=r8,r9
_rotr r9=$t0,$Sigma0[2] };; // ROTR(a,39)
{ .mib; add D=D,T1
mux2 H=X[15],0x44 } // mov H=X[15] in sha512
{ .mib; xor r8=r8,r9 // Sigma0(a)
add X[15]=T1,T2 // H=T1+Maj(a,b,c)
br.ctop.sptk .L_rest };;
.L_rest_end:
{ .mmi; add A=A,r8 };; // H+=Sigma0(a) from the past
{ .mmi; add A_=A_,A
add B_=B_,B
add C_=C_,C }
{ .mmi; add D_=D_,D
add E_=E_,E
cmp.ltu p16,p0=1,num };;
{ .mmi; add F_=F_,F
add G_=G_,G
add H_=H_,H }
{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl
(p16) add num=-1,num
(p16) br.dptk.many .L_outer };;
{ .mib; add r8=0*$SZ,ctx
add r9=1*$SZ,ctx }
{ .mib; add r10=2*$SZ,ctx
add r11=3*$SZ,ctx };;
{ .mmi; $STW [r8]=A_,4*$SZ
$STW [r9]=B_,4*$SZ
mov ar.lc=lcsave }
{ .mmi; $STW [r10]=C_,4*$SZ
$STW [r11]=D_,4*$SZ
mov pr=prsave,0x1ffff };;
{ .mmb; $STW [r8]=E_
$STW [r9]=F_ }
{ .mmb; $STW [r10]=G_
$STW [r11]=H_
br.ret.sptk.many b0 };;
.endp $func#
___
foreach(split($/,$code)) {
s/\`([^\`]*)\`/eval $1/gem;
s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm;
if ($BITS==64) {
s/mux2(\s+)([^=]+)=([^,]+),\S+/mov$1 $2=$3/gm;
s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian);
s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm
if (!$big_endian);
s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm;
}
print $_,"\n";
}
print<<___ if ($BITS==32);
.align 64
.type K256#,\@object
K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
data4 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
data4 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
data4 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
data4 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
data4 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
data4 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
data4 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
data4 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
data4 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
data4 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
data4 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
data4 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
data4 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
.size K256#,$SZ*$rounds
stringz "SHA256 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___
print<<___ if ($BITS==64);
.align 64
.type K512#,\@object
K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd
data8 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
data8 0x3956c25bf348b538,0x59f111f1b605d019
data8 0x923f82a4af194f9b,0xab1c5ed5da6d8118
data8 0xd807aa98a3030242,0x12835b0145706fbe
data8 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
data8 0x72be5d74f27b896f,0x80deb1fe3b1696b1
data8 0x9bdc06a725c71235,0xc19bf174cf692694
data8 0xe49b69c19ef14ad2,0xefbe4786384f25e3
data8 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
data8 0x2de92c6f592b0275,0x4a7484aa6ea6e483
data8 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
data8 0x983e5152ee66dfab,0xa831c66d2db43210
data8 0xb00327c898fb213f,0xbf597fc7beef0ee4
data8 0xc6e00bf33da88fc2,0xd5a79147930aa725
data8 0x06ca6351e003826f,0x142929670a0e6e70
data8 0x27b70a8546d22ffc,0x2e1b21385c26c926
data8 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
data8 0x650a73548baf63de,0x766a0abb3c77b2a8
data8 0x81c2c92e47edaee6,0x92722c851482353b
data8 0xa2bfe8a14cf10364,0xa81a664bbc423001
data8 0xc24b8b70d0f89791,0xc76c51a30654be30
data8 0xd192e819d6ef5218,0xd69906245565a910
data8 0xf40e35855771202a,0x106aa07032bbd1b8
data8 0x19a4c116b8d2d0c8,0x1e376c085141ab53
data8 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
data8 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
data8 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
data8 0x748f82ee5defb2fc,0x78a5636f43172f60
data8 0x84c87814a1f0ab72,0x8cc702081a6439ec
data8 0x90befffa23631e28,0xa4506cebde82bde9
data8 0xbef9a3f7b2c67915,0xc67178f2e372532b
data8 0xca273eceea26619c,0xd186b8c721c0c207
data8 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
data8 0x06f067aa72176fba,0x0a637dc5a2c898a6
data8 0x113f9804bef90dae,0x1b710b35131c471b
data8 0x28db77f523047d84,0x32caab7b40c72493
data8 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.size K512#,$SZ*$rounds
stringz "SHA512 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
___

View file

@ -0,0 +1,521 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA2 block procedures for MIPS.
# October 2010.
#
# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
# for now can only be compiled for MIPS64 ISA] improvement is modest
# ~17%, but it comes for free, because it's same instruction sequence.
# Improvement coefficients are for aligned input.
# September 2012.
#
# Add MIPS[32|64]R2 code (>25% less instructions).
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
# excluded from the rule, because it's specified volatile];
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_LA="dla";
$PTR_ADD="daddu"; # incidentally works even on n32
$PTR_SUB="dsubu"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_LA="la";
$PTR_ADD="addu";
$PTR_SUB="subu";
$REG_S="sw";
$REG_L="lw";
$PTR_SLL="sll";
$SZREG=4;
}
$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
#
# <appro@openssl.org>
#
######################################################################
$big_endian=(`echo MIPSEB | $ENV{CC} -E -`=~/MIPSEB/)?0:1 if ($ENV{CC});
for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ld"; # load from memory
$ST="sd"; # store to memory
$SLL="dsll"; # shift left logical
$SRL="dsrl"; # shift right logical
$ADDU="daddu";
$ROTR="drotr";
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
} else {
$label="256";
$SZ=4;
$LD="lw"; # load from memory
$ST="sw"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
$ADDU="addu";
$ROTR="rotr";
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
}
$MSB = $big_endian ? 0 : ($SZ-1);
$LSB = ($SZ-1)&~$MSB;
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
@X=map("\$$_",(8..23));
$ctx=$a0;
$inp=$a1;
$len=$a2; $Ktbl=$len;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___ if ($i<15);
#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
${LD} @X[1],`($i+1)*$SZ`($inp)
#else
${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp)
${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp)
#endif
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==4);
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
wsbh @X[0],@X[0] # byte swap($i)
rotr @X[0],@X[0],16
#else
srl $tmp0,@X[0],24 # byte swap($i)
srl $tmp1,@X[0],8
andi $tmp2,@X[0],0xFF00
sll @X[0],@X[0],24
andi $tmp1,0xFF00
sll $tmp2,$tmp2,8
or @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
#endif
___
$code.=<<___ if (!$big_endian && $i<16 && $SZ==8);
#if defined(_MIPS_ARCH_MIPS64R2)
dsbh @X[0],@X[0] # byte swap($i)
dshd @X[0],@X[0]
#else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,@X[0],$tmp0 # byte swap($i)
dsrl $tmp2,@X[0],24
dsll $tmp1,24
and $tmp2,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
and $tmp2,@X[0],$tmp0
dsrl @X[0],8
dsll $tmp2,8
and @X[0],$tmp0
or $tmp1,$tmp2
or @X[0],$tmp1
dsrl $tmp1,@X[0],32
dsll @X[0],32
or @X[0],$tmp1
#endif
___
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
xor $tmp2,$f,$g # $i
$ROTR $tmp0,$e,@Sigma1[0]
$ADDU $T1,$X[0],$h
$ROTR $tmp1,$e,@Sigma1[1]
and $tmp2,$e
$ROTR $h,$e,@Sigma1[2]
xor $tmp0,$tmp1
$ROTR $tmp1,$a,@Sigma0[0]
xor $tmp2,$g # Ch(e,f,g)
xor $tmp0,$h # Sigma1(e)
$ROTR $h,$a,@Sigma0[1]
$ADDU $T1,$tmp2
$LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
xor $h,$tmp1
$ROTR $tmp1,$a,@Sigma0[2]
$ADDU $T1,$tmp0
and $tmp0,$b,$c
xor $h,$tmp1 # Sigma0(a)
xor $tmp1,$b,$c
#else
$ADDU $T1,$X[0],$h # $i
$SRL $h,$e,@Sigma1[0]
xor $tmp2,$f,$g
$SLL $tmp1,$e,`$SZ*8-@Sigma1[2]`
and $tmp2,$e
$SRL $tmp0,$e,@Sigma1[1]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[1]`
xor $h,$tmp0
$SRL $tmp0,$e,@Sigma1[2]
xor $h,$tmp1
$SLL $tmp1,$e,`$SZ*8-@Sigma1[0]`
xor $h,$tmp0
xor $tmp2,$g # Ch(e,f,g)
xor $tmp0,$tmp1,$h # Sigma1(e)
$SRL $h,$a,@Sigma0[0]
$ADDU $T1,$tmp2
$LD $tmp2,`$i*$SZ`($Ktbl) # K[$i]
$SLL $tmp1,$a,`$SZ*8-@Sigma0[2]`
$ADDU $T1,$tmp0
$SRL $tmp0,$a,@Sigma0[1]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[1]`
xor $h,$tmp0
$SRL $tmp0,$a,@Sigma0[2]
xor $h,$tmp1
$SLL $tmp1,$a,`$SZ*8-@Sigma0[0]`
xor $h,$tmp0
and $tmp0,$b,$c
xor $h,$tmp1 # Sigma0(a)
xor $tmp1,$b,$c
#endif
$ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer
$ADDU $h,$tmp0
and $tmp1,$a
$ADDU $T1,$tmp2 # +=K[$i]
$ADDU $h,$tmp1 # +=Maj(a,b,c)
$ADDU $d,$T1
$ADDU $h,$T1
___
$code.=<<___ if ($i>=13);
$LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer
___
}
sub BODY_16_XX {
my $i=@_[0];
my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
$code.=<<___;
#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ROTR $tmp0,@X[1],@sigma0[1]
$ADDU @X[0],@X[9] # +=X[i+9]
xor $tmp2,$tmp0
$ROTR $tmp0,@X[1],@sigma0[2]
$SRL $tmp3,@X[14],@sigma1[0]
$ROTR $tmp1,@X[14],@sigma1[1]
xor $tmp2,$tmp0 # sigma0(X[i+1])
$ROTR $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
$ADDU @X[0],$tmp2
#else
$SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i)
$ADDU @X[0],@X[9] # +=X[i+9]
$SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]`
$SRL $tmp0,@X[1],@sigma0[1]
xor $tmp2,$tmp1
$SLL $tmp1,`@sigma0[2]-@sigma0[1]`
xor $tmp2,$tmp0
$SRL $tmp0,@X[1],@sigma0[2]
xor $tmp2,$tmp1
$SRL $tmp3,@X[14],@sigma1[0]
xor $tmp2,$tmp0 # sigma0(X[i+1])
$SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]`
$ADDU @X[0],$tmp2
$SRL $tmp0,@X[14],@sigma1[1]
xor $tmp3,$tmp1
$SLL $tmp1,`@sigma1[2]-@sigma1[1]`
xor $tmp3,$tmp0
$SRL $tmp0,@X[14],@sigma1[2]
xor $tmp3,$tmp1
#endif
xor $tmp3,$tmp0 # sigma1(X[i+14])
$ADDU @X[0],$tmp3
___
&BODY_00_15(@_);
}
$FRAMESIZE=16*$SZ+16*$SZREG;
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
$code.=<<___;
#include "mips_arch.h"
.text
.set noat
#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
.option pic2
#endif
.align 5
.globl sha${label}_block_data_order
.ent sha${label}_block_data_order
sha${label}_block_data_order:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
___
$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
.cpload $pf
___
$code.=<<___;
$PTR_SUB $sp,$FRAMESIZE
$REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
$REG_S $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_S $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_S $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_S $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Ktbl
.cpsetup $pf,$zero,sha${label}_block_data_order
___
$code.=<<___;
.set reorder
$PTR_LA $Ktbl,K${label} # PIC-ified 'load address'
$LD $A,0*$SZ($ctx) # load context
$LD $B,1*$SZ($ctx)
$LD $C,2*$SZ($ctx)
$LD $D,3*$SZ($ctx)
$LD $E,4*$SZ($ctx)
$LD $F,5*$SZ($ctx)
$LD $G,6*$SZ($ctx)
$LD $H,7*$SZ($ctx)
$PTR_ADD @X[15],$inp # pointer to the end of input
$REG_S @X[15],16*$SZ($sp)
b .Loop
.align 5
.Loop:
#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
${LD} @X[0],($inp)
#else
${LD}l @X[0],$MSB($inp)
${LD}r @X[0],$LSB($inp)
#endif
___
for ($i=0;$i<16;$i++)
{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
b .L16_xx
.align 4
.L16_xx:
___
for (;$i<32;$i++)
{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
$code.=<<___;
and @X[6],0xfff
li @X[7],$lastK
.set noreorder
bne @X[6],@X[7],.L16_xx
$PTR_ADD $Ktbl,16*$SZ # Ktbl+=16
$REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input
$LD @X[0],0*$SZ($ctx)
$LD @X[1],1*$SZ($ctx)
$LD @X[2],2*$SZ($ctx)
$PTR_ADD $inp,16*$SZ
$LD @X[3],3*$SZ($ctx)
$ADDU $A,@X[0]
$LD @X[4],4*$SZ($ctx)
$ADDU $B,@X[1]
$LD @X[5],5*$SZ($ctx)
$ADDU $C,@X[2]
$LD @X[6],6*$SZ($ctx)
$ADDU $D,@X[3]
$LD @X[7],7*$SZ($ctx)
$ADDU $E,@X[4]
$ST $A,0*$SZ($ctx)
$ADDU $F,@X[5]
$ST $B,1*$SZ($ctx)
$ADDU $G,@X[6]
$ST $C,2*$SZ($ctx)
$ADDU $H,@X[7]
$ST $D,3*$SZ($ctx)
$ST $E,4*$SZ($ctx)
$ST $F,5*$SZ($ctx)
$ST $G,6*$SZ($ctx)
$ST $H,7*$SZ($ctx)
bne $inp,@X[15],.Loop
$PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl
$REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
$REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
$REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
$REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
$REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
$REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
$REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
$REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
$REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
$REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
$REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
$REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
$REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
$REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
.end sha${label}_block_data_order
.rdata
.align 5
K${label}:
___
if ($SZ==4) {
$code.=<<___;
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.dword 0x428a2f98d728ae22, 0x7137449123ef65cd
.dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.dword 0x3956c25bf348b538, 0x59f111f1b605d019
.dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.dword 0xd807aa98a3030242, 0x12835b0145706fbe
.dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.dword 0x9bdc06a725c71235, 0xc19bf174cf692694
.dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.dword 0x983e5152ee66dfab, 0xa831c66d2db43210
.dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.dword 0x06ca6351e003826f, 0x142929670a0e6e70
.dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.dword 0x650a73548baf63de, 0x766a0abb3c77b2a8
.dword 0x81c2c92e47edaee6, 0x92722c851482353b
.dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.dword 0xc24b8b70d0f89791, 0xc76c51a30654be30
.dword 0xd192e819d6ef5218, 0xd69906245565a910
.dword 0xf40e35855771202a, 0x106aa07032bbd1b8
.dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.dword 0x748f82ee5defb2fc, 0x78a5636f43172f60
.dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.dword 0x90befffa23631e28, 0xa4506cebde82bde9
.dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.dword 0xca273eceea26619c, 0xd186b8c721c0c207
.dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.dword 0x113f9804bef90dae, 0x1b710b35131c471b
.dword 0x28db77f523047d84, 0x32caab7b40c72493
.dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
___
}
$code.=<<___;
.asciiz "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,807 @@
#! /usr/bin/env perl
# Copyright 2009-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedure for PA-RISC.
# June 2009.
#
# SHA256 performance is >75% better than gcc 3.2 generated code on
# PA-7100LC. Compared to code generated by vendor compiler this
# implementation is almost 70% faster in 64-bit build, but delivers
# virtually same performance in 32-bit build on PA-8600.
#
# SHA512 performance is >2.9x better than gcc 3.2 generated code on
# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
# code is executed on PA-RISC 2.0 processor and switches to 64-bit
# code path delivering adequate performance even in "blended" 32-bit
# build. Though 64-bit code is not any faster than code generated by
# vendor compiler on PA-8600...
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
if ($output =~ /512/) {
$func="sha512_block_data_order";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LAST10BITS=0x017;
$LD="ldd";
$LDM="ldd,ma";
$ST="std";
} else {
$func="sha256_block_data_order";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LAST10BITS=0x0f2;
$LD="ldw";
$LDM="ldwm";
$ST="stw";
}
$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
# [+ argument transfer]
$XOFF=16*$SZ+32; # local variables
$FRAME+=$XOFF;
$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
$ctx="%r26"; # zapped by $a0
$inp="%r25"; # zapped by $a1
$num="%r24"; # zapped by $t0
$a0 ="%r26";
$a1 ="%r25";
$t0 ="%r24";
$t1 ="%r29";
$Tbl="%r31";
@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
_ror $e,$Sigma1[0],$a0
and $f,$e,$t0
_ror $e,$Sigma1[1],$a1
addl $t1,$h,$h
andcm $g,$e,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
or $t0,$t1,$t1 ; Ch(e,f,g)
addl @X[$i%16],$h,$h
xor $a0,$a1,$a1 ; Sigma1(e)
addl $t1,$h,$h
_ror $a,$Sigma0[0],$a0
addl $a1,$h,$h
_ror $a,$Sigma0[1],$a1
and $a,$b,$t0
and $a,$c,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
xor $t1,$t0,$t0
and $b,$c,$t1
xor $a0,$a1,$a1 ; Sigma0(a)
addl $h,$d,$d
xor $t1,$t0,$t0 ; Maj(a,b,c)
`"$LDM $SZ($Tbl),$t1" if ($i<15)`
addl $a1,$h,$h
addl $t0,$h,$h
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
_ror @X[($i+1)%16],$sigma0[0],$a0
_ror @X[($i+1)%16],$sigma0[1],$a1
addl @X[($i+9)%16],@X[$i],@X[$i]
_ror @X[($i+14)%16],$sigma1[0],$t0
_ror @X[($i+14)%16],$sigma1[1],$t1
xor $a1,$a0,$a0
_shr @X[($i+1)%16],$sigma0[2],$a1
xor $t1,$t0,$t0
_shr @X[($i+14)%16],$sigma1[2],$t1
xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
$LDM $SZ($Tbl),$t1
addl $a0,@X[$i],@X[$i]
addl $t0,@X[$i],@X[$i]
___
$code.=<<___ if ($i==15);
extru $t1,31,10,$a1
comiclr,<> $LAST10BITS,$a1,%r0
ldo 1($Tbl),$Tbl ; signal end of $Tbl
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.ALIGN 64
L\$table
___
$code.=<<___ if ($SZ==8);
.WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
.WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
.WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
.WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
.WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
.WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
.WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
.WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
.WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
.WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
.WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
.WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
.WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
.WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
.WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
.WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
.WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
.WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
.WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
.WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
.WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
.WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
.WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
.WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
.WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
.WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
.WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
.WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
.WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
.WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
.WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
.WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
.WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
.WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
.WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
.WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
.WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
.WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
.WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
.WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
___
$code.=<<___ if ($SZ==4);
.WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___;
.EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 64
$func
.PROC
.CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
$PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
$PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
_shl $num,`log(16*$SZ)/log(2)`,$num
addl $inp,$num,$num ; $num to point at the end of $inp
$PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
$PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
blr %r0,$Tbl
ldi 3,$t1
L\$pic
andcm $Tbl,$t1,$Tbl ; wipe privilege level
ldo L\$table-L\$pic($Tbl),$Tbl
___
$code.=<<___ if ($SZ==8 && $SIZE_T==4);
ldi 31,$t1
mtctl $t1,%cr11
extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
b L\$parisc1
nop
___
$code.=<<___;
$LD `0*$SZ`($ctx),$A ; load context
$LD `1*$SZ`($ctx),$B
$LD `2*$SZ`($ctx),$C
$LD `3*$SZ`($ctx),$D
$LD `4*$SZ`($ctx),$E
$LD `5*$SZ`($ctx),$F
$LD `6*$SZ`($ctx),$G
$LD `7*$SZ`($ctx),$H
extru $inp,31,`log($SZ)/log(2)`,$t0
sh3addl $t0,%r0,$t0
subi `8*$SZ`,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop
ldi `$SZ-1`,$t0
$LDM $SZ($Tbl),$t1
andcm $inp,$t0,$t0 ; align $inp
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
$LD `$SZ*15`($t0),@X[15]
$LD `$SZ*16`($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align data
$code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
L\$rounds
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
nop
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
$LD `0*$SZ`($ctx),@X[0] ; load context
$LD `1*$SZ`($ctx),@X[1]
$LD `2*$SZ`($ctx),@X[2]
$LD `3*$SZ`($ctx),@X[3]
$LD `4*$SZ`($ctx),@X[4]
$LD `5*$SZ`($ctx),@X[5]
addl @X[0],$A,$A
$LD `6*$SZ`($ctx),@X[6]
addl @X[1],$B,$B
$LD `7*$SZ`($ctx),@X[7]
ldo `16*$SZ`($inp),$inp ; advance $inp
$ST $A,`0*$SZ`($ctx) ; save context
addl @X[2],$C,$C
$ST $B,`1*$SZ`($ctx)
addl @X[3],$D,$D
$ST $C,`2*$SZ`($ctx)
addl @X[4],$E,$E
$ST $D,`3*$SZ`($ctx)
addl @X[5],$F,$F
$ST $E,`4*$SZ`($ctx)
addl @X[6],$G,$G
$ST $F,`5*$SZ`($ctx)
addl @X[7],$H,$H
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cmpb,*<>,n $inp,$num,L\$oop
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
___
if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
{{
$code.=<<___;
b L\$done
nop
.ALIGN 64
L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
$a1 ="%r18";
$a2 ="%r19";
$a3 ="%r20";
$t0 ="%r21";
$t1 ="%r22";
$t2 ="%r28";
$t3 ="%r29";
$Tbl="%r31";
@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
sub ROUND_00_15_pa1 {
my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
$code.=<<___ if (!$flag);
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
___
$code.=<<___;
shd $ehi,$elo,$Sigma1[0],$t0
add $Xlo,$hlo,$hlo
shd $elo,$ehi,$Sigma1[0],$t1
addc $Xhi,$hhi,$hhi ; h += X[i]
shd $ehi,$elo,$Sigma1[1],$t2
ldwm 8($Tbl),$Xhi
shd $elo,$ehi,$Sigma1[1],$t3
ldw -4($Tbl),$Xlo ; load K[i]
xor $t2,$t0,$t0
xor $t3,$t1,$t1
and $flo,$elo,$a0
and $fhi,$ehi,$a1
shd $ehi,$elo,$Sigma1[2],$t2
andcm $glo,$elo,$a2
shd $elo,$ehi,$Sigma1[2],$t3
andcm $ghi,$ehi,$a3
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma1(e)
add $Xlo,$hlo,$hlo
xor $a2,$a0,$a0
addc $Xhi,$hhi,$hhi ; h += K[i]
xor $a3,$a1,$a1 ; Ch(e,f,g)
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
shd $alo,$ahi,$Sigma0[1],$t3
xor $t2,$t0,$t0
xor $t3,$t1,$t1
shd $ahi,$alo,$Sigma0[2],$t2
and $alo,$blo,$a0
shd $alo,$ahi,$Sigma0[2],$t3
and $ahi,$bhi,$a1
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma0(a)
and $alo,$clo,$a2
and $ahi,$chi,$a3
xor $a2,$a0,$a0
add $hlo,$dlo,$dlo
xor $a3,$a1,$a1
addc $hhi,$dhi,$dhi ; d += h
and $blo,$clo,$a2
add $t0,$hlo,$hlo
and $bhi,$chi,$a3
addc $t1,$hhi,$hhi ; h += Sigma0(a)
xor $a2,$a0,$a0
add $a0,$hlo,$hlo
xor $a3,$a1,$a1 ; Maj(a,b,c)
addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
___
$code.=<<___ if ($i==15 && $flag);
extru $Xlo,31,10,$Xlo
comiclr,= $LAST10BITS,$Xlo,%r0
b L\$rounds_pa1
nop
___
push(@X,shift(@X)); push(@X,shift(@X));
}
sub ROUND_16_xx_pa1 {
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
my ($i)=shift;
$i-=16;
$code.=<<___;
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
shd $Xnhi,$Xnlo,$sigma0[0],$t0
shd $Xnlo,$Xnhi,$sigma0[0],$t1
add $a0,$Xlo,$Xlo
shd $Xnhi,$Xnlo,$sigma0[1],$t2
addc $a1,$Xhi,$Xhi
shd $Xnlo,$Xnhi,$sigma0[1],$t3
xor $t2,$t0,$t0
shd $Xnhi,$Xnlo,$sigma0[2],$t2
xor $t3,$t1,$t1
extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
xor $t2,$t0,$t0
shd $a3,$a2,$sigma1[0],$a0
xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
shd $a2,$a3,$sigma1[0],$a1
add $t0,$Xlo,$Xlo
shd $a3,$a2,$sigma1[1],$t2
addc $t1,$Xhi,$Xhi
shd $a2,$a3,$sigma1[1],$t3
xor $t2,$a0,$a0
shd $a3,$a2,$sigma1[2],$t2
xor $t3,$a1,$a1
extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
xor $t2,$a0,$a0
xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
add $a0,$Xlo,$Xlo
addc $a1,$Xhi,$Xhi
stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
___
&ROUND_00_15_pa1($i,@_,1);
}
$code.=<<___;
ldw `0*4`($ctx),$Ahi ; load context
ldw `1*4`($ctx),$Alo
ldw `2*4`($ctx),$Bhi
ldw `3*4`($ctx),$Blo
ldw `4*4`($ctx),$Chi
ldw `5*4`($ctx),$Clo
ldw `6*4`($ctx),$Dhi
ldw `7*4`($ctx),$Dlo
ldw `8*4`($ctx),$Ehi
ldw `9*4`($ctx),$Elo
ldw `10*4`($ctx),$Fhi
ldw `11*4`($ctx),$Flo
ldw `12*4`($ctx),$Ghi
ldw `13*4`($ctx),$Glo
ldw `14*4`($ctx),$Hhi
ldw `15*4`($ctx),$Hlo
extru $inp,31,2,$t0
sh3addl $t0,%r0,$t0
subi 32,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop_pa1
extru $inp,31,2,$a3
comib,= 0,$a3,L\$aligned_pa1
sub $inp,$a3,$inp
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
vshd $X[0],$X[1],$X[0]
vshd $X[1],$t2,$X[1]
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
vshd $t2,$t3,$t2
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
vshd $t3,$a0,$t3
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<=(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
for (;$i<(128/4-1);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
$code.=<<___;
b L\$collected_pa1
stw $t[0],`-$XOFF+$i*4`(%sp)
___
}
$code.=<<___;
L\$aligned_pa1
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
___
push(@t,shift(@t));
}
for (;$i<128/4;$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
___
push(@t,shift(@t));
}
$code.="L\$collected_pa1\n";
}
for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.="L\$rounds_pa1\n";
for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.=<<___;
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
ldw `0*4`($ctx),$t1 ; update context
ldw `1*4`($ctx),$t0
ldw `2*4`($ctx),$t3
ldw `3*4`($ctx),$t2
ldw `4*4`($ctx),$a1
ldw `5*4`($ctx),$a0
ldw `6*4`($ctx),$a3
add $t0,$Alo,$Alo
ldw `7*4`($ctx),$a2
addc $t1,$Ahi,$Ahi
ldw `8*4`($ctx),$t1
add $t2,$Blo,$Blo
ldw `9*4`($ctx),$t0
addc $t3,$Bhi,$Bhi
ldw `10*4`($ctx),$t3
add $a0,$Clo,$Clo
ldw `11*4`($ctx),$t2
addc $a1,$Chi,$Chi
ldw `12*4`($ctx),$a1
add $a2,$Dlo,$Dlo
ldw `13*4`($ctx),$a0
addc $a3,$Dhi,$Dhi
ldw `14*4`($ctx),$a3
add $t0,$Elo,$Elo
ldw `15*4`($ctx),$a2
addc $t1,$Ehi,$Ehi
stw $Ahi,`0*4`($ctx)
add $t2,$Flo,$Flo
stw $Alo,`1*4`($ctx)
addc $t3,$Fhi,$Fhi
stw $Bhi,`2*4`($ctx)
add $a0,$Glo,$Glo
stw $Blo,`3*4`($ctx)
addc $a1,$Ghi,$Ghi
stw $Chi,`4*4`($ctx)
add $a2,$Hlo,$Hlo
stw $Clo,`5*4`($ctx)
addc $a3,$Hhi,$Hhi
stw $Dhi,`6*4`($ctx)
ldo `16*$SZ`($inp),$inp ; advance $inp
stw $Dlo,`7*4`($ctx)
stw $Ehi,`8*4`($ctx)
stw $Elo,`9*4`($ctx)
stw $Fhi,`10*4`($ctx)
stw $Flo,`11*4`($ctx)
stw $Ghi,`12*4`($ctx)
stw $Glo,`13*4`($ctx)
stw $Hhi,`14*4`($ctx)
comb,= $inp,$num,L\$done
stw $Hlo,`15*4`($ctx)
b L\$oop_pa1
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
L\$done
___
}}
$code.=<<___;
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
$POP `-$FRAME+14*$SIZE_T`(%sp),%r17
$POP `-$FRAME+15*$SIZE_T`(%sp),%r18
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
{ my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
$opcode|=(1<<3) if ($mod =~ /^,m/);
$opcode|=(1<<2) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler/) {
$gnuas = 1;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
: sprintf("shd\t%$1,%$2,%d",$3)/e or
# translate made up instructions: _ror, _shr, _align, _shl
s/_ror(\s+)(%r[0-9]+),/
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
s/_shr(\s+%r[0-9]+),([0-9]+),/
$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
s/_align(\s+%r[0-9]+,%r[0-9]+),/
($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
s/_shl(\s+%r[0-9]+),([0-9]+),/
$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
s/cmpb,\*/comb,/ if ($SIZE_T==4);
s/\bbv\b/bve/ if ($SIZE_T==8);
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,799 @@
#! /usr/bin/env perl
# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# I let hardware handle unaligned input, except on page boundaries
# (see below for details). Otherwise straightforward implementation
# with X vector in register bank.
# sha256 | sha512
# -m64 -m32 | -m64 -m32
# --------------------------------------+-----------------------
# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
# Power6,xlc-7 +150% +90% | +100% +430%(*)
#
# (*) 64-bit code in 32-bit application context, which actually is
# on TODO list. It should be noted that for safe deployment in
# 32-bit *multi-threaded* context asynchronous signals should be
# blocked upon entry to SHA512 block routine. This is because
# 32-bit signaling procedure invalidates upper halves of GPRs.
# Context switch procedure preserves them, but not signaling:-(
# Second version is true multi-thread safe. Trouble with the original
# version was that it was using thread local storage pointer register.
# Well, it scrupulously preserved it, but the problem would arise the
# moment asynchronous signal was delivered and signal handler would
# dereference the TLS pointer. While it's never the case in openssl
# application or test suite, we have to respect this scenario and not
# use TLS pointer register. Alternative would be to require caller to
# block signals prior calling this routine. For the record, in 32-bit
# context R2 serves as TLS pointer, while in 64-bit context - R13.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
$func="sha512_block_ppc";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LD="ld";
$ST="std";
$ROR="rotrdi";
$SHR="srdi";
} else {
$func="sha256_block_ppc";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LD="lwz";
$ST="stw";
$ROR="rotrwi";
$SHR="srwi";
}
$FRAME=32*$SIZE_T+16*$SZ;
$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
$ctx="r3"; # zapped by $a0
$inp="r4"; # zapped by $a1
$num="r5"; # zapped by $t0
$T ="r0";
$a0 ="r3";
$a1 ="r4";
$t0 ="r5";
$t1 ="r6";
$Tbl="r7";
$A ="r8";
$B ="r9";
$C ="r10";
$D ="r11";
$E ="r12";
$F =$t1; $t1 = "r0"; # stay away from "r13";
$G ="r14";
$H ="r15";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
"r24","r25","r26","r27","r28","r29","r30","r31");
$inp="r31" if($SZ==4 || $SIZE_T==8); # reassigned $inp! aliases with @X[15]
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
$ROR $a0,$e,$Sigma1[0]
$ROR $a1,$e,$Sigma1[1]
and $t0,$f,$e
xor $a0,$a0,$a1
add $h,$h,$t1
andc $t1,$g,$e
$ROR $a1,$a1,`$Sigma1[2]-$Sigma1[1]`
or $t0,$t0,$t1 ; Ch(e,f,g)
add $h,$h,@X[$i%16]
xor $a0,$a0,$a1 ; Sigma1(e)
add $h,$h,$t0
add $h,$h,$a0
$ROR $a0,$a,$Sigma0[0]
$ROR $a1,$a,$Sigma0[1]
and $t0,$a,$b
and $t1,$a,$c
xor $a0,$a0,$a1
$ROR $a1,$a1,`$Sigma0[2]-$Sigma0[1]`
xor $t0,$t0,$t1
and $t1,$b,$c
xor $a0,$a0,$a1 ; Sigma0(a)
add $d,$d,$h
xor $t0,$t0,$t1 ; Maj(a,b,c)
___
$code.=<<___ if ($i<15);
$LD $t1,`($i+1)*$SZ`($Tbl)
___
$code.=<<___;
add $h,$h,$a0
add $h,$h,$t0
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
$ROR $a0,@X[($i+1)%16],$sigma0[0]
$ROR $a1,@X[($i+1)%16],$sigma0[1]
$ROR $t0,@X[($i+14)%16],$sigma1[0]
$ROR $t1,@X[($i+14)%16],$sigma1[1]
xor $a0,$a0,$a1
$SHR $a1,@X[($i+1)%16],$sigma0[2]
xor $t0,$t0,$t1
$SHR $t1,@X[($i+14)%16],$sigma1[2]
add @X[$i],@X[$i],@X[($i+9)%16]
xor $a0,$a0,$a1 ; sigma0(X[(i+1)&0x0f])
xor $t0,$t0,$t1 ; sigma1(X[(i+14)&0x0f])
$LD $t1,`$i*$SZ`($Tbl)
add @X[$i],@X[$i],$a0
add @X[$i],@X[$i],$t0
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.machine "any"
.text
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr r0
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
___
} else {
for ($i=16;$i<32;$i++) {
$code.=<<___;
lwz r$i,`$LITTLE_ENDIAN^(4*($i-16))`($ctx)
___
}
}
$code.=<<___;
bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
Laligned:
add $num,$inp,$num
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for the input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
beq Lcross_page
$UCMP $num,$t1
ble Laligned ; didn't cross the page boundary
subfc $num,$t1,$num
add $t1,$inp,$t1
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
; $inp equals to the intermediate end pointer here
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
___
} else {
$code.=<<___;
addi r12,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r8,0($inp)
lbz r9,1($inp)
lbz r10,2($inp)
lbz r11,3($inp)
addi $inp,$inp,4
stb r8,0(r12)
stb r9,1(r12)
stb r10,2(r12)
stb r11,3(r12)
addi r12,r12,4
bdnz Lmemcpy
___
}
$code.=<<___;
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
$POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne Lunaligned
Ldone:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
if ($SZ==4 || $SIZE_T==8) {
$code.=<<___;
.align 4
Lsha2_block_private:
$LD $t1,0($Tbl)
___
for($i=0;$i<16;$i++) {
$code.=<<___ if ($SZ==4 && !$LITTLE_ENDIAN);
lwz @X[$i],`$i*$SZ`($inp)
___
$code.=<<___ if ($SZ==4 && $LITTLE_ENDIAN);
lwz $a0,`$i*$SZ`($inp)
rotlwi @X[$i],$a0,8
rlwimi @X[$i],$a0,24,0,7
rlwimi @X[$i],$a0,24,16,23
___
# 64-bit loads are split to 2x32-bit ones, as CPU can't handle
# unaligned 64-bit loads, only 32-bit ones...
$code.=<<___ if ($SZ==8 && !$LITTLE_ENDIAN);
lwz $t0,`$i*$SZ`($inp)
lwz @X[$i],`$i*$SZ+4`($inp)
insrdi @X[$i],$t0,32,0
___
$code.=<<___ if ($SZ==8 && $LITTLE_ENDIAN);
lwz $a0,`$i*$SZ`($inp)
lwz $a1,`$i*$SZ+4`($inp)
rotlwi $t0,$a0,8
rotlwi @X[$i],$a1,8
rlwimi $t0,$a0,24,0,7
rlwimi @X[$i],$a1,24,0,7
rlwimi $t0,$a0,24,16,23
rlwimi @X[$i],$a1,24,16,23
insrdi @X[$i],$t0,32,0
___
&ROUND_00_15($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
li $t0,`$rounds/16-1`
mtctr $t0
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
___
for(;$i<32;$i++) {
&ROUND_16_xx($i,@V);
unshift(@V,pop(@V));
}
$code.=<<___;
bdnz Lrounds
$POP $ctx,`$FRAME-$SIZE_T*22`($sp)
$POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
$POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
$LD r16,`0*$SZ`($ctx)
$LD r17,`1*$SZ`($ctx)
$LD r18,`2*$SZ`($ctx)
$LD r19,`3*$SZ`($ctx)
$LD r20,`4*$SZ`($ctx)
$LD r21,`5*$SZ`($ctx)
$LD r22,`6*$SZ`($ctx)
addi $inp,$inp,`16*$SZ` ; advance inp
$LD r23,`7*$SZ`($ctx)
add $A,$A,r16
add $B,$B,r17
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
add $C,$C,r18
$ST $A,`0*$SZ`($ctx)
add $D,$D,r19
$ST $B,`1*$SZ`($ctx)
add $E,$E,r20
$ST $C,`2*$SZ`($ctx)
add $F,$F,r21
$ST $D,`3*$SZ`($ctx)
add $G,$G,r22
$ST $E,`4*$SZ`($ctx)
add $H,$H,r23
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$UCMP $inp,$num
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size $func,.-$func
___
} else {
########################################################################
# SHA512 for PPC32, X vector is off-loaded to stack...
#
# | sha512
# | -m32
# ----------------------+-----------------------
# PPC74x0,gcc-4.0.1 | +48%
# POWER6,gcc-4.4.6 | +124%(*)
# POWER7,gcc-4.4.6 | +79%(*)
# e300,gcc-4.1.0 | +167%
#
# (*) ~1/3 of -m64 result [and ~20% better than -m32 code generated
# by xlc-12.1]
my $XOFF=$LOCALS;
my @V=map("r$_",(16..31)); # A..H
my ($s0,$s1,$t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("r$_",(0,5,6,8..12,14,15));
my ($x0,$x1)=("r3","r4"); # zaps $ctx and $inp
sub ROUND_00_15_ppc32 {
my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
$code.=<<___;
lwz $t2,`$SZ*($i%16)+($LITTLE_ENDIAN^4)`($Tbl)
xor $a0,$flo,$glo
lwz $t3,`$SZ*($i%16)+($LITTLE_ENDIAN^0)`($Tbl)
xor $a1,$fhi,$ghi
addc $hlo,$hlo,$t0 ; h+=x[i]
stw $t0,`$XOFF+0+$SZ*($i%16)`($sp) ; save x[i]
srwi $s0,$elo,$Sigma1[0]
srwi $s1,$ehi,$Sigma1[0]
and $a0,$a0,$elo
adde $hhi,$hhi,$t1
and $a1,$a1,$ehi
stw $t1,`$XOFF+4+$SZ*($i%16)`($sp)
srwi $t0,$elo,$Sigma1[1]
srwi $t1,$ehi,$Sigma1[1]
addc $hlo,$hlo,$t2 ; h+=K512[i]
insrwi $s0,$ehi,$Sigma1[0],0
insrwi $s1,$elo,$Sigma1[0],0
xor $a0,$a0,$glo ; Ch(e,f,g)
adde $hhi,$hhi,$t3
xor $a1,$a1,$ghi
insrwi $t0,$ehi,$Sigma1[1],0
insrwi $t1,$elo,$Sigma1[1],0
addc $hlo,$hlo,$a0 ; h+=Ch(e,f,g)
srwi $t2,$ehi,$Sigma1[2]-32
srwi $t3,$elo,$Sigma1[2]-32
xor $s0,$s0,$t0
xor $s1,$s1,$t1
insrwi $t2,$elo,$Sigma1[2]-32,0
insrwi $t3,$ehi,$Sigma1[2]-32,0
xor $a0,$alo,$blo ; a^b, b^c in next round
adde $hhi,$hhi,$a1
xor $a1,$ahi,$bhi
xor $s0,$s0,$t2 ; Sigma1(e)
xor $s1,$s1,$t3
srwi $t0,$alo,$Sigma0[0]
and $a2,$a2,$a0
addc $hlo,$hlo,$s0 ; h+=Sigma1(e)
and $a3,$a3,$a1
srwi $t1,$ahi,$Sigma0[0]
srwi $s0,$ahi,$Sigma0[1]-32
adde $hhi,$hhi,$s1
srwi $s1,$alo,$Sigma0[1]-32
insrwi $t0,$ahi,$Sigma0[0],0
insrwi $t1,$alo,$Sigma0[0],0
xor $a2,$a2,$blo ; Maj(a,b,c)
addc $dlo,$dlo,$hlo ; d+=h
xor $a3,$a3,$bhi
insrwi $s0,$alo,$Sigma0[1]-32,0
insrwi $s1,$ahi,$Sigma0[1]-32,0
adde $dhi,$dhi,$hhi
srwi $t2,$ahi,$Sigma0[2]-32
srwi $t3,$alo,$Sigma0[2]-32
xor $s0,$s0,$t0
addc $hlo,$hlo,$a2 ; h+=Maj(a,b,c)
xor $s1,$s1,$t1
insrwi $t2,$alo,$Sigma0[2]-32,0
insrwi $t3,$ahi,$Sigma0[2]-32,0
adde $hhi,$hhi,$a3
___
$code.=<<___ if ($i>=15);
lwz $t0,`$XOFF+0+$SZ*(($i+2)%16)`($sp)
lwz $t1,`$XOFF+4+$SZ*(($i+2)%16)`($sp)
___
$code.=<<___ if ($i<15 && !$LITTLE_ENDIAN);
lwz $t1,`$SZ*($i+1)+0`($inp)
lwz $t0,`$SZ*($i+1)+4`($inp)
___
$code.=<<___ if ($i<15 && $LITTLE_ENDIAN);
lwz $a2,`$SZ*($i+1)+0`($inp)
lwz $a3,`$SZ*($i+1)+4`($inp)
rotlwi $t1,$a2,8
rotlwi $t0,$a3,8
rlwimi $t1,$a2,24,0,7
rlwimi $t0,$a3,24,0,7
rlwimi $t1,$a2,24,16,23
rlwimi $t0,$a3,24,16,23
___
$code.=<<___;
xor $s0,$s0,$t2 ; Sigma0(a)
xor $s1,$s1,$t3
addc $hlo,$hlo,$s0 ; h+=Sigma0(a)
adde $hhi,$hhi,$s1
___
$code.=<<___ if ($i==15);
lwz $x0,`$XOFF+0+$SZ*(($i+1)%16)`($sp)
lwz $x1,`$XOFF+4+$SZ*(($i+1)%16)`($sp)
___
}
sub ROUND_16_xx_ppc32 {
my ($i, $ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo)=@_;
$code.=<<___;
srwi $s0,$t0,$sigma0[0]
srwi $s1,$t1,$sigma0[0]
srwi $t2,$t0,$sigma0[1]
srwi $t3,$t1,$sigma0[1]
insrwi $s0,$t1,$sigma0[0],0
insrwi $s1,$t0,$sigma0[0],0
srwi $a0,$t0,$sigma0[2]
insrwi $t2,$t1,$sigma0[1],0
insrwi $t3,$t0,$sigma0[1],0
insrwi $a0,$t1,$sigma0[2],0
xor $s0,$s0,$t2
lwz $t2,`$XOFF+0+$SZ*(($i+14)%16)`($sp)
srwi $a1,$t1,$sigma0[2]
xor $s1,$s1,$t3
lwz $t3,`$XOFF+4+$SZ*(($i+14)%16)`($sp)
xor $a0,$a0,$s0
srwi $s0,$t2,$sigma1[0]
xor $a1,$a1,$s1
srwi $s1,$t3,$sigma1[0]
addc $x0,$x0,$a0 ; x[i]+=sigma0(x[i+1])
srwi $a0,$t3,$sigma1[1]-32
insrwi $s0,$t3,$sigma1[0],0
insrwi $s1,$t2,$sigma1[0],0
adde $x1,$x1,$a1
srwi $a1,$t2,$sigma1[1]-32
insrwi $a0,$t2,$sigma1[1]-32,0
srwi $t2,$t2,$sigma1[2]
insrwi $a1,$t3,$sigma1[1]-32,0
insrwi $t2,$t3,$sigma1[2],0
xor $s0,$s0,$a0
lwz $a0,`$XOFF+0+$SZ*(($i+9)%16)`($sp)
srwi $t3,$t3,$sigma1[2]
xor $s1,$s1,$a1
lwz $a1,`$XOFF+4+$SZ*(($i+9)%16)`($sp)
xor $s0,$s0,$t2
addc $x0,$x0,$a0 ; x[i]+=x[i+9]
xor $s1,$s1,$t3
adde $x1,$x1,$a1
addc $x0,$x0,$s0 ; x[i]+=sigma1(x[i+14])
adde $x1,$x1,$s1
___
($t0,$t1,$x0,$x1) = ($x0,$x1,$t0,$t1);
&ROUND_00_15_ppc32(@_);
}
$code.=<<___;
.align 4
Lsha2_block_private:
___
$code.=<<___ if (!$LITTLE_ENDIAN);
lwz $t1,0($inp)
xor $a2,@V[3],@V[5] ; B^C, magic seed
lwz $t0,4($inp)
xor $a3,@V[2],@V[4]
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $a1,0($inp)
xor $a2,@V[3],@V[5] ; B^C, magic seed
lwz $a0,4($inp)
xor $a3,@V[2],@V[4]
rotlwi $t1,$a1,8
rotlwi $t0,$a0,8
rlwimi $t1,$a1,24,0,7
rlwimi $t0,$a0,24,0,7
rlwimi $t1,$a1,24,16,23
rlwimi $t0,$a0,24,16,23
___
for($i=0;$i<16;$i++) {
&ROUND_00_15_ppc32($i,@V);
unshift(@V,pop(@V)); unshift(@V,pop(@V));
($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
}
$code.=<<___;
li $a0,`$rounds/16-1`
mtctr $a0
.align 4
Lrounds:
addi $Tbl,$Tbl,`16*$SZ`
___
for(;$i<32;$i++) {
&ROUND_16_xx_ppc32($i,@V);
unshift(@V,pop(@V)); unshift(@V,pop(@V));
($a0,$a1,$a2,$a3) = ($a2,$a3,$a0,$a1);
}
$code.=<<___;
bdnz Lrounds
$POP $ctx,`$FRAME-$SIZE_T*22`($sp)
$POP $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
$POP $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
subi $Tbl,$Tbl,`($rounds-16)*$SZ` ; rewind Tbl
lwz $t0,`$LITTLE_ENDIAN^0`($ctx)
lwz $t1,`$LITTLE_ENDIAN^4`($ctx)
lwz $t2,`$LITTLE_ENDIAN^8`($ctx)
lwz $t3,`$LITTLE_ENDIAN^12`($ctx)
lwz $a0,`$LITTLE_ENDIAN^16`($ctx)
lwz $a1,`$LITTLE_ENDIAN^20`($ctx)
lwz $a2,`$LITTLE_ENDIAN^24`($ctx)
addc @V[1],@V[1],$t1
lwz $a3,`$LITTLE_ENDIAN^28`($ctx)
adde @V[0],@V[0],$t0
lwz $t0,`$LITTLE_ENDIAN^32`($ctx)
addc @V[3],@V[3],$t3
lwz $t1,`$LITTLE_ENDIAN^36`($ctx)
adde @V[2],@V[2],$t2
lwz $t2,`$LITTLE_ENDIAN^40`($ctx)
addc @V[5],@V[5],$a1
lwz $t3,`$LITTLE_ENDIAN^44`($ctx)
adde @V[4],@V[4],$a0
lwz $a0,`$LITTLE_ENDIAN^48`($ctx)
addc @V[7],@V[7],$a3
lwz $a1,`$LITTLE_ENDIAN^52`($ctx)
adde @V[6],@V[6],$a2
lwz $a2,`$LITTLE_ENDIAN^56`($ctx)
addc @V[9],@V[9],$t1
lwz $a3,`$LITTLE_ENDIAN^60`($ctx)
adde @V[8],@V[8],$t0
stw @V[0],`$LITTLE_ENDIAN^0`($ctx)
stw @V[1],`$LITTLE_ENDIAN^4`($ctx)
addc @V[11],@V[11],$t3
stw @V[2],`$LITTLE_ENDIAN^8`($ctx)
stw @V[3],`$LITTLE_ENDIAN^12`($ctx)
adde @V[10],@V[10],$t2
stw @V[4],`$LITTLE_ENDIAN^16`($ctx)
stw @V[5],`$LITTLE_ENDIAN^20`($ctx)
addc @V[13],@V[13],$a1
stw @V[6],`$LITTLE_ENDIAN^24`($ctx)
stw @V[7],`$LITTLE_ENDIAN^28`($ctx)
adde @V[12],@V[12],$a0
stw @V[8],`$LITTLE_ENDIAN^32`($ctx)
stw @V[9],`$LITTLE_ENDIAN^36`($ctx)
addc @V[15],@V[15],$a3
stw @V[10],`$LITTLE_ENDIAN^40`($ctx)
stw @V[11],`$LITTLE_ENDIAN^44`($ctx)
adde @V[14],@V[14],$a2
stw @V[12],`$LITTLE_ENDIAN^48`($ctx)
stw @V[13],`$LITTLE_ENDIAN^52`($ctx)
stw @V[14],`$LITTLE_ENDIAN^56`($ctx)
stw @V[15],`$LITTLE_ENDIAN^60`($ctx)
addi $inp,$inp,`16*$SZ` ; advance inp
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp)
$UCMP $inp,$num
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.size $func,.-$func
___
}
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,324 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedures for s390x.
# April 2007.
#
# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
# generated code (must be a bug in compiler, as improvement is
# "pathologically" high, in particular in comparison to other SHA
# modules). But the real twist is that it detects if hardware support
# for SHA256 is available and in such case utilizes it. Then the
# performance can reach >6.5x of assembler one for larger chunks.
#
# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
# January 2009.
#
# Add support for hardware SHA512 and reschedule instructions to
# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
# than software.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. On z990 SHA256 was measured to
# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
$t0="%r0";
$t1="%r1";
$ctx="%r2"; $t2="%r2";
$inp="%r3";
$len="%r4"; # used as index in inner loop
$A="%r5";
$B="%r6";
$C="%r7";
$D="%r8";
$E="%r9";
$F="%r10";
$G="%r11";
$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
$tbl="%r13";
$T1="%r14";
$sp="%r15";
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="lg"; # load from memory
$ST="stg"; # store to memory
$ADD="alg"; # add with memory operand
$ROT="rllg"; # rotate left
$SHR="srlg"; # logical right shift [see even at the end]
@Sigma0=(25,30,36);
@Sigma1=(23,46,50);
@sigma0=(56,63, 7);
@sigma1=( 3,45, 6);
$rounds=80;
$kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
} else {
$label="256";
$SZ=4;
$LD="llgf"; # load from memory
$ST="st"; # store to memory
$ADD="al"; # add with memory operand
$ROT="rll"; # rotate left
$SHR="srl"; # logical right shift
@Sigma0=(10,19,30);
@Sigma1=( 7,21,26);
@sigma0=(14,25, 3);
@sigma1=(13,15,10);
$rounds=64;
$kimdfunc=2; # magic function code for kimd instruction
}
$Func="sha${label}_block_data_order";
$Table="K${label}";
$stdframe=16*$SIZE_T+4*8;
$frame=$stdframe+16*$SZ;
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
$LD $T1,`$i*$SZ`($inp) ### $i
___
$code.=<<___;
$ROT $t0,$e,$Sigma1[0]
$ROT $t1,$e,$Sigma1[1]
lgr $t2,$f
xgr $t0,$t1
$ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
xgr $t2,$g
$ST $T1,`$stdframe+$SZ*($i%16)`($sp)
xgr $t0,$t1 # Sigma1(e)
algr $T1,$h # T1+=h
ngr $t2,$e
lgr $t1,$a
algr $T1,$t0 # T1+=Sigma1(e)
$ROT $h,$a,$Sigma0[0]
xgr $t2,$g # Ch(e,f,g)
$ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
$ROT $t0,$a,$Sigma0[1]
algr $T1,$t2 # T1+=Ch(e,f,g)
ogr $t1,$b
xgr $h,$t0
lgr $t2,$a
ngr $t1,$c
$ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
xgr $h,$t0 # h=Sigma0(a)
ngr $t2,$b
algr $h,$T1 # h+=T1
ogr $t2,$t1 # Maj(a,b,c)
algr $d,$T1 # d+=T1
algr $h,$t2 # h+=Maj(a,b,c)
___
}
sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
$LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
$LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
$ROT $t0,$T1,$sigma0[0]
$SHR $T1,$sigma0[2]
$ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
xgr $T1,$t0
$ROT $t0,$t1,$sigma1[0]
xgr $T1,$t2 # sigma0(X[i+1])
$SHR $t1,$sigma1[2]
$ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
xgr $t1,$t0
$ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
$ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
xgr $t1,$t0 # sigma1(X[i+14])
algr $T1,$t1 # +=sigma1(X[i+14])
___
&BODY_00_15(@_);
}
$code.=<<___;
#include "s390x_arch.h"
.text
.align 64
.type $Table,\@object
$Table:
___
$code.=<<___ if ($SZ==4);
.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
___
$code.=<<___;
.size $Table,.-$Table
.globl $Func
.type $Func,\@function
$Func:
sllg $len,$len,`log(16*$SZ)/log(2)`
___
$code.=<<___ if ($kimdfunc);
larl %r1,OPENSSL_s390xcap_P
lg %r0,S390X_KIMD(%r1) # check kimd capabilities
tmhh %r0,`0x8000>>$kimdfunc`
jz .Lsoftware
lghi %r0,$kimdfunc
lgr %r1,$ctx
lgr %r2,$inp
lgr %r3,$len
.long 0xb93e0002 # kimd %r0,%r2
brc 1,.-4 # pay attention to "partial completion"
br %r14
.align 16
.Lsoftware:
___
$code.=<<___;
lghi %r1,-$frame
la $len,0($len,$inp)
stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
lgr %r0,$sp
la $sp,0(%r1,$sp)
st${g} %r0,0($sp)
larl $tbl,$Table
$LD $A,`0*$SZ`($ctx)
$LD $B,`1*$SZ`($ctx)
$LD $C,`2*$SZ`($ctx)
$LD $D,`3*$SZ`($ctx)
$LD $E,`4*$SZ`($ctx)
$LD $F,`5*$SZ`($ctx)
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
.Lloop:
lghi $len,0
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".Lrounds_16_xx:\n";
for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
aghi $len,`16*$SZ`
lghi $t0,`($rounds-16)*$SZ`
clgr $len,$t0
jne .Lrounds_16_xx
l${g} $ctx,`$frame+2*$SIZE_T`($sp)
la $inp,`16*$SZ`($inp)
$ADD $A,`0*$SZ`($ctx)
$ADD $B,`1*$SZ`($ctx)
$ADD $C,`2*$SZ`($ctx)
$ADD $D,`3*$SZ`($ctx)
$ADD $E,`4*$SZ`($ctx)
$ADD $F,`5*$SZ`($ctx)
$ADD $G,`6*$SZ`($ctx)
$ADD $H,`7*$SZ`($ctx)
$ST $A,`0*$SZ`($ctx)
$ST $B,`1*$SZ`($ctx)
$ST $C,`2*$SZ`($ctx)
$ST $D,`3*$SZ`($ctx)
$ST $E,`4*$SZ`($ctx)
$ST $F,`5*$SZ`($ctx)
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
# unlike 32-bit shift 64-bit one takes three arguments
$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
print $code;
close STDOUT;

View file

@ -0,0 +1,857 @@
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Hardware SPARC T4 support by David S. Miller
# ====================================================================
# SHA256 performance improvement over compiler generated code varies
# from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
# build]. Just like in SHA1 module I aim to ensure scalability on
# UltraSPARC T1 by packing X[16] to 8 64-bit registers.
# SHA512 on pre-T1 UltraSPARC.
#
# Performance is >75% better than 64-bit code generated by Sun C and
# over 2x than 32-bit code. X[16] resides on stack, but access to it
# is scheduled for L2 latency and staged through 32 least significant
# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
# duality. Nevetheless it's ~40% faster than SHA256, which is pretty
# good [optimal coefficient is 50%].
#
# SHA512 on UltraSPARC T1.
#
# It's not any faster than 64-bit code generated by Sun C 5.8. This is
# because 64-bit code generator has the advantage of using 64-bit
# loads(*) to access X[16], which I consciously traded for 32-/64-bit
# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
# code by 60%, not to mention that it doesn't suffer from severe decay
# when running 4 times physical cores threads and that it leaves gcc
# [3.4] behind by over 4x factor! If compared to SHA256, single thread
# performance is only 10% better, but overall throughput for maximum
# amount of threads for given CPU exceeds corresponding one of SHA256
# by 30% [again, optimal coefficient is 50%].
#
# (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
# in-order, i.e. load instruction has to complete prior next
# instruction in given thread is executed, even if the latter is
# not dependent on load result! This means that on T1 two 32-bit
# loads are always slower than one 64-bit load. Once again this
# is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
# 2x32-bit loads can be as fast as 1x64-bit ones.
#
# SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
# which is 9.3x/11.1x faster than software. Multi-process benchmark
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.
$output=pop;
open STDOUT,">$output";
if ($output =~ /512/) {
$label="512";
$SZ=8;
$LD="ldx"; # load from memory
$ST="stx"; # store to memory
$SLL="sllx"; # shift left logical
$SRL="srlx"; # shift right logical
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=( 7, 1, 8); # right shift first
@sigma1=( 6,19,61); # right shift first
$lastK=0x817;
$rounds=80;
$align=4;
$locals=16*$SZ; # X[16]
$A="%o0";
$B="%o1";
$C="%o2";
$D="%o3";
$E="%o4";
$F="%o5";
$G="%g1";
$H="%o7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
} else {
$label="256";
$SZ=4;
$LD="ld"; # load from memory
$ST="st"; # store to memory
$SLL="sll"; # shift left logical
$SRL="srl"; # shift right logical
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 3, 7,18); # right shift first
@sigma1=(10,17,19); # right shift first
$lastK=0x8f2;
$rounds=64;
$align=8;
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$A="%l0";
$B="%l1";
$C="%l2";
$D="%l3";
$E="%l4";
$F="%l5";
$G="%l6";
$H="%l7";
@V=($A,$B,$C,$D,$E,$F,$G,$H);
}
$T1="%g2";
$tmp0="%g3";
$tmp1="%g4";
$tmp2="%g5";
$ctx="%i0";
$inp="%i1";
$len="%i2";
$Ktbl="%i3";
$tmp31="%i4";
$tmp32="%i5";
########### SHA256
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i==0) {
$code.=<<___;
ldx [$inp+0],@X[0]
ldx [$inp+16],@X[2]
ldx [$inp+32],@X[4]
ldx [$inp+48],@X[6]
ldx [$inp+8],@X[1]
ldx [$inp+24],@X[3]
subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
ldx [$inp+40],@X[5]
bz,pt %icc,.Laligned
ldx [$inp+56],@X[7]
sllx @X[0],$tmp31,@X[0]
ldx [$inp+64],$T1
___
for($j=0;$j<7;$j++)
{ $code.=<<___;
srlx @X[$j+1],$tmp32,$tmp1
sllx @X[$j+1],$tmp31,@X[$j+1]
or $tmp1,@X[$j],@X[$j]
___
}
$code.=<<___;
srlx $T1,$tmp32,$T1
or $T1,@X[7],@X[7]
.Laligned:
___
}
if ($i&1) {
$code.="\tadd @X[$i/2],$h,$T1\n";
} else {
$code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
}
} if ($SZ==4);
########### SHA512
$Xload = sub {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
$code.=<<___ if ($i==0);
ld [$inp+0],%l0
ld [$inp+4],%l1
ld [$inp+8],%l2
ld [$inp+12],%l3
ld [$inp+16],%l4
ld [$inp+20],%l5
ld [$inp+24],%l6
cmp $tmp31,0
ld [$inp+28],%l7
___
$code.=<<___ if ($i<15);
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
sllx @pair[0],$tmp0,$tmp1
`"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
or @pair[1],$tmp2,$tmp2
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
add $h,$tmp2,$T1
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
___
$code.=<<___ if ($i==12);
bnz,a,pn %icc,.+8
ld [$inp+128],%l0
___
$code.=<<___ if ($i==15);
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
sllx @pair[0],$tmp0,$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
or @pair[1],$tmp2,$tmp2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
add $h,$tmp2,$T1
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);
########### common
sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
if ($i<16) {
&$Xload(@_);
} else {
$code.="\tadd $h,$T1,$T1\n";
}
$code.=<<___;
$SRL $e,@Sigma1[0],$h !! $i
xor $f,$g,$tmp2
$SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
and $e,$tmp2,$tmp2
$SRL $e,@Sigma1[1],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $e,@Sigma1[2],$tmp0
xor $tmp1,$h,$h
$SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
xor $tmp0,$h,$h
xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
xor $tmp1,$h,$tmp0 ! Sigma1(e)
$SRL $a,@Sigma0[0],$h
add $tmp2,$T1,$T1
$LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
$SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
add $tmp0,$T1,$T1
$SRL $a,@Sigma0[1],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)
or $a,$b,$tmp0
and $a,$b,$tmp1
and $c,$tmp0,$tmp0
or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
add $tmp2,$T1,$T1 ! +=K[$i]
add $tmp1,$h,$h
add $T1,$d,$d
add $T1,$h,$h
___
}
########### SHA256
$BODY_16_XX = sub {
my $i=@_[0];
my $xi;
if ($i&1) {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
} else {
$xi=@X[(($i+1)/2)%8];
}
$code.=<<___;
srl $xi,@sigma0[0],$T1 !! Xupdate($i)
sll $xi,`32-@sigma0[2]`,$tmp1
srl $xi,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srl $xi,@sigma0[2],$tmp0
xor $tmp1,$T1,$T1
___
if ($i&1) {
$xi=@X[(($i+14)/2)%8];
} else {
$xi=$tmp32;
$code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
}
$code.=<<___;
srl $xi,@sigma1[0],$tmp2
xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
sll $xi,`32-@sigma1[2]`,$tmp1
srl $xi,@sigma1[1],$tmp0
xor $tmp1,$tmp2,$tmp2
sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
xor $tmp0,$tmp2,$tmp2
srl $xi,@sigma1[2],$tmp0
xor $tmp1,$tmp2,$tmp2
___
if ($i&1) {
$xi=@X[($i/2)%8];
$code.=<<___;
srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
srl @X[($i/2)%8],0,$tmp0
add $tmp2,$tmp1,$tmp1
add $xi,$T1,$T1 ! +=X[i]
xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
add $tmp1,$T1,$T1
srl $T1,0,$T1
or $T1,@X[($i/2)%8],@X[($i/2)%8]
___
} else {
$xi=@X[(($i+9)/2)%8];
$code.=<<___;
srlx @X[($i/2)%8],32,$tmp1 ! X[i]
xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
add $xi,$T1,$T1 ! +=X[i+9]
add $tmp2,$tmp1,$tmp1
srl @X[($i/2)%8],0,@X[($i/2)%8]
add $tmp1,$T1,$T1
sllx $T1,32,$tmp0
or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
___
}
&BODY_00_15(@_);
} if ($SZ==4);
########### SHA512
$BODY_16_XX = sub {
my $i=@_[0];
my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
$code.=<<___;
sllx %l2,32,$tmp0 !! Xupdate($i)
or %l3,$tmp0,$tmp0
srlx $tmp0,@sigma0[0],$T1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx $tmp0,`64-@sigma0[2]`,$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
srlx $tmp0,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
xor $tmp0,$T1,$T1
srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
xor $tmp1,$T1,$T1
sllx %l6,32,$tmp2
xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
or %l7,$tmp2,$tmp2
srlx $tmp2,@sigma1[0],$tmp1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
sllx $tmp2,`64-@sigma1[2]`,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
srlx $tmp2,@sigma1[1],$tmp2
xor $tmp0,$tmp1,$tmp1
sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
xor $tmp2,$tmp1,$tmp1
srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
xor $tmp0,$tmp1,$tmp1
sllx %l4,32,$tmp0
xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
or %l5,$tmp0,$tmp0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
sllx %l0,32,$tmp2
add $tmp1,$T1,$T1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
or %l1,$tmp2,$tmp2
add $tmp0,$T1,$T1 ! +=X[$i+9]
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
add $tmp2,$T1,$T1 ! +=X[$i]
$ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
___
&BODY_00_15(@_);
} if ($SZ==8);
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 64
K${label}:
.type K${label},#object
___
if ($SZ==4) {
$code.=<<___;
.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
___
} else {
$code.=<<___;
.long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
___
}
$code.=<<___;
.size K${label},.-K${label}
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl sha${label}_block_data_order
.align 32
sha${label}_block_data_order:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
andcc %g1, CFR_SHA${label}, %g0
be .Lsoftware
nop
___
$code.=<<___ if ($SZ==8); # SHA512
ldd [%o0 + 0x00], %f0 ! load context
ldd [%o0 + 0x08], %f2
ldd [%o0 + 0x10], %f4
ldd [%o0 + 0x18], %f6
ldd [%o0 + 0x20], %f8
ldd [%o0 + 0x28], %f10
andcc %o1, 0x7, %g0
ldd [%o0 + 0x30], %f12
bne,pn %icc, .Lhwunaligned
ldd [%o0 + 0x38], %f14
.Lhwaligned_loop:
ldd [%o1 + 0x00], %f16
ldd [%o1 + 0x08], %f18
ldd [%o1 + 0x10], %f20
ldd [%o1 + 0x18], %f22
ldd [%o1 + 0x20], %f24
ldd [%o1 + 0x28], %f26
ldd [%o1 + 0x30], %f28
ldd [%o1 + 0x38], %f30
ldd [%o1 + 0x40], %f32
ldd [%o1 + 0x48], %f34
ldd [%o1 + 0x50], %f36
ldd [%o1 + 0x58], %f38
ldd [%o1 + 0x60], %f40
ldd [%o1 + 0x68], %f42
ldd [%o1 + 0x70], %f44
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x78], %f46
add %o1, 0x80, %o1
prefetch [%o1 + 63], 20
prefetch [%o1 + 64+63], 20
.word 0x81b02860 ! SHA512
bne,pt SIZE_T_CC, .Lhwaligned_loop
nop
.Lhwfinish:
std %f0, [%o0 + 0x00] ! store context
std %f2, [%o0 + 0x08]
std %f4, [%o0 + 0x10]
std %f6, [%o0 + 0x18]
std %f8, [%o0 + 0x20]
std %f10, [%o0 + 0x28]
std %f12, [%o0 + 0x30]
retl
std %f14, [%o0 + 0x38]
.align 16
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f18
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f20
ldd [%o1 + 0x10], %f22
ldd [%o1 + 0x18], %f24
ldd [%o1 + 0x20], %f26
ldd [%o1 + 0x28], %f28
ldd [%o1 + 0x30], %f30
ldd [%o1 + 0x38], %f32
ldd [%o1 + 0x40], %f34
ldd [%o1 + 0x48], %f36
ldd [%o1 + 0x50], %f38
ldd [%o1 + 0x58], %f40
ldd [%o1 + 0x60], %f42
ldd [%o1 + 0x68], %f44
ldd [%o1 + 0x70], %f46
ldd [%o1 + 0x78], %f48
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x80], %f50
add %o1, 0x80, %o1
prefetch [%o1 + 63], 20
prefetch [%o1 + 64+63], 20
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
faligndata %f26, %f28, %f24
faligndata %f28, %f30, %f26
faligndata %f30, %f32, %f28
faligndata %f32, %f34, %f30
faligndata %f34, %f36, %f32
faligndata %f36, %f38, %f34
faligndata %f38, %f40, %f36
faligndata %f40, %f42, %f38
faligndata %f42, %f44, %f40
faligndata %f44, %f46, %f42
faligndata %f46, %f48, %f44
faligndata %f48, %f50, %f46
.word 0x81b02860 ! SHA512
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f50, %f50, %f18 ! %f18=%f50
ba .Lhwfinish
nop
___
$code.=<<___ if ($SZ==4); # SHA256
ld [%o0 + 0x00], %f0
ld [%o0 + 0x04], %f1
ld [%o0 + 0x08], %f2
ld [%o0 + 0x0c], %f3
ld [%o0 + 0x10], %f4
ld [%o0 + 0x14], %f5
andcc %o1, 0x7, %g0
ld [%o0 + 0x18], %f6
bne,pn %icc, .Lhwunaligned
ld [%o0 + 0x1c], %f7
.Lhwloop:
ldd [%o1 + 0x00], %f8
ldd [%o1 + 0x08], %f10
ldd [%o1 + 0x10], %f12
ldd [%o1 + 0x18], %f14
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
.word 0x81b02840 ! SHA256
bne,pt SIZE_T_CC, .Lhwloop
nop
.Lhwfinish:
st %f0, [%o0 + 0x00] ! store context
st %f1, [%o0 + 0x04]
st %f2, [%o0 + 0x08]
st %f3, [%o0 + 0x0c]
st %f4, [%o0 + 0x10]
st %f5, [%o0 + 0x14]
st %f6, [%o0 + 0x18]
retl
st %f7, [%o0 + 0x1c]
.align 8
.Lhwunaligned:
alignaddr %o1, %g0, %o1
ldd [%o1 + 0x00], %f10
.Lhwunaligned_loop:
ldd [%o1 + 0x08], %f12
ldd [%o1 + 0x10], %f14
ldd [%o1 + 0x18], %f16
ldd [%o1 + 0x20], %f18
ldd [%o1 + 0x28], %f20
ldd [%o1 + 0x30], %f22
ldd [%o1 + 0x38], %f24
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x40], %f26
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20
faligndata %f10, %f12, %f8
faligndata %f12, %f14, %f10
faligndata %f14, %f16, %f12
faligndata %f16, %f18, %f14
faligndata %f18, %f20, %f16
faligndata %f20, %f22, %f18
faligndata %f22, %f24, %f20
faligndata %f24, %f26, %f22
.word 0x81b02840 ! SHA256
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
___
$code.=<<___;
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
and $inp,`$align-1`,$tmp31
sllx $len,`log(16*$SZ)/log(2)`,$len
andn $inp,`$align-1`,$inp
sll $tmp31,3,$tmp31
add $inp,$len,$len
___
$code.=<<___ if ($SZ==8); # SHA512
mov 32,$tmp32
sub $tmp32,$tmp31,$tmp32
___
$code.=<<___;
.Lpic: call .+8
add %o7,K${label}-.Lpic,$Ktbl
$LD [$ctx+`0*$SZ`],$A
$LD [$ctx+`1*$SZ`],$B
$LD [$ctx+`2*$SZ`],$C
$LD [$ctx+`3*$SZ`],$D
$LD [$ctx+`4*$SZ`],$E
$LD [$ctx+`5*$SZ`],$F
$LD [$ctx+`6*$SZ`],$G
$LD [$ctx+`7*$SZ`],$H
.Lloop:
___
for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=".L16_xx:\n";
for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
and $tmp2,0xfff,$tmp2
cmp $tmp2,$lastK
bne .L16_xx
add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
___
$code.=<<___ if ($SZ==4); # SHA256
$LD [$ctx+`0*$SZ`],@X[0]
$LD [$ctx+`1*$SZ`],@X[1]
$LD [$ctx+`2*$SZ`],@X[2]
$LD [$ctx+`3*$SZ`],@X[3]
$LD [$ctx+`4*$SZ`],@X[4]
$LD [$ctx+`5*$SZ`],@X[5]
$LD [$ctx+`6*$SZ`],@X[6]
$LD [$ctx+`7*$SZ`],@X[7]
add $A,@X[0],$A
$ST $A,[$ctx+`0*$SZ`]
add $B,@X[1],$B
$ST $B,[$ctx+`1*$SZ`]
add $C,@X[2],$C
$ST $C,[$ctx+`2*$SZ`]
add $D,@X[3],$D
$ST $D,[$ctx+`3*$SZ`]
add $E,@X[4],$E
$ST $E,[$ctx+`4*$SZ`]
add $F,@X[5],$F
$ST $F,[$ctx+`5*$SZ`]
add $G,@X[6],$G
$ST $G,[$ctx+`6*$SZ`]
add $H,@X[7],$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___ if ($SZ==8); # SHA512
ld [$ctx+`0*$SZ+0`],%l0
ld [$ctx+`0*$SZ+4`],%l1
ld [$ctx+`1*$SZ+0`],%l2
ld [$ctx+`1*$SZ+4`],%l3
ld [$ctx+`2*$SZ+0`],%l4
ld [$ctx+`2*$SZ+4`],%l5
ld [$ctx+`3*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`3*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$A,$A
add $tmp1,$B,$B
$ST $A,[$ctx+`0*$SZ`]
sllx %l4,32,$tmp2
$ST $B,[$ctx+`1*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$C,$C
$ST $C,[$ctx+`2*$SZ`]
add $T1,$D,$D
$ST $D,[$ctx+`3*$SZ`]
ld [$ctx+`4*$SZ+0`],%l0
ld [$ctx+`4*$SZ+4`],%l1
ld [$ctx+`5*$SZ+0`],%l2
ld [$ctx+`5*$SZ+4`],%l3
ld [$ctx+`6*$SZ+0`],%l4
ld [$ctx+`6*$SZ+4`],%l5
ld [$ctx+`7*$SZ+0`],%l6
sllx %l0,32,$tmp0
ld [$ctx+`7*$SZ+4`],%l7
sllx %l2,32,$tmp1
or %l1,$tmp0,$tmp0
or %l3,$tmp1,$tmp1
add $tmp0,$E,$E
add $tmp1,$F,$F
$ST $E,[$ctx+`4*$SZ`]
sllx %l4,32,$tmp2
$ST $F,[$ctx+`5*$SZ`]
sllx %l6,32,$T1
or %l5,$tmp2,$tmp2
or %l7,$T1,$T1
add $tmp2,$G,$G
$ST $G,[$ctx+`6*$SZ`]
add $T1,$H,$H
$ST $H,[$ctx+`7*$SZ`]
___
$code.=<<___;
add $inp,`16*$SZ`,$inp ! advance inp
cmp $inp,$len
bne SIZE_T_CC,.Lloop
sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
ret
restore
.type sha${label}_block_data_order,#function
.size sha${label}_block_data_order,(.-sha${label}_block_data_order)
.asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my $ref,$opf;
my %visopf = ( "faligndata" => 0x048,
"for" => 0x07c );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%f([0-9]{1,2})/);
$_=$1;
if ($1>=32) {
return $ref if ($1&1);
# re-encode for upper double register addressing
$_=($1|$1>>5)&31;
}
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
sub unalignaddr {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my $ref="$mnemonic\t$rs1,$rs2,$rd";
foreach ($rs1,$rs2,$rd) {
if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
else { return $ref; }
}
return sprintf ".word\t0x%08x !%s",
0x81b00300|$rd<<25|$rs1<<14|$rs2,
$ref;
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
&unvis($1,$2,$3,$4)
/ge;
s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unalignaddr($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,420 @@
#! /usr/bin/env perl
# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 for PowerISA v2.07.
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This module is
# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
# result is degree of computational resources' utilization. POWER8 is
# "massively multi-threaded chip" and difference between single- and
# maximum multi-process benchmark results tells that utilization is
# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
# to single-process one, given that all threads end up on the same
# physical core.
#
######################################################################
# Believed-to-be-accurate results in cycles per processed byte [on
# little-endian system]. Numbers in square brackets are for 64-bit
# build of sha512-ppc.pl, presented for reference.
#
# POWER8 POWER9
# SHA256 9.7 [15.8] 11.2 [12.5]
# SHA512 6.1 [10.3] 7.0 [7.9]
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$LENDIAN=($flavour=~/le/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
if ($output =~ /512/) {
$bits=512;
$SZ=8;
$sz="d";
$rounds=80;
} else {
$bits=256;
$SZ=4;
$sz="w";
$rounds=64;
}
$func="sha${bits}_block_p8";
$LOCALS=8*$SIZE_T+8*16;
$FRAME=$LOCALS+9*16+6*$SIZE_T;
$sp ="r1";
$toc="r2";
$ctx="r3";
$inp="r4";
$num="r5";
$Tbl="r6";
$idx="r7";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
@X=map("v$_",(8..19,24..27));
($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
my $k=($i+2)%8;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
addi $inp,$inp,16
___
$code.=<<___ if ($i<16 && ($i%(16/$SZ)));
vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ
___
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
$code.=<<___ if ($i>=15);
vshasigma${sz} $Sigma,@X[($j+1)%16],0,0
vaddu${sz}m @X[$j],@X[$j],$Sigma
vshasigma${sz} $Sigma,@X[($j+14)%16],0,15
vaddu${sz}m @X[$j],@X[$j],$Sigma
vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]
___
$code.=<<___;
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
vsel $Func,$g,$f,$e ; Ch(e,f,g)
vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e)
vxor $Func,$a,$b
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
vaddu${sz}m $d,$d,$h ; d+=h
vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a)
vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c)
vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c)
lvx $Ki,@I[$k],$idx ; load next K[i]
___
$code.=<<___ if ($k == 7);
addi $idx,$idx,0x80
___
}
$code=<<___;
.machine "any"
.text
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr $lrsave
li r10,`$LOCALS+15`
li r11,`$LOCALS+31`
stvx v24,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r11,-4096+255 # 0xfffff0ff
stw $vrsave,`$FRAME-6*$SIZE_T-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME-6*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME-5*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME-4*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME-3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME-2*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME-1*$SIZE_T`($sp)
li $x70,0x70
$PUSH $lrsave,`$FRAME+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
addi $offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___ if ($LENDIAN);
li $idx,8
lvsl $lemask,0,$idx
vspltisb $Ki,0x0f
vxor $lemask,$lemask,$Ki
___
$code.=<<___ if ($SZ==4);
lvx_4w $A,$x00,$ctx
lvx_4w $E,$x10,$ctx
vsldoi $B,$A,$A,4 # unpack
vsldoi $C,$A,$A,8
vsldoi $D,$A,$A,12
vsldoi $F,$E,$E,4
vsldoi $G,$E,$E,8
vsldoi $H,$E,$E,12
___
$code.=<<___ if ($SZ==8);
lvx_u $A,$x00,$ctx
lvx_u $C,$x10,$ctx
lvx_u $E,$x20,$ctx
vsldoi $B,$A,$A,8 # unpack
lvx_u $G,$x30,$ctx
vsldoi $D,$C,$C,8
vsldoi $F,$E,$E,8
vsldoi $H,$G,$G,8
___
$code.=<<___;
li r0,`($rounds-16)/16` # inner loop counter
b Loop
.align 5
Loop:
lvx $Ki,$x00,$Tbl
lvx_u @X[0],0,$inp
addi $inp,$inp,16
mr $idx,$Tbl # copy $Tbl
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
stvx $D,$x30,$offload
stvx $E,$x40,$offload
stvx $F,$x50,$offload
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
lvx $Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mtctr r0
b L16_xx
.align 5
L16_xx:
___
for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bdnz L16_xx
lvx @X[2],$x00,$offload
subic. $num,$num,1
lvx @X[3],$x10,$offload
vaddu${sz}m $A,$A,@X[2]
lvx @X[4],$x20,$offload
vaddu${sz}m $B,$B,@X[3]
lvx @X[5],$x30,$offload
vaddu${sz}m $C,$C,@X[4]
lvx @X[6],$x40,$offload
vaddu${sz}m $D,$D,@X[5]
lvx @X[7],$x50,$offload
vaddu${sz}m $E,$E,@X[6]
lvx @X[8],$x60,$offload
vaddu${sz}m $F,$F,@X[7]
lvx @X[9],$x70,$offload
vaddu${sz}m $G,$G,@X[8]
vaddu${sz}m $H,$H,@X[9]
bne Loop
___
$code.=<<___ if ($SZ==4);
lvx @X[0],$x20,$idx
vperm $A,$A,$B,$Ki # pack the answer
lvx @X[1],$x30,$idx
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
vperm $A,$A,$D,@X[1]
vperm $E,$E,$H,@X[1]
stvx_4w $A,$x00,$ctx
stvx_4w $E,$x10,$ctx
___
$code.=<<___ if ($SZ==8);
vperm $A,$A,$B,$Ki # pack the answer
vperm $C,$C,$D,$Ki
vperm $E,$E,$F,$Ki
vperm $G,$G,$H,$Ki
stvx_u $A,$x00,$ctx
stvx_u $C,$x10,$ctx
stvx_u $E,$x20,$ctx
stvx_u $G,$x30,$ctx
___
$code.=<<___;
addi $offload,$sp,`$LOCALS+15`
mtlr $lrsave
mtspr 256,$vrsave
lvx v24,$x00,$offload # ABI says so
lvx v25,$x10,$offload
lvx v26,$x20,$offload
lvx v27,$x30,$offload
lvx v28,$x40,$offload
lvx v29,$x50,$offload
lvx v30,$x60,$offload
lvx v31,$x70,$offload
$POP r26,`$FRAME-6*$SIZE_T`($sp)
$POP r27,`$FRAME-5*$SIZE_T`($sp)
$POP r28,`$FRAME-4*$SIZE_T`($sp)
$POP r29,`$FRAME-3*$SIZE_T`($sp)
$POP r30,`$FRAME-2*$SIZE_T`($sp)
$POP r31,`$FRAME-1*$SIZE_T`($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0
.long 0
.size $func,.-$func
___
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
if ($SZ==8) {
local *table = sub {
foreach(@_) { $code.=".quad $_,$_\n"; }
};
table(
"0x428a2f98d728ae22","0x7137449123ef65cd",
"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
"0x3956c25bf348b538","0x59f111f1b605d019",
"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
"0xd807aa98a3030242","0x12835b0145706fbe",
"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
"0x9bdc06a725c71235","0xc19bf174cf692694",
"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
"0x983e5152ee66dfab","0xa831c66d2db43210",
"0xb00327c898fb213f","0xbf597fc7beef0ee4",
"0xc6e00bf33da88fc2","0xd5a79147930aa725",
"0x06ca6351e003826f","0x142929670a0e6e70",
"0x27b70a8546d22ffc","0x2e1b21385c26c926",
"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
"0x650a73548baf63de","0x766a0abb3c77b2a8",
"0x81c2c92e47edaee6","0x92722c851482353b",
"0xa2bfe8a14cf10364","0xa81a664bbc423001",
"0xc24b8b70d0f89791","0xc76c51a30654be30",
"0xd192e819d6ef5218","0xd69906245565a910",
"0xf40e35855771202a","0x106aa07032bbd1b8",
"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
"0x748f82ee5defb2fc","0x78a5636f43172f60",
"0x84c87814a1f0ab72","0x8cc702081a6439ec",
"0x90befffa23631e28","0xa4506cebde82bde9",
"0xbef9a3f7b2c67915","0xc67178f2e372532b",
"0xca273eceea26619c","0xd186b8c721c0c207",
"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
"0x06f067aa72176fba","0x0a637dc5a2c898a6",
"0x113f9804bef90dae","0x1b710b35131c471b",
"0x28db77f523047d84","0x32caab7b40c72493",
"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
$code.=<<___ if (!$LENDIAN);
.quad 0x0001020304050607,0x1011121314151617
___
$code.=<<___ if ($LENDIAN); # quad-swapped
.quad 0x1011121314151617,0x0001020304050607
___
} else {
local *table = sub {
foreach(@_) { $code.=".long $_,$_,$_,$_\n"; }
};
table(
"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
$code.=<<___ if (!$LENDIAN);
.long 0x00010203,0x10111213,0x10111213,0x10111213
.long 0x00010203,0x04050607,0x10111213,0x10111213
.long 0x00010203,0x04050607,0x08090a0b,0x10111213
___
$code.=<<___ if ($LENDIAN); # word-swapped
.long 0x10111213,0x10111213,0x10111213,0x00010203
.long 0x10111213,0x10111213,0x04050607,0x00010203
.long 0x10111213,0x08090a0b,0x04050607,0x00010203
___
}
$code.=<<___;
.asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,89 @@
LIBS=../../libcrypto
SOURCE[../../libcrypto]=\
sha1dgst.c sha1_one.c sha256.c sha512.c {- $target{sha1_asm_src} -} \
{- $target{keccak1600_asm_src} -}
GENERATE[sha1-586.s]=asm/sha1-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[sha1-586.s]=../perlasm/x86asm.pl
GENERATE[sha256-586.s]=asm/sha256-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[sha256-586.s]=../perlasm/x86asm.pl
GENERATE[sha512-586.s]=asm/sha512-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[sha512-586.s]=../perlasm/x86asm.pl
GENERATE[sha1-ia64.s]=asm/sha1-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
GENERATE[sha256-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
GENERATE[sha512-ia64.s]=asm/sha512-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
GENERATE[sha1-alpha.S]=asm/sha1-alpha.pl $(PERLASM_SCHEME)
GENERATE[sha1-x86_64.s]=asm/sha1-x86_64.pl $(PERLASM_SCHEME)
GENERATE[sha1-mb-x86_64.s]=asm/sha1-mb-x86_64.pl $(PERLASM_SCHEME)
GENERATE[sha256-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME)
GENERATE[sha256-mb-x86_64.s]=asm/sha256-mb-x86_64.pl $(PERLASM_SCHEME)
GENERATE[sha512-x86_64.s]=asm/sha512-x86_64.pl $(PERLASM_SCHEME)
GENERATE[keccak1600-x86_64.s]=asm/keccak1600-x86_64.pl $(PERLASM_SCHEME)
GENERATE[sha1-sparcv9.S]=asm/sha1-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[sha1-sparcv9.o]=..
GENERATE[sha256-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[sha256-sparcv9.o]=..
GENERATE[sha512-sparcv9.S]=asm/sha512-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[sha512-sparcv9.o]=..
GENERATE[sha1-ppc.s]=asm/sha1-ppc.pl $(PERLASM_SCHEME)
GENERATE[sha256-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME)
GENERATE[sha512-ppc.s]=asm/sha512-ppc.pl $(PERLASM_SCHEME)
GENERATE[sha256p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME)
GENERATE[sha512p8-ppc.s]=asm/sha512p8-ppc.pl $(PERLASM_SCHEME)
GENERATE[keccak1600-ppc64.s]=asm/keccak1600-ppc64.pl $(PERLASM_SCHEME)
GENERATE[sha1-parisc.s]=asm/sha1-parisc.pl $(PERLASM_SCHEME)
GENERATE[sha256-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME)
GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl $(PERLASM_SCHEME)
GENERATE[sha1-mips.S]=asm/sha1-mips.pl $(PERLASM_SCHEME)
INCLUDE[sha1-mips.o]=..
GENERATE[sha256-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME)
INCLUDE[sha256-mips.o]=..
GENERATE[sha512-mips.S]=asm/sha512-mips.pl $(PERLASM_SCHEME)
INCLUDE[sha512-mips.o]=..
GENERATE[sha1-armv4-large.S]=asm/sha1-armv4-large.pl $(PERLASM_SCHEME)
INCLUDE[sha1-armv4-large.o]=..
GENERATE[sha256-armv4.S]=asm/sha256-armv4.pl $(PERLASM_SCHEME)
INCLUDE[sha256-armv4.o]=..
GENERATE[sha512-armv4.S]=asm/sha512-armv4.pl $(PERLASM_SCHEME)
INCLUDE[sha512-armv4.o]=..
GENERATE[keccak1600-armv4.S]=asm/keccak1600-armv4.pl $(PERLASM_SCHEME)
INCLUDE[keccak1600-armv4.o]=..
GENERATE[sha1-armv8.S]=asm/sha1-armv8.pl $(PERLASM_SCHEME)
INCLUDE[sha1-armv8.o]=..
GENERATE[sha256-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME)
INCLUDE[sha256-armv8.o]=..
GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl $(PERLASM_SCHEME)
INCLUDE[sha512-armv8.o]=..
GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl $(PERLASM_SCHEME)
GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl $(PERLASM_SCHEME)
INCLUDE[sha1-s390x.o]=..
GENERATE[sha256-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME)
INCLUDE[sha256-s390x.o]=..
GENERATE[sha512-s390x.S]=asm/sha512-s390x.pl $(PERLASM_SCHEME)
INCLUDE[sha512-s390x.o]=..
GENERATE[keccak1600-s390x.S]=asm/keccak1600-s390x.pl $(PERLASM_SCHEME)
BEGINRAW[Makefile(unix)]
##### SHA assembler implementations
# GNU make "catch all"
{- $builddir -}/sha1-%.S: {- $sourcedir -}/asm/sha1-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
{- $builddir -}/sha256-%.S: {- $sourcedir -}/asm/sha512-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
{- $builddir -}/sha512-%.S: {- $sourcedir -}/asm/sha512-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
ENDRAW[Makefile(unix)]

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,28 @@
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
unsigned char *SHA1(const unsigned char *d, size_t n, unsigned char *md)
{
SHA_CTX c;
static unsigned char m[SHA_DIGEST_LENGTH];
if (md == NULL)
md = m;
if (!SHA1_Init(&c))
return NULL;
SHA1_Update(&c, d, n);
SHA1_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
return md;
}

View file

@ -0,0 +1,17 @@
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/crypto.h>
#include <openssl/opensslconf.h>
# include <openssl/opensslv.h>
/* The implementation is in ../md32_common.h */
# include "sha_locl.h"

View file

@ -0,0 +1,386 @@
/*
* Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/opensslconf.h>
#include <stdlib.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
#include <openssl/opensslv.h>
int SHA224_Init(SHA256_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h[0] = 0xc1059ed8UL;
c->h[1] = 0x367cd507UL;
c->h[2] = 0x3070dd17UL;
c->h[3] = 0xf70e5939UL;
c->h[4] = 0xffc00b31UL;
c->h[5] = 0x68581511UL;
c->h[6] = 0x64f98fa7UL;
c->h[7] = 0xbefa4fa4UL;
c->md_len = SHA224_DIGEST_LENGTH;
return 1;
}
int SHA256_Init(SHA256_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h[0] = 0x6a09e667UL;
c->h[1] = 0xbb67ae85UL;
c->h[2] = 0x3c6ef372UL;
c->h[3] = 0xa54ff53aUL;
c->h[4] = 0x510e527fUL;
c->h[5] = 0x9b05688cUL;
c->h[6] = 0x1f83d9abUL;
c->h[7] = 0x5be0cd19UL;
c->md_len = SHA256_DIGEST_LENGTH;
return 1;
}
unsigned char *SHA224(const unsigned char *d, size_t n, unsigned char *md)
{
SHA256_CTX c;
static unsigned char m[SHA224_DIGEST_LENGTH];
if (md == NULL)
md = m;
SHA224_Init(&c);
SHA256_Update(&c, d, n);
SHA256_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
return md;
}
unsigned char *SHA256(const unsigned char *d, size_t n, unsigned char *md)
{
SHA256_CTX c;
static unsigned char m[SHA256_DIGEST_LENGTH];
if (md == NULL)
md = m;
SHA256_Init(&c);
SHA256_Update(&c, d, n);
SHA256_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
return md;
}
int SHA224_Update(SHA256_CTX *c, const void *data, size_t len)
{
return SHA256_Update(c, data, len);
}
int SHA224_Final(unsigned char *md, SHA256_CTX *c)
{
return SHA256_Final(md, c);
}
#define DATA_ORDER_IS_BIG_ENDIAN
#define HASH_LONG SHA_LONG
#define HASH_CTX SHA256_CTX
#define HASH_CBLOCK SHA_CBLOCK
/*
* Note that FIPS180-2 discusses "Truncation of the Hash Function Output."
* default: case below covers for it. It's not clear however if it's
* permitted to truncate to amount of bytes not divisible by 4. I bet not,
* but if it is, then default: case shall be extended. For reference.
* Idea behind separate cases for pre-defined lengths is to let the
* compiler decide if it's appropriate to unroll small loops.
*/
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
unsigned int nn; \
switch ((c)->md_len) \
{ case SHA224_DIGEST_LENGTH: \
for (nn=0;nn<SHA224_DIGEST_LENGTH/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
case SHA256_DIGEST_LENGTH: \
for (nn=0;nn<SHA256_DIGEST_LENGTH/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
default: \
if ((c)->md_len > SHA256_DIGEST_LENGTH) \
return 0; \
for (nn=0;nn<(c)->md_len/4;nn++) \
{ ll=(c)->h[nn]; (void)HOST_l2c(ll,(s)); } \
break; \
} \
} while (0)
#define HASH_UPDATE SHA256_Update
#define HASH_TRANSFORM SHA256_Transform
#define HASH_FINAL SHA256_Final
#define HASH_BLOCK_DATA_ORDER sha256_block_data_order
#ifndef SHA256_ASM
static
#endif
void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
#include "internal/md32_common.h"
#ifndef SHA256_ASM
static const SHA_LONG K256[64] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
/*
* FIPS specification refers to right rotations, while our ROTATE macro
* is left one. This is why you might notice that rotation coefficients
* differ from those observed in FIPS document by 32-N...
*/
# define Sigma0(x) (ROTATE((x),30) ^ ROTATE((x),19) ^ ROTATE((x),10))
# define Sigma1(x) (ROTATE((x),26) ^ ROTATE((x),21) ^ ROTATE((x),7))
# define sigma0(x) (ROTATE((x),25) ^ ROTATE((x),14) ^ ((x)>>3))
# define sigma1(x) (ROTATE((x),15) ^ ROTATE((x),13) ^ ((x)>>10))
# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
# ifdef OPENSSL_SMALL_FOOTPRINT
static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
size_t num)
{
unsigned MD32_REG_T a, b, c, d, e, f, g, h, s0, s1, T1, T2;
SHA_LONG X[16], l;
int i;
const unsigned char *data = in;
while (num--) {
a = ctx->h[0];
b = ctx->h[1];
c = ctx->h[2];
d = ctx->h[3];
e = ctx->h[4];
f = ctx->h[5];
g = ctx->h[6];
h = ctx->h[7];
for (i = 0; i < 16; i++) {
(void)HOST_c2l(data, l);
T1 = X[i] = l;
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
for (; i < 64; i++) {
s0 = X[(i + 1) & 0x0f];
s0 = sigma0(s0);
s1 = X[(i + 14) & 0x0f];
s1 = sigma1(s1);
T1 = X[i & 0xf] += s0 + s1 + X[(i + 9) & 0xf];
T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
ctx->h[0] += a;
ctx->h[1] += b;
ctx->h[2] += c;
ctx->h[3] += d;
ctx->h[4] += e;
ctx->h[5] += f;
ctx->h[6] += g;
ctx->h[7] += h;
}
}
# else
# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; \
h = Sigma0(a) + Maj(a,b,c); \
d += T1; h += T1; } while (0)
# define ROUND_16_63(i,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(i+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(i+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \
ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0)
static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
size_t num)
{
unsigned MD32_REG_T a, b, c, d, e, f, g, h, s0, s1, T1;
SHA_LONG X[16];
int i;
const unsigned char *data = in;
const union {
long one;
char little;
} is_endian = {
1
};
while (num--) {
a = ctx->h[0];
b = ctx->h[1];
c = ctx->h[2];
d = ctx->h[3];
e = ctx->h[4];
f = ctx->h[5];
g = ctx->h[6];
h = ctx->h[7];
if (!is_endian.little && sizeof(SHA_LONG) == 4
&& ((size_t)in % 4) == 0) {
const SHA_LONG *W = (const SHA_LONG *)data;
T1 = X[0] = W[0];
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = W[1];
ROUND_00_15(1, h, a, b, c, d, e, f, g);
T1 = X[2] = W[2];
ROUND_00_15(2, g, h, a, b, c, d, e, f);
T1 = X[3] = W[3];
ROUND_00_15(3, f, g, h, a, b, c, d, e);
T1 = X[4] = W[4];
ROUND_00_15(4, e, f, g, h, a, b, c, d);
T1 = X[5] = W[5];
ROUND_00_15(5, d, e, f, g, h, a, b, c);
T1 = X[6] = W[6];
ROUND_00_15(6, c, d, e, f, g, h, a, b);
T1 = X[7] = W[7];
ROUND_00_15(7, b, c, d, e, f, g, h, a);
T1 = X[8] = W[8];
ROUND_00_15(8, a, b, c, d, e, f, g, h);
T1 = X[9] = W[9];
ROUND_00_15(9, h, a, b, c, d, e, f, g);
T1 = X[10] = W[10];
ROUND_00_15(10, g, h, a, b, c, d, e, f);
T1 = X[11] = W[11];
ROUND_00_15(11, f, g, h, a, b, c, d, e);
T1 = X[12] = W[12];
ROUND_00_15(12, e, f, g, h, a, b, c, d);
T1 = X[13] = W[13];
ROUND_00_15(13, d, e, f, g, h, a, b, c);
T1 = X[14] = W[14];
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = W[15];
ROUND_00_15(15, b, c, d, e, f, g, h, a);
data += SHA256_CBLOCK;
} else {
SHA_LONG l;
(void)HOST_c2l(data, l);
T1 = X[0] = l;
ROUND_00_15(0, a, b, c, d, e, f, g, h);
(void)HOST_c2l(data, l);
T1 = X[1] = l;
ROUND_00_15(1, h, a, b, c, d, e, f, g);
(void)HOST_c2l(data, l);
T1 = X[2] = l;
ROUND_00_15(2, g, h, a, b, c, d, e, f);
(void)HOST_c2l(data, l);
T1 = X[3] = l;
ROUND_00_15(3, f, g, h, a, b, c, d, e);
(void)HOST_c2l(data, l);
T1 = X[4] = l;
ROUND_00_15(4, e, f, g, h, a, b, c, d);
(void)HOST_c2l(data, l);
T1 = X[5] = l;
ROUND_00_15(5, d, e, f, g, h, a, b, c);
(void)HOST_c2l(data, l);
T1 = X[6] = l;
ROUND_00_15(6, c, d, e, f, g, h, a, b);
(void)HOST_c2l(data, l);
T1 = X[7] = l;
ROUND_00_15(7, b, c, d, e, f, g, h, a);
(void)HOST_c2l(data, l);
T1 = X[8] = l;
ROUND_00_15(8, a, b, c, d, e, f, g, h);
(void)HOST_c2l(data, l);
T1 = X[9] = l;
ROUND_00_15(9, h, a, b, c, d, e, f, g);
(void)HOST_c2l(data, l);
T1 = X[10] = l;
ROUND_00_15(10, g, h, a, b, c, d, e, f);
(void)HOST_c2l(data, l);
T1 = X[11] = l;
ROUND_00_15(11, f, g, h, a, b, c, d, e);
(void)HOST_c2l(data, l);
T1 = X[12] = l;
ROUND_00_15(12, e, f, g, h, a, b, c, d);
(void)HOST_c2l(data, l);
T1 = X[13] = l;
ROUND_00_15(13, d, e, f, g, h, a, b, c);
(void)HOST_c2l(data, l);
T1 = X[14] = l;
ROUND_00_15(14, c, d, e, f, g, h, a, b);
(void)HOST_c2l(data, l);
T1 = X[15] = l;
ROUND_00_15(15, b, c, d, e, f, g, h, a);
}
for (i = 16; i < 64; i += 8) {
ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X);
ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X);
ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X);
ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X);
ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X);
ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X);
ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X);
ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X);
}
ctx->h[0] += a;
ctx->h[1] += b;
ctx->h[2] += c;
ctx->h[3] += d;
ctx->h[4] += e;
ctx->h[5] += f;
ctx->h[6] += g;
ctx->h[7] += h;
}
}
# endif
#endif /* SHA256_ASM */

View file

@ -0,0 +1,765 @@
/*
* Copyright 2004-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/opensslconf.h>
/*-
* IMPLEMENTATION NOTES.
*
* As you might have noticed 32-bit hash algorithms:
*
* - permit SHA_LONG to be wider than 32-bit
* - optimized versions implement two transform functions: one operating
* on [aligned] data in host byte order and one - on data in input
* stream byte order;
* - share common byte-order neutral collector and padding function
* implementations, ../md32_common.h;
*
* Neither of the above applies to this SHA-512 implementations. Reasons
* [in reverse order] are:
*
* - it's the only 64-bit hash algorithm for the moment of this writing,
* there is no need for common collector/padding implementation [yet];
* - by supporting only one transform function [which operates on
* *aligned* data in input stream byte order, big-endian in this case]
* we minimize burden of maintenance in two ways: a) collector/padding
* function is simpler; b) only one transform function to stare at;
* - SHA_LONG64 is required to be exactly 64-bit in order to be able to
* apply a number of optimizations to mitigate potential performance
* penalties caused by previous design decision;
*
* Caveat lector.
*
* Implementation relies on the fact that "long long" is 64-bit on
* both 32- and 64-bit platforms. If some compiler vendor comes up
* with 128-bit long long, adjustment to sha.h would be required.
* As this implementation relies on 64-bit integer type, it's totally
* inappropriate for platforms which don't support it, most notably
* 16-bit platforms.
*/
#include <stdlib.h>
#include <string.h>
#include <openssl/crypto.h>
#include <openssl/sha.h>
#include <openssl/opensslv.h>
#include "internal/cryptlib.h"
#include "internal/sha.h"
#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \
defined(__s390__) || defined(__s390x__) || \
defined(__aarch64__) || \
defined(SHA512_ASM)
# define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
#endif
int sha512_224_init(SHA512_CTX *c)
{
c->h[0] = U64(0x8c3d37c819544da2);
c->h[1] = U64(0x73e1996689dcd4d6);
c->h[2] = U64(0x1dfab7ae32ff9c82);
c->h[3] = U64(0x679dd514582f9fcf);
c->h[4] = U64(0x0f6d2b697bd44da8);
c->h[5] = U64(0x77e36f7304c48942);
c->h[6] = U64(0x3f9d85a86a1d36c8);
c->h[7] = U64(0x1112e6ad91d692a1);
c->Nl = 0;
c->Nh = 0;
c->num = 0;
c->md_len = SHA224_DIGEST_LENGTH;
return 1;
}
int sha512_256_init(SHA512_CTX *c)
{
c->h[0] = U64(0x22312194fc2bf72c);
c->h[1] = U64(0x9f555fa3c84c64c2);
c->h[2] = U64(0x2393b86b6f53b151);
c->h[3] = U64(0x963877195940eabd);
c->h[4] = U64(0x96283ee2a88effe3);
c->h[5] = U64(0xbe5e1e2553863992);
c->h[6] = U64(0x2b0199fc2c85b8aa);
c->h[7] = U64(0x0eb72ddc81c52ca2);
c->Nl = 0;
c->Nh = 0;
c->num = 0;
c->md_len = SHA256_DIGEST_LENGTH;
return 1;
}
int SHA384_Init(SHA512_CTX *c)
{
c->h[0] = U64(0xcbbb9d5dc1059ed8);
c->h[1] = U64(0x629a292a367cd507);
c->h[2] = U64(0x9159015a3070dd17);
c->h[3] = U64(0x152fecd8f70e5939);
c->h[4] = U64(0x67332667ffc00b31);
c->h[5] = U64(0x8eb44a8768581511);
c->h[6] = U64(0xdb0c2e0d64f98fa7);
c->h[7] = U64(0x47b5481dbefa4fa4);
c->Nl = 0;
c->Nh = 0;
c->num = 0;
c->md_len = SHA384_DIGEST_LENGTH;
return 1;
}
int SHA512_Init(SHA512_CTX *c)
{
c->h[0] = U64(0x6a09e667f3bcc908);
c->h[1] = U64(0xbb67ae8584caa73b);
c->h[2] = U64(0x3c6ef372fe94f82b);
c->h[3] = U64(0xa54ff53a5f1d36f1);
c->h[4] = U64(0x510e527fade682d1);
c->h[5] = U64(0x9b05688c2b3e6c1f);
c->h[6] = U64(0x1f83d9abfb41bd6b);
c->h[7] = U64(0x5be0cd19137e2179);
c->Nl = 0;
c->Nh = 0;
c->num = 0;
c->md_len = SHA512_DIGEST_LENGTH;
return 1;
}
#ifndef SHA512_ASM
static
#endif
void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
int SHA512_Final(unsigned char *md, SHA512_CTX *c)
{
unsigned char *p = (unsigned char *)c->u.p;
size_t n = c->num;
p[n] = 0x80; /* There always is a room for one */
n++;
if (n > (sizeof(c->u) - 16)) {
memset(p + n, 0, sizeof(c->u) - n);
n = 0;
sha512_block_data_order(c, p, 1);
}
memset(p + n, 0, sizeof(c->u) - 16 - n);
#ifdef B_ENDIAN
c->u.d[SHA_LBLOCK - 2] = c->Nh;
c->u.d[SHA_LBLOCK - 1] = c->Nl;
#else
p[sizeof(c->u) - 1] = (unsigned char)(c->Nl);
p[sizeof(c->u) - 2] = (unsigned char)(c->Nl >> 8);
p[sizeof(c->u) - 3] = (unsigned char)(c->Nl >> 16);
p[sizeof(c->u) - 4] = (unsigned char)(c->Nl >> 24);
p[sizeof(c->u) - 5] = (unsigned char)(c->Nl >> 32);
p[sizeof(c->u) - 6] = (unsigned char)(c->Nl >> 40);
p[sizeof(c->u) - 7] = (unsigned char)(c->Nl >> 48);
p[sizeof(c->u) - 8] = (unsigned char)(c->Nl >> 56);
p[sizeof(c->u) - 9] = (unsigned char)(c->Nh);
p[sizeof(c->u) - 10] = (unsigned char)(c->Nh >> 8);
p[sizeof(c->u) - 11] = (unsigned char)(c->Nh >> 16);
p[sizeof(c->u) - 12] = (unsigned char)(c->Nh >> 24);
p[sizeof(c->u) - 13] = (unsigned char)(c->Nh >> 32);
p[sizeof(c->u) - 14] = (unsigned char)(c->Nh >> 40);
p[sizeof(c->u) - 15] = (unsigned char)(c->Nh >> 48);
p[sizeof(c->u) - 16] = (unsigned char)(c->Nh >> 56);
#endif
sha512_block_data_order(c, p, 1);
if (md == 0)
return 0;
switch (c->md_len) {
/* Let compiler decide if it's appropriate to unroll... */
case SHA224_DIGEST_LENGTH:
for (n = 0; n < SHA224_DIGEST_LENGTH / 8; n++) {
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t >> 56);
*(md++) = (unsigned char)(t >> 48);
*(md++) = (unsigned char)(t >> 40);
*(md++) = (unsigned char)(t >> 32);
*(md++) = (unsigned char)(t >> 24);
*(md++) = (unsigned char)(t >> 16);
*(md++) = (unsigned char)(t >> 8);
*(md++) = (unsigned char)(t);
}
/*
* For 224 bits, there are four bytes left over that have to be
* processed separately.
*/
{
SHA_LONG64 t = c->h[SHA224_DIGEST_LENGTH / 8];
*(md++) = (unsigned char)(t >> 56);
*(md++) = (unsigned char)(t >> 48);
*(md++) = (unsigned char)(t >> 40);
*(md++) = (unsigned char)(t >> 32);
}
break;
case SHA256_DIGEST_LENGTH:
for (n = 0; n < SHA256_DIGEST_LENGTH / 8; n++) {
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t >> 56);
*(md++) = (unsigned char)(t >> 48);
*(md++) = (unsigned char)(t >> 40);
*(md++) = (unsigned char)(t >> 32);
*(md++) = (unsigned char)(t >> 24);
*(md++) = (unsigned char)(t >> 16);
*(md++) = (unsigned char)(t >> 8);
*(md++) = (unsigned char)(t);
}
break;
case SHA384_DIGEST_LENGTH:
for (n = 0; n < SHA384_DIGEST_LENGTH / 8; n++) {
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t >> 56);
*(md++) = (unsigned char)(t >> 48);
*(md++) = (unsigned char)(t >> 40);
*(md++) = (unsigned char)(t >> 32);
*(md++) = (unsigned char)(t >> 24);
*(md++) = (unsigned char)(t >> 16);
*(md++) = (unsigned char)(t >> 8);
*(md++) = (unsigned char)(t);
}
break;
case SHA512_DIGEST_LENGTH:
for (n = 0; n < SHA512_DIGEST_LENGTH / 8; n++) {
SHA_LONG64 t = c->h[n];
*(md++) = (unsigned char)(t >> 56);
*(md++) = (unsigned char)(t >> 48);
*(md++) = (unsigned char)(t >> 40);
*(md++) = (unsigned char)(t >> 32);
*(md++) = (unsigned char)(t >> 24);
*(md++) = (unsigned char)(t >> 16);
*(md++) = (unsigned char)(t >> 8);
*(md++) = (unsigned char)(t);
}
break;
/* ... as well as make sure md_len is not abused. */
default:
return 0;
}
return 1;
}
int SHA384_Final(unsigned char *md, SHA512_CTX *c)
{
return SHA512_Final(md, c);
}
int SHA512_Update(SHA512_CTX *c, const void *_data, size_t len)
{
SHA_LONG64 l;
unsigned char *p = c->u.p;
const unsigned char *data = (const unsigned char *)_data;
if (len == 0)
return 1;
l = (c->Nl + (((SHA_LONG64) len) << 3)) & U64(0xffffffffffffffff);
if (l < c->Nl)
c->Nh++;
if (sizeof(len) >= 8)
c->Nh += (((SHA_LONG64) len) >> 61);
c->Nl = l;
if (c->num != 0) {
size_t n = sizeof(c->u) - c->num;
if (len < n) {
memcpy(p + c->num, data, len), c->num += (unsigned int)len;
return 1;
} else {
memcpy(p + c->num, data, n), c->num = 0;
len -= n, data += n;
sha512_block_data_order(c, p, 1);
}
}
if (len >= sizeof(c->u)) {
#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data % sizeof(c->u.d[0]) != 0)
while (len >= sizeof(c->u))
memcpy(p, data, sizeof(c->u)),
sha512_block_data_order(c, p, 1),
len -= sizeof(c->u), data += sizeof(c->u);
else
#endif
sha512_block_data_order(c, data, len / sizeof(c->u)),
data += len, len %= sizeof(c->u), data -= len;
}
if (len != 0)
memcpy(p, data, len), c->num = (int)len;
return 1;
}
int SHA384_Update(SHA512_CTX *c, const void *data, size_t len)
{
return SHA512_Update(c, data, len);
}
void SHA512_Transform(SHA512_CTX *c, const unsigned char *data)
{
#ifndef SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
if ((size_t)data % sizeof(c->u.d[0]) != 0)
memcpy(c->u.p, data, sizeof(c->u.p)), data = c->u.p;
#endif
sha512_block_data_order(c, data, 1);
}
unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md)
{
SHA512_CTX c;
static unsigned char m[SHA384_DIGEST_LENGTH];
if (md == NULL)
md = m;
SHA384_Init(&c);
SHA512_Update(&c, d, n);
SHA512_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
return md;
}
unsigned char *SHA512(const unsigned char *d, size_t n, unsigned char *md)
{
SHA512_CTX c;
static unsigned char m[SHA512_DIGEST_LENGTH];
if (md == NULL)
md = m;
SHA512_Init(&c);
SHA512_Update(&c, d, n);
SHA512_Final(md, &c);
OPENSSL_cleanse(&c, sizeof(c));
return md;
}
#ifndef SHA512_ASM
static const SHA_LONG64 K512[80] = {
U64(0x428a2f98d728ae22), U64(0x7137449123ef65cd),
U64(0xb5c0fbcfec4d3b2f), U64(0xe9b5dba58189dbbc),
U64(0x3956c25bf348b538), U64(0x59f111f1b605d019),
U64(0x923f82a4af194f9b), U64(0xab1c5ed5da6d8118),
U64(0xd807aa98a3030242), U64(0x12835b0145706fbe),
U64(0x243185be4ee4b28c), U64(0x550c7dc3d5ffb4e2),
U64(0x72be5d74f27b896f), U64(0x80deb1fe3b1696b1),
U64(0x9bdc06a725c71235), U64(0xc19bf174cf692694),
U64(0xe49b69c19ef14ad2), U64(0xefbe4786384f25e3),
U64(0x0fc19dc68b8cd5b5), U64(0x240ca1cc77ac9c65),
U64(0x2de92c6f592b0275), U64(0x4a7484aa6ea6e483),
U64(0x5cb0a9dcbd41fbd4), U64(0x76f988da831153b5),
U64(0x983e5152ee66dfab), U64(0xa831c66d2db43210),
U64(0xb00327c898fb213f), U64(0xbf597fc7beef0ee4),
U64(0xc6e00bf33da88fc2), U64(0xd5a79147930aa725),
U64(0x06ca6351e003826f), U64(0x142929670a0e6e70),
U64(0x27b70a8546d22ffc), U64(0x2e1b21385c26c926),
U64(0x4d2c6dfc5ac42aed), U64(0x53380d139d95b3df),
U64(0x650a73548baf63de), U64(0x766a0abb3c77b2a8),
U64(0x81c2c92e47edaee6), U64(0x92722c851482353b),
U64(0xa2bfe8a14cf10364), U64(0xa81a664bbc423001),
U64(0xc24b8b70d0f89791), U64(0xc76c51a30654be30),
U64(0xd192e819d6ef5218), U64(0xd69906245565a910),
U64(0xf40e35855771202a), U64(0x106aa07032bbd1b8),
U64(0x19a4c116b8d2d0c8), U64(0x1e376c085141ab53),
U64(0x2748774cdf8eeb99), U64(0x34b0bcb5e19b48a8),
U64(0x391c0cb3c5c95a63), U64(0x4ed8aa4ae3418acb),
U64(0x5b9cca4f7763e373), U64(0x682e6ff3d6b2b8a3),
U64(0x748f82ee5defb2fc), U64(0x78a5636f43172f60),
U64(0x84c87814a1f0ab72), U64(0x8cc702081a6439ec),
U64(0x90befffa23631e28), U64(0xa4506cebde82bde9),
U64(0xbef9a3f7b2c67915), U64(0xc67178f2e372532b),
U64(0xca273eceea26619c), U64(0xd186b8c721c0c207),
U64(0xeada7dd6cde0eb1e), U64(0xf57d4f7fee6ed178),
U64(0x06f067aa72176fba), U64(0x0a637dc5a2c898a6),
U64(0x113f9804bef90dae), U64(0x1b710b35131c471b),
U64(0x28db77f523047d84), U64(0x32caab7b40c72493),
U64(0x3c9ebe0a15c9bebc), U64(0x431d67c49c100d4c),
U64(0x4cc5d4becb3e42b6), U64(0x597f299cfc657e2a),
U64(0x5fcb6fab3ad6faec), U64(0x6c44198c4a475817)
};
# ifndef PEDANTIC
# if defined(__GNUC__) && __GNUC__>=2 && \
!defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
# if defined(__x86_64) || defined(__x86_64__)
# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rorq %1,%0" \
: "=r"(ret) \
: "J"(n),"0"(a) \
: "cc"); ret; })
# if !defined(B_ENDIAN)
# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \
asm ("bswapq %0" \
: "=r"(ret) \
: "0"(ret)); ret; })
# endif
# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN)
# if defined(I386_ONLY)
# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
unsigned int hi=p[0],lo=p[1]; \
asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
"roll $16,%%eax; roll $16,%%edx; "\
"xchgb %%ah,%%al;xchgb %%dh,%%dl;"\
: "=a"(lo),"=d"(hi) \
: "0"(lo),"1"(hi) : "cc"); \
((SHA_LONG64)hi)<<32|lo; })
# else
# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\
unsigned int hi=p[0],lo=p[1]; \
asm ("bswapl %0; bswapl %1;" \
: "=r"(lo),"=r"(hi) \
: "0"(lo),"1"(hi)); \
((SHA_LONG64)hi)<<32|lo; })
# endif
# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64)
# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("rotrdi %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"K"(n)); ret; })
# elif defined(__aarch64__)
# define ROTR(a,n) ({ SHA_LONG64 ret; \
asm ("ror %0,%1,%2" \
: "=r"(ret) \
: "r"(a),"I"(n)); ret; })
# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
__BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
# define PULL64(x) ({ SHA_LONG64 ret; \
asm ("rev %0,%1" \
: "=r"(ret) \
: "r"(*((const SHA_LONG64 *)(&(x))))); ret; })
# endif
# endif
# elif defined(_MSC_VER)
# if defined(_WIN64) /* applies to both IA-64 and AMD64 */
# pragma intrinsic(_rotr64)
# define ROTR(a,n) _rotr64((a),n)
# endif
# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && \
!defined(OPENSSL_NO_INLINE_ASM)
# if defined(I386_ONLY)
static SHA_LONG64 __fastcall __pull64be(const void *x)
{
_asm mov edx,[ecx + 0]
_asm mov eax,[ecx + 4]
_asm xchg dh, dl
_asm xchg ah, al
_asm rol edx, 16
_asm rol eax, 16
_asm xchg dh, dl
_asm xchg ah, al
}
# else
static SHA_LONG64 __fastcall __pull64be(const void *x)
{
_asm mov edx,[ecx + 0]
_asm mov eax,[ecx + 4]
_asm bswap edx
_asm bswap eax
}
# endif
# define PULL64(x) __pull64be(&(x))
# endif
# endif
# endif
# ifndef PULL64
# define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8))
# define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7))
# endif
# ifndef ROTR
# define ROTR(x,s) (((x)>>s) | (x)<<(64-s))
# endif
# define Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
# define Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
# define sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
# define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
# define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
# define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
/*
* This code should give better results on 32-bit CPU with less than
* ~24 registers, both size and performance wise...
*/
static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
size_t num)
{
const SHA_LONG64 *W = in;
SHA_LONG64 A, E, T;
SHA_LONG64 X[9 + 80], *F;
int i;
while (num--) {
F = X + 80;
A = ctx->h[0];
F[1] = ctx->h[1];
F[2] = ctx->h[2];
F[3] = ctx->h[3];
E = ctx->h[4];
F[5] = ctx->h[5];
F[6] = ctx->h[6];
F[7] = ctx->h[7];
for (i = 0; i < 16; i++, F--) {
# ifdef B_ENDIAN
T = W[i];
# else
T = PULL64(W[i]);
# endif
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A, F[1], F[2]);
}
for (; i < 80; i++, F--) {
T = sigma0(F[8 + 16 - 1]);
T += sigma1(F[8 + 16 - 14]);
T += F[8 + 16] + F[8 + 16 - 9];
F[0] = A;
F[4] = E;
F[8] = T;
T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
E = F[3] + T;
A = T + Sigma0(A) + Maj(A, F[1], F[2]);
}
ctx->h[0] += A;
ctx->h[1] += F[1];
ctx->h[2] += F[2];
ctx->h[3] += F[3];
ctx->h[4] += E;
ctx->h[5] += F[5];
ctx->h[6] += F[6];
ctx->h[7] += F[7];
W += SHA_LBLOCK;
}
}
# elif defined(OPENSSL_SMALL_FOOTPRINT)
static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
size_t num)
{
const SHA_LONG64 *W = in;
SHA_LONG64 a, b, c, d, e, f, g, h, s0, s1, T1, T2;
SHA_LONG64 X[16];
int i;
while (num--) {
a = ctx->h[0];
b = ctx->h[1];
c = ctx->h[2];
d = ctx->h[3];
e = ctx->h[4];
f = ctx->h[5];
g = ctx->h[6];
h = ctx->h[7];
for (i = 0; i < 16; i++) {
# ifdef B_ENDIAN
T1 = X[i] = W[i];
# else
T1 = X[i] = PULL64(W[i]);
# endif
T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
for (; i < 80; i++) {
s0 = X[(i + 1) & 0x0f];
s0 = sigma0(s0);
s1 = X[(i + 14) & 0x0f];
s1 = sigma1(s1);
T1 = X[i & 0xf] += s0 + s1 + X[(i + 9) & 0xf];
T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i];
T2 = Sigma0(a) + Maj(a, b, c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
ctx->h[0] += a;
ctx->h[1] += b;
ctx->h[2] += c;
ctx->h[3] += d;
ctx->h[4] += e;
ctx->h[5] += f;
ctx->h[6] += g;
ctx->h[7] += h;
W += SHA_LBLOCK;
}
}
# else
# define ROUND_00_15(i,a,b,c,d,e,f,g,h) do { \
T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i]; \
h = Sigma0(a) + Maj(a,b,c); \
d += T1; h += T1; } while (0)
# define ROUND_16_80(i,j,a,b,c,d,e,f,g,h,X) do { \
s0 = X[(j+1)&0x0f]; s0 = sigma0(s0); \
s1 = X[(j+14)&0x0f]; s1 = sigma1(s1); \
T1 = X[(j)&0x0f] += s0 + s1 + X[(j+9)&0x0f]; \
ROUND_00_15(i+j,a,b,c,d,e,f,g,h); } while (0)
static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
size_t num)
{
const SHA_LONG64 *W = in;
SHA_LONG64 a, b, c, d, e, f, g, h, s0, s1, T1;
SHA_LONG64 X[16];
int i;
while (num--) {
a = ctx->h[0];
b = ctx->h[1];
c = ctx->h[2];
d = ctx->h[3];
e = ctx->h[4];
f = ctx->h[5];
g = ctx->h[6];
h = ctx->h[7];
# ifdef B_ENDIAN
T1 = X[0] = W[0];
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = W[1];
ROUND_00_15(1, h, a, b, c, d, e, f, g);
T1 = X[2] = W[2];
ROUND_00_15(2, g, h, a, b, c, d, e, f);
T1 = X[3] = W[3];
ROUND_00_15(3, f, g, h, a, b, c, d, e);
T1 = X[4] = W[4];
ROUND_00_15(4, e, f, g, h, a, b, c, d);
T1 = X[5] = W[5];
ROUND_00_15(5, d, e, f, g, h, a, b, c);
T1 = X[6] = W[6];
ROUND_00_15(6, c, d, e, f, g, h, a, b);
T1 = X[7] = W[7];
ROUND_00_15(7, b, c, d, e, f, g, h, a);
T1 = X[8] = W[8];
ROUND_00_15(8, a, b, c, d, e, f, g, h);
T1 = X[9] = W[9];
ROUND_00_15(9, h, a, b, c, d, e, f, g);
T1 = X[10] = W[10];
ROUND_00_15(10, g, h, a, b, c, d, e, f);
T1 = X[11] = W[11];
ROUND_00_15(11, f, g, h, a, b, c, d, e);
T1 = X[12] = W[12];
ROUND_00_15(12, e, f, g, h, a, b, c, d);
T1 = X[13] = W[13];
ROUND_00_15(13, d, e, f, g, h, a, b, c);
T1 = X[14] = W[14];
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = W[15];
ROUND_00_15(15, b, c, d, e, f, g, h, a);
# else
T1 = X[0] = PULL64(W[0]);
ROUND_00_15(0, a, b, c, d, e, f, g, h);
T1 = X[1] = PULL64(W[1]);
ROUND_00_15(1, h, a, b, c, d, e, f, g);
T1 = X[2] = PULL64(W[2]);
ROUND_00_15(2, g, h, a, b, c, d, e, f);
T1 = X[3] = PULL64(W[3]);
ROUND_00_15(3, f, g, h, a, b, c, d, e);
T1 = X[4] = PULL64(W[4]);
ROUND_00_15(4, e, f, g, h, a, b, c, d);
T1 = X[5] = PULL64(W[5]);
ROUND_00_15(5, d, e, f, g, h, a, b, c);
T1 = X[6] = PULL64(W[6]);
ROUND_00_15(6, c, d, e, f, g, h, a, b);
T1 = X[7] = PULL64(W[7]);
ROUND_00_15(7, b, c, d, e, f, g, h, a);
T1 = X[8] = PULL64(W[8]);
ROUND_00_15(8, a, b, c, d, e, f, g, h);
T1 = X[9] = PULL64(W[9]);
ROUND_00_15(9, h, a, b, c, d, e, f, g);
T1 = X[10] = PULL64(W[10]);
ROUND_00_15(10, g, h, a, b, c, d, e, f);
T1 = X[11] = PULL64(W[11]);
ROUND_00_15(11, f, g, h, a, b, c, d, e);
T1 = X[12] = PULL64(W[12]);
ROUND_00_15(12, e, f, g, h, a, b, c, d);
T1 = X[13] = PULL64(W[13]);
ROUND_00_15(13, d, e, f, g, h, a, b, c);
T1 = X[14] = PULL64(W[14]);
ROUND_00_15(14, c, d, e, f, g, h, a, b);
T1 = X[15] = PULL64(W[15]);
ROUND_00_15(15, b, c, d, e, f, g, h, a);
# endif
for (i = 16; i < 80; i += 16) {
ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X);
ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X);
ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X);
ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X);
ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X);
ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X);
ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X);
ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X);
ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X);
ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X);
ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X);
ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X);
ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X);
ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X);
ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X);
ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
}
ctx->h[0] += a;
ctx->h[1] += b;
ctx->h[2] += c;
ctx->h[3] += d;
ctx->h[4] += e;
ctx->h[5] += f;
ctx->h[6] += g;
ctx->h[7] += h;
W += SHA_LBLOCK;
}
}
# endif
#endif /* SHA512_ASM */

View file

@ -0,0 +1,424 @@
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdlib.h>
#include <string.h>
#include <openssl/opensslconf.h>
#include <openssl/sha.h>
#define DATA_ORDER_IS_BIG_ENDIAN
#define HASH_LONG SHA_LONG
#define HASH_CTX SHA_CTX
#define HASH_CBLOCK SHA_CBLOCK
#define HASH_MAKE_STRING(c,s) do { \
unsigned long ll; \
ll=(c)->h0; (void)HOST_l2c(ll,(s)); \
ll=(c)->h1; (void)HOST_l2c(ll,(s)); \
ll=(c)->h2; (void)HOST_l2c(ll,(s)); \
ll=(c)->h3; (void)HOST_l2c(ll,(s)); \
ll=(c)->h4; (void)HOST_l2c(ll,(s)); \
} while (0)
#define HASH_UPDATE SHA1_Update
#define HASH_TRANSFORM SHA1_Transform
#define HASH_FINAL SHA1_Final
#define HASH_INIT SHA1_Init
#define HASH_BLOCK_DATA_ORDER sha1_block_data_order
#define Xupdate(a,ix,ia,ib,ic,id) ( (a)=(ia^ib^ic^id), \
ix=(a)=ROTATE((a),1) \
)
#ifndef SHA1_ASM
static void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
#else
void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
#endif
#include "internal/md32_common.h"
#define INIT_DATA_h0 0x67452301UL
#define INIT_DATA_h1 0xefcdab89UL
#define INIT_DATA_h2 0x98badcfeUL
#define INIT_DATA_h3 0x10325476UL
#define INIT_DATA_h4 0xc3d2e1f0UL
int HASH_INIT(SHA_CTX *c)
{
memset(c, 0, sizeof(*c));
c->h0 = INIT_DATA_h0;
c->h1 = INIT_DATA_h1;
c->h2 = INIT_DATA_h2;
c->h3 = INIT_DATA_h3;
c->h4 = INIT_DATA_h4;
return 1;
}
#define K_00_19 0x5a827999UL
#define K_20_39 0x6ed9eba1UL
#define K_40_59 0x8f1bbcdcUL
#define K_60_79 0xca62c1d6UL
/*
* As pointed out by Wei Dai, F() below can be simplified to the code in
* F_00_19. Wei attributes these optimizations to Peter Gutmann's SHS code,
* and he attributes it to Rich Schroeppel.
* #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
* I've just become aware of another tweak to be made, again from Wei Dai,
* in F_40_59, (x&a)|(y&a) -> (x|y)&a
*/
#define F_00_19(b,c,d) ((((c) ^ (d)) & (b)) ^ (d))
#define F_20_39(b,c,d) ((b) ^ (c) ^ (d))
#define F_40_59(b,c,d) (((b) & (c)) | (((b)|(c)) & (d)))
#define F_60_79(b,c,d) F_20_39(b,c,d)
#ifndef OPENSSL_SMALL_FOOTPRINT
# define BODY_00_15(i,a,b,c,d,e,f,xi) \
(f)=xi+(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
(b)=ROTATE((b),30);
# define BODY_16_19(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
Xupdate(f,xi,xa,xb,xc,xd); \
(f)+=(e)+K_00_19+ROTATE((a),5)+F_00_19((b),(c),(d)); \
(b)=ROTATE((b),30);
# define BODY_20_31(i,a,b,c,d,e,f,xi,xa,xb,xc,xd) \
Xupdate(f,xi,xa,xb,xc,xd); \
(f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
(b)=ROTATE((b),30);
# define BODY_32_39(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)+=(e)+K_20_39+ROTATE((a),5)+F_20_39((b),(c),(d)); \
(b)=ROTATE((b),30);
# define BODY_40_59(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)+=(e)+K_40_59+ROTATE((a),5)+F_40_59((b),(c),(d)); \
(b)=ROTATE((b),30);
# define BODY_60_79(i,a,b,c,d,e,f,xa,xb,xc,xd) \
Xupdate(f,xa,xa,xb,xc,xd); \
(f)=xa+(e)+K_60_79+ROTATE((a),5)+F_60_79((b),(c),(d)); \
(b)=ROTATE((b),30);
# ifdef X
# undef X
# endif
# ifndef MD32_XARRAY
/*
* Originally X was an array. As it's automatic it's natural
* to expect RISC compiler to accommodate at least part of it in
* the register bank, isn't it? Unfortunately not all compilers
* "find" this expectation reasonable:-( On order to make such
* compilers generate better code I replace X[] with a bunch of
* X0, X1, etc. See the function body below...
*/
# define X(i) XX##i
# else
/*
* However! Some compilers (most notably HP C) get overwhelmed by
* that many local variables so that we have to have the way to
* fall down to the original behavior.
*/
# define X(i) XX[i]
# endif
# if !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data = p;
register unsigned MD32_REG_T A, B, C, D, E, T, l;
# ifndef MD32_XARRAY
unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7,
XX8, XX9, XX10, XX11, XX12, XX13, XX14, XX15;
# else
SHA_LONG XX[16];
# endif
A = c->h0;
B = c->h1;
C = c->h2;
D = c->h3;
E = c->h4;
for (;;) {
const union {
long one;
char little;
} is_endian = {
1
};
if (!is_endian.little && sizeof(SHA_LONG) == 4
&& ((size_t)p % 4) == 0) {
const SHA_LONG *W = (const SHA_LONG *)data;
X(0) = W[0];
X(1) = W[1];
BODY_00_15(0, A, B, C, D, E, T, X(0));
X(2) = W[2];
BODY_00_15(1, T, A, B, C, D, E, X(1));
X(3) = W[3];
BODY_00_15(2, E, T, A, B, C, D, X(2));
X(4) = W[4];
BODY_00_15(3, D, E, T, A, B, C, X(3));
X(5) = W[5];
BODY_00_15(4, C, D, E, T, A, B, X(4));
X(6) = W[6];
BODY_00_15(5, B, C, D, E, T, A, X(5));
X(7) = W[7];
BODY_00_15(6, A, B, C, D, E, T, X(6));
X(8) = W[8];
BODY_00_15(7, T, A, B, C, D, E, X(7));
X(9) = W[9];
BODY_00_15(8, E, T, A, B, C, D, X(8));
X(10) = W[10];
BODY_00_15(9, D, E, T, A, B, C, X(9));
X(11) = W[11];
BODY_00_15(10, C, D, E, T, A, B, X(10));
X(12) = W[12];
BODY_00_15(11, B, C, D, E, T, A, X(11));
X(13) = W[13];
BODY_00_15(12, A, B, C, D, E, T, X(12));
X(14) = W[14];
BODY_00_15(13, T, A, B, C, D, E, X(13));
X(15) = W[15];
BODY_00_15(14, E, T, A, B, C, D, X(14));
BODY_00_15(15, D, E, T, A, B, C, X(15));
data += SHA_CBLOCK;
} else {
(void)HOST_c2l(data, l);
X(0) = l;
(void)HOST_c2l(data, l);
X(1) = l;
BODY_00_15(0, A, B, C, D, E, T, X(0));
(void)HOST_c2l(data, l);
X(2) = l;
BODY_00_15(1, T, A, B, C, D, E, X(1));
(void)HOST_c2l(data, l);
X(3) = l;
BODY_00_15(2, E, T, A, B, C, D, X(2));
(void)HOST_c2l(data, l);
X(4) = l;
BODY_00_15(3, D, E, T, A, B, C, X(3));
(void)HOST_c2l(data, l);
X(5) = l;
BODY_00_15(4, C, D, E, T, A, B, X(4));
(void)HOST_c2l(data, l);
X(6) = l;
BODY_00_15(5, B, C, D, E, T, A, X(5));
(void)HOST_c2l(data, l);
X(7) = l;
BODY_00_15(6, A, B, C, D, E, T, X(6));
(void)HOST_c2l(data, l);
X(8) = l;
BODY_00_15(7, T, A, B, C, D, E, X(7));
(void)HOST_c2l(data, l);
X(9) = l;
BODY_00_15(8, E, T, A, B, C, D, X(8));
(void)HOST_c2l(data, l);
X(10) = l;
BODY_00_15(9, D, E, T, A, B, C, X(9));
(void)HOST_c2l(data, l);
X(11) = l;
BODY_00_15(10, C, D, E, T, A, B, X(10));
(void)HOST_c2l(data, l);
X(12) = l;
BODY_00_15(11, B, C, D, E, T, A, X(11));
(void)HOST_c2l(data, l);
X(13) = l;
BODY_00_15(12, A, B, C, D, E, T, X(12));
(void)HOST_c2l(data, l);
X(14) = l;
BODY_00_15(13, T, A, B, C, D, E, X(13));
(void)HOST_c2l(data, l);
X(15) = l;
BODY_00_15(14, E, T, A, B, C, D, X(14));
BODY_00_15(15, D, E, T, A, B, C, X(15));
}
BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13));
BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14));
BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15));
BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0));
BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1));
BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2));
BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3));
BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4));
BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5));
BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6));
BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7));
BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8));
BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9));
BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10));
BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11));
BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12));
BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13));
BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14));
BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15));
BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0));
BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1));
BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2));
BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3));
BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4));
BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5));
BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6));
BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7));
BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8));
BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9));
BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10));
BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11));
BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12));
BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13));
BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14));
BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15));
BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0));
BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1));
BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2));
BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3));
BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4));
BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5));
BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6));
BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7));
BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8));
BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9));
BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10));
BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11));
BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12));
BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13));
BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14));
BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15));
BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0));
BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1));
BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2));
BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3));
BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4));
BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5));
BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6));
BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7));
BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8));
BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9));
BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10));
BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11));
BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12));
c->h0 = (c->h0 + E) & 0xffffffffL;
c->h1 = (c->h1 + T) & 0xffffffffL;
c->h2 = (c->h2 + A) & 0xffffffffL;
c->h3 = (c->h3 + B) & 0xffffffffL;
c->h4 = (c->h4 + C) & 0xffffffffL;
if (--num == 0)
break;
A = c->h0;
B = c->h1;
C = c->h2;
D = c->h3;
E = c->h4;
}
}
# endif
#else /* OPENSSL_SMALL_FOOTPRINT */
# define BODY_00_15(xi) do { \
T=E+K_00_19+F_00_19(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T+xi; } while(0)
# define BODY_16_19(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_00_19+F_00_19(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
# define BODY_20_39(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_20_39+F_20_39(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
# define BODY_40_59(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T+=E+K_40_59+F_40_59(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T; } while(0)
# define BODY_60_79(xa,xb,xc,xd) do { \
Xupdate(T,xa,xa,xb,xc,xd); \
T=E+K_60_79+F_60_79(B,C,D); \
E=D, D=C, C=ROTATE(B,30), B=A; \
A=ROTATE(A,5)+T+xa; } while(0)
# if !defined(SHA1_ASM)
static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num)
{
const unsigned char *data = p;
register unsigned MD32_REG_T A, B, C, D, E, T, l;
int i;
SHA_LONG X[16];
A = c->h0;
B = c->h1;
C = c->h2;
D = c->h3;
E = c->h4;
for (;;) {
for (i = 0; i < 16; i++) {
(void)HOST_c2l(data, l);
X[i] = l;
BODY_00_15(X[i]);
}
for (i = 0; i < 4; i++) {
BODY_16_19(X[i], X[i + 2], X[i + 8], X[(i + 13) & 15]);
}
for (; i < 24; i++) {
BODY_20_39(X[i & 15], X[(i + 2) & 15], X[(i + 8) & 15],
X[(i + 13) & 15]);
}
for (i = 0; i < 20; i++) {
BODY_40_59(X[(i + 8) & 15], X[(i + 10) & 15], X[i & 15],
X[(i + 5) & 15]);
}
for (i = 4; i < 24; i++) {
BODY_60_79(X[(i + 8) & 15], X[(i + 10) & 15], X[i & 15],
X[(i + 5) & 15]);
}
c->h0 = (c->h0 + A) & 0xffffffffL;
c->h1 = (c->h1 + B) & 0xffffffffL;
c->h2 = (c->h2 + C) & 0xffffffffL;
c->h3 = (c->h3 + D) & 0xffffffffL;
c->h4 = (c->h4 + E) & 0xffffffffL;
if (--num == 0)
break;
A = c->h0;
B = c->h1;
C = c->h2;
D = c->h3;
E = c->h4;
}
}
# endif
#endif