mirror of
https://github.com/ossrs/srs.git
synced 2025-03-09 15:49:59 +00:00
Upgrade openssl from 1.1.0e to 1.1.1b, with source code. 4.0.78
This commit is contained in:
parent
8f1c992379
commit
96dbd7bced
1476 changed files with 616554 additions and 4 deletions
1099
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/aesni-gcm-x86_64.pl
vendored
Normal file
1099
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/aesni-gcm-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load diff
467
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-alpha.pl
vendored
Normal file
467
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-alpha.pl
vendored
Normal file
|
@ -0,0 +1,467 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
|
||||
# loops are aggressively modulo-scheduled in respect to references to
|
||||
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
|
||||
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
|
||||
# scheduling "glitch," because uprofile(1) indicates uniform sample
|
||||
# distribution, as if all instruction bundles execute in 1.5 cycles.
|
||||
# Meaning that it could have been even faster, yet 12 cycles is ~60%
|
||||
# better than gcc-generated code and ~80% than code generated by vendor
|
||||
# compiler.
|
||||
|
||||
$cnt="v0"; # $0
|
||||
$t0="t0";
|
||||
$t1="t1";
|
||||
$t2="t2";
|
||||
$Thi0="t3"; # $4
|
||||
$Tlo0="t4";
|
||||
$Thi1="t5";
|
||||
$Tlo1="t6";
|
||||
$rem="t7"; # $8
|
||||
#################
|
||||
$Xi="a0"; # $16, input argument block
|
||||
$Htbl="a1";
|
||||
$inp="a2";
|
||||
$len="a3";
|
||||
$nlo="a4"; # $20
|
||||
$nhi="a5";
|
||||
$Zhi="t8";
|
||||
$Zlo="t9";
|
||||
$Xhi="t10"; # $24
|
||||
$Xlo="t11";
|
||||
$remp="t12";
|
||||
$rem_4bit="AT"; # $28
|
||||
|
||||
{ my $N;
|
||||
sub loop() {
|
||||
|
||||
$N++;
|
||||
$code.=<<___;
|
||||
.align 4
|
||||
extbl $Xlo,7,$nlo
|
||||
and $nlo,0xf0,$nhi
|
||||
sll $nlo,4,$nlo
|
||||
and $nlo,0xf0,$nlo
|
||||
|
||||
addq $nlo,$Htbl,$nlo
|
||||
ldq $Zlo,8($nlo)
|
||||
addq $nhi,$Htbl,$nhi
|
||||
ldq $Zhi,0($nlo)
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
lda $cnt,6(zero)
|
||||
extbl $Xlo,6,$nlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
ldq $Thi1,0($nhi)
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
and $nlo,0xf0,$nhi
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
and $nlo,0xf0,$nlo
|
||||
|
||||
addq $nlo,$Htbl,$nlo
|
||||
ldq $Tlo0,8($nlo)
|
||||
addq $nhi,$Htbl,$nhi
|
||||
ldq $Thi0,0($nlo)
|
||||
|
||||
.Looplo$N:
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
subq $cnt,1,$cnt
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xlo,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
bne $cnt,.Looplo$N
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
lda $cnt,7(zero)
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xhi,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
unop
|
||||
|
||||
|
||||
.Loophi$N:
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
subq $cnt,1,$cnt
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xhi,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
bne $cnt,.Loophi$N
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
___
|
||||
}}
|
||||
|
||||
$code=<<___;
|
||||
#ifdef __linux__
|
||||
#include <asm/regdef.h>
|
||||
#else
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.globl gcm_gmult_4bit
|
||||
.align 4
|
||||
.ent gcm_gmult_4bit
|
||||
gcm_gmult_4bit:
|
||||
.frame sp,0,ra
|
||||
.prologue 0
|
||||
|
||||
ldq $Xlo,8($Xi)
|
||||
ldq $Xhi,0($Xi)
|
||||
|
||||
bsr $t0,picmeup
|
||||
nop
|
||||
___
|
||||
|
||||
&loop();
|
||||
|
||||
$code.=<<___;
|
||||
srl $Zlo,24,$t0 # byte swap
|
||||
srl $Zlo,8,$t1
|
||||
|
||||
sll $Zlo,8,$t2
|
||||
sll $Zlo,24,$Zlo
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
|
||||
zapnot $Zlo,0x88,$Zlo
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zlo,$t0,$Zlo
|
||||
srl $Zhi,24,$t0
|
||||
srl $Zhi,8,$t1
|
||||
|
||||
or $Zlo,$t2,$Zlo
|
||||
sll $Zhi,8,$t2
|
||||
sll $Zhi,24,$Zhi
|
||||
|
||||
srl $Zlo,32,$Xlo
|
||||
sll $Zlo,32,$Zlo
|
||||
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
stq $Xlo,8($Xi)
|
||||
stq $Xhi,0($Xi)
|
||||
|
||||
ret (ra)
|
||||
.end gcm_gmult_4bit
|
||||
___
|
||||
|
||||
$inhi="s0";
|
||||
$inlo="s1";
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_4bit
|
||||
.align 4
|
||||
.ent gcm_ghash_4bit
|
||||
gcm_ghash_4bit:
|
||||
lda sp,-32(sp)
|
||||
stq ra,0(sp)
|
||||
stq s0,8(sp)
|
||||
stq s1,16(sp)
|
||||
.mask 0x04000600,-32
|
||||
.frame sp,32,ra
|
||||
.prologue 0
|
||||
|
||||
ldq_u $inhi,0($inp)
|
||||
ldq_u $Thi0,7($inp)
|
||||
ldq_u $inlo,8($inp)
|
||||
ldq_u $Tlo0,15($inp)
|
||||
ldq $Xhi,0($Xi)
|
||||
ldq $Xlo,8($Xi)
|
||||
|
||||
bsr $t0,picmeup
|
||||
nop
|
||||
|
||||
.Louter:
|
||||
extql $inhi,$inp,$inhi
|
||||
extqh $Thi0,$inp,$Thi0
|
||||
or $inhi,$Thi0,$inhi
|
||||
lda $inp,16($inp)
|
||||
|
||||
extql $inlo,$inp,$inlo
|
||||
extqh $Tlo0,$inp,$Tlo0
|
||||
or $inlo,$Tlo0,$inlo
|
||||
subq $len,16,$len
|
||||
|
||||
xor $Xlo,$inlo,$Xlo
|
||||
xor $Xhi,$inhi,$Xhi
|
||||
___
|
||||
|
||||
&loop();
|
||||
|
||||
$code.=<<___;
|
||||
srl $Zlo,24,$t0 # byte swap
|
||||
srl $Zlo,8,$t1
|
||||
|
||||
sll $Zlo,8,$t2
|
||||
sll $Zlo,24,$Zlo
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
|
||||
zapnot $Zlo,0x88,$Zlo
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zlo,$t0,$Zlo
|
||||
srl $Zhi,24,$t0
|
||||
srl $Zhi,8,$t1
|
||||
|
||||
or $Zlo,$t2,$Zlo
|
||||
sll $Zhi,8,$t2
|
||||
sll $Zhi,24,$Zhi
|
||||
|
||||
srl $Zlo,32,$Xlo
|
||||
sll $Zlo,32,$Zlo
|
||||
beq $len,.Ldone
|
||||
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
ldq_u $inhi,0($inp)
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
ldq_u $Thi0,7($inp)
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
ldq_u $inlo,8($inp)
|
||||
ldq_u $Tlo0,15($inp)
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
br zero,.Louter
|
||||
|
||||
.Ldone:
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
|
||||
stq $Xlo,8($Xi)
|
||||
stq $Xhi,0($Xi)
|
||||
|
||||
.set noreorder
|
||||
/*ldq ra,0(sp)*/
|
||||
ldq s0,8(sp)
|
||||
ldq s1,16(sp)
|
||||
lda sp,32(sp)
|
||||
ret (ra)
|
||||
.end gcm_ghash_4bit
|
||||
|
||||
.align 4
|
||||
.ent picmeup
|
||||
picmeup:
|
||||
.frame sp,0,$t0
|
||||
.prologue 0
|
||||
br $rem_4bit,.Lpic
|
||||
.Lpic: lda $rem_4bit,12($rem_4bit)
|
||||
ret ($t0)
|
||||
.end picmeup
|
||||
nop
|
||||
rem_4bit:
|
||||
.long 0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
|
||||
.long 0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
|
||||
.long 0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
|
||||
.long 0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
|
||||
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
|
||||
___
|
||||
$output=pop and open STDOUT,">$output";
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
551
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-armv4.pl
vendored
Normal file
551
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-armv4.pl
vendored
Normal file
|
@ -0,0 +1,551 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# April 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
|
||||
# experimental performance data available yet. The only approximation
|
||||
# that can be made at this point is based on code size. Inner loop is
|
||||
# 32 instructions long and on single-issue core should execute in <40
|
||||
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
|
||||
# loop, this assembler loop body was found to be ~3x smaller than
|
||||
# compiler-generated one...
|
||||
#
|
||||
# July 2010
|
||||
#
|
||||
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
|
||||
# Cortex A8 core and ~25 cycles per processed byte (which was observed
|
||||
# to be ~3 times faster than gcc-generated code:-)
|
||||
#
|
||||
# February 2011
|
||||
#
|
||||
# Profiler-assisted and platform-specific optimization resulted in 7%
|
||||
# improvement on Cortex A8 core and ~23.5 cycles per byte.
|
||||
#
|
||||
# March 2011
|
||||
#
|
||||
# Add NEON implementation featuring polynomial multiplication, i.e. no
|
||||
# lookup tables involved. On Cortex A8 it was measured to process one
|
||||
# byte in 15 cycles or 55% faster than integer-only code.
|
||||
#
|
||||
# April 2014
|
||||
#
|
||||
# Switch to multiplication algorithm suggested in paper referred
|
||||
# below and combine it with reduction algorithm from x86 module.
|
||||
# Performance improvement over previous version varies from 65% on
|
||||
# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
|
||||
# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
|
||||
# Snapdragon S4 - in 9.33.
|
||||
#
|
||||
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
# ====================================================================
|
||||
# Note about "528B" variant. In ARM case it makes lesser sense to
|
||||
# implement it for following reasons:
|
||||
#
|
||||
# - performance improvement won't be anywhere near 50%, because 128-
|
||||
# bit shift operation is neatly fused with 128-bit xor here, and
|
||||
# "538B" variant would eliminate only 4-5 instructions out of 32
|
||||
# in the inner loop (meaning that estimated improvement is ~15%);
|
||||
# - ARM-based systems are often embedded ones and extra memory
|
||||
# consumption might be unappreciated (for so little improvement);
|
||||
#
|
||||
# Byte order [in]dependence. =========================================
|
||||
#
|
||||
# Caller is expected to maintain specific *dword* order in Htable,
|
||||
# namely with *least* significant dword of 128-bit value at *lower*
|
||||
# address. This differs completely from C code and has everything to
|
||||
# do with ldm instruction and order in which dwords are "consumed" by
|
||||
# algorithm. *Byte* order within these dwords in turn is whatever
|
||||
# *native* byte order on current platform. See gcm128.c for working
|
||||
# example...
|
||||
|
||||
$flavour = shift;
|
||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
$Xi="r0"; # argument block
|
||||
$Htbl="r1";
|
||||
$inp="r2";
|
||||
$len="r3";
|
||||
|
||||
$Zll="r4"; # variables
|
||||
$Zlh="r5";
|
||||
$Zhl="r6";
|
||||
$Zhh="r7";
|
||||
$Tll="r8";
|
||||
$Tlh="r9";
|
||||
$Thl="r10";
|
||||
$Thh="r11";
|
||||
$nlo="r12";
|
||||
################# r13 is stack pointer
|
||||
$nhi="r14";
|
||||
################# r15 is program counter
|
||||
|
||||
$rem_4bit=$inp; # used in gcm_gmult_4bit
|
||||
$cnt=$len;
|
||||
|
||||
sub Zsmash() {
|
||||
my $i=12;
|
||||
my @args=@_;
|
||||
for ($Zll,$Zlh,$Zhl,$Zhh) {
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
|
||||
rev $_,$_
|
||||
str $_,[$Xi,#$i]
|
||||
#elif defined(__ARMEB__)
|
||||
str $_,[$Xi,#$i]
|
||||
#else
|
||||
mov $Tlh,$_,lsr#8
|
||||
strb $_,[$Xi,#$i+3]
|
||||
mov $Thl,$_,lsr#16
|
||||
strb $Tlh,[$Xi,#$i+2]
|
||||
mov $Thh,$_,lsr#24
|
||||
strb $Thl,[$Xi,#$i+1]
|
||||
strb $Thh,[$Xi,#$i]
|
||||
#endif
|
||||
___
|
||||
$code.="\t".shift(@args)."\n";
|
||||
$i-=4;
|
||||
}
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
#if defined(__thumb2__) || defined(__clang__)
|
||||
.syntax unified
|
||||
#define ldrplb ldrbpl
|
||||
#define ldrneb ldrbne
|
||||
#endif
|
||||
#if defined(__thumb2__)
|
||||
.thumb
|
||||
#else
|
||||
.code 32
|
||||
#endif
|
||||
|
||||
.type rem_4bit,%object
|
||||
.align 5
|
||||
rem_4bit:
|
||||
.short 0x0000,0x1C20,0x3840,0x2460
|
||||
.short 0x7080,0x6CA0,0x48C0,0x54E0
|
||||
.short 0xE100,0xFD20,0xD940,0xC560
|
||||
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
|
||||
.size rem_4bit,.-rem_4bit
|
||||
|
||||
.type rem_4bit_get,%function
|
||||
rem_4bit_get:
|
||||
#if defined(__thumb2__)
|
||||
adr $rem_4bit,rem_4bit
|
||||
#else
|
||||
sub $rem_4bit,pc,#8+32 @ &rem_4bit
|
||||
#endif
|
||||
b .Lrem_4bit_got
|
||||
nop
|
||||
nop
|
||||
.size rem_4bit_get,.-rem_4bit_get
|
||||
|
||||
.global gcm_ghash_4bit
|
||||
.type gcm_ghash_4bit,%function
|
||||
.align 4
|
||||
gcm_ghash_4bit:
|
||||
#if defined(__thumb2__)
|
||||
adr r12,rem_4bit
|
||||
#else
|
||||
sub r12,pc,#8+48 @ &rem_4bit
|
||||
#endif
|
||||
add $len,$inp,$len @ $len to point at the end
|
||||
stmdb sp!,{r3-r11,lr} @ save $len/end too
|
||||
|
||||
ldmia r12,{r4-r11} @ copy rem_4bit ...
|
||||
stmdb sp!,{r4-r11} @ ... to stack
|
||||
|
||||
ldrb $nlo,[$inp,#15]
|
||||
ldrb $nhi,[$Xi,#15]
|
||||
.Louter:
|
||||
eor $nlo,$nlo,$nhi
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
mov $cnt,#14
|
||||
|
||||
add $Zhh,$Htbl,$nlo,lsl#4
|
||||
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
||||
add $Thh,$Htbl,$nhi
|
||||
ldrb $nlo,[$inp,#14]
|
||||
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
add $nhi,$nhi,$nhi
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
ldrb $nhi,[$Xi,#14]
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
eor $nlo,$nlo,$nhi
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16
|
||||
|
||||
.Linner:
|
||||
add $Thh,$Htbl,$nlo,lsl#4
|
||||
and $nlo,$Zll,#0xf @ rem
|
||||
subs $cnt,$cnt,#1
|
||||
add $nlo,$nlo,$nlo
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
#ifdef __thumb2__
|
||||
it pl
|
||||
#endif
|
||||
ldrplb $nlo,[$inp,$cnt]
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
add $nhi,$nhi,$nhi
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
#ifdef __thumb2__
|
||||
it pl
|
||||
#endif
|
||||
ldrplb $Tll,[$Xi,$cnt]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
ldrh $Tlh,[sp,$nhi]
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
#ifdef __thumb2__
|
||||
it pl
|
||||
#endif
|
||||
eorpl $nlo,$nlo,$Tll
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
#ifdef __thumb2__
|
||||
itt pl
|
||||
#endif
|
||||
andpl $nhi,$nlo,#0xf0
|
||||
andpl $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
|
||||
bpl .Linner
|
||||
|
||||
ldr $len,[sp,#32] @ re-load $len/end
|
||||
add $inp,$inp,#16
|
||||
mov $nhi,$Zll
|
||||
___
|
||||
&Zsmash("cmp\t$inp,$len","\n".
|
||||
"#ifdef __thumb2__\n".
|
||||
" it ne\n".
|
||||
"#endif\n".
|
||||
" ldrneb $nlo,[$inp,#15]");
|
||||
$code.=<<___;
|
||||
bne .Louter
|
||||
|
||||
add sp,sp,#36
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r11,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r11,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
||||
|
||||
.global gcm_gmult_4bit
|
||||
.type gcm_gmult_4bit,%function
|
||||
gcm_gmult_4bit:
|
||||
stmdb sp!,{r4-r11,lr}
|
||||
ldrb $nlo,[$Xi,#15]
|
||||
b rem_4bit_get
|
||||
.Lrem_4bit_got:
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
mov $cnt,#14
|
||||
|
||||
add $Zhh,$Htbl,$nlo,lsl#4
|
||||
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
||||
ldrb $nlo,[$Xi,#14]
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
add $nhi,$nhi,$nhi
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
and $nhi,$nlo,#0xf0
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16
|
||||
and $nlo,$nlo,#0x0f
|
||||
|
||||
.Loop:
|
||||
add $Thh,$Htbl,$nlo,lsl#4
|
||||
and $nlo,$Zll,#0xf @ rem
|
||||
subs $cnt,$cnt,#1
|
||||
add $nlo,$nlo,$nlo
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
#ifdef __thumb2__
|
||||
it pl
|
||||
#endif
|
||||
ldrplb $nlo,[$Xi,$cnt]
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
add $nhi,$nhi,$nhi
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
#ifdef __thumb2__
|
||||
itt pl
|
||||
#endif
|
||||
andpl $nhi,$nlo,#0xf0
|
||||
andpl $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
bpl .Loop
|
||||
___
|
||||
&Zsmash();
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r11,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r11,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||||
___
|
||||
{
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||||
my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
|
||||
my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
|
||||
|
||||
sub clmul64x64 {
|
||||
my ($r,$a,$b)=@_;
|
||||
$code.=<<___;
|
||||
vext.8 $t0#lo, $a, $a, #1 @ A1
|
||||
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||||
vext.8 $r#lo, $b, $b, #1 @ B1
|
||||
vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||||
vext.8 $t1#lo, $a, $a, #2 @ A2
|
||||
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||||
vext.8 $t3#lo, $b, $b, #2 @ B2
|
||||
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||||
vext.8 $t2#lo, $a, $a, #3 @ A3
|
||||
veor $t0, $t0, $r @ L = E + F
|
||||
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||||
vext.8 $r#lo, $b, $b, #3 @ B3
|
||||
veor $t1, $t1, $t3 @ M = G + H
|
||||
vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||||
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||||
vand $t0#hi, $t0#hi, $k48
|
||||
vext.8 $t3#lo, $b, $b, #4 @ B4
|
||||
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||||
vand $t1#hi, $t1#hi, $k32
|
||||
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||||
veor $t2, $t2, $r @ N = I + J
|
||||
veor $t0#lo, $t0#lo, $t0#hi
|
||||
veor $t1#lo, $t1#lo, $t1#hi
|
||||
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||||
vand $t2#hi, $t2#hi, $k16
|
||||
vext.8 $t0, $t0, $t0, #15
|
||||
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 $t3#hi, #0
|
||||
vext.8 $t1, $t1, $t1, #14
|
||||
veor $t2#lo, $t2#lo, $t2#hi
|
||||
vmull.p8 $r, $a, $b @ D = A*B
|
||||
vext.8 $t3, $t3, $t3, #12
|
||||
vext.8 $t2, $t2, $t2, #13
|
||||
veor $t0, $t0, $t1
|
||||
veor $t2, $t2, $t3
|
||||
veor $r, $r, $t0
|
||||
veor $r, $r, $t2
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
|
||||
.global gcm_init_neon
|
||||
.type gcm_init_neon,%function
|
||||
.align 4
|
||||
gcm_init_neon:
|
||||
vld1.64 $IN#hi,[r1]! @ load H
|
||||
vmov.i8 $t0,#0xe1
|
||||
vld1.64 $IN#lo,[r1]
|
||||
vshl.i64 $t0#hi,#57
|
||||
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
|
||||
vdup.8 $t1,$IN#hi[7]
|
||||
vshr.u64 $Hlo,$IN#lo,#63
|
||||
vshr.s8 $t1,#7 @ broadcast carry bit
|
||||
vshl.i64 $IN,$IN,#1
|
||||
vand $t0,$t0,$t1
|
||||
vorr $IN#hi,$Hlo @ H<<<=1
|
||||
veor $IN,$IN,$t0 @ twisted H
|
||||
vstmia r0,{$IN}
|
||||
|
||||
ret @ bx lr
|
||||
.size gcm_init_neon,.-gcm_init_neon
|
||||
|
||||
.global gcm_gmult_neon
|
||||
.type gcm_gmult_neon,%function
|
||||
.align 4
|
||||
gcm_gmult_neon:
|
||||
vld1.64 $IN#hi,[$Xi]! @ load Xi
|
||||
vld1.64 $IN#lo,[$Xi]!
|
||||
vmov.i64 $k48,#0x0000ffffffffffff
|
||||
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||||
vmov.i64 $k32,#0x00000000ffffffff
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
vmov.i64 $k16,#0x000000000000ffff
|
||||
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||||
mov $len,#16
|
||||
b .Lgmult_neon
|
||||
.size gcm_gmult_neon,.-gcm_gmult_neon
|
||||
|
||||
.global gcm_ghash_neon
|
||||
.type gcm_ghash_neon,%function
|
||||
.align 4
|
||||
gcm_ghash_neon:
|
||||
vld1.64 $Xl#hi,[$Xi]! @ load Xi
|
||||
vld1.64 $Xl#lo,[$Xi]!
|
||||
vmov.i64 $k48,#0x0000ffffffffffff
|
||||
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
|
||||
vmov.i64 $k32,#0x00000000ffffffff
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vmov.i64 $k16,#0x000000000000ffff
|
||||
veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
|
||||
|
||||
.Loop_neon:
|
||||
vld1.64 $IN#hi,[$inp]! @ load inp
|
||||
vld1.64 $IN#lo,[$inp]!
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
veor $IN,$Xl @ inp^=Xi
|
||||
.Lgmult_neon:
|
||||
___
|
||||
&clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
|
||||
$code.=<<___;
|
||||
veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
|
||||
___
|
||||
&clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
&clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
|
||||
$code.=<<___;
|
||||
veor $Xm,$Xm,$Xl @ Karatsuba post-processing
|
||||
veor $Xm,$Xm,$Xh
|
||||
veor $Xl#hi,$Xl#hi,$Xm#lo
|
||||
veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
|
||||
|
||||
@ equivalent of reduction_avx from ghash-x86_64.pl
|
||||
vshl.i64 $t1,$Xl,#57 @ 1st phase
|
||||
vshl.i64 $t2,$Xl,#62
|
||||
veor $t2,$t2,$t1 @
|
||||
vshl.i64 $t1,$Xl,#63
|
||||
veor $t2, $t2, $t1 @
|
||||
veor $Xl#hi,$Xl#hi,$t2#lo @
|
||||
veor $Xh#lo,$Xh#lo,$t2#hi
|
||||
|
||||
vshr.u64 $t2,$Xl,#1 @ 2nd phase
|
||||
veor $Xh,$Xh,$Xl
|
||||
veor $Xl,$Xl,$t2 @
|
||||
vshr.u64 $t2,$t2,#6
|
||||
vshr.u64 $Xl,$Xl,#1 @
|
||||
veor $Xl,$Xl,$Xh @
|
||||
veor $Xl,$Xl,$t2 @
|
||||
|
||||
subs $len,#16
|
||||
bne .Loop_neon
|
||||
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
|
||||
vst1.64 $Xl#lo,[$Xi]
|
||||
|
||||
ret @ bx lr
|
||||
.size gcm_ghash_neon,.-gcm_ghash_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT; # enforce flush
|
247
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-c64xplus.pl
vendored
Normal file
247
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-c64xplus.pl
vendored
Normal file
|
@ -0,0 +1,247 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# December 2011
|
||||
#
|
||||
# The module implements GCM GHASH function and underlying single
|
||||
# multiplication operation in GF(2^128). Even though subroutines
|
||||
# have _4bit suffix, they are not using any tables, but rely on
|
||||
# hardware Galois Field Multiply support. Streamed GHASH processes
|
||||
# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
|
||||
# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
|
||||
# comparing apples vs. oranges, but compiler surely could have done
|
||||
# better, because theoretical [though not necessarily achievable]
|
||||
# estimate for "4-bit" table-driven implementation is ~12 cycles.
|
||||
|
||||
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6"); # arguments
|
||||
|
||||
($Z0,$Z1,$Z2,$Z3, $H0, $H1, $H2, $H3,
|
||||
$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
|
||||
($H01u,$H01y,$H2u,$H3u, $H0y,$H1y,$H2y,$H3y,
|
||||
$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
|
||||
($FF000000,$E10000)=("B30","B31");
|
||||
($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
|
||||
$xia="A9";
|
||||
($rem,$res)=("B4","B5"); # $rem zaps $Htable
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.if .ASSEMBLER_VERSION<7000000
|
||||
.asg 0,__TI_EABI__
|
||||
.endif
|
||||
.if __TI_EABI__
|
||||
.asg gcm_gmult_1bit,_gcm_gmult_1bit
|
||||
.asg gcm_gmult_4bit,_gcm_gmult_4bit
|
||||
.asg gcm_ghash_4bit,_gcm_ghash_4bit
|
||||
.endif
|
||||
|
||||
.asg B3,RA
|
||||
|
||||
.if 0
|
||||
.global _gcm_gmult_1bit
|
||||
_gcm_gmult_1bit:
|
||||
ADDAD $Htable,2,$Htable
|
||||
.endif
|
||||
.global _gcm_gmult_4bit
|
||||
_gcm_gmult_4bit:
|
||||
.asmfunc
|
||||
LDDW *${Htable}[-1],$H1:$H0 ; H.lo
|
||||
LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|
||||
|| MV $Xip,${xip} ; reassign Xi
|
||||
|| MVK 15,B1 ; SPLOOPD constant
|
||||
|
||||
MVK 0xE1,$E10000
|
||||
|| LDBU *++${xip}[15],$x1 ; Xi[15]
|
||||
MVK 0xFF,$FF000000
|
||||
|| LDBU *--${xip},$x0 ; Xi[14]
|
||||
SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
|
||||
SHL $FF000000,24,$FF000000 ; upper byte mask
|
||||
|| BNOP ghash_loop?
|
||||
|| MVK 1,B0 ; take a single spin
|
||||
|
||||
PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
|
||||
AND $H2,$FF000000,$H2u ; H2's upper byte
|
||||
AND $H3,$FF000000,$H3u ; H3's upper byte
|
||||
|| SHRU $H2u,8,$H2u
|
||||
SHRU $H3u,8,$H3u
|
||||
|| ZERO $Z1:$Z0
|
||||
SHRU2 $xia,8,$H01u
|
||||
|| ZERO $Z3:$Z2
|
||||
.endasmfunc
|
||||
|
||||
.global _gcm_ghash_4bit
|
||||
_gcm_ghash_4bit:
|
||||
.asmfunc
|
||||
LDDW *${Htable}[-1],$H1:$H0 ; H.lo
|
||||
|| SHRU $len,4,B0 ; reassign len
|
||||
LDDW *${Htable}[-2],$H3:$H2 ; H.hi
|
||||
|| MV $Xip,${xip} ; reassign Xi
|
||||
|| MVK 15,B1 ; SPLOOPD constant
|
||||
|
||||
MVK 0xE1,$E10000
|
||||
|| [B0] LDNDW *${inp}[1],$H1x:$H0x
|
||||
MVK 0xFF,$FF000000
|
||||
|| [B0] LDNDW *${inp}++[2],$H3x:$H2x
|
||||
SHL $E10000,16,$E10000 ; [pre-shifted] reduction polynomial
|
||||
|| LDDW *${xip}[1],$Z1:$Z0
|
||||
SHL $FF000000,24,$FF000000 ; upper byte mask
|
||||
|| LDDW *${xip}[0],$Z3:$Z2
|
||||
|
||||
PACKH2 $H0,$H1,$xia ; pack H0' and H1's upper bytes
|
||||
AND $H2,$FF000000,$H2u ; H2's upper byte
|
||||
AND $H3,$FF000000,$H3u ; H3's upper byte
|
||||
|| SHRU $H2u,8,$H2u
|
||||
SHRU $H3u,8,$H3u
|
||||
SHRU2 $xia,8,$H01u
|
||||
|
||||
|| [B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
||||
|| [B0] XOR $H1x,$Z1,$Z1
|
||||
.if .LITTLE_ENDIAN
|
||||
[B0] XOR $H2x,$Z2,$Z2
|
||||
|| [B0] XOR $H3x,$Z3,$Z3
|
||||
|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
|
||||
STDW $Z1:$Z0,*${xip}[1]
|
||||
|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
|
||||
|| [B0] ZERO $Z1:$Z0
|
||||
.else
|
||||
[B0] XOR $H2x,$Z2,$Z2
|
||||
|| [B0] XOR $H3x,$Z3,$Z3
|
||||
|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
|
||||
STDW $Z1:$Z0,*${xip}[1]
|
||||
|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
|
||||
|| [B0] ZERO $Z1:$Z0
|
||||
.endif
|
||||
STDW $Z3:$Z2,*${xip}[0]
|
||||
|| [B0] ZERO $Z3:$Z2
|
||||
|| [B0] MV $xia,$x1
|
||||
[B0] ADDK 14,${xip}
|
||||
|
||||
ghash_loop?:
|
||||
SPLOOPD 6 ; 6*16+7
|
||||
|| MVC B1,ILC
|
||||
|| [B0] SUB B0,1,B0
|
||||
|| ZERO A0
|
||||
|| ADD $x1,$x1,$xib ; SHL $x1,1,$xib
|
||||
|| SHL $x1,1,$xia
|
||||
___
|
||||
|
||||
########____________________________
|
||||
# 0 D2. M1 M2 |
|
||||
# 1 M1 |
|
||||
# 2 M1 M2 |
|
||||
# 3 D1. M1 M2 |
|
||||
# 4 S1. L1 |
|
||||
# 5 S2 S1x L1 D2 L2 |____________________________
|
||||
# 6/0 L1 S1 L2 S2x |D2. M1 M2 |
|
||||
# 7/1 L1 S1 D1x S2 M2 | M1 |
|
||||
# 8/2 S1 L1x S2 | M1 M2 |
|
||||
# 9/3 S1 L1x | D1. M1 M2 |
|
||||
# 10/4 D1x | S1. L1 |
|
||||
# 11/5 |S2 S1x L1 D2 L2 |____________
|
||||
# 12/6/0 D1x __| L1 S1 L2 S2x |D2. ....
|
||||
# 7/1 L1 S1 D1x S2 M2 | ....
|
||||
# 8/2 S1 L1x S2 | ....
|
||||
#####... ................|............
|
||||
$code.=<<___;
|
||||
XORMPY $H0,$xia,$H0x ; 0 ; H·(Xi[i]<<1)
|
||||
|| XORMPY $H01u,$xib,$H01y
|
||||
|| [A0] LDBU *--${xip},$x0
|
||||
XORMPY $H1,$xia,$H1x ; 1
|
||||
XORMPY $H2,$xia,$H2x ; 2
|
||||
|| XORMPY $H2u,$xib,$H2y
|
||||
XORMPY $H3,$xia,$H3x ; 3
|
||||
|| XORMPY $H3u,$xib,$H3y
|
||||
||[!A0] MVK.D 15,A0 ; *--${xip} counter
|
||||
XOR.L $H0x,$Z0,$Z0 ; 4 ; Z^=H·(Xi[i]<<1)
|
||||
|| [A0] SUB.S A0,1,A0
|
||||
XOR.L $H1x,$Z1,$Z1 ; 5
|
||||
|| AND.D $H01y,$FF000000,$H0z
|
||||
|| SWAP2.L $H01y,$H1y ; ; SHL $H01y,16,$H1y
|
||||
|| SHL $x0,1,$xib
|
||||
|| SHL $x0,1,$xia
|
||||
|
||||
XOR.L $H2x,$Z2,$Z2 ; 6/0 ; [0,0] in epilogue
|
||||
|| SHL $Z0,1,$rem ; ; rem=Z<<1
|
||||
|| SHRMB.S $Z1,$Z0,$Z0 ; ; Z>>=8
|
||||
|| AND.L $H1y,$FF000000,$H1z
|
||||
XOR.L $H3x,$Z3,$Z3 ; 7/1
|
||||
|| SHRMB.S $Z2,$Z1,$Z1
|
||||
|| XOR.D $H0z,$Z0,$Z0 ; merge upper byte products
|
||||
|| AND.S $H2y,$FF000000,$H2z
|
||||
|| XORMPY $E10000,$rem,$res ; ; implicit rem&0x1FE
|
||||
XOR.L $H1z,$Z1,$Z1 ; 8/2
|
||||
|| SHRMB.S $Z3,$Z2,$Z2
|
||||
|| AND.S $H3y,$FF000000,$H3z
|
||||
XOR.L $H2z,$Z2,$Z2 ; 9/3
|
||||
|| SHRU $Z3,8,$Z3
|
||||
XOR.D $H3z,$Z3,$Z3 ; 10/4
|
||||
NOP ; 11/5
|
||||
|
||||
SPKERNEL 0,2
|
||||
|| XOR.D $res,$Z3,$Z3 ; 12/6/0; Z^=res
|
||||
|
||||
; input pre-fetch is possible where D1 slot is available...
|
||||
[B0] LDNDW *${inp}[1],$H1x:$H0x ; 8/-
|
||||
[B0] LDNDW *${inp}++[2],$H3x:$H2x ; 9/-
|
||||
NOP ; 10/-
|
||||
.if .LITTLE_ENDIAN
|
||||
SWAP2 $Z0,$Z1 ; 11/-
|
||||
|| SWAP4 $Z1,$Z0
|
||||
SWAP4 $Z1,$Z1 ; 12/-
|
||||
|| SWAP2 $Z0,$Z0
|
||||
SWAP2 $Z2,$Z3
|
||||
|| SWAP4 $Z3,$Z2
|
||||
||[!B0] BNOP RA
|
||||
SWAP4 $Z3,$Z3
|
||||
|| SWAP2 $Z2,$Z2
|
||||
|| [B0] BNOP ghash_loop?
|
||||
[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
||||
|| [B0] XOR $H1x,$Z1,$Z1
|
||||
[B0] XOR $H2x,$Z2,$Z2
|
||||
|| [B0] XOR $H3x,$Z3,$Z3
|
||||
|| [B0] SHRU $Z1,24,$xia ; Xi[15], avoid cross-path stall
|
||||
STDW $Z1:$Z0,*${xip}[1]
|
||||
|| [B0] SHRU $Z1,16,$x0 ; Xi[14]
|
||||
|| [B0] ZERO $Z1:$Z0
|
||||
.else
|
||||
[!B0] BNOP RA ; 11/-
|
||||
[B0] BNOP ghash_loop? ; 12/-
|
||||
[B0] XOR $H0x,$Z0,$Z0 ; Xi^=inp
|
||||
|| [B0] XOR $H1x,$Z1,$Z1
|
||||
[B0] XOR $H2x,$Z2,$Z2
|
||||
|| [B0] XOR $H3x,$Z3,$Z3
|
||||
|| [B0] MV $Z0,$xia ; Xi[15], avoid cross-path stall
|
||||
STDW $Z1:$Z0,*${xip}[1]
|
||||
|| [B0] SHRU $Z0,8,$x0 ; Xi[14]
|
||||
|| [B0] ZERO $Z1:$Z0
|
||||
.endif
|
||||
STDW $Z3:$Z2,*${xip}[0]
|
||||
|| [B0] ZERO $Z3:$Z2
|
||||
|| [B0] MV $xia,$x1
|
||||
[B0] ADDK 14,${xip}
|
||||
.endasmfunc
|
||||
|
||||
.sect .const
|
||||
.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
470
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-ia64.pl
vendored
Executable file
470
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-ia64.pl
vendored
Executable file
|
@ -0,0 +1,470 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
|
||||
# GHASH performance was measured to be 6.67 cycles per processed byte
|
||||
# on Itanium 2, which is >90% better than Microsoft compiler generated
|
||||
# code. To anchor to something else sha1-ia64.pl module processes one
|
||||
# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
|
||||
# byte.
|
||||
|
||||
# September 2010
|
||||
#
|
||||
# It was originally thought that it makes lesser sense to implement
|
||||
# "528B" variant on Itanium 2 for following reason. Because number of
|
||||
# functional units is naturally limited, it appeared impossible to
|
||||
# implement "528B" loop in 4 cycles, only in 5. This would mean that
|
||||
# theoretically performance improvement couldn't be more than 20%.
|
||||
# But occasionally you prove yourself wrong:-) I figured out a way to
|
||||
# fold couple of instructions and having freed yet another instruction
|
||||
# slot by unrolling the loop... Resulting performance is 4.45 cycles
|
||||
# per processed byte and 50% better than "256B" version. On original
|
||||
# Itanium performance should remain the same as the "256B" version,
|
||||
# i.e. ~8.5 cycles.
|
||||
|
||||
$output=pop and (open STDOUT,">$output" or die "can't open $output: $!");
|
||||
|
||||
if ($^O eq "hpux") {
|
||||
$ADDP="addp4";
|
||||
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
||||
} else { $ADDP="add"; }
|
||||
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
|
||||
$big_endian=0 if (/\-DL_ENDIAN/); }
|
||||
if (!defined($big_endian))
|
||||
{ $big_endian=(unpack('L',pack('N',1))==1); }
|
||||
|
||||
sub loop() {
|
||||
my $label=shift;
|
||||
my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
|
||||
|
||||
# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
|
||||
# in scalable manner;-) Naturally assuming data in L1 cache...
|
||||
# Special note about 'dep' instruction, which is used to construct
|
||||
# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
|
||||
# bytes boundary and lower 7 bits of its address are guaranteed to
|
||||
# be zero.
|
||||
$code.=<<___;
|
||||
$label:
|
||||
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
||||
(p19) dep rem=Zlo,rem_4bitp,3,4 }
|
||||
{ .mfi; (p19) xor Zhi=Zhi,Hhi
|
||||
($p17) xor xi[1]=xi[1],in[1] };;
|
||||
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
|
||||
(p19) shrp Zlo=Zhi,Zlo,4 }
|
||||
{ .mfi; (p19) ld8 rem=[rem]
|
||||
(p18) and Hi[1]=mask0xf0,xi[2] };;
|
||||
{ .mmi; ($p16) ld1 in[0]=[inp],-1
|
||||
(p18) xor Zlo=Zlo,Hlo
|
||||
(p19) shr.u Zhi=Zhi,4 }
|
||||
{ .mib; (p19) xor Hhi=Hhi,rem
|
||||
(p18) add Hi[1]=Htbl,Hi[1] };;
|
||||
|
||||
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
||||
(p18) dep rem=Zlo,rem_4bitp,3,4 }
|
||||
{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
|
||||
(p18) xor Zhi=Zhi,Hhi };;
|
||||
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
|
||||
(p18) shrp Zlo=Zhi,Zlo,4 }
|
||||
{ .mfi; (p18) ld8 rem=[rem]
|
||||
(p17) and Hi[0]=mask0xf0,Hi[0] };;
|
||||
{ .mmi; (p16) ld1 xi[0]=[Xi],-1
|
||||
(p18) xor Zlo=Zlo,Hlo
|
||||
(p18) shr.u Zhi=Zhi,4 }
|
||||
{ .mib; (p18) xor Hhi=Hhi,rem
|
||||
(p17) add Hi[0]=Htbl,Hi[0]
|
||||
br.ctop.sptk $label };;
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.explicit
|
||||
.text
|
||||
|
||||
prevfs=r2; prevlc=r3; prevpr=r8;
|
||||
mask0xf0=r21;
|
||||
rem=r22; rem_4bitp=r23;
|
||||
Xi=r24; Htbl=r25;
|
||||
inp=r26; end=r27;
|
||||
Hhi=r28; Hlo=r29;
|
||||
Zhi=r30; Zlo=r31;
|
||||
|
||||
.align 128
|
||||
.skip 16 // aligns loop body
|
||||
.global gcm_gmult_4bit#
|
||||
.proc gcm_gmult_4bit#
|
||||
gcm_gmult_4bit:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,2,6,0,8
|
||||
$ADDP Xi=15,in0 // &Xi[15]
|
||||
mov rem_4bitp=ip }
|
||||
{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
.body
|
||||
.rotr in[3],xi[3],Hi[2]
|
||||
|
||||
{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
|
||||
mov mask0xf0=0xf0
|
||||
brp.loop.imp .Loop1,.Lend1-16};;
|
||||
{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
|
||||
};;
|
||||
{ .mii; shladd Hi[1]=xi[2],4,r0
|
||||
mov pr.rot=0x7<<16
|
||||
mov ar.lc=13 };;
|
||||
{ .mii; and Hi[1]=mask0xf0,Hi[1]
|
||||
mov ar.ec=3
|
||||
xor Zlo=Zlo,Zlo };;
|
||||
{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
|
||||
add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
|
||||
xor Zhi=Zhi,Zhi };;
|
||||
___
|
||||
&loop (".Loop1",1);
|
||||
$code.=<<___;
|
||||
.Lend1:
|
||||
{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
|
||||
{ .mib; mux1 Zlo=Zlo,\@rev };;
|
||||
{ .mib; mux1 Zhi=Zhi,\@rev };;
|
||||
{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
|
||||
add Hhi=1,Xi };; // pipeline flush on Itanium
|
||||
{ .mib; st8 [Hlo]=Zlo
|
||||
mov pr=prevpr,0x1ffff };;
|
||||
{ .mib; st8 [Hhi]=Zhi
|
||||
mov ar.lc=prevlc
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp gcm_gmult_4bit#
|
||||
___
|
||||
|
||||
######################################################################
|
||||
# "528B" (well, "512B" actually) streamed GHASH
|
||||
#
|
||||
$Xip="in0";
|
||||
$Htbl="in1";
|
||||
$inp="in2";
|
||||
$len="in3";
|
||||
$rem_8bit="loc0";
|
||||
$mask0xff="loc1";
|
||||
($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
|
||||
|
||||
sub load_htable() {
|
||||
for (my $i=0;$i<8;$i++) {
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
|
||||
ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
|
||||
{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
|
||||
ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
|
||||
___
|
||||
$code.=shift if (($i+$#_)==7);
|
||||
$code.="\t};;\n"
|
||||
}
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
prevsp=r3;
|
||||
|
||||
.align 32
|
||||
.skip 16 // aligns loop body
|
||||
.global gcm_ghash_4bit#
|
||||
.proc gcm_ghash_4bit#
|
||||
gcm_ghash_4bit:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,4,2,0,0
|
||||
.vframe prevsp
|
||||
mov prevsp=sp
|
||||
mov $rem_8bit=ip };;
|
||||
.body
|
||||
{ .mfi; $ADDP r8=0+0,$Htbl
|
||||
$ADDP r9=0+8,$Htbl }
|
||||
{ .mfi; $ADDP r10=128+0,$Htbl
|
||||
$ADDP r11=128+8,$Htbl };;
|
||||
___
|
||||
&load_htable(
|
||||
" $ADDP $Xip=15,$Xip", # &Xi[15]
|
||||
" $ADDP $len=$len,$inp", # &inp[len]
|
||||
" $ADDP $inp=15,$inp", # &inp[15]
|
||||
" mov $mask0xff=0xff",
|
||||
" add sp=-512,sp",
|
||||
" andcm sp=sp,$mask0xff", # align stack frame
|
||||
" add r14=0,sp",
|
||||
" add r15=8,sp");
|
||||
$code.=<<___;
|
||||
{ .mmi; $sum 1<<1 // go big-endian
|
||||
add r8=256+0,sp
|
||||
add r9=256+8,sp }
|
||||
{ .mmi; add r10=256+128+0,sp
|
||||
add r11=256+128+8,sp
|
||||
add $len=-17,$len };;
|
||||
___
|
||||
for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
|
||||
my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
|
||||
$code.=<<___;
|
||||
{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
|
||||
st8 [r9]=$rhi,16 // Htable[$i].hi
|
||||
shrp $rlo=$rhi,$rlo,4 }//;;
|
||||
{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
|
||||
stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
|
||||
shr.u $rhi=$rhi,4 };;
|
||||
{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
|
||||
st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
|
||||
ld8 r17=[r9],16 };; // Htable[8].hi
|
||||
{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
|
||||
ld8 r19=[r9],16 } // Htable[9].hi
|
||||
{ .mmi; rum 1<<5 // clear um.mfh
|
||||
shrp r16=r17,r16,4 };;
|
||||
___
|
||||
for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
|
||||
ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
|
||||
shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
||||
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
||||
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
||||
{ .mmi; add $Htbl=256,sp // &Htable[0]
|
||||
add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
|
||||
shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
|
||||
___
|
||||
|
||||
$in="r15";
|
||||
@xi=("r16","r17");
|
||||
@rem=("r18","r19");
|
||||
($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
|
||||
($Atbl,$Btbl)=("r26","r27");
|
||||
|
||||
$code.=<<___; # (p16)
|
||||
{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
cmp.eq p0,p6=r0,r0 };; // clear p6
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p16),(p17)
|
||||
{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
.align 32
|
||||
.LOOP:
|
||||
{ .mmi;
|
||||
(p6) st8 [$Xip]=$Zhi,13
|
||||
xor $Zlo=$Zlo,$Zlo
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p16),(p17),(p18)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
||||
{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
ld1 $in=[$inp],-1 } //(p16) *inp--
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
for ($i=1;$i<14;$i++) {
|
||||
# Above and below fragments are derived from this one by removing
|
||||
# unsuitable (p??) instructions.
|
||||
$code.=<<___; # (p16),(p17),(p18),(p19)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
||||
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
}
|
||||
|
||||
$code.=<<___; # (p17),(p18),(p19)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
|
||||
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p18),(p19)
|
||||
{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
|
||||
{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
|
||||
xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
|
||||
{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
|
||||
shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
|
||||
{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p19)
|
||||
{ .mmi; cmp.ltu p6,p0=$inp,$len
|
||||
add $inp=32,$inp
|
||||
shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
add $Xip=9,$Xip };; // &Xi.lo
|
||||
{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
||||
(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
|
||||
{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
|
||||
{ .mmi; st8 [$Xip]=$Zlo,-8
|
||||
(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
|
||||
shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi;
|
||||
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
|
||||
{ .mib;
|
||||
(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
|
||||
(p6) br.cond.dptk.many .LOOP };;
|
||||
|
||||
{ .mib; st8 [$Xip]=$Zhi };;
|
||||
{ .mib; $rum 1<<1 // return to little-endian
|
||||
.restore sp
|
||||
mov sp=prevsp
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp gcm_ghash_4bit#
|
||||
___
|
||||
$code.=<<___;
|
||||
.align 128
|
||||
.type rem_4bit#,\@object
|
||||
rem_4bit:
|
||||
data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
|
||||
data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
|
||||
data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
|
||||
data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
|
||||
.size rem_4bit#,128
|
||||
.type rem_8bit#,\@object
|
||||
rem_8bit:
|
||||
data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
|
||||
data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
|
||||
data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
|
||||
data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
|
||||
data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
|
||||
data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
|
||||
data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
|
||||
data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
|
||||
data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
|
||||
data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
|
||||
data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
|
||||
data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
|
||||
data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
|
||||
data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
|
||||
data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
|
||||
data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
|
||||
data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
|
||||
data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
|
||||
data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
|
||||
data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
|
||||
data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
|
||||
data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
|
||||
data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
|
||||
data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
|
||||
data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
|
||||
data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
|
||||
data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
|
||||
data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
|
||||
data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
|
||||
data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
|
||||
data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
|
||||
data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
|
||||
.size rem_8bit#,512
|
||||
stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
748
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-parisc.pl
vendored
Normal file
748
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-parisc.pl
vendored
Normal file
|
@ -0,0 +1,748 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# April 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
|
||||
# it processes one byte in 19.6 cycles, which is more than twice as
|
||||
# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
|
||||
# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
|
||||
# processed byte. This is ~2.2x faster than 64-bit code generated by
|
||||
# vendor compiler (which used to be very hard to beat:-).
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
$NREGS =6;
|
||||
} else {
|
||||
$LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
$NREGS =11;
|
||||
}
|
||||
|
||||
$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
|
||||
################# volatile registers
|
||||
$Xi="%r26"; # argument block
|
||||
$Htbl="%r25";
|
||||
$inp="%r24";
|
||||
$len="%r23";
|
||||
$Hhh=$Htbl; # variables
|
||||
$Hll="%r22";
|
||||
$Zhh="%r21";
|
||||
$Zll="%r20";
|
||||
$cnt="%r19";
|
||||
$rem_4bit="%r28";
|
||||
$rem="%r29";
|
||||
$mask0xf0="%r31";
|
||||
|
||||
################# preserved registers
|
||||
$Thh="%r1";
|
||||
$Tll="%r2";
|
||||
$nlo="%r3";
|
||||
$nhi="%r4";
|
||||
$byte="%r5";
|
||||
if ($SIZE_T==4) {
|
||||
$Zhl="%r6";
|
||||
$Zlh="%r7";
|
||||
$Hhl="%r8";
|
||||
$Hlh="%r9";
|
||||
$Thl="%r10";
|
||||
$Tlh="%r11";
|
||||
}
|
||||
$rem2="%r6"; # used in PA-RISC 2.0 code
|
||||
|
||||
$code.=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
|
||||
.ALIGN 64
|
||||
gcm_gmult_4bit
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
blr %r0,$rem_4bit
|
||||
ldi 3,$rem
|
||||
L\$pic_gmult
|
||||
andcm $rem_4bit,$rem,$rem_4bit
|
||||
addl $inp,$len,$len
|
||||
ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
|
||||
ldi 0xf0,$mask0xf0
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldi 31,$rem
|
||||
mtctl $rem,%cr11
|
||||
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
|
||||
b L\$parisc1_gmult
|
||||
nop
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 8($Htbl),$Hll
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
ldd $nlo($Hll),$Zll
|
||||
ldd $nlo($Hhh),$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldb 14($Xi),$nlo
|
||||
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
b L\$oop_gmult_pa2
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_gmult_pa2
|
||||
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
ldbx $cnt($Xi),$nlo
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
addib,uv -1,$cnt,L\$oop_gmult_pa2
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
std $Zll,8($Xi)
|
||||
std $Zhh,0($Xi)
|
||||
___
|
||||
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
b L\$done_gmult
|
||||
nop
|
||||
|
||||
L\$parisc1_gmult
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 12($Htbl),$Hll
|
||||
ldo 8($Htbl),$Hlh
|
||||
ldo 4($Htbl),$Hhl
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
ldwx $nlo($Hll),$Zll
|
||||
ldwx $nlo($Hlh),$Zlh
|
||||
ldwx $nlo($Hhl),$Zhl
|
||||
ldwx $nlo($Hhh),$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldb 14($Xi),$nlo
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
b L\$oop_gmult_pa1
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_gmult_pa1
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $rem,$Zhh,$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
extru $Zhh,27,28,$Zhh
|
||||
zdep $nlo,27,4,$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
addib,uv -1,$cnt,L\$oop_gmult_pa1
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
stw $Zll,12($Xi)
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
stw $Zlh,8($Xi)
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
stw $Zhl,4($Xi)
|
||||
stw $Zhh,0($Xi)
|
||||
___
|
||||
$code.=<<___;
|
||||
L\$done_gmult
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
___
|
||||
$code.=<<___;
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
|
||||
.EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
.ALIGN 64
|
||||
gcm_ghash_4bit
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
blr %r0,$rem_4bit
|
||||
ldi 3,$rem
|
||||
L\$pic_ghash
|
||||
andcm $rem_4bit,$rem,$rem_4bit
|
||||
addl $inp,$len,$len
|
||||
ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
|
||||
ldi 0xf0,$mask0xf0
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldi 31,$rem
|
||||
mtctl $rem,%cr11
|
||||
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
|
||||
b L\$parisc1_ghash
|
||||
nop
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 8($Htbl),$Hll
|
||||
|
||||
L\$outer_ghash_pa2
|
||||
ldb 15($inp),$nhi
|
||||
xor $nhi,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
ldd $nlo($Hll),$Zll
|
||||
ldd $nlo($Hhh),$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldb 14($Xi),$nlo
|
||||
ldb 14($inp),$byte
|
||||
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
xor $byte,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
b L\$oop_ghash_pa2
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_ghash_pa2
|
||||
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
|
||||
depd,z $Zll,60,4,$rem2
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
ldbx $cnt($inp),$byte
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
ldd $rem2($rem_4bit),$rem2
|
||||
|
||||
xor $rem2,$Zhh,$Zhh
|
||||
xor $byte,$nlo,$nlo
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
|
||||
ldd $rem($rem_4bit),$rem
|
||||
addib,uv -1,$cnt,L\$oop_ghash_pa2
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem2
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
ldd $rem2($rem_4bit),$rem2
|
||||
|
||||
xor $rem2,$Zhh,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
std $Zll,8($Xi)
|
||||
ldo 16($inp),$inp
|
||||
std $Zhh,0($Xi)
|
||||
cmpb,*<> $inp,$len,L\$outer_ghash_pa2
|
||||
copy $Zll,$nlo
|
||||
___
|
||||
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
b L\$done_ghash
|
||||
nop
|
||||
|
||||
L\$parisc1_ghash
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 12($Htbl),$Hll
|
||||
ldo 8($Htbl),$Hlh
|
||||
ldo 4($Htbl),$Hhl
|
||||
|
||||
L\$outer_ghash_pa1
|
||||
ldb 15($inp),$byte
|
||||
xor $byte,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
ldwx $nlo($Hll),$Zll
|
||||
ldwx $nlo($Hlh),$Zlh
|
||||
ldwx $nlo($Hhl),$Zhl
|
||||
ldwx $nlo($Hhh),$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldb 14($Xi),$nlo
|
||||
ldb 14($inp),$byte
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
xor $byte,$nlo,$nlo
|
||||
xor $rem,$Zhh,$Zhh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
b L\$oop_ghash_pa1
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_ghash_pa1
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldbx $cnt($inp),$byte
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $rem,$Zhh,$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $byte,$nlo,$nlo
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
extru $Zhh,27,28,$Zhh
|
||||
zdep $nlo,27,4,$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
addib,uv -1,$cnt,L\$oop_ghash_pa1
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
stw $Zll,12($Xi)
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
stw $Zlh,8($Xi)
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
stw $Zhl,4($Xi)
|
||||
ldo 16($inp),$inp
|
||||
stw $Zhh,0($Xi)
|
||||
comb,<> $inp,$len,L\$outer_ghash_pa1
|
||||
copy $Zll,$nlo
|
||||
___
|
||||
$code.=<<___;
|
||||
L\$done_ghash
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
___
|
||||
$code.=<<___;
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
|
||||
.ALIGN 64
|
||||
L\$rem_4bit
|
||||
.WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
|
||||
.WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
|
||||
.WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
|
||||
.WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
|
||||
.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
|
||||
.ALIGN 64
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
||||
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
|
||||
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
|
||||
{ sprintf "\t.WORD\t0x%08x\t; %s",
|
||||
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $depd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "depd$mod\t$args";
|
||||
|
||||
# I only have ",z" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
|
||||
{ my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
|
||||
my $cpos=63-$2;
|
||||
my $len=32-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
||||
=~ /GNU assembler/) {
|
||||
$gnuas = 1;
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
if ($SIZE_T==4) {
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
|
||||
s/cmpb,\*/comb,/;
|
||||
s/,\*/,/;
|
||||
}
|
||||
|
||||
s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8);
|
||||
s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8);
|
||||
s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8);
|
||||
s/\bbv\b/bve/ if ($SIZE_T==8);
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
262
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-s390x.pl
vendored
Normal file
262
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-s390x.pl
vendored
Normal file
|
@ -0,0 +1,262 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# September 2010.
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
||||
# was measured to be ~18 cycles per processed byte on z10, which is
|
||||
# almost 40% better than gcc-generated code. It should be noted that
|
||||
# 18 cycles is worse result than expected: loop is scheduled for 12
|
||||
# and the result should be close to 12. In the lack of instruction-
|
||||
# level profiling data it's impossible to tell why...
|
||||
|
||||
# November 2010.
|
||||
#
|
||||
# Adapt for -m31 build. If kernel supports what's called "highgprs"
|
||||
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
||||
# instructions and achieve "64-bit" performance even in 31-bit legacy
|
||||
# application context. The feature is not specific to any particular
|
||||
# processor, as long as it's "z-CPU". Latter implies that the code
|
||||
# remains z/Architecture specific. On z990 it was measured to perform
|
||||
# 2.8x better than 32-bit code generated by gcc 4.3.
|
||||
|
||||
# March 2011.
|
||||
#
|
||||
# Support for hardware KIMD-GHASH is verified to produce correct
|
||||
# result and therefore is engaged. On z196 it was measured to process
|
||||
# 8KB buffer ~7 faster than software implementation. It's not as
|
||||
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
|
||||
# it's actually almost 2 times slower. Which is the reason why
|
||||
# KIMD-GHASH is not used in gcm_gmult_4bit.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$softonly=0;
|
||||
|
||||
$Zhi="%r0";
|
||||
$Zlo="%r1";
|
||||
|
||||
$Xi="%r2"; # argument block
|
||||
$Htbl="%r3";
|
||||
$inp="%r4";
|
||||
$len="%r5";
|
||||
|
||||
$rem0="%r6"; # variables
|
||||
$rem1="%r7";
|
||||
$nlo="%r8";
|
||||
$nhi="%r9";
|
||||
$xi="%r10";
|
||||
$cnt="%r11";
|
||||
$tmp="%r12";
|
||||
$x78="%r13";
|
||||
$rem_4bit="%r14";
|
||||
|
||||
$sp="%r15";
|
||||
|
||||
$code.=<<___;
|
||||
#include "s390x_arch.h"
|
||||
|
||||
.text
|
||||
|
||||
.globl gcm_gmult_4bit
|
||||
.align 32
|
||||
gcm_gmult_4bit:
|
||||
___
|
||||
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
|
||||
larl %r1,OPENSSL_s390xcap_P
|
||||
lghi %r0,0
|
||||
lg %r1,S390X_KIMD+8(%r1) # load second word of kimd capabilities
|
||||
# vector
|
||||
tmhh %r1,0x4000 # check for function 65
|
||||
jz .Lsoft_gmult
|
||||
stg %r0,16($sp) # arrange 16 bytes of zero input
|
||||
stg %r0,24($sp)
|
||||
lghi %r0,S390X_GHASH # function 65
|
||||
la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
||||
la $inp,16($sp)
|
||||
lghi $len,16
|
||||
.long 0xb93e0004 # kimd %r0,$inp
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
br %r14
|
||||
.align 32
|
||||
.Lsoft_gmult:
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
|
||||
aghi $Xi,-1
|
||||
lghi $len,1
|
||||
lghi $x78,`0xf<<3`
|
||||
larl $rem_4bit,rem_4bit
|
||||
|
||||
lg $Zlo,8+1($Xi) # Xi
|
||||
j .Lgmult_shortcut
|
||||
.type gcm_gmult_4bit,\@function
|
||||
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
||||
|
||||
.globl gcm_ghash_4bit
|
||||
.align 32
|
||||
gcm_ghash_4bit:
|
||||
___
|
||||
$code.=<<___ if(!$softonly);
|
||||
larl %r1,OPENSSL_s390xcap_P
|
||||
lg %r0,S390X_KIMD+8(%r1) # load second word of kimd capabilities
|
||||
# vector
|
||||
tmhh %r0,0x4000 # check for function 65
|
||||
jz .Lsoft_ghash
|
||||
lghi %r0,S390X_GHASH # function 65
|
||||
la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
||||
.long 0xb93e0004 # kimd %r0,$inp
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
br %r14
|
||||
.align 32
|
||||
.Lsoft_ghash:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /3[12]/);
|
||||
llgfr $len,$len
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
|
||||
aghi $Xi,-1
|
||||
srlg $len,$len,4
|
||||
lghi $x78,`0xf<<3`
|
||||
larl $rem_4bit,rem_4bit
|
||||
|
||||
lg $Zlo,8+1($Xi) # Xi
|
||||
lg $Zhi,0+1($Xi)
|
||||
lghi $tmp,0
|
||||
.Louter:
|
||||
xg $Zhi,0($inp) # Xi ^= inp
|
||||
xg $Zlo,8($inp)
|
||||
xgr $Zhi,$tmp
|
||||
stg $Zlo,8+1($Xi)
|
||||
stg $Zhi,0+1($Xi)
|
||||
|
||||
.Lgmult_shortcut:
|
||||
lghi $tmp,0xf0
|
||||
sllg $nlo,$Zlo,4
|
||||
srlg $xi,$Zlo,8 # extract second byte
|
||||
ngr $nlo,$tmp
|
||||
lgr $nhi,$Zlo
|
||||
lghi $cnt,14
|
||||
ngr $nhi,$tmp
|
||||
|
||||
lg $Zlo,8($nlo,$Htbl)
|
||||
lg $Zhi,0($nlo,$Htbl)
|
||||
|
||||
sllg $nlo,$xi,4
|
||||
sllg $rem0,$Zlo,3
|
||||
ngr $nlo,$tmp
|
||||
ngr $rem0,$x78
|
||||
ngr $xi,$tmp
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
lgr $nhi,$xi
|
||||
sllg $rem1,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem1,$x78
|
||||
sllg $tmp,$Zhi,60
|
||||
j .Lghash_inner
|
||||
.align 16
|
||||
.Lghash_inner:
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nlo,$Htbl)
|
||||
llgc $xi,0($cnt,$Xi)
|
||||
xg $Zhi,0($nlo,$Htbl)
|
||||
sllg $nlo,$xi,4
|
||||
xg $Zhi,0($rem0,$rem_4bit)
|
||||
nill $nlo,0xf0
|
||||
sllg $rem0,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem0,$x78
|
||||
nill $xi,0xf0
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
lgr $nhi,$xi
|
||||
xg $Zhi,0($rem1,$rem_4bit)
|
||||
sllg $rem1,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem1,$x78
|
||||
sllg $tmp,$Zhi,60
|
||||
brct $cnt,.Lghash_inner
|
||||
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nlo,$Htbl)
|
||||
xg $Zhi,0($nlo,$Htbl)
|
||||
sllg $xi,$Zlo,3
|
||||
xg $Zhi,0($rem0,$rem_4bit)
|
||||
xgr $Zlo,$tmp
|
||||
ngr $xi,$x78
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
xgr $Zlo,$tmp
|
||||
xg $Zhi,0($rem1,$rem_4bit)
|
||||
|
||||
lg $tmp,0($xi,$rem_4bit)
|
||||
la $inp,16($inp)
|
||||
sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
|
||||
brctg $len,.Louter
|
||||
|
||||
xgr $Zhi,$tmp
|
||||
stg $Zlo,8+1($Xi)
|
||||
stg $Zhi,0+1($Xi)
|
||||
lm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
br %r14
|
||||
.type gcm_ghash_4bit,\@function
|
||||
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
||||
|
||||
.align 64
|
||||
rem_4bit:
|
||||
.long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
|
||||
.long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
|
||||
.long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
|
||||
.long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
|
||||
.type rem_4bit,\@object
|
||||
.size rem_4bit,(.-rem_4bit)
|
||||
.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
581
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-sparcv9.pl
vendored
Normal file
581
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-sparcv9.pl
vendored
Normal file
|
@ -0,0 +1,581 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
||||
# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
|
||||
# and are expressed in cycles per processed byte, less is better:
|
||||
#
|
||||
# gcc 3.3.x cc 5.2 this assembler
|
||||
#
|
||||
# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
|
||||
# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
|
||||
#
|
||||
# Here is data collected on UltraSPARC T1 system running Linux:
|
||||
#
|
||||
# gcc 4.4.1 this assembler
|
||||
#
|
||||
# 32-bit build 566 50 (+1000%)
|
||||
# 64-bit build 56 50 (+12%)
|
||||
#
|
||||
# I don't quite understand why difference between 32-bit and 64-bit
|
||||
# compiler-generated code is so big. Compilers *were* instructed to
|
||||
# generate code for UltraSPARC and should have used 64-bit registers
|
||||
# for Z vector (see C code) even in 32-bit build... Oh well, it only
|
||||
# means more impressive improvement coefficients for this assembler
|
||||
# module;-) Loops are aggressively modulo-scheduled in respect to
|
||||
# references to input data and Z.hi updates to achieve 12 cycles
|
||||
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
|
||||
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
|
||||
#
|
||||
# October 2012
|
||||
#
|
||||
# Add VIS3 lookup-table-free implementation using polynomial
|
||||
# multiplication xmulx[hi] and extended addition addxc[cc]
|
||||
# instructions. 4.52/7.63x improvement on T3/T4 or in absolute
|
||||
# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
|
||||
# saturates at ~15.5x single-process result on 8-core processor,
|
||||
# or ~20.5GBps per 2.85GHz socket.
|
||||
|
||||
$output=pop;
|
||||
open STDOUT,">$output";
|
||||
|
||||
$frame="STACK_FRAME";
|
||||
$bias="STACK_BIAS";
|
||||
|
||||
$Zhi="%o0"; # 64-bit values
|
||||
$Zlo="%o1";
|
||||
$Thi="%o2";
|
||||
$Tlo="%o3";
|
||||
$rem="%o4";
|
||||
$tmp="%o5";
|
||||
|
||||
$nhi="%l0"; # small values and pointers
|
||||
$nlo="%l1";
|
||||
$xi0="%l2";
|
||||
$xi1="%l3";
|
||||
$rem_4bit="%l4";
|
||||
$remi="%l5";
|
||||
$Htblo="%l6";
|
||||
$cnt="%l7";
|
||||
|
||||
$Xi="%i0"; # input argument block
|
||||
$Htbl="%i1";
|
||||
$inp="%i2";
|
||||
$len="%i3";
|
||||
|
||||
$code.=<<___;
|
||||
#include "sparc_arch.h"
|
||||
|
||||
#ifdef __arch64__
|
||||
.register %g2,#scratch
|
||||
.register %g3,#scratch
|
||||
#endif
|
||||
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.align 64
|
||||
rem_4bit:
|
||||
.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
|
||||
.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
|
||||
.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
|
||||
.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
|
||||
.type rem_4bit,#object
|
||||
.size rem_4bit,(.-rem_4bit)
|
||||
|
||||
.globl gcm_ghash_4bit
|
||||
.align 32
|
||||
gcm_ghash_4bit:
|
||||
save %sp,-$frame,%sp
|
||||
ldub [$inp+15],$nlo
|
||||
ldub [$Xi+15],$xi0
|
||||
ldub [$Xi+14],$xi1
|
||||
add $len,$inp,$len
|
||||
add $Htbl,8,$Htblo
|
||||
|
||||
1: call .+8
|
||||
add %o7,rem_4bit-1b,$rem_4bit
|
||||
|
||||
.Louter:
|
||||
xor $xi0,$nlo,$nlo
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
sll $nlo,4,$nlo
|
||||
ldx [$Htblo+$nlo],$Zlo
|
||||
ldx [$Htbl+$nlo],$Zhi
|
||||
|
||||
ldub [$inp+14],$nlo
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
and $Zlo,0xf,$remi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
sll $remi,3,$remi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
srlx $Zlo,4,$Zlo
|
||||
mov 13,$cnt
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
|
||||
xor $xi1,$nlo,$nlo
|
||||
and $Zlo,0xf,$remi
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
ba .Lghash_inner
|
||||
sll $nlo,4,$nlo
|
||||
.align 32
|
||||
.Lghash_inner:
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$inp+$cnt],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
ldub [$Xi+$cnt],$xi1
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $xi1,$nlo,$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
and $nlo,0xf0,$nhi
|
||||
addcc $cnt,-1,$cnt
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
and $nlo,0x0f,$nlo
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
blu .Lghash_inner
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
|
||||
add $inp,16,$inp
|
||||
cmp $inp,$len
|
||||
be,pn SIZE_T_CC,.Ldone
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$inp+15],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
srl $Zlo,8,$xi1
|
||||
and $Zlo,0xff,$xi0
|
||||
ba .Louter
|
||||
and $xi1,0xff,$xi1
|
||||
.align 32
|
||||
.Ldone:
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_ghash_4bit,#function
|
||||
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
||||
___
|
||||
|
||||
undef $inp;
|
||||
undef $len;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_gmult_4bit
|
||||
.align 32
|
||||
gcm_gmult_4bit:
|
||||
save %sp,-$frame,%sp
|
||||
ldub [$Xi+15],$nlo
|
||||
add $Htbl,8,$Htblo
|
||||
|
||||
1: call .+8
|
||||
add %o7,rem_4bit-1b,$rem_4bit
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
sll $nlo,4,$nlo
|
||||
ldx [$Htblo+$nlo],$Zlo
|
||||
ldx [$Htbl+$nlo],$Zhi
|
||||
|
||||
ldub [$Xi+14],$nlo
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
and $Zlo,0xf,$remi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
sll $remi,3,$remi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
srlx $Zlo,4,$Zlo
|
||||
mov 13,$cnt
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
|
||||
and $Zlo,0xf,$remi
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
ba .Lgmult_inner
|
||||
sll $nlo,4,$nlo
|
||||
.align 32
|
||||
.Lgmult_inner:
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$Xi+$cnt],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
srlx $Zhi,4,$Zhi
|
||||
and $nlo,0xf0,$nhi
|
||||
addcc $cnt,-1,$cnt
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
and $nlo,0x0f,$nlo
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
blu .Lgmult_inner
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_gmult_4bit,#function
|
||||
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
||||
___
|
||||
|
||||
{{{
|
||||
# Straightforward 128x128-bit multiplication using Karatsuba algorithm
|
||||
# followed by pair of 64-bit reductions [with a shortcut in first one,
|
||||
# which allowed to break dependency between reductions and remove one
|
||||
# multiplication from critical path]. While it might be suboptimal
|
||||
# with regard to sheer number of multiplications, other methods [such
|
||||
# as aggregate reduction] would require more 64-bit registers, which
|
||||
# we don't have in 32-bit application context.
|
||||
|
||||
($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
|
||||
|
||||
($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
|
||||
(map("%o$_",(0..5,7)),map("%g$_",(1..5)));
|
||||
|
||||
($shl,$shr)=map("%l$_",(0..7));
|
||||
|
||||
# For details regarding "twisted H" see ghash-x86.pl.
|
||||
$code.=<<___;
|
||||
.globl gcm_init_vis3
|
||||
.align 32
|
||||
gcm_init_vis3:
|
||||
save %sp,-$frame,%sp
|
||||
|
||||
ldx [%i1+0],$Hhi
|
||||
ldx [%i1+8],$Hlo
|
||||
mov 0xE1,$Xhi
|
||||
mov 1,$Xlo
|
||||
sllx $Xhi,57,$Xhi
|
||||
srax $Hhi,63,$C0 ! broadcast carry
|
||||
addcc $Hlo,$Hlo,$Hlo ! H<<=1
|
||||
addxc $Hhi,$Hhi,$Hhi
|
||||
and $C0,$Xlo,$Xlo
|
||||
and $C0,$Xhi,$Xhi
|
||||
xor $Xlo,$Hlo,$Hlo
|
||||
xor $Xhi,$Hhi,$Hhi
|
||||
stx $Hlo,[%i0+8] ! save twisted H
|
||||
stx $Hhi,[%i0+0]
|
||||
|
||||
sethi %hi(0xA0406080),$V
|
||||
sethi %hi(0x20C0E000),%l0
|
||||
or $V,%lo(0xA0406080),$V
|
||||
or %l0,%lo(0x20C0E000),%l0
|
||||
sllx $V,32,$V
|
||||
or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
|
||||
stx $V,[%i0+16]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_init_vis3,#function
|
||||
.size gcm_init_vis3,.-gcm_init_vis3
|
||||
|
||||
.globl gcm_gmult_vis3
|
||||
.align 32
|
||||
gcm_gmult_vis3:
|
||||
save %sp,-$frame,%sp
|
||||
|
||||
ldx [$Xip+8],$Xlo ! load Xi
|
||||
ldx [$Xip+0],$Xhi
|
||||
ldx [$Htable+8],$Hlo ! load twisted H
|
||||
ldx [$Htable+0],$Hhi
|
||||
|
||||
mov 0xE1,%l7
|
||||
sllx %l7,57,$xE1 ! 57 is not a typo
|
||||
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
|
||||
|
||||
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
|
||||
xmulx $Xlo,$Hlo,$C0
|
||||
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
|
||||
xmulx $C2,$Hhl,$C1
|
||||
xmulxhi $Xlo,$Hlo,$Xlo
|
||||
xmulxhi $C2,$Hhl,$C2
|
||||
xmulxhi $Xhi,$Hhi,$C3
|
||||
xmulx $Xhi,$Hhi,$Xhi
|
||||
|
||||
sll $C0,3,$sqr
|
||||
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
|
||||
xor $C0,$sqr,$sqr
|
||||
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
|
||||
|
||||
xor $C0,$C1,$C1 ! Karatsuba post-processing
|
||||
xor $Xlo,$C2,$C2
|
||||
xor $sqr,$Xlo,$Xlo ! real destination is $C1
|
||||
xor $C3,$C2,$C2
|
||||
xor $Xlo,$C1,$C1
|
||||
xor $Xhi,$C2,$C2
|
||||
xor $Xhi,$C1,$C1
|
||||
|
||||
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
|
||||
xor $C0,$C2,$C2
|
||||
xmulx $C1,$xE1,$C0
|
||||
xor $C1,$C3,$C3
|
||||
xmulxhi $C1,$xE1,$C1
|
||||
|
||||
xor $Xlo,$C2,$C2
|
||||
xor $C0,$C2,$C2
|
||||
xor $C1,$C3,$C3
|
||||
|
||||
stx $C2,[$Xip+8] ! save Xi
|
||||
stx $C3,[$Xip+0]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_gmult_vis3,#function
|
||||
.size gcm_gmult_vis3,.-gcm_gmult_vis3
|
||||
|
||||
.globl gcm_ghash_vis3
|
||||
.align 32
|
||||
gcm_ghash_vis3:
|
||||
save %sp,-$frame,%sp
|
||||
nop
|
||||
srln $len,0,$len ! needed on v8+, "nop" on v9
|
||||
|
||||
ldx [$Xip+8],$C2 ! load Xi
|
||||
ldx [$Xip+0],$C3
|
||||
ldx [$Htable+8],$Hlo ! load twisted H
|
||||
ldx [$Htable+0],$Hhi
|
||||
|
||||
mov 0xE1,%l7
|
||||
sllx %l7,57,$xE1 ! 57 is not a typo
|
||||
ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
|
||||
|
||||
and $inp,7,$shl
|
||||
andn $inp,7,$inp
|
||||
sll $shl,3,$shl
|
||||
prefetch [$inp+63], 20
|
||||
sub %g0,$shl,$shr
|
||||
|
||||
xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
|
||||
.Loop:
|
||||
ldx [$inp+8],$Xlo
|
||||
brz,pt $shl,1f
|
||||
ldx [$inp+0],$Xhi
|
||||
|
||||
ldx [$inp+16],$C1 ! align data
|
||||
srlx $Xlo,$shr,$C0
|
||||
sllx $Xlo,$shl,$Xlo
|
||||
sllx $Xhi,$shl,$Xhi
|
||||
srlx $C1,$shr,$C1
|
||||
or $C0,$Xhi,$Xhi
|
||||
or $C1,$Xlo,$Xlo
|
||||
1:
|
||||
add $inp,16,$inp
|
||||
sub $len,16,$len
|
||||
xor $C2,$Xlo,$Xlo
|
||||
xor $C3,$Xhi,$Xhi
|
||||
prefetch [$inp+63], 20
|
||||
|
||||
xmulx $Xlo,$Hlo,$C0
|
||||
xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
|
||||
xmulx $C2,$Hhl,$C1
|
||||
xmulxhi $Xlo,$Hlo,$Xlo
|
||||
xmulxhi $C2,$Hhl,$C2
|
||||
xmulxhi $Xhi,$Hhi,$C3
|
||||
xmulx $Xhi,$Hhi,$Xhi
|
||||
|
||||
sll $C0,3,$sqr
|
||||
srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
|
||||
xor $C0,$sqr,$sqr
|
||||
sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
|
||||
|
||||
xor $C0,$C1,$C1 ! Karatsuba post-processing
|
||||
xor $Xlo,$C2,$C2
|
||||
xor $sqr,$Xlo,$Xlo ! real destination is $C1
|
||||
xor $C3,$C2,$C2
|
||||
xor $Xlo,$C1,$C1
|
||||
xor $Xhi,$C2,$C2
|
||||
xor $Xhi,$C1,$C1
|
||||
|
||||
xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
|
||||
xor $C0,$C2,$C2
|
||||
xmulx $C1,$xE1,$C0
|
||||
xor $C1,$C3,$C3
|
||||
xmulxhi $C1,$xE1,$C1
|
||||
|
||||
xor $Xlo,$C2,$C2
|
||||
xor $C0,$C2,$C2
|
||||
brnz,pt $len,.Loop
|
||||
xor $C1,$C3,$C3
|
||||
|
||||
stx $C2,[$Xip+8] ! save Xi
|
||||
stx $C3,[$Xip+0]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_ghash_vis3,#function
|
||||
.size gcm_ghash_vis3,.-gcm_ghash_vis3
|
||||
___
|
||||
}}}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
|
||||
# Purpose of these subroutines is to explicitly encode VIS instructions,
|
||||
# so that one can compile the module without having to specify VIS
|
||||
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
||||
# Idea is to reserve for option to produce "universal" binary and let
|
||||
# programmer detect if current CPU is VIS capable at run-time.
|
||||
sub unvis3 {
|
||||
my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
||||
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
||||
my ($ref,$opf);
|
||||
my %visopf = ( "addxc" => 0x011,
|
||||
"addxccc" => 0x013,
|
||||
"xmulx" => 0x115,
|
||||
"xmulxhi" => 0x116 );
|
||||
|
||||
$ref = "$mnemonic\t$rs1,$rs2,$rd";
|
||||
|
||||
if ($opf=$visopf{$mnemonic}) {
|
||||
foreach ($rs1,$rs2,$rd) {
|
||||
return $ref if (!/%([goli])([0-9])/);
|
||||
$_=$bias{$1}+$2;
|
||||
}
|
||||
|
||||
return sprintf ".word\t0x%08x !%s",
|
||||
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
||||
$ref;
|
||||
} else {
|
||||
return $ref;
|
||||
}
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
||||
&unvis3($1,$2,$3,$4)
|
||||
/ge;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
1404
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-x86.pl
vendored
Normal file
1404
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-x86.pl
vendored
Normal file
File diff suppressed because it is too large
Load diff
1816
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-x86_64.pl
vendored
Normal file
1816
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghash-x86_64.pl
vendored
Normal file
File diff suppressed because it is too large
Load diff
671
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghashp8-ppc.pl
vendored
Executable file
671
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghashp8-ppc.pl
vendored
Executable file
|
@ -0,0 +1,671 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# GHASH for for PowerISA v2.07.
|
||||
#
|
||||
# July 2014
|
||||
#
|
||||
# Accurate performance measurements are problematic, because it's
|
||||
# always virtualized setup with possibly throttled processor.
|
||||
# Relative comparison is therefore more informative. This initial
|
||||
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
|
||||
# faster than "4-bit" integer-only compiler-generated 64-bit code.
|
||||
# "Initial version" means that there is room for further improvement.
|
||||
|
||||
# May 2016
|
||||
#
|
||||
# 2x aggregated reduction improves performance by 50% (resulting
|
||||
# performance on POWER8 is 1 cycle per processed byte), and 4x
|
||||
# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
|
||||
# POWER9 delivers 0.51 cpb.
|
||||
|
||||
$flavour=shift;
|
||||
$output =shift;
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$SIZE_T=8;
|
||||
$LRSAVE=2*$SIZE_T;
|
||||
$STU="stdu";
|
||||
$POP="ld";
|
||||
$PUSH="std";
|
||||
$UCMP="cmpld";
|
||||
$SHRI="srdi";
|
||||
} elsif ($flavour =~ /32/) {
|
||||
$SIZE_T=4;
|
||||
$LRSAVE=$SIZE_T;
|
||||
$STU="stwu";
|
||||
$POP="lwz";
|
||||
$PUSH="stw";
|
||||
$UCMP="cmplw";
|
||||
$SHRI="srwi";
|
||||
} else { die "nonsense $flavour"; }
|
||||
|
||||
$sp="r1";
|
||||
$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
||||
die "can't locate ppc-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
|
||||
|
||||
my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
|
||||
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
|
||||
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
|
||||
my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
|
||||
my $vrsave="r12";
|
||||
|
||||
$code=<<___;
|
||||
.machine "any"
|
||||
|
||||
.text
|
||||
|
||||
.globl .gcm_init_p8
|
||||
.align 5
|
||||
.gcm_init_p8:
|
||||
li r0,-4096
|
||||
li r8,0x10
|
||||
mfspr $vrsave,256
|
||||
li r9,0x20
|
||||
mtspr 256,r0
|
||||
li r10,0x30
|
||||
lvx_u $H,0,r4 # load H
|
||||
|
||||
vspltisb $xC2,-16 # 0xf0
|
||||
vspltisb $t0,1 # one
|
||||
vaddubm $xC2,$xC2,$xC2 # 0xe0
|
||||
vxor $zero,$zero,$zero
|
||||
vor $xC2,$xC2,$t0 # 0xe1
|
||||
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
|
||||
vsldoi $t1,$zero,$t0,1 # ...1
|
||||
vaddubm $xC2,$xC2,$xC2 # 0xc2...
|
||||
vspltisb $t2,7
|
||||
vor $xC2,$xC2,$t1 # 0xc2....01
|
||||
vspltb $t1,$H,0 # most significant byte
|
||||
vsl $H,$H,$t0 # H<<=1
|
||||
vsrab $t1,$t1,$t2 # broadcast carry bit
|
||||
vand $t1,$t1,$xC2
|
||||
vxor $IN,$H,$t1 # twisted H
|
||||
|
||||
vsldoi $H,$IN,$IN,8 # twist even more ...
|
||||
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
|
||||
vsldoi $Hl,$zero,$H,8 # ... and split
|
||||
vsldoi $Hh,$H,$zero,8
|
||||
|
||||
stvx_u $xC2,0,r3 # save pre-computed table
|
||||
stvx_u $Hl,r8,r3
|
||||
li r8,0x40
|
||||
stvx_u $H, r9,r3
|
||||
li r9,0x50
|
||||
stvx_u $Hh,r10,r3
|
||||
li r10,0x60
|
||||
|
||||
vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
|
||||
vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
|
||||
vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
vxor $t1,$t1,$Xh
|
||||
vxor $IN1,$Xl,$t1
|
||||
|
||||
vsldoi $H2,$IN1,$IN1,8
|
||||
vsldoi $H2l,$zero,$H2,8
|
||||
vsldoi $H2h,$H2,$zero,8
|
||||
|
||||
stvx_u $H2l,r8,r3 # save H^2
|
||||
li r8,0x70
|
||||
stvx_u $H2,r9,r3
|
||||
li r9,0x80
|
||||
stvx_u $H2h,r10,r3
|
||||
li r10,0x90
|
||||
___
|
||||
{
|
||||
my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
|
||||
$code.=<<___;
|
||||
vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
|
||||
vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
|
||||
vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
|
||||
vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
|
||||
vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
|
||||
vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vsldoi $t4,$Xm1,$zero,8
|
||||
vsldoi $t5,$zero,$Xm1,8
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
vxor $Xl1,$Xl1,$t4
|
||||
vxor $Xh1,$Xh1,$t5
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vsldoi $Xl1,$Xl1,$Xl1,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
vxor $Xl1,$Xl1,$t6
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
vpmsumd $Xl1,$Xl1,$xC2
|
||||
vxor $t1,$t1,$Xh
|
||||
vxor $t5,$t5,$Xh1
|
||||
vxor $Xl,$Xl,$t1
|
||||
vxor $Xl1,$Xl1,$t5
|
||||
|
||||
vsldoi $H,$Xl,$Xl,8
|
||||
vsldoi $H2,$Xl1,$Xl1,8
|
||||
vsldoi $Hl,$zero,$H,8
|
||||
vsldoi $Hh,$H,$zero,8
|
||||
vsldoi $H2l,$zero,$H2,8
|
||||
vsldoi $H2h,$H2,$zero,8
|
||||
|
||||
stvx_u $Hl,r8,r3 # save H^3
|
||||
li r8,0xa0
|
||||
stvx_u $H,r9,r3
|
||||
li r9,0xb0
|
||||
stvx_u $Hh,r10,r3
|
||||
li r10,0xc0
|
||||
stvx_u $H2l,r8,r3 # save H^4
|
||||
stvx_u $H2,r9,r3
|
||||
stvx_u $H2h,r10,r3
|
||||
|
||||
mtspr 256,$vrsave
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,0x14,0,0,0,2,0
|
||||
.long 0
|
||||
.size .gcm_init_p8,.-.gcm_init_p8
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.globl .gcm_gmult_p8
|
||||
.align 5
|
||||
.gcm_gmult_p8:
|
||||
lis r0,0xfff8
|
||||
li r8,0x10
|
||||
mfspr $vrsave,256
|
||||
li r9,0x20
|
||||
mtspr 256,r0
|
||||
li r10,0x30
|
||||
lvx_u $IN,0,$Xip # load Xi
|
||||
|
||||
lvx_u $Hl,r8,$Htbl # load pre-computed table
|
||||
le?lvsl $lemask,r0,r0
|
||||
lvx_u $H, r9,$Htbl
|
||||
le?vspltisb $t0,0x07
|
||||
lvx_u $Hh,r10,$Htbl
|
||||
le?vxor $lemask,$lemask,$t0
|
||||
lvx_u $xC2,0,$Htbl
|
||||
le?vperm $IN,$IN,$IN,$lemask
|
||||
vxor $zero,$zero,$zero
|
||||
|
||||
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
|
||||
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
|
||||
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
vxor $t1,$t1,$Xh
|
||||
vxor $Xl,$Xl,$t1
|
||||
|
||||
le?vperm $Xl,$Xl,$Xl,$lemask
|
||||
stvx_u $Xl,0,$Xip # write out Xi
|
||||
|
||||
mtspr 256,$vrsave
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,0x14,0,0,0,2,0
|
||||
.long 0
|
||||
.size .gcm_gmult_p8,.-.gcm_gmult_p8
|
||||
|
||||
.globl .gcm_ghash_p8
|
||||
.align 5
|
||||
.gcm_ghash_p8:
|
||||
li r0,-4096
|
||||
li r8,0x10
|
||||
mfspr $vrsave,256
|
||||
li r9,0x20
|
||||
mtspr 256,r0
|
||||
li r10,0x30
|
||||
lvx_u $Xl,0,$Xip # load Xi
|
||||
|
||||
lvx_u $Hl,r8,$Htbl # load pre-computed table
|
||||
li r8,0x40
|
||||
le?lvsl $lemask,r0,r0
|
||||
lvx_u $H, r9,$Htbl
|
||||
li r9,0x50
|
||||
le?vspltisb $t0,0x07
|
||||
lvx_u $Hh,r10,$Htbl
|
||||
li r10,0x60
|
||||
le?vxor $lemask,$lemask,$t0
|
||||
lvx_u $xC2,0,$Htbl
|
||||
le?vperm $Xl,$Xl,$Xl,$lemask
|
||||
vxor $zero,$zero,$zero
|
||||
|
||||
${UCMP}i $len,64
|
||||
bge Lgcm_ghash_p8_4x
|
||||
|
||||
lvx_u $IN,0,$inp
|
||||
addi $inp,$inp,16
|
||||
subic. $len,$len,16
|
||||
le?vperm $IN,$IN,$IN,$lemask
|
||||
vxor $IN,$IN,$Xl
|
||||
beq Lshort
|
||||
|
||||
lvx_u $H2l,r8,$Htbl # load H^2
|
||||
li r8,16
|
||||
lvx_u $H2, r9,$Htbl
|
||||
add r9,$inp,$len # end of input
|
||||
lvx_u $H2h,r10,$Htbl
|
||||
be?b Loop_2x
|
||||
|
||||
.align 5
|
||||
Loop_2x:
|
||||
lvx_u $IN1,0,$inp
|
||||
le?vperm $IN1,$IN1,$IN1,$lemask
|
||||
|
||||
subic $len,$len,32
|
||||
vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
|
||||
vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
|
||||
subfe r0,r0,r0 # borrow?-1:0
|
||||
vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
|
||||
vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
|
||||
and r0,r0,$len
|
||||
vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
|
||||
vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
|
||||
add $inp,$inp,r0
|
||||
|
||||
vxor $Xl,$Xl,$Xl1
|
||||
vxor $Xm,$Xm,$Xm1
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xh,$Xh,$Xh1
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
lvx_u $IN,r8,$inp
|
||||
addi $inp,$inp,32
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
le?vperm $IN,$IN,$IN,$lemask
|
||||
vxor $t1,$t1,$Xh
|
||||
vxor $IN,$IN,$t1
|
||||
vxor $IN,$IN,$Xl
|
||||
$UCMP r9,$inp
|
||||
bgt Loop_2x # done yet?
|
||||
|
||||
cmplwi $len,0
|
||||
bne Leven
|
||||
|
||||
Lshort:
|
||||
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
|
||||
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
|
||||
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
vxor $t1,$t1,$Xh
|
||||
|
||||
Leven:
|
||||
vxor $Xl,$Xl,$t1
|
||||
le?vperm $Xl,$Xl,$Xl,$lemask
|
||||
stvx_u $Xl,0,$Xip # write out Xi
|
||||
|
||||
mtspr 256,$vrsave
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,0x14,0,0,0,4,0
|
||||
.long 0
|
||||
___
|
||||
{
|
||||
my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
|
||||
$Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
|
||||
my $IN0=$IN;
|
||||
my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
|
||||
|
||||
$code.=<<___;
|
||||
.align 5
|
||||
.gcm_ghash_p8_4x:
|
||||
Lgcm_ghash_p8_4x:
|
||||
$STU $sp,-$FRAME($sp)
|
||||
li r10,`15+6*$SIZE_T`
|
||||
li r11,`31+6*$SIZE_T`
|
||||
stvx v20,r10,$sp
|
||||
addi r10,r10,32
|
||||
stvx v21,r11,$sp
|
||||
addi r11,r11,32
|
||||
stvx v22,r10,$sp
|
||||
addi r10,r10,32
|
||||
stvx v23,r11,$sp
|
||||
addi r11,r11,32
|
||||
stvx v24,r10,$sp
|
||||
addi r10,r10,32
|
||||
stvx v25,r11,$sp
|
||||
addi r11,r11,32
|
||||
stvx v26,r10,$sp
|
||||
addi r10,r10,32
|
||||
stvx v27,r11,$sp
|
||||
addi r11,r11,32
|
||||
stvx v28,r10,$sp
|
||||
addi r10,r10,32
|
||||
stvx v29,r11,$sp
|
||||
addi r11,r11,32
|
||||
stvx v30,r10,$sp
|
||||
li r10,0x60
|
||||
stvx v31,r11,$sp
|
||||
li r0,-1
|
||||
stw $vrsave,`$FRAME-4`($sp) # save vrsave
|
||||
mtspr 256,r0 # preserve all AltiVec registers
|
||||
|
||||
lvsl $t0,0,r8 # 0x0001..0e0f
|
||||
#lvx_u $H2l,r8,$Htbl # load H^2
|
||||
li r8,0x70
|
||||
lvx_u $H2, r9,$Htbl
|
||||
li r9,0x80
|
||||
vspltisb $t1,8 # 0x0808..0808
|
||||
#lvx_u $H2h,r10,$Htbl
|
||||
li r10,0x90
|
||||
lvx_u $H3l,r8,$Htbl # load H^3
|
||||
li r8,0xa0
|
||||
lvx_u $H3, r9,$Htbl
|
||||
li r9,0xb0
|
||||
lvx_u $H3h,r10,$Htbl
|
||||
li r10,0xc0
|
||||
lvx_u $H4l,r8,$Htbl # load H^4
|
||||
li r8,0x10
|
||||
lvx_u $H4, r9,$Htbl
|
||||
li r9,0x20
|
||||
lvx_u $H4h,r10,$Htbl
|
||||
li r10,0x30
|
||||
|
||||
vsldoi $t2,$zero,$t1,8 # 0x0000..0808
|
||||
vaddubm $hiperm,$t0,$t2 # 0x0001..1617
|
||||
vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
|
||||
|
||||
$SHRI $len,$len,4 # this allows to use sign bit
|
||||
# as carry
|
||||
lvx_u $IN0,0,$inp # load input
|
||||
lvx_u $IN1,r8,$inp
|
||||
subic. $len,$len,8
|
||||
lvx_u $IN2,r9,$inp
|
||||
lvx_u $IN3,r10,$inp
|
||||
addi $inp,$inp,0x40
|
||||
le?vperm $IN0,$IN0,$IN0,$lemask
|
||||
le?vperm $IN1,$IN1,$IN1,$lemask
|
||||
le?vperm $IN2,$IN2,$IN2,$lemask
|
||||
le?vperm $IN3,$IN3,$IN3,$lemask
|
||||
|
||||
vxor $Xh,$IN0,$Xl
|
||||
|
||||
vpmsumd $Xl1,$IN1,$H3l
|
||||
vpmsumd $Xm1,$IN1,$H3
|
||||
vpmsumd $Xh1,$IN1,$H3h
|
||||
|
||||
vperm $H21l,$H2,$H,$hiperm
|
||||
vperm $t0,$IN2,$IN3,$loperm
|
||||
vperm $H21h,$H2,$H,$loperm
|
||||
vperm $t1,$IN2,$IN3,$hiperm
|
||||
vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
|
||||
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
|
||||
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
|
||||
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
|
||||
|
||||
vxor $Xm2,$Xm2,$Xm1
|
||||
vxor $Xl3,$Xl3,$Xl1
|
||||
vxor $Xm3,$Xm3,$Xm2
|
||||
vxor $Xh3,$Xh3,$Xh1
|
||||
|
||||
blt Ltail_4x
|
||||
|
||||
Loop_4x:
|
||||
lvx_u $IN0,0,$inp
|
||||
lvx_u $IN1,r8,$inp
|
||||
subic. $len,$len,4
|
||||
lvx_u $IN2,r9,$inp
|
||||
lvx_u $IN3,r10,$inp
|
||||
addi $inp,$inp,0x40
|
||||
le?vperm $IN1,$IN1,$IN1,$lemask
|
||||
le?vperm $IN2,$IN2,$IN2,$lemask
|
||||
le?vperm $IN3,$IN3,$IN3,$lemask
|
||||
le?vperm $IN0,$IN0,$IN0,$lemask
|
||||
|
||||
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
|
||||
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
|
||||
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
|
||||
vpmsumd $Xl1,$IN1,$H3l
|
||||
vpmsumd $Xm1,$IN1,$H3
|
||||
vpmsumd $Xh1,$IN1,$H3h
|
||||
|
||||
vxor $Xl,$Xl,$Xl3
|
||||
vxor $Xm,$Xm,$Xm3
|
||||
vxor $Xh,$Xh,$Xh3
|
||||
vperm $t0,$IN2,$IN3,$loperm
|
||||
vperm $t1,$IN2,$IN3,$hiperm
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
|
||||
vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
|
||||
vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
|
||||
vxor $Xl3,$Xl3,$Xl1
|
||||
vxor $Xh3,$Xh3,$Xh1
|
||||
vxor $Xh,$Xh,$IN0
|
||||
vxor $Xm2,$Xm2,$Xm1
|
||||
vxor $Xh,$Xh,$t1
|
||||
vxor $Xm3,$Xm3,$Xm2
|
||||
vxor $Xh,$Xh,$Xl
|
||||
bge Loop_4x
|
||||
|
||||
Ltail_4x:
|
||||
vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
|
||||
vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
|
||||
vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
|
||||
|
||||
vxor $Xl,$Xl,$Xl3
|
||||
vxor $Xm,$Xm,$Xm3
|
||||
|
||||
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
|
||||
|
||||
vsldoi $t0,$Xm,$zero,8
|
||||
vsldoi $t1,$zero,$Xm,8
|
||||
vxor $Xh,$Xh,$Xh3
|
||||
vxor $Xl,$Xl,$t0
|
||||
vxor $Xh,$Xh,$t1
|
||||
|
||||
vsldoi $Xl,$Xl,$Xl,8
|
||||
vxor $Xl,$Xl,$t2
|
||||
|
||||
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
|
||||
vpmsumd $Xl,$Xl,$xC2
|
||||
vxor $t1,$t1,$Xh
|
||||
vxor $Xl,$Xl,$t1
|
||||
|
||||
addic. $len,$len,4
|
||||
beq Ldone_4x
|
||||
|
||||
lvx_u $IN0,0,$inp
|
||||
${UCMP}i $len,2
|
||||
li $len,-4
|
||||
blt Lone
|
||||
lvx_u $IN1,r8,$inp
|
||||
beq Ltwo
|
||||
|
||||
Lthree:
|
||||
lvx_u $IN2,r9,$inp
|
||||
le?vperm $IN0,$IN0,$IN0,$lemask
|
||||
le?vperm $IN1,$IN1,$IN1,$lemask
|
||||
le?vperm $IN2,$IN2,$IN2,$lemask
|
||||
|
||||
vxor $Xh,$IN0,$Xl
|
||||
vmr $H4l,$H3l
|
||||
vmr $H4, $H3
|
||||
vmr $H4h,$H3h
|
||||
|
||||
vperm $t0,$IN1,$IN2,$loperm
|
||||
vperm $t1,$IN1,$IN2,$hiperm
|
||||
vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
|
||||
vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
|
||||
vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
|
||||
vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
|
||||
|
||||
vxor $Xm3,$Xm3,$Xm2
|
||||
b Ltail_4x
|
||||
|
||||
.align 4
|
||||
Ltwo:
|
||||
le?vperm $IN0,$IN0,$IN0,$lemask
|
||||
le?vperm $IN1,$IN1,$IN1,$lemask
|
||||
|
||||
vxor $Xh,$IN0,$Xl
|
||||
vperm $t0,$zero,$IN1,$loperm
|
||||
vperm $t1,$zero,$IN1,$hiperm
|
||||
|
||||
vsldoi $H4l,$zero,$H2,8
|
||||
vmr $H4, $H2
|
||||
vsldoi $H4h,$H2,$zero,8
|
||||
|
||||
vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
|
||||
vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
|
||||
vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
|
||||
|
||||
b Ltail_4x
|
||||
|
||||
.align 4
|
||||
Lone:
|
||||
le?vperm $IN0,$IN0,$IN0,$lemask
|
||||
|
||||
vsldoi $H4l,$zero,$H,8
|
||||
vmr $H4, $H
|
||||
vsldoi $H4h,$H,$zero,8
|
||||
|
||||
vxor $Xh,$IN0,$Xl
|
||||
vxor $Xl3,$Xl3,$Xl3
|
||||
vxor $Xm3,$Xm3,$Xm3
|
||||
vxor $Xh3,$Xh3,$Xh3
|
||||
|
||||
b Ltail_4x
|
||||
|
||||
Ldone_4x:
|
||||
le?vperm $Xl,$Xl,$Xl,$lemask
|
||||
stvx_u $Xl,0,$Xip # write out Xi
|
||||
|
||||
li r10,`15+6*$SIZE_T`
|
||||
li r11,`31+6*$SIZE_T`
|
||||
mtspr 256,$vrsave
|
||||
lvx v20,r10,$sp
|
||||
addi r10,r10,32
|
||||
lvx v21,r11,$sp
|
||||
addi r11,r11,32
|
||||
lvx v22,r10,$sp
|
||||
addi r10,r10,32
|
||||
lvx v23,r11,$sp
|
||||
addi r11,r11,32
|
||||
lvx v24,r10,$sp
|
||||
addi r10,r10,32
|
||||
lvx v25,r11,$sp
|
||||
addi r11,r11,32
|
||||
lvx v26,r10,$sp
|
||||
addi r10,r10,32
|
||||
lvx v27,r11,$sp
|
||||
addi r11,r11,32
|
||||
lvx v28,r10,$sp
|
||||
addi r10,r10,32
|
||||
lvx v29,r11,$sp
|
||||
addi r11,r11,32
|
||||
lvx v30,r10,$sp
|
||||
lvx v31,r11,$sp
|
||||
addi $sp,$sp,$FRAME
|
||||
blr
|
||||
.long 0
|
||||
.byte 0,12,0x04,0,0x80,0,4,0
|
||||
.long 0
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.size .gcm_ghash_p8,.-.gcm_ghash_p8
|
||||
|
||||
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
if ($flavour =~ /le$/o) { # little-endian
|
||||
s/le\?//o or
|
||||
s/be\?/#be#/o;
|
||||
} else {
|
||||
s/le\?/#le#/o or
|
||||
s/be\?//o;
|
||||
}
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT; # enforce flush
|
781
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghashv8-armx.pl
vendored
Normal file
781
trunk/3rdparty/openssl-1.1-fit/crypto/modes/asm/ghashv8-armx.pl
vendored
Normal file
|
@ -0,0 +1,781 @@
|
|||
#! /usr/bin/env perl
|
||||
# Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the OpenSSL license (the "License"). You may not use
|
||||
# this file except in compliance with the License. You can obtain a copy
|
||||
# in the file LICENSE in the source distribution or at
|
||||
# https://www.openssl.org/source/license.html
|
||||
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
|
||||
#
|
||||
# June 2014
|
||||
#
|
||||
# Initial version was developed in tight cooperation with Ard
|
||||
# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
|
||||
# Just like aesv8-armx.pl this module supports both AArch32 and
|
||||
# AArch64 execution modes.
|
||||
#
|
||||
# July 2014
|
||||
#
|
||||
# Implement 2x aggregated reduction [see ghash-x86.pl for background
|
||||
# information].
|
||||
#
|
||||
# November 2017
|
||||
#
|
||||
# AArch64 register bank to "accommodate" 4x aggregated reduction and
|
||||
# improve performance by 20-70% depending on processor.
|
||||
#
|
||||
# Current performance in cycles per processed byte:
|
||||
#
|
||||
# 64-bit PMULL 32-bit PMULL 32-bit NEON(*)
|
||||
# Apple A7 0.58 0.92 5.62
|
||||
# Cortex-A53 0.85 1.01 8.39
|
||||
# Cortex-A57 0.73 1.17 7.61
|
||||
# Denver 0.51 0.65 6.02
|
||||
# Mongoose 0.65 1.10 8.06
|
||||
# Kryo 0.76 1.16 8.00
|
||||
#
|
||||
# (*) presented for reference/comparison purposes;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
$Xi="x0"; # argument block
|
||||
$Htbl="x1";
|
||||
$inp="x2";
|
||||
$len="x3";
|
||||
|
||||
$inc="x12";
|
||||
|
||||
{
|
||||
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
|
||||
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
___
|
||||
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
.fpu neon
|
||||
.code 32
|
||||
#undef __thumb2__
|
||||
___
|
||||
|
||||
################################################################################
|
||||
# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
|
||||
#
|
||||
# input: 128-bit H - secret parameter E(K,0^128)
|
||||
# output: precomputed table filled with degrees of twisted H;
|
||||
# H is twisted to handle reverse bitness of GHASH;
|
||||
# only few of 16 slots of Htable[16] are used;
|
||||
# data is opaque to outside world (which allows to
|
||||
# optimize the code independently);
|
||||
#
|
||||
$code.=<<___;
|
||||
.global gcm_init_v8
|
||||
.type gcm_init_v8,%function
|
||||
.align 4
|
||||
gcm_init_v8:
|
||||
vld1.64 {$t1},[x1] @ load input H
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
vshr.u64 $t2,$xC2,#63
|
||||
vdup.32 $t1,${t1}[1]
|
||||
vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
|
||||
vshr.u64 $t2,$IN,#63
|
||||
vshr.s32 $t1,$t1,#31 @ broadcast carry bit
|
||||
vand $t2,$t2,$t0
|
||||
vshl.i64 $IN,$IN,#1
|
||||
vext.8 $t2,$t2,$t2,#8
|
||||
vand $t0,$t0,$t1
|
||||
vorr $IN,$IN,$t2 @ H<<<=1
|
||||
veor $H,$IN,$t0 @ twisted H
|
||||
vst1.64 {$H},[x0],#16 @ store Htable[0]
|
||||
|
||||
@ calculate H^2
|
||||
vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
|
||||
vpmull.p64 $Xl,$H,$H
|
||||
veor $t0,$t0,$H
|
||||
vpmull2.p64 $Xh,$H,$H
|
||||
vpmull.p64 $Xm,$t0,$t0
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $H2,$Xl,$t2
|
||||
|
||||
vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
|
||||
veor $t1,$t1,$H2
|
||||
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
||||
vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2]
|
||||
___
|
||||
if ($flavour =~ /64/) {
|
||||
my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
|
||||
|
||||
$code.=<<___;
|
||||
@ calculate H^3 and H^4
|
||||
vpmull.p64 $Xl,$H, $H2
|
||||
vpmull.p64 $Yl,$H2,$H2
|
||||
vpmull2.p64 $Xh,$H, $H2
|
||||
vpmull2.p64 $Yh,$H2,$H2
|
||||
vpmull.p64 $Xm,$t0,$t1
|
||||
vpmull.p64 $Ym,$t1,$t1
|
||||
|
||||
vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
vext.8 $t1,$Yl,$Yh,#8
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t0
|
||||
veor $t3,$Yl,$Yh
|
||||
veor $Ym,$Ym,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
|
||||
veor $Ym,$Ym,$t3
|
||||
vpmull.p64 $t3,$Yl,$xC2
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Yh#lo,$Ym#hi
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vmov $Ym#hi,$Yl#lo
|
||||
veor $Xl,$Xm,$t2
|
||||
veor $Yl,$Ym,$t3
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
|
||||
vext.8 $t3,$Yl,$Yl,#8
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
vpmull.p64 $Yl,$Yl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $t3,$t3,$Yh
|
||||
veor $H, $Xl,$t2 @ H^3
|
||||
veor $H2,$Yl,$t3 @ H^4
|
||||
|
||||
vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing
|
||||
vext.8 $t1,$H2,$H2,#8
|
||||
veor $t0,$t0,$H
|
||||
veor $t1,$t1,$H2
|
||||
vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
|
||||
vst1.64 {$H-$H2},[x0] @ store Htable[3..5]
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size gcm_init_v8,.-gcm_init_v8
|
||||
___
|
||||
################################################################################
|
||||
# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
|
||||
#
|
||||
# input: Xi - current hash value;
|
||||
# Htable - table precomputed in gcm_init_v8;
|
||||
# output: Xi - next hash value Xi;
|
||||
#
|
||||
$code.=<<___;
|
||||
.global gcm_gmult_v8
|
||||
.type gcm_gmult_v8,%function
|
||||
.align 4
|
||||
gcm_gmult_v8:
|
||||
vld1.64 {$t1},[$Xi] @ load Xi
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
|
||||
vshl.u64 $xC2,$xC2,#57
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vext.8 $IN,$t1,$t1,#8
|
||||
|
||||
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
|
||||
veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
|
||||
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_gmult_v8,.-gcm_gmult_v8
|
||||
___
|
||||
################################################################################
|
||||
# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
#
|
||||
# input: table precomputed in gcm_init_v8;
|
||||
# current hash value Xi;
|
||||
# pointer to input data;
|
||||
# length of input data in bytes, but divisible by block size;
|
||||
# output: next hash value Xi;
|
||||
#
|
||||
$code.=<<___;
|
||||
.global gcm_ghash_v8
|
||||
.type gcm_ghash_v8,%function
|
||||
.align 4
|
||||
gcm_ghash_v8:
|
||||
___
|
||||
$code.=<<___ if ($flavour =~ /64/);
|
||||
cmp $len,#64
|
||||
b.hs .Lgcm_ghash_v8_4x
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vstmdb sp!,{d8-d15} @ 32-bit ABI says so
|
||||
___
|
||||
$code.=<<___;
|
||||
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
|
||||
@ "[rotated]" means that
|
||||
@ loaded value would have
|
||||
@ to be rotated in order to
|
||||
@ make it appear as in
|
||||
@ algorithm specification
|
||||
subs $len,$len,#32 @ see if $len is 32 or larger
|
||||
mov $inc,#16 @ $inc is used as post-
|
||||
@ increment for input pointer;
|
||||
@ as loop is modulo-scheduled
|
||||
@ $inc is zeroed just in time
|
||||
@ to preclude overstepping
|
||||
@ inp[len], which means that
|
||||
@ last block[s] are actually
|
||||
@ loaded twice, but last
|
||||
@ copy is not processed
|
||||
vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vld1.64 {$H2},[$Htbl]
|
||||
cclr $inc,eq @ is it time to zero $inc?
|
||||
vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
|
||||
vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
|
||||
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t0,$t0
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
|
||||
b.lo .Lodd_tail_v8 @ $len was less than 32
|
||||
___
|
||||
{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
$code.=<<___;
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vext.8 $In,$t1,$t1,#8
|
||||
veor $IN,$IN,$Xl @ I[i]^=Xi
|
||||
vpmull.p64 $Xln,$H,$In @ H·Ii+1
|
||||
veor $t1,$t1,$In @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xhn,$H,$In
|
||||
b .Loop_mod2x_v8
|
||||
|
||||
.align 4
|
||||
.Loop_mod2x_v8:
|
||||
vext.8 $t2,$IN,$IN,#8
|
||||
subs $len,$len,#32 @ is there more data?
|
||||
vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
|
||||
cclr $inc,lo @ is it time to zero $inc?
|
||||
|
||||
vpmull.p64 $Xmn,$Hhl,$t1
|
||||
veor $t2,$t2,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
|
||||
veor $Xl,$Xl,$Xln @ accumulate
|
||||
vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
|
||||
vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
|
||||
|
||||
veor $Xh,$Xh,$Xhn
|
||||
cclr $inc,eq @ is it time to zero $inc?
|
||||
veor $Xm,$Xm,$Xmn
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t0,$t0
|
||||
#endif
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $t1,$t1
|
||||
#endif
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vext.8 $In,$t1,$t1,#8
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
veor $Xl,$Xm,$t2
|
||||
vpmull.p64 $Xln,$H,$In @ H·Ii+1
|
||||
veor $IN,$IN,$Xh @ accumulate $IN early
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $IN,$IN,$t2
|
||||
veor $t1,$t1,$In @ Karatsuba pre-processing
|
||||
veor $IN,$IN,$Xl
|
||||
vpmull2.p64 $Xhn,$H,$In
|
||||
b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
|
||||
|
||||
veor $Xh,$Xh,$t2
|
||||
vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
|
||||
adds $len,$len,#32 @ re-construct $len
|
||||
veor $Xl,$Xl,$Xh @ re-construct $Xl
|
||||
b.eq .Ldone_v8 @ is $len zero?
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.Lodd_tail_v8:
|
||||
vext.8 $t2,$Xl,$Xl,#8
|
||||
veor $IN,$IN,$Xl @ inp^=Xi
|
||||
veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
|
||||
|
||||
vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
|
||||
veor $t1,$t1,$IN @ Karatsuba pre-processing
|
||||
vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
|
||||
vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
|
||||
.Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /64/);
|
||||
vldmia sp!,{d8-d15} @ 32-bit ABI says so
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.size gcm_ghash_v8,.-gcm_ghash_v8
|
||||
___
|
||||
|
||||
if ($flavour =~ /64/) { # 4x subroutine
|
||||
my ($I0,$j1,$j2,$j3,
|
||||
$I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
|
||||
|
||||
$code.=<<___;
|
||||
.type gcm_ghash_v8_4x,%function
|
||||
.align 4
|
||||
gcm_ghash_v8_4x:
|
||||
.Lgcm_ghash_v8_4x:
|
||||
vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
|
||||
vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2
|
||||
vmov.i8 $xC2,#0xe1
|
||||
vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4
|
||||
vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
|
||||
|
||||
vld1.64 {$I0-$j3},[$inp],#64
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
vrev64.8 $j1,$j1
|
||||
vrev64.8 $j2,$j2
|
||||
vrev64.8 $j3,$j3
|
||||
vrev64.8 $I0,$I0
|
||||
#endif
|
||||
vext.8 $I3,$j3,$j3,#8
|
||||
vext.8 $I2,$j2,$j2,#8
|
||||
vext.8 $I1,$j1,$j1,#8
|
||||
|
||||
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
|
||||
veor $j3,$j3,$I3
|
||||
vpmull2.p64 $Yh,$H,$I3
|
||||
vpmull.p64 $Ym,$Hhl,$j3
|
||||
|
||||
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
|
||||
veor $j2,$j2,$I2
|
||||
vpmull2.p64 $I2,$H2,$I2
|
||||
vpmull2.p64 $j2,$Hhl,$j2
|
||||
|
||||
veor $Yl,$Yl,$t0
|
||||
veor $Yh,$Yh,$I2
|
||||
veor $Ym,$Ym,$j2
|
||||
|
||||
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
|
||||
veor $j1,$j1,$I1
|
||||
vpmull2.p64 $I1,$H3,$I1
|
||||
vpmull.p64 $j1,$H34,$j1
|
||||
|
||||
veor $Yl,$Yl,$j3
|
||||
veor $Yh,$Yh,$I1
|
||||
veor $Ym,$Ym,$j1
|
||||
|
||||
subs $len,$len,#128
|
||||
b.lo .Ltail4x
|
||||
|
||||
b .Loop4x
|
||||
|
||||
.align 4
|
||||
.Loop4x:
|
||||
veor $t0,$I0,$Xl
|
||||
vld1.64 {$I0-$j3},[$inp],#64
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $j1,$j1
|
||||
vrev64.8 $j2,$j2
|
||||
vrev64.8 $j3,$j3
|
||||
vrev64.8 $I0,$I0
|
||||
#endif
|
||||
|
||||
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
|
||||
veor $t0,$t0,$IN
|
||||
vpmull2.p64 $Xh,$H4,$IN
|
||||
vext.8 $I3,$j3,$j3,#8
|
||||
vpmull2.p64 $Xm,$H34,$t0
|
||||
|
||||
veor $Xl,$Xl,$Yl
|
||||
veor $Xh,$Xh,$Yh
|
||||
vext.8 $I2,$j2,$j2,#8
|
||||
veor $Xm,$Xm,$Ym
|
||||
vext.8 $I1,$j1,$j1,#8
|
||||
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
vpmull.p64 $Yl,$H,$I3 @ H·Ii+3
|
||||
veor $j3,$j3,$I3
|
||||
veor $Xm,$Xm,$t1
|
||||
vpmull2.p64 $Yh,$H,$I3
|
||||
veor $Xm,$Xm,$t2
|
||||
vpmull.p64 $Ym,$Hhl,$j3
|
||||
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2
|
||||
veor $j2,$j2,$I2
|
||||
vpmull2.p64 $I2,$H2,$I2
|
||||
veor $Xl,$Xm,$t2
|
||||
vpmull2.p64 $j2,$Hhl,$j2
|
||||
|
||||
veor $Yl,$Yl,$t0
|
||||
veor $Yh,$Yh,$I2
|
||||
veor $Ym,$Ym,$j2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1
|
||||
veor $j1,$j1,$I1
|
||||
veor $t2,$t2,$Xh
|
||||
vpmull2.p64 $I1,$H3,$I1
|
||||
vpmull.p64 $j1,$H34,$j1
|
||||
|
||||
veor $Xl,$Xl,$t2
|
||||
veor $Yl,$Yl,$j3
|
||||
veor $Yh,$Yh,$I1
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
veor $Ym,$Ym,$j1
|
||||
|
||||
subs $len,$len,#64
|
||||
b.hs .Loop4x
|
||||
|
||||
.Ltail4x:
|
||||
veor $t0,$I0,$Xl
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
|
||||
vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii)
|
||||
veor $t0,$t0,$IN
|
||||
vpmull2.p64 $Xh,$H4,$IN
|
||||
vpmull2.p64 $Xm,$H34,$t0
|
||||
|
||||
veor $Xl,$Xl,$Yl
|
||||
veor $Xh,$Xh,$Yh
|
||||
veor $Xm,$Xm,$Ym
|
||||
|
||||
adds $len,$len,#64
|
||||
b.eq .Ldone4x
|
||||
|
||||
cmp $len,#32
|
||||
b.lo .Lone
|
||||
b.eq .Ltwo
|
||||
.Lthree:
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$I0-$j2},[$inp]
|
||||
veor $Xm,$Xm,$t2
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $j1,$j1
|
||||
vrev64.8 $j2,$j2
|
||||
vrev64.8 $I0,$I0
|
||||
#endif
|
||||
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vext.8 $I2,$j2,$j2,#8
|
||||
vext.8 $I1,$j1,$j1,#8
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vpmull.p64 $Yl,$H,$I2 @ H·Ii+2
|
||||
veor $j2,$j2,$I2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
vpmull2.p64 $Yh,$H,$I2
|
||||
vpmull.p64 $Ym,$Hhl,$j2
|
||||
veor $Xl,$Xl,$t2
|
||||
vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1
|
||||
veor $j1,$j1,$I1
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
|
||||
vpmull2.p64 $I1,$H2,$I1
|
||||
veor $t0,$I0,$Xl
|
||||
vpmull2.p64 $j1,$Hhl,$j1
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
|
||||
veor $Yl,$Yl,$j3
|
||||
veor $Yh,$Yh,$I1
|
||||
veor $Ym,$Ym,$j1
|
||||
|
||||
vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii)
|
||||
veor $t0,$t0,$IN
|
||||
vpmull2.p64 $Xh,$H3,$IN
|
||||
vpmull.p64 $Xm,$H34,$t0
|
||||
|
||||
veor $Xl,$Xl,$Yl
|
||||
veor $Xh,$Xh,$Yh
|
||||
veor $Xm,$Xm,$Ym
|
||||
b .Ldone4x
|
||||
|
||||
.align 4
|
||||
.Ltwo:
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$I0-$j1},[$inp]
|
||||
veor $Xm,$Xm,$t2
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $j1,$j1
|
||||
vrev64.8 $I0,$I0
|
||||
#endif
|
||||
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
vext.8 $I1,$j1,$j1,#8
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
|
||||
vpmull.p64 $Yl,$H,$I1 @ H·Ii+1
|
||||
veor $j1,$j1,$I1
|
||||
|
||||
veor $t0,$I0,$Xl
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
|
||||
vpmull2.p64 $Yh,$H,$I1
|
||||
vpmull.p64 $Ym,$Hhl,$j1
|
||||
|
||||
vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii)
|
||||
veor $t0,$t0,$IN
|
||||
vpmull2.p64 $Xh,$H2,$IN
|
||||
vpmull2.p64 $Xm,$Hhl,$t0
|
||||
|
||||
veor $Xl,$Xl,$Yl
|
||||
veor $Xh,$Xh,$Yh
|
||||
veor $Xm,$Xm,$Ym
|
||||
b .Ldone4x
|
||||
|
||||
.align 4
|
||||
.Lone:
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
vld1.64 {$I0},[$inp]
|
||||
veor $Xm,$Xm,$t2
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $I0,$I0
|
||||
#endif
|
||||
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
|
||||
veor $t0,$I0,$Xl
|
||||
vext.8 $IN,$t0,$t0,#8
|
||||
|
||||
vpmull.p64 $Xl,$H,$IN
|
||||
veor $t0,$t0,$IN
|
||||
vpmull2.p64 $Xh,$H,$IN
|
||||
vpmull.p64 $Xm,$Hhl,$t0
|
||||
|
||||
.Ldone4x:
|
||||
vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
|
||||
veor $t2,$Xl,$Xh
|
||||
veor $Xm,$Xm,$t1
|
||||
veor $Xm,$Xm,$t2
|
||||
|
||||
vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
|
||||
vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
|
||||
vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
|
||||
veor $Xl,$Xm,$t2
|
||||
|
||||
vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
|
||||
vpmull.p64 $Xl,$Xl,$xC2
|
||||
veor $t2,$t2,$Xh
|
||||
veor $Xl,$Xl,$t2
|
||||
vext.8 $Xl,$Xl,$Xl,#8
|
||||
|
||||
#ifndef __ARMEB__
|
||||
vrev64.8 $Xl,$Xl
|
||||
#endif
|
||||
vst1.64 {$Xl},[$Xi] @ write out Xi
|
||||
|
||||
ret
|
||||
.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
|
||||
___
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
#endif
|
||||
___
|
||||
|
||||
if ($flavour =~ /64/) { ######## 64-bit code
|
||||
sub unvmov {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
|
||||
sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
|
||||
$3<8?$3:$3+8,($4 eq "lo")?0:1;
|
||||
}
|
||||
foreach(split("\n",$code)) {
|
||||
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
|
||||
s/vmov\.i8/movi/o or # fix up legacy mnemonics
|
||||
s/vmov\s+(.*)/unvmov($1)/geo or
|
||||
s/vext\.8/ext/o or
|
||||
s/vshr\.s/sshr\.s/o or
|
||||
s/vshr/ushr/o or
|
||||
s/^(\s+)v/$1/o or # strip off v prefix
|
||||
s/\bbx\s+lr\b/ret/o;
|
||||
|
||||
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
|
||||
s/@\s/\/\//o; # old->new style commentary
|
||||
|
||||
# fix up remaining legacy suffixes
|
||||
s/\.[ui]?8(\s)/$1/o;
|
||||
s/\.[uis]?32//o and s/\.16b/\.4s/go;
|
||||
m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
|
||||
m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
|
||||
s/\.[uisp]?64//o and s/\.16b/\.2d/go;
|
||||
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
} else { ######## 32-bit code
|
||||
sub unvdup32 {
|
||||
my $arg=shift;
|
||||
|
||||
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
|
||||
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
|
||||
}
|
||||
sub unvpmullp64 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
|
||||
my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
$word |= 0x00010001 if ($mnemonic =~ "2");
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
|
||||
foreach(split("\n",$code)) {
|
||||
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
|
||||
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
|
||||
s/\/\/\s?/@ /o; # new->old style commentary
|
||||
|
||||
# fix up remaining new-style suffixes
|
||||
s/\],#[0-9]+/]!/o;
|
||||
|
||||
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
|
||||
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
|
||||
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/^(\s+)b\./$1b/o or
|
||||
s/^(\s+)ret/$1bx\tlr/o;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
}
|
||||
|
||||
close STDOUT; # enforce flush
|
30
trunk/3rdparty/openssl-1.1-fit/crypto/modes/build.info
vendored
Normal file
30
trunk/3rdparty/openssl-1.1-fit/crypto/modes/build.info
vendored
Normal file
|
@ -0,0 +1,30 @@
|
|||
LIBS=../../libcrypto
|
||||
SOURCE[../../libcrypto]=\
|
||||
cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
|
||||
ccm128.c xts128.c wrap128.c ocb128.c \
|
||||
{- $target{modes_asm_src} -}
|
||||
|
||||
INCLUDE[gcm128.o]=..
|
||||
|
||||
GENERATE[ghash-ia64.s]=asm/ghash-ia64.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
|
||||
GENERATE[ghash-x86.s]=asm/ghash-x86.pl \
|
||||
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
|
||||
GENERATE[ghash-x86_64.s]=asm/ghash-x86_64.pl $(PERLASM_SCHEME)
|
||||
GENERATE[aesni-gcm-x86_64.s]=asm/aesni-gcm-x86_64.pl $(PERLASM_SCHEME)
|
||||
GENERATE[ghash-sparcv9.S]=asm/ghash-sparcv9.pl $(PERLASM_SCHEME)
|
||||
INCLUDE[ghash-sparcv9.o]=..
|
||||
GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl $(PERLASM_SCHEME)
|
||||
GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl $(PERLASM_SCHEME)
|
||||
GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl $(PERLASM_SCHEME)
|
||||
GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl $(PERLASM_SCHEME)
|
||||
INCLUDE[ghash-armv4.o]=..
|
||||
GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl $(PERLASM_SCHEME)
|
||||
INCLUDE[ghashv8-armx.o]=..
|
||||
GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl $(PERLASM_SCHEME)
|
||||
INCLUDE[ghash-s390x.o]=..
|
||||
|
||||
BEGINRAW[Makefile]
|
||||
# GNU make "catch all"
|
||||
{- $builddir -}/ghash-%.S: {- $sourcedir -}/asm/ghash-%.pl
|
||||
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
|
||||
ENDRAW[Makefile]
|
161
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cbc128.c
vendored
Normal file
161
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cbc128.c
vendored
Normal file
|
@ -0,0 +1,161 @@
|
|||
/*
|
||||
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#if !defined(STRICT_ALIGNMENT) && !defined(PEDANTIC)
|
||||
# define STRICT_ALIGNMENT 0
|
||||
#endif
|
||||
|
||||
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block)
|
||||
{
|
||||
size_t n;
|
||||
const unsigned char *iv = ivec;
|
||||
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (STRICT_ALIGNMENT &&
|
||||
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
|
||||
while (len >= 16) {
|
||||
for (n = 0; n < 16; ++n)
|
||||
out[n] = in[n] ^ iv[n];
|
||||
(*block) (out, out, key);
|
||||
iv = out;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
} else {
|
||||
while (len >= 16) {
|
||||
for (n = 0; n < 16; n += sizeof(size_t))
|
||||
*(size_t *)(out + n) =
|
||||
*(size_t *)(in + n) ^ *(size_t *)(iv + n);
|
||||
(*block) (out, out, key);
|
||||
iv = out;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (len) {
|
||||
for (n = 0; n < 16 && n < len; ++n)
|
||||
out[n] = in[n] ^ iv[n];
|
||||
for (; n < 16; ++n)
|
||||
out[n] = iv[n];
|
||||
(*block) (out, out, key);
|
||||
iv = out;
|
||||
if (len <= 16)
|
||||
break;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
memcpy(ivec, iv, 16);
|
||||
}
|
||||
|
||||
void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block)
|
||||
{
|
||||
size_t n;
|
||||
union {
|
||||
size_t t[16 / sizeof(size_t)];
|
||||
unsigned char c[16];
|
||||
} tmp;
|
||||
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (in != out) {
|
||||
const unsigned char *iv = ivec;
|
||||
|
||||
if (STRICT_ALIGNMENT &&
|
||||
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
|
||||
while (len >= 16) {
|
||||
(*block) (in, out, key);
|
||||
for (n = 0; n < 16; ++n)
|
||||
out[n] ^= iv[n];
|
||||
iv = in;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
} else if (16 % sizeof(size_t) == 0) { /* always true */
|
||||
while (len >= 16) {
|
||||
size_t *out_t = (size_t *)out, *iv_t = (size_t *)iv;
|
||||
|
||||
(*block) (in, out, key);
|
||||
for (n = 0; n < 16 / sizeof(size_t); n++)
|
||||
out_t[n] ^= iv_t[n];
|
||||
iv = in;
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
memcpy(ivec, iv, 16);
|
||||
} else {
|
||||
if (STRICT_ALIGNMENT &&
|
||||
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
|
||||
unsigned char c;
|
||||
while (len >= 16) {
|
||||
(*block) (in, tmp.c, key);
|
||||
for (n = 0; n < 16; ++n) {
|
||||
c = in[n];
|
||||
out[n] = tmp.c[n] ^ ivec[n];
|
||||
ivec[n] = c;
|
||||
}
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
} else if (16 % sizeof(size_t) == 0) { /* always true */
|
||||
while (len >= 16) {
|
||||
size_t c, *out_t = (size_t *)out, *ivec_t = (size_t *)ivec;
|
||||
const size_t *in_t = (const size_t *)in;
|
||||
|
||||
(*block) (in, tmp.c, key);
|
||||
for (n = 0; n < 16 / sizeof(size_t); n++) {
|
||||
c = in_t[n];
|
||||
out_t[n] = tmp.t[n] ^ ivec_t[n];
|
||||
ivec_t[n] = c;
|
||||
}
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (len) {
|
||||
unsigned char c;
|
||||
(*block) (in, tmp.c, key);
|
||||
for (n = 0; n < 16 && n < len; ++n) {
|
||||
c = in[n];
|
||||
out[n] = tmp.c[n] ^ ivec[n];
|
||||
ivec[n] = c;
|
||||
}
|
||||
if (len <= 16) {
|
||||
for (; n < 16; ++n)
|
||||
ivec[n] = in[n];
|
||||
break;
|
||||
}
|
||||
len -= 16;
|
||||
in += 16;
|
||||
out += 16;
|
||||
}
|
||||
}
|
432
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ccm128.c
vendored
Normal file
432
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ccm128.c
vendored
Normal file
|
@ -0,0 +1,432 @@
|
|||
/*
|
||||
* Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* First you setup M and L parameters and pass the key schedule. This is
|
||||
* called once per session setup...
|
||||
*/
|
||||
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
|
||||
unsigned int M, unsigned int L, void *key,
|
||||
block128_f block)
|
||||
{
|
||||
memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
|
||||
ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2) / 2) & 7) << 3;
|
||||
ctx->blocks = 0;
|
||||
ctx->block = block;
|
||||
ctx->key = key;
|
||||
}
|
||||
|
||||
/* !!! Following interfaces are to be called *once* per packet !!! */
|
||||
|
||||
/* Then you setup per-message nonce and pass the length of the message */
|
||||
int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *nonce, size_t nlen, size_t mlen)
|
||||
{
|
||||
unsigned int L = ctx->nonce.c[0] & 7; /* the L parameter */
|
||||
|
||||
if (nlen < (14 - L))
|
||||
return -1; /* nonce is too short */
|
||||
|
||||
if (sizeof(mlen) == 8 && L >= 3) {
|
||||
ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen) * 8)));
|
||||
ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen) * 8)));
|
||||
ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen) * 8)));
|
||||
ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen) * 8)));
|
||||
} else
|
||||
ctx->nonce.u[1] = 0;
|
||||
|
||||
ctx->nonce.c[12] = (u8)(mlen >> 24);
|
||||
ctx->nonce.c[13] = (u8)(mlen >> 16);
|
||||
ctx->nonce.c[14] = (u8)(mlen >> 8);
|
||||
ctx->nonce.c[15] = (u8)mlen;
|
||||
|
||||
ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
|
||||
memcpy(&ctx->nonce.c[1], nonce, 14 - L);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Then you pass additional authentication data, this is optional */
|
||||
void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *aad, size_t alen)
|
||||
{
|
||||
unsigned int i;
|
||||
block128_f block = ctx->block;
|
||||
|
||||
if (alen == 0)
|
||||
return;
|
||||
|
||||
ctx->nonce.c[0] |= 0x40; /* set Adata flag */
|
||||
(*block) (ctx->nonce.c, ctx->cmac.c, ctx->key), ctx->blocks++;
|
||||
|
||||
if (alen < (0x10000 - 0x100)) {
|
||||
ctx->cmac.c[0] ^= (u8)(alen >> 8);
|
||||
ctx->cmac.c[1] ^= (u8)alen;
|
||||
i = 2;
|
||||
} else if (sizeof(alen) == 8
|
||||
&& alen >= (size_t)1 << (32 % (sizeof(alen) * 8))) {
|
||||
ctx->cmac.c[0] ^= 0xFF;
|
||||
ctx->cmac.c[1] ^= 0xFF;
|
||||
ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen) * 8)));
|
||||
ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen) * 8)));
|
||||
ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen) * 8)));
|
||||
ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen) * 8)));
|
||||
ctx->cmac.c[6] ^= (u8)(alen >> 24);
|
||||
ctx->cmac.c[7] ^= (u8)(alen >> 16);
|
||||
ctx->cmac.c[8] ^= (u8)(alen >> 8);
|
||||
ctx->cmac.c[9] ^= (u8)alen;
|
||||
i = 10;
|
||||
} else {
|
||||
ctx->cmac.c[0] ^= 0xFF;
|
||||
ctx->cmac.c[1] ^= 0xFE;
|
||||
ctx->cmac.c[2] ^= (u8)(alen >> 24);
|
||||
ctx->cmac.c[3] ^= (u8)(alen >> 16);
|
||||
ctx->cmac.c[4] ^= (u8)(alen >> 8);
|
||||
ctx->cmac.c[5] ^= (u8)alen;
|
||||
i = 6;
|
||||
}
|
||||
|
||||
do {
|
||||
for (; i < 16 && alen; ++i, ++aad, --alen)
|
||||
ctx->cmac.c[i] ^= *aad;
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, ctx->key), ctx->blocks++;
|
||||
i = 0;
|
||||
} while (alen);
|
||||
}
|
||||
|
||||
/* Finally you encrypt or decrypt the message */
|
||||
|
||||
/*
|
||||
* counter part of nonce may not be larger than L*8 bits, L is not larger
|
||||
* than 8, therefore 64-bit counter...
|
||||
*/
|
||||
static void ctr64_inc(unsigned char *counter)
|
||||
{
|
||||
unsigned int n = 8;
|
||||
u8 c;
|
||||
|
||||
counter += 8;
|
||||
do {
|
||||
--n;
|
||||
c = counter[n];
|
||||
++c;
|
||||
counter[n] = c;
|
||||
if (c)
|
||||
return;
|
||||
} while (n);
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i, L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void *key = ctx->key;
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} scratch;
|
||||
|
||||
if (!(flags0 & 0x40))
|
||||
(*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
|
||||
|
||||
ctx->nonce.c[0] = L = flags0 & 7;
|
||||
for (n = 0, i = 15 - L; i < 15; ++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i] = 0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15] = 1;
|
||||
|
||||
if (n != len)
|
||||
return -1; /* length mismatch */
|
||||
|
||||
ctx->blocks += ((len + 15) >> 3) | 1;
|
||||
if (ctx->blocks > (U64(1) << 61))
|
||||
return -2; /* too much data */
|
||||
|
||||
while (len >= 16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} temp;
|
||||
|
||||
memcpy(temp.c, inp, 16);
|
||||
ctx->cmac.u[0] ^= temp.u[0];
|
||||
ctx->cmac.u[1] ^= temp.u[1];
|
||||
#else
|
||||
ctx->cmac.u[0] ^= ((u64 *)inp)[0];
|
||||
ctx->cmac.u[1] ^= ((u64 *)inp)[1];
|
||||
#endif
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctr64_inc(ctx->nonce.c);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
temp.u[0] ^= scratch.u[0];
|
||||
temp.u[1] ^= scratch.u[1];
|
||||
memcpy(out, temp.c, 16);
|
||||
#else
|
||||
((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
|
||||
((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
|
||||
#endif
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
for (i = 0; i < len; ++i)
|
||||
ctx->cmac.c[i] ^= inp[i];
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
for (i = 0; i < len; ++i)
|
||||
out[i] = scratch.c[i] ^ inp[i];
|
||||
}
|
||||
|
||||
for (i = 15 - L; i < 16; ++i)
|
||||
ctx->nonce.c[i] = 0;
|
||||
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i, L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void *key = ctx->key;
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} scratch;
|
||||
|
||||
if (!(flags0 & 0x40))
|
||||
(*block) (ctx->nonce.c, ctx->cmac.c, key);
|
||||
|
||||
ctx->nonce.c[0] = L = flags0 & 7;
|
||||
for (n = 0, i = 15 - L; i < 15; ++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i] = 0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15] = 1;
|
||||
|
||||
if (n != len)
|
||||
return -1;
|
||||
|
||||
while (len >= 16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} temp;
|
||||
#endif
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctr64_inc(ctx->nonce.c);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy(temp.c, inp, 16);
|
||||
ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
|
||||
ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
|
||||
memcpy(out, scratch.c, 16);
|
||||
#else
|
||||
ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0]);
|
||||
ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1]);
|
||||
#endif
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
for (i = 0; i < len; ++i)
|
||||
ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
}
|
||||
|
||||
for (i = 15 - L; i < 16; ++i)
|
||||
ctx->nonce.c[i] = 0;
|
||||
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ctr64_add(unsigned char *counter, size_t inc)
|
||||
{
|
||||
size_t n = 8, val = 0;
|
||||
|
||||
counter += 8;
|
||||
do {
|
||||
--n;
|
||||
val += counter[n] + (inc & 0xff);
|
||||
counter[n] = (unsigned char)val;
|
||||
val >>= 8; /* carry bit */
|
||||
inc >>= 8;
|
||||
} while (n && (inc || val));
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len, ccm128_f stream)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i, L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void *key = ctx->key;
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} scratch;
|
||||
|
||||
if (!(flags0 & 0x40))
|
||||
(*block) (ctx->nonce.c, ctx->cmac.c, key), ctx->blocks++;
|
||||
|
||||
ctx->nonce.c[0] = L = flags0 & 7;
|
||||
for (n = 0, i = 15 - L; i < 15; ++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i] = 0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15] = 1;
|
||||
|
||||
if (n != len)
|
||||
return -1; /* length mismatch */
|
||||
|
||||
ctx->blocks += ((len + 15) >> 3) | 1;
|
||||
if (ctx->blocks > (U64(1) << 61))
|
||||
return -2; /* too much data */
|
||||
|
||||
if ((n = len / 16)) {
|
||||
(*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
|
||||
n *= 16;
|
||||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len)
|
||||
ctr64_add(ctx->nonce.c, n / 16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
for (i = 0; i < len; ++i)
|
||||
ctx->cmac.c[i] ^= inp[i];
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
for (i = 0; i < len; ++i)
|
||||
out[i] = scratch.c[i] ^ inp[i];
|
||||
}
|
||||
|
||||
for (i = 15 - L; i < 16; ++i)
|
||||
ctx->nonce.c[i] = 0;
|
||||
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len, ccm128_f stream)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i, L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void *key = ctx->key;
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} scratch;
|
||||
|
||||
if (!(flags0 & 0x40))
|
||||
(*block) (ctx->nonce.c, ctx->cmac.c, key);
|
||||
|
||||
ctx->nonce.c[0] = L = flags0 & 7;
|
||||
for (n = 0, i = 15 - L; i < 15; ++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i] = 0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15] = 1;
|
||||
|
||||
if (n != len)
|
||||
return -1;
|
||||
|
||||
if ((n = len / 16)) {
|
||||
(*stream) (inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
|
||||
n *= 16;
|
||||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len)
|
||||
ctr64_add(ctx->nonce.c, n / 16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
for (i = 0; i < len; ++i)
|
||||
ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
|
||||
(*block) (ctx->cmac.c, ctx->cmac.c, key);
|
||||
}
|
||||
|
||||
for (i = 15 - L; i < 16; ++i)
|
||||
ctx->nonce.c[i] = 0;
|
||||
|
||||
(*block) (ctx->nonce.c, scratch.c, key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
|
||||
{
|
||||
unsigned int M = (ctx->nonce.c[0] >> 3) & 7; /* the M parameter */
|
||||
|
||||
M *= 2;
|
||||
M += 2;
|
||||
if (len < M)
|
||||
return 0;
|
||||
memcpy(tag, ctx->cmac.c, M);
|
||||
return M;
|
||||
}
|
198
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cfb128.c
vendored
Normal file
198
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cfb128.c
vendored
Normal file
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* The input and output encrypted as though 128bit cfb mode is being used.
|
||||
* The extra state information to record how much of the 128bit block we have
|
||||
* used is contained in *num;
|
||||
*/
|
||||
void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], int *num,
|
||||
int enc, block128_f block)
|
||||
{
|
||||
unsigned int n;
|
||||
size_t l = 0;
|
||||
|
||||
n = *num;
|
||||
|
||||
if (enc) {
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (16 % sizeof(size_t) == 0) { /* always true actually */
|
||||
do {
|
||||
while (n && len) {
|
||||
*(out++) = ivec[n] ^= *(in++);
|
||||
--len;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
# if defined(STRICT_ALIGNMENT)
|
||||
if (((size_t)in | (size_t)out | (size_t)ivec) %
|
||||
sizeof(size_t) != 0)
|
||||
break;
|
||||
# endif
|
||||
while (len >= 16) {
|
||||
(*block) (ivec, ivec, key);
|
||||
for (; n < 16; n += sizeof(size_t)) {
|
||||
*(size_t *)(out + n) =
|
||||
*(size_t *)(ivec + n) ^= *(size_t *)(in + n);
|
||||
}
|
||||
len -= 16;
|
||||
out += 16;
|
||||
in += 16;
|
||||
n = 0;
|
||||
}
|
||||
if (len) {
|
||||
(*block) (ivec, ivec, key);
|
||||
while (len--) {
|
||||
out[n] = ivec[n] ^= in[n];
|
||||
++n;
|
||||
}
|
||||
}
|
||||
*num = n;
|
||||
return;
|
||||
} while (0);
|
||||
}
|
||||
/* the rest would be commonly eliminated by x86* compiler */
|
||||
#endif
|
||||
while (l < len) {
|
||||
if (n == 0) {
|
||||
(*block) (ivec, ivec, key);
|
||||
}
|
||||
out[l] = ivec[n] ^= in[l];
|
||||
++l;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
*num = n;
|
||||
} else {
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (16 % sizeof(size_t) == 0) { /* always true actually */
|
||||
do {
|
||||
while (n && len) {
|
||||
unsigned char c;
|
||||
*(out++) = ivec[n] ^ (c = *(in++));
|
||||
ivec[n] = c;
|
||||
--len;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
# if defined(STRICT_ALIGNMENT)
|
||||
if (((size_t)in | (size_t)out | (size_t)ivec) %
|
||||
sizeof(size_t) != 0)
|
||||
break;
|
||||
# endif
|
||||
while (len >= 16) {
|
||||
(*block) (ivec, ivec, key);
|
||||
for (; n < 16; n += sizeof(size_t)) {
|
||||
size_t t = *(size_t *)(in + n);
|
||||
*(size_t *)(out + n) = *(size_t *)(ivec + n) ^ t;
|
||||
*(size_t *)(ivec + n) = t;
|
||||
}
|
||||
len -= 16;
|
||||
out += 16;
|
||||
in += 16;
|
||||
n = 0;
|
||||
}
|
||||
if (len) {
|
||||
(*block) (ivec, ivec, key);
|
||||
while (len--) {
|
||||
unsigned char c;
|
||||
out[n] = ivec[n] ^ (c = in[n]);
|
||||
ivec[n] = c;
|
||||
++n;
|
||||
}
|
||||
}
|
||||
*num = n;
|
||||
return;
|
||||
} while (0);
|
||||
}
|
||||
/* the rest would be commonly eliminated by x86* compiler */
|
||||
#endif
|
||||
while (l < len) {
|
||||
unsigned char c;
|
||||
if (n == 0) {
|
||||
(*block) (ivec, ivec, key);
|
||||
}
|
||||
out[l] = ivec[n] ^ (c = in[l]);
|
||||
ivec[n] = c;
|
||||
++l;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
*num = n;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This expects a single block of size nbits for both in and out. Note that
|
||||
* it corrupts any extra bits in the last byte of out
|
||||
*/
|
||||
static void cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
|
||||
int nbits, const void *key,
|
||||
unsigned char ivec[16], int enc,
|
||||
block128_f block)
|
||||
{
|
||||
int n, rem, num;
|
||||
unsigned char ovec[16 * 2 + 1]; /* +1 because we dereference (but don't
|
||||
* use) one byte off the end */
|
||||
|
||||
if (nbits <= 0 || nbits > 128)
|
||||
return;
|
||||
|
||||
/* fill in the first half of the new IV with the current IV */
|
||||
memcpy(ovec, ivec, 16);
|
||||
/* construct the new IV */
|
||||
(*block) (ivec, ivec, key);
|
||||
num = (nbits + 7) / 8;
|
||||
if (enc) /* encrypt the input */
|
||||
for (n = 0; n < num; ++n)
|
||||
out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
|
||||
else /* decrypt the input */
|
||||
for (n = 0; n < num; ++n)
|
||||
out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
|
||||
/* shift ovec left... */
|
||||
rem = nbits % 8;
|
||||
num = nbits / 8;
|
||||
if (rem == 0)
|
||||
memcpy(ivec, ovec + num, 16);
|
||||
else
|
||||
for (n = 0; n < 16; ++n)
|
||||
ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem);
|
||||
|
||||
/* it is not necessary to cleanse ovec, since the IV is not secret */
|
||||
}
|
||||
|
||||
/* N.B. This expects the input to be packed, MS bit first */
|
||||
void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t bits, const void *key,
|
||||
unsigned char ivec[16], int *num,
|
||||
int enc, block128_f block)
|
||||
{
|
||||
size_t n;
|
||||
unsigned char c[1], d[1];
|
||||
|
||||
for (n = 0; n < bits; ++n) {
|
||||
c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0;
|
||||
cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
|
||||
out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) |
|
||||
((d[0] & 0x80) >> (unsigned int)(n % 8));
|
||||
}
|
||||
}
|
||||
|
||||
void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t length, const void *key,
|
||||
unsigned char ivec[16], int *num,
|
||||
int enc, block128_f block)
|
||||
{
|
||||
size_t n;
|
||||
|
||||
for (n = 0; n < length; ++n)
|
||||
cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
|
||||
}
|
209
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ctr128.c
vendored
Normal file
209
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ctr128.c
vendored
Normal file
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* NOTE: the IV/counter CTR mode is big-endian. The code itself is
|
||||
* endian-neutral.
|
||||
*/
|
||||
|
||||
/* increment counter (128-bit int) by 1 */
|
||||
static void ctr128_inc(unsigned char *counter)
|
||||
{
|
||||
u32 n = 16, c = 1;
|
||||
|
||||
do {
|
||||
--n;
|
||||
c += counter[n];
|
||||
counter[n] = (u8)c;
|
||||
c >>= 8;
|
||||
} while (n);
|
||||
}
|
||||
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
static void ctr128_inc_aligned(unsigned char *counter)
|
||||
{
|
||||
size_t *data, c, d, n;
|
||||
const union {
|
||||
long one;
|
||||
char little;
|
||||
} is_endian = {
|
||||
1
|
||||
};
|
||||
|
||||
if (is_endian.little || ((size_t)counter % sizeof(size_t)) != 0) {
|
||||
ctr128_inc(counter);
|
||||
return;
|
||||
}
|
||||
|
||||
data = (size_t *)counter;
|
||||
c = 1;
|
||||
n = 16 / sizeof(size_t);
|
||||
do {
|
||||
--n;
|
||||
d = data[n] += c;
|
||||
/* did addition carry? */
|
||||
c = ((d - c) & ~d) >> (sizeof(size_t) * 8 - 1);
|
||||
} while (n);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The input encrypted as though 128bit counter mode is being used. The
|
||||
* extra state information to record how much of the 128bit block we have
|
||||
* used is contained in *num, and the encrypted counter is kept in
|
||||
* ecount_buf. Both *num and ecount_buf must be initialised with zeros
|
||||
* before the first call to CRYPTO_ctr128_encrypt(). This algorithm assumes
|
||||
* that the counter is in the x lower bits of the IV (ivec), and that the
|
||||
* application has full control over overflow and the rest of the IV. This
|
||||
* implementation takes NO responsibility for checking that the counter
|
||||
* doesn't overflow into the rest of the IV when incremented.
|
||||
*/
|
||||
void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16],
|
||||
unsigned char ecount_buf[16], unsigned int *num,
|
||||
block128_f block)
|
||||
{
|
||||
unsigned int n;
|
||||
size_t l = 0;
|
||||
|
||||
n = *num;
|
||||
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (16 % sizeof(size_t) == 0) { /* always true actually */
|
||||
do {
|
||||
while (n && len) {
|
||||
*(out++) = *(in++) ^ ecount_buf[n];
|
||||
--len;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
|
||||
# if defined(STRICT_ALIGNMENT)
|
||||
if (((size_t)in | (size_t)out | (size_t)ecount_buf)
|
||||
% sizeof(size_t) != 0)
|
||||
break;
|
||||
# endif
|
||||
while (len >= 16) {
|
||||
(*block) (ivec, ecount_buf, key);
|
||||
ctr128_inc_aligned(ivec);
|
||||
for (n = 0; n < 16; n += sizeof(size_t))
|
||||
*(size_t *)(out + n) =
|
||||
*(size_t *)(in + n) ^ *(size_t *)(ecount_buf + n);
|
||||
len -= 16;
|
||||
out += 16;
|
||||
in += 16;
|
||||
n = 0;
|
||||
}
|
||||
if (len) {
|
||||
(*block) (ivec, ecount_buf, key);
|
||||
ctr128_inc_aligned(ivec);
|
||||
while (len--) {
|
||||
out[n] = in[n] ^ ecount_buf[n];
|
||||
++n;
|
||||
}
|
||||
}
|
||||
*num = n;
|
||||
return;
|
||||
} while (0);
|
||||
}
|
||||
/* the rest would be commonly eliminated by x86* compiler */
|
||||
#endif
|
||||
while (l < len) {
|
||||
if (n == 0) {
|
||||
(*block) (ivec, ecount_buf, key);
|
||||
ctr128_inc(ivec);
|
||||
}
|
||||
out[l] = in[l] ^ ecount_buf[n];
|
||||
++l;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
|
||||
*num = n;
|
||||
}
|
||||
|
||||
/* increment upper 96 bits of 128-bit counter by 1 */
|
||||
static void ctr96_inc(unsigned char *counter)
|
||||
{
|
||||
u32 n = 12, c = 1;
|
||||
|
||||
do {
|
||||
--n;
|
||||
c += counter[n];
|
||||
counter[n] = (u8)c;
|
||||
c >>= 8;
|
||||
} while (n);
|
||||
}
|
||||
|
||||
void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16],
|
||||
unsigned char ecount_buf[16],
|
||||
unsigned int *num, ctr128_f func)
|
||||
{
|
||||
unsigned int n, ctr32;
|
||||
|
||||
n = *num;
|
||||
|
||||
while (n && len) {
|
||||
*(out++) = *(in++) ^ ecount_buf[n];
|
||||
--len;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
|
||||
ctr32 = GETU32(ivec + 12);
|
||||
while (len >= 16) {
|
||||
size_t blocks = len / 16;
|
||||
/*
|
||||
* 1<<28 is just a not-so-small yet not-so-large number...
|
||||
* Below condition is practically never met, but it has to
|
||||
* be checked for code correctness.
|
||||
*/
|
||||
if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28))
|
||||
blocks = (1U << 28);
|
||||
/*
|
||||
* As (*func) operates on 32-bit counter, caller
|
||||
* has to handle overflow. 'if' below detects the
|
||||
* overflow, which is then handled by limiting the
|
||||
* amount of blocks to the exact overflow point...
|
||||
*/
|
||||
ctr32 += (u32)blocks;
|
||||
if (ctr32 < blocks) {
|
||||
blocks -= ctr32;
|
||||
ctr32 = 0;
|
||||
}
|
||||
(*func) (in, out, blocks, key, ivec);
|
||||
/* (*ctr) does not update ivec, caller does: */
|
||||
PUTU32(ivec + 12, ctr32);
|
||||
/* ... overflow was detected, propagate carry. */
|
||||
if (ctr32 == 0)
|
||||
ctr96_inc(ivec);
|
||||
blocks *= 16;
|
||||
len -= blocks;
|
||||
out += blocks;
|
||||
in += blocks;
|
||||
}
|
||||
if (len) {
|
||||
memset(ecount_buf, 0, 16);
|
||||
(*func) (ecount_buf, ecount_buf, 1, key, ivec);
|
||||
++ctr32;
|
||||
PUTU32(ivec + 12, ctr32);
|
||||
if (ctr32 == 0)
|
||||
ctr96_inc(ivec);
|
||||
while (len--) {
|
||||
out[n] = in[n] ^ ecount_buf[n];
|
||||
++n;
|
||||
}
|
||||
}
|
||||
|
||||
*num = n;
|
||||
}
|
330
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cts128.c
vendored
Normal file
330
trunk/3rdparty/openssl-1.1-fit/crypto/modes/cts128.c
vendored
Normal file
|
@ -0,0 +1,330 @@
|
|||
/*
|
||||
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Trouble with Ciphertext Stealing, CTS, mode is that there is no
|
||||
* common official specification, but couple of cipher/application
|
||||
* specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
|
||||
* Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
|
||||
* deviates from mentioned RFCs. Most notably it allows input to be
|
||||
* of block length and it doesn't flip the order of the last two
|
||||
* blocks. CTS is being discussed even in ECB context, but it's not
|
||||
* adopted for any known application. This implementation provides
|
||||
* two interfaces: one compliant with above mentioned RFCs and one
|
||||
* compliant with the NIST proposal, both extending CBC mode.
|
||||
*/
|
||||
|
||||
size_t CRYPTO_cts128_encrypt_block(const unsigned char *in,
|
||||
unsigned char *out, size_t len,
|
||||
const void *key, unsigned char ivec[16],
|
||||
block128_f block)
|
||||
{
|
||||
size_t residue, n;
|
||||
|
||||
if (len <= 16)
|
||||
return 0;
|
||||
|
||||
if ((residue = len % 16) == 0)
|
||||
residue = 16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
for (n = 0; n < residue; ++n)
|
||||
ivec[n] ^= in[n];
|
||||
(*block) (ivec, ivec, key);
|
||||
memcpy(out, out - 16, residue);
|
||||
memcpy(out - 16, ivec, 16);
|
||||
|
||||
return len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in,
|
||||
unsigned char *out, size_t len,
|
||||
const void *key,
|
||||
unsigned char ivec[16],
|
||||
block128_f block)
|
||||
{
|
||||
size_t residue, n;
|
||||
|
||||
if (len < 16)
|
||||
return 0;
|
||||
|
||||
residue = len % 16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
CRYPTO_cbc128_encrypt(in, out, len, key, ivec, block);
|
||||
|
||||
if (residue == 0)
|
||||
return len;
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
for (n = 0; n < residue; ++n)
|
||||
ivec[n] ^= in[n];
|
||||
(*block) (ivec, ivec, key);
|
||||
memcpy(out - 16 + residue, ivec, 16);
|
||||
|
||||
return len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{
|
||||
size_t residue;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[16];
|
||||
} tmp;
|
||||
|
||||
if (len <= 16)
|
||||
return 0;
|
||||
|
||||
if ((residue = len % 16) == 0)
|
||||
residue = 16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
(*cbc) (in, out, len, key, ivec, 1);
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
memcpy(tmp.c, out - 16, 16);
|
||||
(*cbc) (in, out - 16, residue, key, ivec, 1);
|
||||
memcpy(out, tmp.c, residue);
|
||||
#else
|
||||
memset(tmp.c, 0, sizeof(tmp));
|
||||
memcpy(tmp.c, in, residue);
|
||||
memcpy(out, out - 16, residue);
|
||||
(*cbc) (tmp.c, out - 16, 16, key, ivec, 1);
|
||||
#endif
|
||||
return len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{
|
||||
size_t residue;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[16];
|
||||
} tmp;
|
||||
|
||||
if (len < 16)
|
||||
return 0;
|
||||
|
||||
residue = len % 16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
(*cbc) (in, out, len, key, ivec, 1);
|
||||
|
||||
if (residue == 0)
|
||||
return len;
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
(*cbc) (in, out - 16 + residue, residue, key, ivec, 1);
|
||||
#else
|
||||
memset(tmp.c, 0, sizeof(tmp));
|
||||
memcpy(tmp.c, in, residue);
|
||||
(*cbc) (tmp.c, out - 16 + residue, 16, key, ivec, 1);
|
||||
#endif
|
||||
return len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_decrypt_block(const unsigned char *in,
|
||||
unsigned char *out, size_t len,
|
||||
const void *key, unsigned char ivec[16],
|
||||
block128_f block)
|
||||
{
|
||||
size_t residue, n;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[32];
|
||||
} tmp;
|
||||
|
||||
if (len <= 16)
|
||||
return 0;
|
||||
|
||||
if ((residue = len % 16) == 0)
|
||||
residue = 16;
|
||||
|
||||
len -= 16 + residue;
|
||||
|
||||
if (len) {
|
||||
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
(*block) (in, tmp.c + 16, key);
|
||||
|
||||
memcpy(tmp.c, tmp.c + 16, 16);
|
||||
memcpy(tmp.c, in + 16, residue);
|
||||
(*block) (tmp.c, tmp.c, key);
|
||||
|
||||
for (n = 0; n < 16; ++n) {
|
||||
unsigned char c = in[n];
|
||||
out[n] = tmp.c[n] ^ ivec[n];
|
||||
ivec[n] = c;
|
||||
}
|
||||
for (residue += 16; n < residue; ++n)
|
||||
out[n] = tmp.c[n] ^ in[n];
|
||||
|
||||
return 16 + len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in,
|
||||
unsigned char *out, size_t len,
|
||||
const void *key,
|
||||
unsigned char ivec[16],
|
||||
block128_f block)
|
||||
{
|
||||
size_t residue, n;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[32];
|
||||
} tmp;
|
||||
|
||||
if (len < 16)
|
||||
return 0;
|
||||
|
||||
residue = len % 16;
|
||||
|
||||
if (residue == 0) {
|
||||
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
|
||||
return len;
|
||||
}
|
||||
|
||||
len -= 16 + residue;
|
||||
|
||||
if (len) {
|
||||
CRYPTO_cbc128_decrypt(in, out, len, key, ivec, block);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
(*block) (in + residue, tmp.c + 16, key);
|
||||
|
||||
memcpy(tmp.c, tmp.c + 16, 16);
|
||||
memcpy(tmp.c, in, residue);
|
||||
(*block) (tmp.c, tmp.c, key);
|
||||
|
||||
for (n = 0; n < 16; ++n) {
|
||||
unsigned char c = in[n];
|
||||
out[n] = tmp.c[n] ^ ivec[n];
|
||||
ivec[n] = in[n + residue];
|
||||
tmp.c[n] = c;
|
||||
}
|
||||
for (residue += 16; n < residue; ++n)
|
||||
out[n] = tmp.c[n] ^ tmp.c[n - 16];
|
||||
|
||||
return 16 + len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{
|
||||
size_t residue;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[32];
|
||||
} tmp;
|
||||
|
||||
if (len <= 16)
|
||||
return 0;
|
||||
|
||||
if ((residue = len % 16) == 0)
|
||||
residue = 16;
|
||||
|
||||
len -= 16 + residue;
|
||||
|
||||
if (len) {
|
||||
(*cbc) (in, out, len, key, ivec, 0);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
memset(tmp.c, 0, sizeof(tmp));
|
||||
/*
|
||||
* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
|
||||
*/
|
||||
(*cbc) (in, tmp.c, 16, key, tmp.c + 16, 0);
|
||||
|
||||
memcpy(tmp.c, in + 16, residue);
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
(*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
|
||||
#else
|
||||
(*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
|
||||
memcpy(out, tmp.c, 16 + residue);
|
||||
#endif
|
||||
return 16 + len + residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{
|
||||
size_t residue;
|
||||
union {
|
||||
size_t align;
|
||||
unsigned char c[32];
|
||||
} tmp;
|
||||
|
||||
if (len < 16)
|
||||
return 0;
|
||||
|
||||
residue = len % 16;
|
||||
|
||||
if (residue == 0) {
|
||||
(*cbc) (in, out, len, key, ivec, 0);
|
||||
return len;
|
||||
}
|
||||
|
||||
len -= 16 + residue;
|
||||
|
||||
if (len) {
|
||||
(*cbc) (in, out, len, key, ivec, 0);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
memset(tmp.c, 0, sizeof(tmp));
|
||||
/*
|
||||
* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0]
|
||||
*/
|
||||
(*cbc) (in + residue, tmp.c, 16, key, tmp.c + 16, 0);
|
||||
|
||||
memcpy(tmp.c, in, residue);
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
(*cbc) (tmp.c, out, 16 + residue, key, ivec, 0);
|
||||
#else
|
||||
(*cbc) (tmp.c, tmp.c, 32, key, ivec, 0);
|
||||
memcpy(out, tmp.c, 16 + residue);
|
||||
#endif
|
||||
return 16 + len + residue;
|
||||
}
|
1888
trunk/3rdparty/openssl-1.1-fit/crypto/modes/gcm128.c
vendored
Normal file
1888
trunk/3rdparty/openssl-1.1-fit/crypto/modes/gcm128.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
190
trunk/3rdparty/openssl-1.1-fit/crypto/modes/modes_lcl.h
vendored
Normal file
190
trunk/3rdparty/openssl-1.1-fit/crypto/modes/modes_lcl.h
vendored
Normal file
|
@ -0,0 +1,190 @@
|
|||
/*
|
||||
* Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/modes.h>
|
||||
|
||||
#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
|
||||
typedef __int64 i64;
|
||||
typedef unsigned __int64 u64;
|
||||
# define U64(C) C##UI64
|
||||
#elif defined(__arch64__)
|
||||
typedef long i64;
|
||||
typedef unsigned long u64;
|
||||
# define U64(C) C##UL
|
||||
#else
|
||||
typedef long long i64;
|
||||
typedef unsigned long long u64;
|
||||
# define U64(C) C##ULL
|
||||
#endif
|
||||
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#define STRICT_ALIGNMENT 1
|
||||
#ifndef PEDANTIC
|
||||
# if defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
||||
defined(__aarch64__) || \
|
||||
defined(__s390__) || defined(__s390x__)
|
||||
# undef STRICT_ALIGNMENT
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
|
||||
# if defined(__GNUC__) && __GNUC__>=2
|
||||
# if defined(__x86_64) || defined(__x86_64__)
|
||||
# define BSWAP8(x) ({ u64 ret_=(x); \
|
||||
asm ("bswapq %0" \
|
||||
: "+r"(ret_)); ret_; })
|
||||
# define BSWAP4(x) ({ u32 ret_=(x); \
|
||||
asm ("bswapl %0" \
|
||||
: "+r"(ret_)); ret_; })
|
||||
# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
|
||||
# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
|
||||
asm ("bswapl %0; bswapl %1" \
|
||||
: "+r"(hi_),"+r"(lo_)); \
|
||||
(u64)hi_<<32|lo_; })
|
||||
# define BSWAP4(x) ({ u32 ret_=(x); \
|
||||
asm ("bswapl %0" \
|
||||
: "+r"(ret_)); ret_; })
|
||||
# elif defined(__aarch64__)
|
||||
# define BSWAP8(x) ({ u64 ret_; \
|
||||
asm ("rev %0,%1" \
|
||||
: "=r"(ret_) : "r"(x)); ret_; })
|
||||
# define BSWAP4(x) ({ u32 ret_; \
|
||||
asm ("rev %w0,%w1" \
|
||||
: "=r"(ret_) : "r"(x)); ret_; })
|
||||
# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
|
||||
# define BSWAP8(x) ({ u32 lo_=(u64)(x)>>32,hi_=(x); \
|
||||
asm ("rev %0,%0; rev %1,%1" \
|
||||
: "+r"(hi_),"+r"(lo_)); \
|
||||
(u64)hi_<<32|lo_; })
|
||||
# define BSWAP4(x) ({ u32 ret_; \
|
||||
asm ("rev %0,%1" \
|
||||
: "=r"(ret_) : "r"((u32)(x))); \
|
||||
ret_; })
|
||||
# endif
|
||||
# elif defined(_MSC_VER)
|
||||
# if _MSC_VER>=1300
|
||||
# include <stdlib.h>
|
||||
# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
|
||||
# define BSWAP8(x) _byteswap_uint64((u64)(x))
|
||||
# define BSWAP4(x) _byteswap_ulong((u32)(x))
|
||||
# elif defined(_M_IX86)
|
||||
__inline u32 _bswap4(u32 val)
|
||||
{
|
||||
_asm mov eax, val _asm bswap eax}
|
||||
# define BSWAP4(x) _bswap4(x)
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
|
||||
# define GETU32(p) BSWAP4(*(const u32 *)(p))
|
||||
# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
|
||||
#else
|
||||
# define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
|
||||
# define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
|
||||
#endif
|
||||
/*- GCM definitions */ typedef struct {
|
||||
u64 hi, lo;
|
||||
} u128;
|
||||
|
||||
#ifdef TABLE_BITS
|
||||
# undef TABLE_BITS
|
||||
#endif
|
||||
/*
|
||||
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
|
||||
* never be set to 8 [or 1]. For further information see gcm128.c.
|
||||
*/
|
||||
#define TABLE_BITS 4
|
||||
|
||||
struct gcm128_context {
|
||||
/* Following 6 names follow names in GCM specification */
|
||||
union {
|
||||
u64 u[2];
|
||||
u32 d[4];
|
||||
u8 c[16];
|
||||
size_t t[16 / sizeof(size_t)];
|
||||
} Yi, EKi, EK0, len, Xi, H;
|
||||
/*
|
||||
* Relative position of Xi, H and pre-computed Htable is used in some
|
||||
* assembler modules, i.e. don't change the order!
|
||||
*/
|
||||
#if TABLE_BITS==8
|
||||
u128 Htable[256];
|
||||
#else
|
||||
u128 Htable[16];
|
||||
void (*gmult) (u64 Xi[2], const u128 Htable[16]);
|
||||
void (*ghash) (u64 Xi[2], const u128 Htable[16], const u8 *inp,
|
||||
size_t len);
|
||||
#endif
|
||||
unsigned int mres, ares;
|
||||
block128_f block;
|
||||
void *key;
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
unsigned char Xn[48];
|
||||
#endif
|
||||
};
|
||||
|
||||
struct xts128_context {
|
||||
void *key1, *key2;
|
||||
block128_f block1, block2;
|
||||
};
|
||||
|
||||
struct ccm128_context {
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} nonce, cmac;
|
||||
u64 blocks;
|
||||
block128_f block;
|
||||
void *key;
|
||||
};
|
||||
|
||||
#ifndef OPENSSL_NO_OCB
|
||||
|
||||
typedef union {
|
||||
u64 a[2];
|
||||
unsigned char c[16];
|
||||
} OCB_BLOCK;
|
||||
# define ocb_block16_xor(in1,in2,out) \
|
||||
( (out)->a[0]=(in1)->a[0]^(in2)->a[0], \
|
||||
(out)->a[1]=(in1)->a[1]^(in2)->a[1] )
|
||||
# if STRICT_ALIGNMENT
|
||||
# define ocb_block16_xor_misaligned(in1,in2,out) \
|
||||
ocb_block_xor((in1)->c,(in2)->c,16,(out)->c)
|
||||
# else
|
||||
# define ocb_block16_xor_misaligned ocb_block16_xor
|
||||
# endif
|
||||
|
||||
struct ocb128_context {
|
||||
/* Need both encrypt and decrypt key schedules for decryption */
|
||||
block128_f encrypt;
|
||||
block128_f decrypt;
|
||||
void *keyenc;
|
||||
void *keydec;
|
||||
ocb128_f stream; /* direction dependent */
|
||||
/* Key dependent variables. Can be reused if key remains the same */
|
||||
size_t l_index;
|
||||
size_t max_l_index;
|
||||
OCB_BLOCK l_star;
|
||||
OCB_BLOCK l_dollar;
|
||||
OCB_BLOCK *l;
|
||||
/* Must be reset for each session */
|
||||
struct {
|
||||
u64 blocks_hashed;
|
||||
u64 blocks_processed;
|
||||
OCB_BLOCK offset_aad;
|
||||
OCB_BLOCK sum;
|
||||
OCB_BLOCK offset;
|
||||
OCB_BLOCK checksum;
|
||||
} sess;
|
||||
};
|
||||
#endif /* OPENSSL_NO_OCB */
|
562
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ocb128.c
vendored
Normal file
562
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ocb128.c
vendored
Normal file
|
@ -0,0 +1,562 @@
|
|||
/*
|
||||
* Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <openssl/crypto.h>
|
||||
#include <openssl/err.h>
|
||||
#include "modes_lcl.h"
|
||||
|
||||
#ifndef OPENSSL_NO_OCB
|
||||
|
||||
/*
|
||||
* Calculate the number of binary trailing zero's in any given number
|
||||
*/
|
||||
static u32 ocb_ntz(u64 n)
|
||||
{
|
||||
u32 cnt = 0;
|
||||
|
||||
/*
|
||||
* We do a right-to-left simple sequential search. This is surprisingly
|
||||
* efficient as the distribution of trailing zeros is not uniform,
|
||||
* e.g. the number of possible inputs with no trailing zeros is equal to
|
||||
* the number with 1 or more; the number with exactly 1 is equal to the
|
||||
* number with 2 or more, etc. Checking the last two bits covers 75% of
|
||||
* all numbers. Checking the last three covers 87.5%
|
||||
*/
|
||||
while (!(n & 1)) {
|
||||
n >>= 1;
|
||||
cnt++;
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Shift a block of 16 bytes left by shift bits
|
||||
*/
|
||||
static void ocb_block_lshift(const unsigned char *in, size_t shift,
|
||||
unsigned char *out)
|
||||
{
|
||||
int i;
|
||||
unsigned char carry = 0, carry_next;
|
||||
|
||||
for (i = 15; i >= 0; i--) {
|
||||
carry_next = in[i] >> (8 - shift);
|
||||
out[i] = (in[i] << shift) | carry;
|
||||
carry = carry_next;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform a "double" operation as per OCB spec
|
||||
*/
|
||||
static void ocb_double(OCB_BLOCK *in, OCB_BLOCK *out)
|
||||
{
|
||||
unsigned char mask;
|
||||
|
||||
/*
|
||||
* Calculate the mask based on the most significant bit. There are more
|
||||
* efficient ways to do this - but this way is constant time
|
||||
*/
|
||||
mask = in->c[0] & 0x80;
|
||||
mask >>= 7;
|
||||
mask = (0 - mask) & 0x87;
|
||||
|
||||
ocb_block_lshift(in->c, 1, out->c);
|
||||
|
||||
out->c[15] ^= mask;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform an xor on in1 and in2 - each of len bytes. Store result in out
|
||||
*/
|
||||
static void ocb_block_xor(const unsigned char *in1,
|
||||
const unsigned char *in2, size_t len,
|
||||
unsigned char *out)
|
||||
{
|
||||
size_t i;
|
||||
for (i = 0; i < len; i++) {
|
||||
out[i] = in1[i] ^ in2[i];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup L_index in our lookup table. If we haven't already got it we need to
|
||||
* calculate it
|
||||
*/
|
||||
static OCB_BLOCK *ocb_lookup_l(OCB128_CONTEXT *ctx, size_t idx)
|
||||
{
|
||||
size_t l_index = ctx->l_index;
|
||||
|
||||
if (idx <= l_index) {
|
||||
return ctx->l + idx;
|
||||
}
|
||||
|
||||
/* We don't have it - so calculate it */
|
||||
if (idx >= ctx->max_l_index) {
|
||||
void *tmp_ptr;
|
||||
/*
|
||||
* Each additional entry allows to process almost double as
|
||||
* much data, so that in linear world the table will need to
|
||||
* be expanded with smaller and smaller increments. Originally
|
||||
* it was doubling in size, which was a waste. Growing it
|
||||
* linearly is not formally optimal, but is simpler to implement.
|
||||
* We grow table by minimally required 4*n that would accommodate
|
||||
* the index.
|
||||
*/
|
||||
ctx->max_l_index += (idx - ctx->max_l_index + 4) & ~3;
|
||||
tmp_ptr = OPENSSL_realloc(ctx->l, ctx->max_l_index * sizeof(OCB_BLOCK));
|
||||
if (tmp_ptr == NULL) /* prevent ctx->l from being clobbered */
|
||||
return NULL;
|
||||
ctx->l = tmp_ptr;
|
||||
}
|
||||
while (l_index < idx) {
|
||||
ocb_double(ctx->l + l_index, ctx->l + l_index + 1);
|
||||
l_index++;
|
||||
}
|
||||
ctx->l_index = l_index;
|
||||
|
||||
return ctx->l + idx;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a new OCB128_CONTEXT
|
||||
*/
|
||||
OCB128_CONTEXT *CRYPTO_ocb128_new(void *keyenc, void *keydec,
|
||||
block128_f encrypt, block128_f decrypt,
|
||||
ocb128_f stream)
|
||||
{
|
||||
OCB128_CONTEXT *octx;
|
||||
int ret;
|
||||
|
||||
if ((octx = OPENSSL_malloc(sizeof(*octx))) != NULL) {
|
||||
ret = CRYPTO_ocb128_init(octx, keyenc, keydec, encrypt, decrypt,
|
||||
stream);
|
||||
if (ret)
|
||||
return octx;
|
||||
OPENSSL_free(octx);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialise an existing OCB128_CONTEXT
|
||||
*/
|
||||
int CRYPTO_ocb128_init(OCB128_CONTEXT *ctx, void *keyenc, void *keydec,
|
||||
block128_f encrypt, block128_f decrypt,
|
||||
ocb128_f stream)
|
||||
{
|
||||
memset(ctx, 0, sizeof(*ctx));
|
||||
ctx->l_index = 0;
|
||||
ctx->max_l_index = 5;
|
||||
if ((ctx->l = OPENSSL_malloc(ctx->max_l_index * 16)) == NULL) {
|
||||
CRYPTOerr(CRYPTO_F_CRYPTO_OCB128_INIT, ERR_R_MALLOC_FAILURE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We set both the encryption and decryption key schedules - decryption
|
||||
* needs both. Don't really need decryption schedule if only doing
|
||||
* encryption - but it simplifies things to take it anyway
|
||||
*/
|
||||
ctx->encrypt = encrypt;
|
||||
ctx->decrypt = decrypt;
|
||||
ctx->stream = stream;
|
||||
ctx->keyenc = keyenc;
|
||||
ctx->keydec = keydec;
|
||||
|
||||
/* L_* = ENCIPHER(K, zeros(128)) */
|
||||
ctx->encrypt(ctx->l_star.c, ctx->l_star.c, ctx->keyenc);
|
||||
|
||||
/* L_$ = double(L_*) */
|
||||
ocb_double(&ctx->l_star, &ctx->l_dollar);
|
||||
|
||||
/* L_0 = double(L_$) */
|
||||
ocb_double(&ctx->l_dollar, ctx->l);
|
||||
|
||||
/* L_{i} = double(L_{i-1}) */
|
||||
ocb_double(ctx->l, ctx->l+1);
|
||||
ocb_double(ctx->l+1, ctx->l+2);
|
||||
ocb_double(ctx->l+2, ctx->l+3);
|
||||
ocb_double(ctx->l+3, ctx->l+4);
|
||||
ctx->l_index = 4; /* enough to process up to 496 bytes */
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy an OCB128_CONTEXT object
|
||||
*/
|
||||
int CRYPTO_ocb128_copy_ctx(OCB128_CONTEXT *dest, OCB128_CONTEXT *src,
|
||||
void *keyenc, void *keydec)
|
||||
{
|
||||
memcpy(dest, src, sizeof(OCB128_CONTEXT));
|
||||
if (keyenc)
|
||||
dest->keyenc = keyenc;
|
||||
if (keydec)
|
||||
dest->keydec = keydec;
|
||||
if (src->l) {
|
||||
if ((dest->l = OPENSSL_malloc(src->max_l_index * 16)) == NULL) {
|
||||
CRYPTOerr(CRYPTO_F_CRYPTO_OCB128_COPY_CTX, ERR_R_MALLOC_FAILURE);
|
||||
return 0;
|
||||
}
|
||||
memcpy(dest->l, src->l, (src->l_index + 1) * 16);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the IV to be used for this operation. Must be 1 - 15 bytes.
|
||||
*/
|
||||
int CRYPTO_ocb128_setiv(OCB128_CONTEXT *ctx, const unsigned char *iv,
|
||||
size_t len, size_t taglen)
|
||||
{
|
||||
unsigned char ktop[16], tmp[16], mask;
|
||||
unsigned char stretch[24], nonce[16];
|
||||
size_t bottom, shift;
|
||||
|
||||
/*
|
||||
* Spec says IV is 120 bits or fewer - it allows non byte aligned lengths.
|
||||
* We don't support this at this stage
|
||||
*/
|
||||
if ((len > 15) || (len < 1) || (taglen > 16) || (taglen < 1)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Reset nonce-dependent variables */
|
||||
memset(&ctx->sess, 0, sizeof(ctx->sess));
|
||||
|
||||
/* Nonce = num2str(TAGLEN mod 128,7) || zeros(120-bitlen(N)) || 1 || N */
|
||||
nonce[0] = ((taglen * 8) % 128) << 1;
|
||||
memset(nonce + 1, 0, 15);
|
||||
memcpy(nonce + 16 - len, iv, len);
|
||||
nonce[15 - len] |= 1;
|
||||
|
||||
/* Ktop = ENCIPHER(K, Nonce[1..122] || zeros(6)) */
|
||||
memcpy(tmp, nonce, 16);
|
||||
tmp[15] &= 0xc0;
|
||||
ctx->encrypt(tmp, ktop, ctx->keyenc);
|
||||
|
||||
/* Stretch = Ktop || (Ktop[1..64] xor Ktop[9..72]) */
|
||||
memcpy(stretch, ktop, 16);
|
||||
ocb_block_xor(ktop, ktop + 1, 8, stretch + 16);
|
||||
|
||||
/* bottom = str2num(Nonce[123..128]) */
|
||||
bottom = nonce[15] & 0x3f;
|
||||
|
||||
/* Offset_0 = Stretch[1+bottom..128+bottom] */
|
||||
shift = bottom % 8;
|
||||
ocb_block_lshift(stretch + (bottom / 8), shift, ctx->sess.offset.c);
|
||||
mask = 0xff;
|
||||
mask <<= 8 - shift;
|
||||
ctx->sess.offset.c[15] |=
|
||||
(*(stretch + (bottom / 8) + 16) & mask) >> (8 - shift);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Provide any AAD. This can be called multiple times. Only the final time can
|
||||
* have a partial block
|
||||
*/
|
||||
int CRYPTO_ocb128_aad(OCB128_CONTEXT *ctx, const unsigned char *aad,
|
||||
size_t len)
|
||||
{
|
||||
u64 i, all_num_blocks;
|
||||
size_t num_blocks, last_len;
|
||||
OCB_BLOCK tmp;
|
||||
|
||||
/* Calculate the number of blocks of AAD provided now, and so far */
|
||||
num_blocks = len / 16;
|
||||
all_num_blocks = num_blocks + ctx->sess.blocks_hashed;
|
||||
|
||||
/* Loop through all full blocks of AAD */
|
||||
for (i = ctx->sess.blocks_hashed + 1; i <= all_num_blocks; i++) {
|
||||
OCB_BLOCK *lookup;
|
||||
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
lookup = ocb_lookup_l(ctx, ocb_ntz(i));
|
||||
if (lookup == NULL)
|
||||
return 0;
|
||||
ocb_block16_xor(&ctx->sess.offset_aad, lookup, &ctx->sess.offset_aad);
|
||||
|
||||
memcpy(tmp.c, aad, 16);
|
||||
aad += 16;
|
||||
|
||||
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
|
||||
ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
|
||||
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
|
||||
ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we have any partial blocks left over. This is only valid in the
|
||||
* last call to this function
|
||||
*/
|
||||
last_len = len % 16;
|
||||
|
||||
if (last_len > 0) {
|
||||
/* Offset_* = Offset_m xor L_* */
|
||||
ocb_block16_xor(&ctx->sess.offset_aad, &ctx->l_star,
|
||||
&ctx->sess.offset_aad);
|
||||
|
||||
/* CipherInput = (A_* || 1 || zeros(127-bitlen(A_*))) xor Offset_* */
|
||||
memset(tmp.c, 0, 16);
|
||||
memcpy(tmp.c, aad, last_len);
|
||||
tmp.c[last_len] = 0x80;
|
||||
ocb_block16_xor(&ctx->sess.offset_aad, &tmp, &tmp);
|
||||
|
||||
/* Sum = Sum_m xor ENCIPHER(K, CipherInput) */
|
||||
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
|
||||
ocb_block16_xor(&tmp, &ctx->sess.sum, &ctx->sess.sum);
|
||||
}
|
||||
|
||||
ctx->sess.blocks_hashed = all_num_blocks;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Provide any data to be encrypted. This can be called multiple times. Only
|
||||
* the final time can have a partial block
|
||||
*/
|
||||
int CRYPTO_ocb128_encrypt(OCB128_CONTEXT *ctx,
|
||||
const unsigned char *in, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
u64 i, all_num_blocks;
|
||||
size_t num_blocks, last_len;
|
||||
|
||||
/*
|
||||
* Calculate the number of blocks of data to be encrypted provided now, and
|
||||
* so far
|
||||
*/
|
||||
num_blocks = len / 16;
|
||||
all_num_blocks = num_blocks + ctx->sess.blocks_processed;
|
||||
|
||||
if (num_blocks && all_num_blocks == (size_t)all_num_blocks
|
||||
&& ctx->stream != NULL) {
|
||||
size_t max_idx = 0, top = (size_t)all_num_blocks;
|
||||
|
||||
/*
|
||||
* See how many L_{i} entries we need to process data at hand
|
||||
* and pre-compute missing entries in the table [if any]...
|
||||
*/
|
||||
while (top >>= 1)
|
||||
max_idx++;
|
||||
if (ocb_lookup_l(ctx, max_idx) == NULL)
|
||||
return 0;
|
||||
|
||||
ctx->stream(in, out, num_blocks, ctx->keyenc,
|
||||
(size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
|
||||
(const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
|
||||
} else {
|
||||
/* Loop through all full blocks to be encrypted */
|
||||
for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
|
||||
OCB_BLOCK *lookup;
|
||||
OCB_BLOCK tmp;
|
||||
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
lookup = ocb_lookup_l(ctx, ocb_ntz(i));
|
||||
if (lookup == NULL)
|
||||
return 0;
|
||||
ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
|
||||
|
||||
memcpy(tmp.c, in, 16);
|
||||
in += 16;
|
||||
|
||||
/* Checksum_i = Checksum_{i-1} xor P_i */
|
||||
ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
|
||||
|
||||
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
|
||||
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
|
||||
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
|
||||
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
|
||||
|
||||
memcpy(out, tmp.c, 16);
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we have any partial blocks left over. This is only valid in the
|
||||
* last call to this function
|
||||
*/
|
||||
last_len = len % 16;
|
||||
|
||||
if (last_len > 0) {
|
||||
OCB_BLOCK pad;
|
||||
|
||||
/* Offset_* = Offset_m xor L_* */
|
||||
ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
|
||||
|
||||
/* Pad = ENCIPHER(K, Offset_*) */
|
||||
ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
|
||||
|
||||
/* C_* = P_* xor Pad[1..bitlen(P_*)] */
|
||||
ocb_block_xor(in, pad.c, last_len, out);
|
||||
|
||||
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
|
||||
memset(pad.c, 0, 16); /* borrow pad */
|
||||
memcpy(pad.c, in, last_len);
|
||||
pad.c[last_len] = 0x80;
|
||||
ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
|
||||
}
|
||||
|
||||
ctx->sess.blocks_processed = all_num_blocks;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Provide any data to be decrypted. This can be called multiple times. Only
|
||||
* the final time can have a partial block
|
||||
*/
|
||||
int CRYPTO_ocb128_decrypt(OCB128_CONTEXT *ctx,
|
||||
const unsigned char *in, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
u64 i, all_num_blocks;
|
||||
size_t num_blocks, last_len;
|
||||
|
||||
/*
|
||||
* Calculate the number of blocks of data to be decrypted provided now, and
|
||||
* so far
|
||||
*/
|
||||
num_blocks = len / 16;
|
||||
all_num_blocks = num_blocks + ctx->sess.blocks_processed;
|
||||
|
||||
if (num_blocks && all_num_blocks == (size_t)all_num_blocks
|
||||
&& ctx->stream != NULL) {
|
||||
size_t max_idx = 0, top = (size_t)all_num_blocks;
|
||||
|
||||
/*
|
||||
* See how many L_{i} entries we need to process data at hand
|
||||
* and pre-compute missing entries in the table [if any]...
|
||||
*/
|
||||
while (top >>= 1)
|
||||
max_idx++;
|
||||
if (ocb_lookup_l(ctx, max_idx) == NULL)
|
||||
return 0;
|
||||
|
||||
ctx->stream(in, out, num_blocks, ctx->keydec,
|
||||
(size_t)ctx->sess.blocks_processed + 1, ctx->sess.offset.c,
|
||||
(const unsigned char (*)[16])ctx->l, ctx->sess.checksum.c);
|
||||
} else {
|
||||
OCB_BLOCK tmp;
|
||||
|
||||
/* Loop through all full blocks to be decrypted */
|
||||
for (i = ctx->sess.blocks_processed + 1; i <= all_num_blocks; i++) {
|
||||
|
||||
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
|
||||
OCB_BLOCK *lookup = ocb_lookup_l(ctx, ocb_ntz(i));
|
||||
if (lookup == NULL)
|
||||
return 0;
|
||||
ocb_block16_xor(&ctx->sess.offset, lookup, &ctx->sess.offset);
|
||||
|
||||
memcpy(tmp.c, in, 16);
|
||||
in += 16;
|
||||
|
||||
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
|
||||
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
|
||||
ctx->decrypt(tmp.c, tmp.c, ctx->keydec);
|
||||
ocb_block16_xor(&ctx->sess.offset, &tmp, &tmp);
|
||||
|
||||
/* Checksum_i = Checksum_{i-1} xor P_i */
|
||||
ocb_block16_xor(&tmp, &ctx->sess.checksum, &ctx->sess.checksum);
|
||||
|
||||
memcpy(out, tmp.c, 16);
|
||||
out += 16;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we have any partial blocks left over. This is only valid in the
|
||||
* last call to this function
|
||||
*/
|
||||
last_len = len % 16;
|
||||
|
||||
if (last_len > 0) {
|
||||
OCB_BLOCK pad;
|
||||
|
||||
/* Offset_* = Offset_m xor L_* */
|
||||
ocb_block16_xor(&ctx->sess.offset, &ctx->l_star, &ctx->sess.offset);
|
||||
|
||||
/* Pad = ENCIPHER(K, Offset_*) */
|
||||
ctx->encrypt(ctx->sess.offset.c, pad.c, ctx->keyenc);
|
||||
|
||||
/* P_* = C_* xor Pad[1..bitlen(C_*)] */
|
||||
ocb_block_xor(in, pad.c, last_len, out);
|
||||
|
||||
/* Checksum_* = Checksum_m xor (P_* || 1 || zeros(127-bitlen(P_*))) */
|
||||
memset(pad.c, 0, 16); /* borrow pad */
|
||||
memcpy(pad.c, out, last_len);
|
||||
pad.c[last_len] = 0x80;
|
||||
ocb_block16_xor(&pad, &ctx->sess.checksum, &ctx->sess.checksum);
|
||||
}
|
||||
|
||||
ctx->sess.blocks_processed = all_num_blocks;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int ocb_finish(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len,
|
||||
int write)
|
||||
{
|
||||
OCB_BLOCK tmp;
|
||||
|
||||
if (len > 16 || len < 1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Tag = ENCIPHER(K, Checksum_* xor Offset_* xor L_$) xor HASH(K,A)
|
||||
*/
|
||||
ocb_block16_xor(&ctx->sess.checksum, &ctx->sess.offset, &tmp);
|
||||
ocb_block16_xor(&ctx->l_dollar, &tmp, &tmp);
|
||||
ctx->encrypt(tmp.c, tmp.c, ctx->keyenc);
|
||||
ocb_block16_xor(&tmp, &ctx->sess.sum, &tmp);
|
||||
|
||||
if (write) {
|
||||
memcpy(tag, &tmp, len);
|
||||
return 1;
|
||||
} else {
|
||||
return CRYPTO_memcmp(&tmp, tag, len);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the tag and verify it against the supplied tag
|
||||
*/
|
||||
int CRYPTO_ocb128_finish(OCB128_CONTEXT *ctx, const unsigned char *tag,
|
||||
size_t len)
|
||||
{
|
||||
return ocb_finish(ctx, (unsigned char*)tag, len, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Retrieve the calculated tag
|
||||
*/
|
||||
int CRYPTO_ocb128_tag(OCB128_CONTEXT *ctx, unsigned char *tag, size_t len)
|
||||
{
|
||||
return ocb_finish(ctx, tag, len, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Release all resources
|
||||
*/
|
||||
void CRYPTO_ocb128_cleanup(OCB128_CONTEXT *ctx)
|
||||
{
|
||||
if (ctx) {
|
||||
OPENSSL_clear_free(ctx->l, ctx->max_l_index * 16);
|
||||
OPENSSL_cleanse(ctx, sizeof(*ctx));
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* OPENSSL_NO_OCB */
|
74
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ofb128.c
vendored
Normal file
74
trunk/3rdparty/openssl-1.1-fit/crypto/modes/ofb128.c
vendored
Normal file
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* The input and output encrypted as though 128bit ofb mode is being used.
|
||||
* The extra state information to record how much of the 128bit block we have
|
||||
* used is contained in *num;
|
||||
*/
|
||||
void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], int *num, block128_f block)
|
||||
{
|
||||
unsigned int n;
|
||||
size_t l = 0;
|
||||
|
||||
n = *num;
|
||||
|
||||
#if !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
if (16 % sizeof(size_t) == 0) { /* always true actually */
|
||||
do {
|
||||
while (n && len) {
|
||||
*(out++) = *(in++) ^ ivec[n];
|
||||
--len;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
# if defined(STRICT_ALIGNMENT)
|
||||
if (((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) !=
|
||||
0)
|
||||
break;
|
||||
# endif
|
||||
while (len >= 16) {
|
||||
(*block) (ivec, ivec, key);
|
||||
for (; n < 16; n += sizeof(size_t))
|
||||
*(size_t *)(out + n) =
|
||||
*(size_t *)(in + n) ^ *(size_t *)(ivec + n);
|
||||
len -= 16;
|
||||
out += 16;
|
||||
in += 16;
|
||||
n = 0;
|
||||
}
|
||||
if (len) {
|
||||
(*block) (ivec, ivec, key);
|
||||
while (len--) {
|
||||
out[n] = in[n] ^ ivec[n];
|
||||
++n;
|
||||
}
|
||||
}
|
||||
*num = n;
|
||||
return;
|
||||
} while (0);
|
||||
}
|
||||
/* the rest would be commonly eliminated by x86* compiler */
|
||||
#endif
|
||||
while (l < len) {
|
||||
if (n == 0) {
|
||||
(*block) (ivec, ivec, key);
|
||||
}
|
||||
out[l] = in[l] ^ ivec[n];
|
||||
++l;
|
||||
n = (n + 1) % 16;
|
||||
}
|
||||
|
||||
*num = n;
|
||||
}
|
331
trunk/3rdparty/openssl-1.1-fit/crypto/modes/wrap128.c
vendored
Normal file
331
trunk/3rdparty/openssl-1.1-fit/crypto/modes/wrap128.c
vendored
Normal file
|
@ -0,0 +1,331 @@
|
|||
/*
|
||||
* Copyright 2013-2018 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
/** Beware!
|
||||
*
|
||||
* Following wrapping modes were designed for AES but this implementation
|
||||
* allows you to use them for any 128 bit block cipher.
|
||||
*/
|
||||
|
||||
#include "internal/cryptlib.h"
|
||||
#include <openssl/modes.h>
|
||||
|
||||
/** RFC 3394 section 2.2.3.1 Default Initial Value */
|
||||
static const unsigned char default_iv[] = {
|
||||
0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6, 0xA6,
|
||||
};
|
||||
|
||||
/** RFC 5649 section 3 Alternative Initial Value 32-bit constant */
|
||||
static const unsigned char default_aiv[] = {
|
||||
0xA6, 0x59, 0x59, 0xA6
|
||||
};
|
||||
|
||||
/** Input size limit: lower than maximum of standards but far larger than
|
||||
* anything that will be used in practice.
|
||||
*/
|
||||
#define CRYPTO128_WRAP_MAX (1UL << 31)
|
||||
|
||||
/** Wrapping according to RFC 3394 section 2.2.1.
|
||||
*
|
||||
* @param[in] key Key value.
|
||||
* @param[in] iv IV value. Length = 8 bytes. NULL = use default_iv.
|
||||
* @param[in] in Plaintext as n 64-bit blocks, n >= 2.
|
||||
* @param[in] inlen Length of in.
|
||||
* @param[out] out Ciphertext. Minimal buffer length = (inlen + 8) bytes.
|
||||
* Input and output buffers can overlap if block function
|
||||
* supports that.
|
||||
* @param[in] block Block processing function.
|
||||
* @return 0 if inlen does not consist of n 64-bit blocks, n >= 2.
|
||||
* or if inlen > CRYPTO128_WRAP_MAX.
|
||||
* Output length if wrapping succeeded.
|
||||
*/
|
||||
size_t CRYPTO_128_wrap(void *key, const unsigned char *iv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, size_t inlen,
|
||||
block128_f block)
|
||||
{
|
||||
unsigned char *A, B[16], *R;
|
||||
size_t i, j, t;
|
||||
if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
|
||||
return 0;
|
||||
A = B;
|
||||
t = 1;
|
||||
memmove(out + 8, in, inlen);
|
||||
if (!iv)
|
||||
iv = default_iv;
|
||||
|
||||
memcpy(A, iv, 8);
|
||||
|
||||
for (j = 0; j < 6; j++) {
|
||||
R = out + 8;
|
||||
for (i = 0; i < inlen; i += 8, t++, R += 8) {
|
||||
memcpy(B + 8, R, 8);
|
||||
block(B, B, key);
|
||||
A[7] ^= (unsigned char)(t & 0xff);
|
||||
if (t > 0xff) {
|
||||
A[6] ^= (unsigned char)((t >> 8) & 0xff);
|
||||
A[5] ^= (unsigned char)((t >> 16) & 0xff);
|
||||
A[4] ^= (unsigned char)((t >> 24) & 0xff);
|
||||
}
|
||||
memcpy(R, B + 8, 8);
|
||||
}
|
||||
}
|
||||
memcpy(out, A, 8);
|
||||
return inlen + 8;
|
||||
}
|
||||
|
||||
/** Unwrapping according to RFC 3394 section 2.2.2 steps 1-2.
|
||||
* The IV check (step 3) is responsibility of the caller.
|
||||
*
|
||||
* @param[in] key Key value.
|
||||
* @param[out] iv Unchecked IV value. Minimal buffer length = 8 bytes.
|
||||
* @param[out] out Plaintext without IV.
|
||||
* Minimal buffer length = (inlen - 8) bytes.
|
||||
* Input and output buffers can overlap if block function
|
||||
* supports that.
|
||||
* @param[in] in Ciphertext as n 64-bit blocks.
|
||||
* @param[in] inlen Length of in.
|
||||
* @param[in] block Block processing function.
|
||||
* @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
|
||||
* or if inlen is not a multiple of 8.
|
||||
* Output length otherwise.
|
||||
*/
|
||||
static size_t crypto_128_unwrap_raw(void *key, unsigned char *iv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, size_t inlen,
|
||||
block128_f block)
|
||||
{
|
||||
unsigned char *A, B[16], *R;
|
||||
size_t i, j, t;
|
||||
inlen -= 8;
|
||||
if ((inlen & 0x7) || (inlen < 16) || (inlen > CRYPTO128_WRAP_MAX))
|
||||
return 0;
|
||||
A = B;
|
||||
t = 6 * (inlen >> 3);
|
||||
memcpy(A, in, 8);
|
||||
memmove(out, in + 8, inlen);
|
||||
for (j = 0; j < 6; j++) {
|
||||
R = out + inlen - 8;
|
||||
for (i = 0; i < inlen; i += 8, t--, R -= 8) {
|
||||
A[7] ^= (unsigned char)(t & 0xff);
|
||||
if (t > 0xff) {
|
||||
A[6] ^= (unsigned char)((t >> 8) & 0xff);
|
||||
A[5] ^= (unsigned char)((t >> 16) & 0xff);
|
||||
A[4] ^= (unsigned char)((t >> 24) & 0xff);
|
||||
}
|
||||
memcpy(B + 8, R, 8);
|
||||
block(B, B, key);
|
||||
memcpy(R, B + 8, 8);
|
||||
}
|
||||
}
|
||||
memcpy(iv, A, 8);
|
||||
return inlen;
|
||||
}
|
||||
|
||||
/** Unwrapping according to RFC 3394 section 2.2.2, including the IV check.
|
||||
* The first block of plaintext has to match the supplied IV, otherwise an
|
||||
* error is returned.
|
||||
*
|
||||
* @param[in] key Key value.
|
||||
* @param[out] iv IV value to match against. Length = 8 bytes.
|
||||
* NULL = use default_iv.
|
||||
* @param[out] out Plaintext without IV.
|
||||
* Minimal buffer length = (inlen - 8) bytes.
|
||||
* Input and output buffers can overlap if block function
|
||||
* supports that.
|
||||
* @param[in] in Ciphertext as n 64-bit blocks.
|
||||
* @param[in] inlen Length of in.
|
||||
* @param[in] block Block processing function.
|
||||
* @return 0 if inlen is out of range [24, CRYPTO128_WRAP_MAX]
|
||||
* or if inlen is not a multiple of 8
|
||||
* or if IV doesn't match expected value.
|
||||
* Output length otherwise.
|
||||
*/
|
||||
size_t CRYPTO_128_unwrap(void *key, const unsigned char *iv,
|
||||
unsigned char *out, const unsigned char *in,
|
||||
size_t inlen, block128_f block)
|
||||
{
|
||||
size_t ret;
|
||||
unsigned char got_iv[8];
|
||||
|
||||
ret = crypto_128_unwrap_raw(key, got_iv, out, in, inlen, block);
|
||||
if (ret == 0)
|
||||
return 0;
|
||||
|
||||
if (!iv)
|
||||
iv = default_iv;
|
||||
if (CRYPTO_memcmp(got_iv, iv, 8)) {
|
||||
OPENSSL_cleanse(out, ret);
|
||||
return 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Wrapping according to RFC 5649 section 4.1.
|
||||
*
|
||||
* @param[in] key Key value.
|
||||
* @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
|
||||
* @param[out] out Ciphertext. Minimal buffer length = (inlen + 15) bytes.
|
||||
* Input and output buffers can overlap if block function
|
||||
* supports that.
|
||||
* @param[in] in Plaintext as n 64-bit blocks, n >= 2.
|
||||
* @param[in] inlen Length of in.
|
||||
* @param[in] block Block processing function.
|
||||
* @return 0 if inlen is out of range [1, CRYPTO128_WRAP_MAX].
|
||||
* Output length if wrapping succeeded.
|
||||
*/
|
||||
size_t CRYPTO_128_wrap_pad(void *key, const unsigned char *icv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, size_t inlen,
|
||||
block128_f block)
|
||||
{
|
||||
/* n: number of 64-bit blocks in the padded key data
|
||||
*
|
||||
* If length of plain text is not a multiple of 8, pad the plain text octet
|
||||
* string on the right with octets of zeros, where final length is the
|
||||
* smallest multiple of 8 that is greater than length of plain text.
|
||||
* If length of plain text is a multiple of 8, then there is no padding. */
|
||||
const size_t blocks_padded = (inlen + 7) / 8; /* CEILING(m/8) */
|
||||
const size_t padded_len = blocks_padded * 8;
|
||||
const size_t padding_len = padded_len - inlen;
|
||||
/* RFC 5649 section 3: Alternative Initial Value */
|
||||
unsigned char aiv[8];
|
||||
int ret;
|
||||
|
||||
/* Section 1: use 32-bit fixed field for plaintext octet length */
|
||||
if (inlen == 0 || inlen >= CRYPTO128_WRAP_MAX)
|
||||
return 0;
|
||||
|
||||
/* Section 3: Alternative Initial Value */
|
||||
if (!icv)
|
||||
memcpy(aiv, default_aiv, 4);
|
||||
else
|
||||
memcpy(aiv, icv, 4); /* Standard doesn't mention this. */
|
||||
|
||||
aiv[4] = (inlen >> 24) & 0xFF;
|
||||
aiv[5] = (inlen >> 16) & 0xFF;
|
||||
aiv[6] = (inlen >> 8) & 0xFF;
|
||||
aiv[7] = inlen & 0xFF;
|
||||
|
||||
if (padded_len == 8) {
|
||||
/*
|
||||
* Section 4.1 - special case in step 2: If the padded plaintext
|
||||
* contains exactly eight octets, then prepend the AIV and encrypt
|
||||
* the resulting 128-bit block using AES in ECB mode.
|
||||
*/
|
||||
memmove(out + 8, in, inlen);
|
||||
memcpy(out, aiv, 8);
|
||||
memset(out + 8 + inlen, 0, padding_len);
|
||||
block(out, out, key);
|
||||
ret = 16; /* AIV + padded input */
|
||||
} else {
|
||||
memmove(out, in, inlen);
|
||||
memset(out + inlen, 0, padding_len); /* Section 4.1 step 1 */
|
||||
ret = CRYPTO_128_wrap(key, aiv, out, out, padded_len, block);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Unwrapping according to RFC 5649 section 4.2.
|
||||
*
|
||||
* @param[in] key Key value.
|
||||
* @param[in] icv (Non-standard) IV, 4 bytes. NULL = use default_aiv.
|
||||
* @param[out] out Plaintext. Minimal buffer length = (inlen - 8) bytes.
|
||||
* Input and output buffers can overlap if block function
|
||||
* supports that.
|
||||
* @param[in] in Ciphertext as n 64-bit blocks.
|
||||
* @param[in] inlen Length of in.
|
||||
* @param[in] block Block processing function.
|
||||
* @return 0 if inlen is out of range [16, CRYPTO128_WRAP_MAX],
|
||||
* or if inlen is not a multiple of 8
|
||||
* or if IV and message length indicator doesn't match.
|
||||
* Output length if unwrapping succeeded and IV matches.
|
||||
*/
|
||||
size_t CRYPTO_128_unwrap_pad(void *key, const unsigned char *icv,
|
||||
unsigned char *out,
|
||||
const unsigned char *in, size_t inlen,
|
||||
block128_f block)
|
||||
{
|
||||
/* n: number of 64-bit blocks in the padded key data */
|
||||
size_t n = inlen / 8 - 1;
|
||||
size_t padded_len;
|
||||
size_t padding_len;
|
||||
size_t ptext_len;
|
||||
/* RFC 5649 section 3: Alternative Initial Value */
|
||||
unsigned char aiv[8];
|
||||
static unsigned char zeros[8] = { 0x0 };
|
||||
size_t ret;
|
||||
|
||||
/* Section 4.2: Ciphertext length has to be (n+1) 64-bit blocks. */
|
||||
if ((inlen & 0x7) != 0 || inlen < 16 || inlen >= CRYPTO128_WRAP_MAX)
|
||||
return 0;
|
||||
|
||||
if (inlen == 16) {
|
||||
/*
|
||||
* Section 4.2 - special case in step 1: When n=1, the ciphertext
|
||||
* contains exactly two 64-bit blocks and they are decrypted as a
|
||||
* single AES block using AES in ECB mode: AIV | P[1] = DEC(K, C[0] |
|
||||
* C[1])
|
||||
*/
|
||||
unsigned char buff[16];
|
||||
|
||||
block(in, buff, key);
|
||||
memcpy(aiv, buff, 8);
|
||||
/* Remove AIV */
|
||||
memcpy(out, buff + 8, 8);
|
||||
padded_len = 8;
|
||||
OPENSSL_cleanse(buff, inlen);
|
||||
} else {
|
||||
padded_len = inlen - 8;
|
||||
ret = crypto_128_unwrap_raw(key, aiv, out, in, inlen, block);
|
||||
if (padded_len != ret) {
|
||||
OPENSSL_cleanse(out, inlen);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Section 3: AIV checks: Check that MSB(32,A) = A65959A6. Optionally a
|
||||
* user-supplied value can be used (even if standard doesn't mention
|
||||
* this).
|
||||
*/
|
||||
if ((!icv && CRYPTO_memcmp(aiv, default_aiv, 4))
|
||||
|| (icv && CRYPTO_memcmp(aiv, icv, 4))) {
|
||||
OPENSSL_cleanse(out, inlen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that 8*(n-1) < LSB(32,AIV) <= 8*n. If so, let ptext_len =
|
||||
* LSB(32,AIV).
|
||||
*/
|
||||
|
||||
ptext_len = ((unsigned int)aiv[4] << 24)
|
||||
| ((unsigned int)aiv[5] << 16)
|
||||
| ((unsigned int)aiv[6] << 8)
|
||||
| (unsigned int)aiv[7];
|
||||
if (8 * (n - 1) >= ptext_len || ptext_len > 8 * n) {
|
||||
OPENSSL_cleanse(out, inlen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the rightmost padding_len octets of the output data are
|
||||
* zero.
|
||||
*/
|
||||
padding_len = padded_len - ptext_len;
|
||||
if (CRYPTO_memcmp(out + ptext_len, zeros, padding_len) != 0) {
|
||||
OPENSSL_cleanse(out, inlen);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Section 4.2 step 3: Remove padding */
|
||||
return ptext_len;
|
||||
}
|
157
trunk/3rdparty/openssl-1.1-fit/crypto/modes/xts128.c
vendored
Normal file
157
trunk/3rdparty/openssl-1.1-fit/crypto/modes/xts128.c
vendored
Normal file
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
* Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
*
|
||||
* Licensed under the OpenSSL license (the "License"). You may not use
|
||||
* this file except in compliance with the License. You can obtain a copy
|
||||
* in the file LICENSE in the source distribution or at
|
||||
* https://www.openssl.org/source/license.html
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx,
|
||||
const unsigned char iv[16],
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len, int enc)
|
||||
{
|
||||
const union {
|
||||
long one;
|
||||
char little;
|
||||
} is_endian = {
|
||||
1
|
||||
};
|
||||
union {
|
||||
u64 u[2];
|
||||
u32 d[4];
|
||||
u8 c[16];
|
||||
} tweak, scratch;
|
||||
unsigned int i;
|
||||
|
||||
if (len < 16)
|
||||
return -1;
|
||||
|
||||
memcpy(tweak.c, iv, 16);
|
||||
|
||||
(*ctx->block2) (tweak.c, tweak.c, ctx->key2);
|
||||
|
||||
if (!enc && (len % 16))
|
||||
len -= 16;
|
||||
|
||||
while (len >= 16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy(scratch.c, inp, 16);
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
#else
|
||||
scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
|
||||
scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
|
||||
#endif
|
||||
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy(out, scratch.c, 16);
|
||||
#else
|
||||
((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
|
||||
((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
|
||||
#endif
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
if (is_endian.little) {
|
||||
unsigned int carry, res;
|
||||
|
||||
res = 0x87 & (((int)tweak.d[3]) >> 31);
|
||||
carry = (unsigned int)(tweak.u[0] >> 63);
|
||||
tweak.u[0] = (tweak.u[0] << 1) ^ res;
|
||||
tweak.u[1] = (tweak.u[1] << 1) | carry;
|
||||
} else {
|
||||
size_t c;
|
||||
|
||||
for (c = 0, i = 0; i < 16; ++i) {
|
||||
/*
|
||||
* + substitutes for |, because c is 1 bit
|
||||
*/
|
||||
c += ((size_t)tweak.c[i]) << 1;
|
||||
tweak.c[i] = (u8)c;
|
||||
c = c >> 8;
|
||||
}
|
||||
tweak.c[0] ^= (u8)(0x87 & (0 - c));
|
||||
}
|
||||
}
|
||||
if (enc) {
|
||||
for (i = 0; i < len; ++i) {
|
||||
u8 c = inp[i];
|
||||
out[i] = scratch.c[i];
|
||||
scratch.c[i] = c;
|
||||
}
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy(out - 16, scratch.c, 16);
|
||||
} else {
|
||||
union {
|
||||
u64 u[2];
|
||||
u8 c[16];
|
||||
} tweak1;
|
||||
|
||||
if (is_endian.little) {
|
||||
unsigned int carry, res;
|
||||
|
||||
res = 0x87 & (((int)tweak.d[3]) >> 31);
|
||||
carry = (unsigned int)(tweak.u[0] >> 63);
|
||||
tweak1.u[0] = (tweak.u[0] << 1) ^ res;
|
||||
tweak1.u[1] = (tweak.u[1] << 1) | carry;
|
||||
} else {
|
||||
size_t c;
|
||||
|
||||
for (c = 0, i = 0; i < 16; ++i) {
|
||||
/*
|
||||
* + substitutes for |, because c is 1 bit
|
||||
*/
|
||||
c += ((size_t)tweak.c[i]) << 1;
|
||||
tweak1.c[i] = (u8)c;
|
||||
c = c >> 8;
|
||||
}
|
||||
tweak1.c[0] ^= (u8)(0x87 & (0 - c));
|
||||
}
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy(scratch.c, inp, 16);
|
||||
scratch.u[0] ^= tweak1.u[0];
|
||||
scratch.u[1] ^= tweak1.u[1];
|
||||
#else
|
||||
scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
|
||||
scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
|
||||
#endif
|
||||
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
|
||||
scratch.u[0] ^= tweak1.u[0];
|
||||
scratch.u[1] ^= tweak1.u[1];
|
||||
|
||||
for (i = 0; i < len; ++i) {
|
||||
u8 c = inp[16 + i];
|
||||
out[16 + i] = scratch.c[i];
|
||||
scratch.c[i] = c;
|
||||
}
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
(*ctx->block1) (scratch.c, scratch.c, ctx->key1);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy(out, scratch.c, 16);
|
||||
#else
|
||||
((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
|
||||
((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue