1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Upgrade openssl from 1.1.0e to 1.1.1b, with source code. 4.0.78

This commit is contained in:
winlin 2021-03-01 20:47:57 +08:00
parent 8f1c992379
commit 96dbd7bced
1476 changed files with 616554 additions and 4 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,946 @@
#! /usr/bin/env perl
# Copyright 2016-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for ARMv8.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
# X-Gene 2.13/+68% 2.27
# Mongoose 1.77/+75% 1.12
# Kryo 2.70/+55% 1.13
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
$flavour=shift;
$output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
$code.=<<___;
#include "arm_arch.h"
.text
// forward "declarations" are required for Apple
.extern OPENSSL_armcap_P
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp $inp,xzr
stp xzr,xzr,[$ctx] // zero hash value
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifdef __ILP32__
ldrsw $t1,.LOPENSSL_armcap_P
#else
ldr $t1,.LOPENSSL_armcap_P
#endif
adr $t0,.LOPENSSL_armcap_P
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
ldr w17,[$t0,$t1]
#ifdef __ARMEB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
and $r0,$r0,$s1 // &=0ffffffc0fffffff
and $s1,$s1,#-4
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
stp $r0,$r1,[$ctx,#32] // save key value
tst w17,#ARMV7_NEON
adr $d0,poly1305_blocks
adr $r0,poly1305_blocks_neon
adr $d1,poly1305_emit
adr $r1,poly1305_emit_neon
csel $d0,$d0,$r0,eq
csel $d1,$d1,$r1,eq
#ifdef __ILP32__
stp w12,w13,[$len]
#else
stp $d0,$d1,[$len]
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
ldp $h0,$h1,[$ctx] // load hash value
ldp $r0,$r1,[$ctx,#32] // load key value
ldr $h2,[$ctx,#16]
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
b .Loop
.align 5
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
#ifdef __ARMEB__
rev $t0,$t0
rev $t1,$t1
#endif
adds $h0,$h0,$t0 // accumulate input
adcs $h1,$h1,$t1
mul $d0,$h0,$r0 // h0*r0
adc $h2,$h2,$padbit
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
cbnz $len,.Loop
stp $h0,$h1,[$ctx] // store hash value
str $h2,[$ctx,#16]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldr $h2,[$ctx,#16]
ldp $t0,$t1,[$nonce] // load nonce
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __ARMEB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __ARMEB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit,.-poly1305_emit
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
my ($T0,$T1,$MASK) = map("v$_",(29..31));
my ($in2,$zeros)=("x16","x17");
my $is_base2_26 = $zeros; # borrow
$code.=<<___;
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul $d0,$h0,$r0 // h0*r0
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 5
poly1305_splat:
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,$h0,#26,#26
extr x14,$h1,$h0,#52
and x14,x14,#0x03ffffff
ubfx x15,$h1,#14,#26
extr x16,$h2,$h1,#40
str w12,[$ctx,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[$ctx,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[$ctx,#16*2] // s1
str w14,[$ctx,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[$ctx,#16*4] // s2
str w15,[$ctx,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[$ctx,#16*6] // s3
str w16,[$ctx,#16*7] // r4
str w15,[$ctx,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.hs .Lblocks_neon
cbz $is_base2_26,poly1305_blocks
.Lblocks_neon:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
ands $len,$len,#-16
b.eq .Lno_data_neon
cbz $is_base2_26,.Lbase2_64_neon
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
tst $len,#31
b.eq .Leven_neon
ldp $r0,$r1,[$ctx,#32] // load key value
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $d2,$h2,xzr // can be partially reduced...
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
and $t0,$d2,#-4 // ... so reduce
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$h0,$t0
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
#ifdef __ARMEB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
ldr x30,[sp,#8]
cbz $padbit,.Lstore_base2_64_neon
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
cbnz $len,.Leven_neon
stp w10,w11,[$ctx] // store hash value base 2^26
stp w12,w13,[$ctx,#8]
str w14,[$ctx,#16]
b .Lno_data_neon
.align 4
.Lstore_base2_64_neon:
stp $h0,$h1,[$ctx] // store hash value base 2^64
stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
b .Lno_data_neon
.align 4
.Lbase2_64_neon:
ldp $r0,$r1,[$ctx,#32] // load key value
ldp $h0,$h1,[$ctx] // load hash value base 2^64
ldr $h2,[$ctx,#16]
tst $len,#31
b.eq .Linit_neon
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __ARMEB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
.Linit_neon:
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
////////////////////////////////// initialize r^n table
mov $h0,$r0 // r^1
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
mov $h1,$r1
mov $h2,xzr
add $ctx,$ctx,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub $ctx,$ctx,#4
bl poly1305_splat
ldr x30,[sp,#8]
add $in2,$inp,#32
adr $zeros,.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
mov x4,#1
str x4,[$ctx,#-24] // set is_base2_26
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
.align 4
.Leven_neon:
add $in2,$inp,#32
adr $zeros,.Lzeros
subs $len,$len,#64
csel $in2,$zeros,$in2,lo
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
.Ldo_neon:
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
ldp x9,x13,[$in2],#48
lsl $padbit,$padbit,#24
add x15,$ctx,#48
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN23_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN23_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN23_2,x8
fmov $IN23_3,x10
fmov $IN23_4,x12
ldp x8,x12,[$inp],#16 // inp[0:1]
ldp x9,x13,[$inp],#48
ld1 {$R0,$R1,$S1,$R2},[x15],#64
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN01_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi $MASK.2d,#-1
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
ushr $MASK.2d,$MASK.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// \___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// \___________________/ \____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs $len,$len,#64
umull $ACC4,$IN23_0,${R4}[2]
csel $in2,$zeros,$in2,lo
umull $ACC3,$IN23_0,${R3}[2]
umull $ACC2,$IN23_0,${R2}[2]
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal $ACC4,$IN23_1,${R3}[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC3,$IN23_1,${R2}[2]
and x5,x9,#0x03ffffff
umlal $ACC2,$IN23_1,${R1}[2]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN23_1,${R0}[2]
ubfx x7,x9,#26,#26
umlal $ACC0,$IN23_1,${S4}[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC4,$IN23_2,${R2}[2]
extr x8,x12,x8,#52
umlal $ACC3,$IN23_2,${R1}[2]
extr x9,x13,x9,#52
umlal $ACC2,$IN23_2,${R0}[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC1,$IN23_2,${S4}[2]
fmov $IN23_0,x4
umlal $ACC0,$IN23_2,${S3}[2]
and x8,x8,#0x03ffffff
umlal $ACC4,$IN23_3,${R1}[2]
and x9,x9,#0x03ffffff
umlal $ACC3,$IN23_3,${R0}[2]
ubfx x10,x12,#14,#26
umlal $ACC2,$IN23_3,${S4}[2]
ubfx x11,x13,#14,#26
umlal $ACC1,$IN23_3,${S3}[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC0,$IN23_3,${S2}[2]
fmov $IN23_1,x6
add $IN01_2,$IN01_2,$H2
add x12,$padbit,x12,lsr#40
umlal $ACC4,$IN23_4,${R0}[2]
add x13,$padbit,x13,lsr#40
umlal $ACC3,$IN23_4,${S4}[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC2,$IN23_4,${S3}[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN23_4,${S2}[2]
fmov $IN23_2,x8
umlal $ACC0,$IN23_4,${S1}[2]
fmov $IN23_3,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add $IN01_0,$IN01_0,$H0
fmov $IN23_4,x12
umlal $ACC3,$IN01_2,${R1}[0]
ldp x8,x12,[$inp],#16 // inp[0:1]
umlal $ACC0,$IN01_2,${S3}[0]
ldp x9,x13,[$inp],#48
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
#ifdef __ARMEB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}[0]
umlal $ACC4,$IN01_0,${R4}[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC2,$IN01_0,${R2}[0]
and x5,x9,#0x03ffffff
umlal $ACC0,$IN01_0,${R0}[0]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN01_0,${R1}[0]
ubfx x7,x9,#26,#26
add $IN01_3,$IN01_3,$H3
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC3,$IN01_1,${R2}[0]
extr x8,x12,x8,#52
umlal $ACC4,$IN01_1,${R3}[0]
extr x9,x13,x9,#52
umlal $ACC0,$IN01_1,${S4}[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC2,$IN01_1,${R1}[0]
fmov $IN01_0,x4
umlal $ACC1,$IN01_1,${R0}[0]
and x8,x8,#0x03ffffff
add $IN01_4,$IN01_4,$H4
and x9,x9,#0x03ffffff
umlal $ACC3,$IN01_3,${R0}[0]
ubfx x10,x12,#14,#26
umlal $ACC0,$IN01_3,${S2}[0]
ubfx x11,x13,#14,#26
umlal $ACC4,$IN01_3,${R1}[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC1,$IN01_3,${S3}[0]
fmov $IN01_1,x6
umlal $ACC2,$IN01_3,${S4}[0]
add x12,$padbit,x12,lsr#40
umlal $ACC3,$IN01_4,${S4}[0]
add x13,$padbit,x13,lsr#40
umlal $ACC0,$IN01_4,${S1}[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC4,$IN01_4,${R0}[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN01_4,${S2}[0]
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
fmov $IN01_4,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr $T0.2d,$ACC3,#26
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
bic $H4,#0xfc,lsl#24
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
add $ACC0,$ACC0,$T0.2d // h4 -> h0
bic $H1,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
shrn $T0.2s,$ACC0,#26
xtn $H0,$ACC0
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
bic $H0,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
add $IN01_2,$IN01_2,$H2
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds $len,$len,#32
b.ne .Long_tail
dup $IN23_2,${IN01_2}[0]
add $IN23_0,$IN01_0,$H0
add $IN23_3,$IN01_3,$H3
add $IN23_1,$IN01_1,$H1
add $IN23_4,$IN01_4,$H4
.Long_tail:
dup $IN23_0,${IN23_0}[0]
umull2 $ACC0,$IN23_2,${S3}
umull2 $ACC3,$IN23_2,${R1}
umull2 $ACC4,$IN23_2,${R2}
umull2 $ACC2,$IN23_2,${R0}
umull2 $ACC1,$IN23_2,${S4}
dup $IN23_1,${IN23_1}[0]
umlal2 $ACC0,$IN23_0,${R0}
umlal2 $ACC2,$IN23_0,${R2}
umlal2 $ACC3,$IN23_0,${R3}
umlal2 $ACC4,$IN23_0,${R4}
umlal2 $ACC1,$IN23_0,${R1}
dup $IN23_3,${IN23_3}[0]
umlal2 $ACC0,$IN23_1,${S4}
umlal2 $ACC3,$IN23_1,${R2}
umlal2 $ACC2,$IN23_1,${R1}
umlal2 $ACC4,$IN23_1,${R3}
umlal2 $ACC1,$IN23_1,${R0}
dup $IN23_4,${IN23_4}[0]
umlal2 $ACC3,$IN23_3,${R0}
umlal2 $ACC4,$IN23_3,${R1}
umlal2 $ACC0,$IN23_3,${S2}
umlal2 $ACC1,$IN23_3,${S3}
umlal2 $ACC2,$IN23_3,${S4}
umlal2 $ACC3,$IN23_4,${S4}
umlal2 $ACC0,$IN23_4,${S1}
umlal2 $ACC4,$IN23_4,${R0}
umlal2 $ACC1,$IN23_4,${S2}
umlal2 $ACC2,$IN23_4,${S3}
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add $IN01_0,$IN01_0,$H0
umlal $ACC3,$IN01_2,${R1}
umlal $ACC0,$IN01_2,${S3}
umlal $ACC4,$IN01_2,${R2}
umlal $ACC1,$IN01_2,${S4}
umlal $ACC2,$IN01_2,${R0}
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}
umlal $ACC0,$IN01_0,${R0}
umlal $ACC4,$IN01_0,${R4}
umlal $ACC1,$IN01_0,${R1}
umlal $ACC2,$IN01_0,${R2}
add $IN01_3,$IN01_3,$H3
umlal $ACC3,$IN01_1,${R2}
umlal $ACC0,$IN01_1,${S4}
umlal $ACC4,$IN01_1,${R3}
umlal $ACC1,$IN01_1,${R0}
umlal $ACC2,$IN01_1,${R1}
add $IN01_4,$IN01_4,$H4
umlal $ACC3,$IN01_3,${R0}
umlal $ACC0,$IN01_3,${S2}
umlal $ACC4,$IN01_3,${R1}
umlal $ACC1,$IN01_3,${S3}
umlal $ACC2,$IN01_3,${S4}
umlal $ACC3,$IN01_4,${S4}
umlal $ACC0,$IN01_4,${S1}
umlal $ACC4,$IN01_4,${R0}
umlal $ACC1,$IN01_4,${S2}
umlal $ACC2,$IN01_4,${S3}
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC3,$ACC3,$ACC3
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC4,$ACC4,$ACC4
ldp d12,d13,[sp,#48]
addp $ACC1,$ACC1,$ACC1
ldp d14,d15,[sp,#64]
addp $ACC2,$ACC2,$ACC2
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr $T0.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
and $ACC4,$ACC4,$MASK.2d
ushr $T1.2d,$ACC1,#26
and $ACC1,$ACC1,$MASK.2d
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
ushr $T1.2d,$ACC2,#26
and $ACC2,$ACC2,$MASK.2d
add $ACC0,$ACC0,$T0.2d // h4 -> h0
add $ACC3,$ACC3,$T1.2d // h2 -> h3
ushr $T0.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
ushr $T1.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
add $ACC1,$ACC1,$T0.2d // h0 -> h1
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
st1 {$ACC4}[0],[$ctx]
.Lno_data_neon:
.inst 0xd50323bf // autiasp
ldr x29,[sp],#80
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
ldr $is_base2_26,[$ctx,#24]
cbz $is_base2_26,poly1305_emit
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $h2,$h2,xzr // can be partially reduced...
ldp $t0,$t1,[$nonce] // load nonce
and $d0,$h2,#-4 // ... so reduce
add $d0,$d0,$h2,lsr#2
and $h2,$h2,#3
adds $h0,$h0,$d0
adcs $h1,$h1,xzr
adc $h2,$h2,xzr
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __ARMEB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __ARMEB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit_neon,.-poly1305_emit_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.LOPENSSL_armcap_P:
#ifdef __ILP32__
.long OPENSSL_armcap_P-.
#else
.quad OPENSSL_armcap_P-.
#endif
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
s/\.[124]([sd])\[/.$1\[/;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,331 @@
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Poly1305 hash for C64x+.
#
# October 2015
#
# Performance is [incredible for a 32-bit processor] 1.82 cycles per
# processed byte. Comparison to compiler-generated code is problematic,
# because results were observed to vary from 2.1 to 7.6 cpb depending
# on compiler's ability to inline small functions. Compiler also
# disables interrupts for some reason, thus making interrupt response
# time dependent on input length. This module on the other hand is free
# from such limitation.
$output=pop;
open STDOUT,">$output";
($CTXA,$INPB,$LEN,$PADBIT)=("A4","B4","A6","B6");
($H0,$H1,$H2,$H3,$H4,$H4a)=("A8","B8","A10","B10","B2",$LEN);
($D0,$D1,$D2,$D3)= ("A9","B9","A11","B11");
($R0,$R1,$R2,$R3,$S1,$S2,$S3,$S3b)=("A0","B0","A1","B1","A12","B12","A13","B13");
($THREE,$R0b,$S2a)=("B7","B5","A5");
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg poly1305_init,_poly1305_init
.asg poly1305_blocks,_poly1305_blocks
.asg poly1305_emit,_poly1305_emit
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.if .LITTLE_ENDIAN
.asg MV,SWAP2
.asg MV.L,SWAP4
.endif
.global _poly1305_init
_poly1305_init:
.asmfunc
LDNDW *${INPB}[0],B17:B16 ; load key material
LDNDW *${INPB}[1],A17:A16
|| ZERO B9:B8
|| MVK -1,B0
STDW B9:B8,*${CTXA}[0] ; initialize h1:h0
|| SHRU B0,4,B0 ; 0x0fffffff
|| MVK -4,B1
STDW B9:B8,*${CTXA}[1] ; initialize h3:h2
|| AND B0,B1,B1 ; 0x0ffffffc
STW B8,*${CTXA}[4] ; initialize h4
.if .BIG_ENDIAN
SWAP2 B16,B17
|| SWAP2 B17,B16
SWAP2 A16,A17
|| SWAP2 A17,A16
SWAP4 B16,B16
|| SWAP4 A16,A16
SWAP4 B17,B17
|| SWAP4 A17,A17
.endif
AND B16,B0,B20 ; r0 = key[0] & 0x0fffffff
|| AND B17,B1,B22 ; r1 = key[1] & 0x0ffffffc
|| EXTU B17,4,6,B16 ; r1>>2
AND A16,B1,B21 ; r2 = key[2] & 0x0ffffffc
|| AND A17,B1,A23 ; r3 = key[3] & 0x0ffffffc
|| BNOP RA
SHRU B21,2,B18
|| ADD B22,B16,B16 ; s1 = r1 + r1>>2
STDW B21:B20,*${CTXA}[3] ; save r2:r0
|| ADD B21,B18,B18 ; s2 = r2 + r2>>2
|| SHRU A23,2,B17
|| MV A23,B23
STDW B23:B22,*${CTXA}[4] ; save r3:r1
|| ADD B23,B17,B19 ; s3 = r3 + r3>>2
|| ADD B23,B17,B17 ; s3 = r3 + r3>>2
STDW B17:B16,*${CTXA}[5] ; save s3:s1
STDW B19:B18,*${CTXA}[6] ; save s3:s2
|| ZERO A4 ; return 0
.endasmfunc
.global _poly1305_blocks
.align 32
_poly1305_blocks:
.asmfunc stack_usage(40)
SHRU $LEN,4,A2 ; A2 is loop counter, number of blocks
[!A2] BNOP RA ; no data
|| [A2] STW FP,*SP--(40) ; save frame pointer and alloca(40)
|| [A2] MV SP,FP
[A2] STDW B13:B12,*SP[4] ; ABI says so
|| [A2] MV $CTXA,$S3b ; borrow $S3b
[A2] STDW B11:B10,*SP[3]
|| [A2] STDW A13:A12,*FP[-3]
[A2] STDW A11:A10,*FP[-4]
|| [A2] LDDW *${S3b}[0],B25:B24 ; load h1:h0
[A2] LDNW *${INPB}++[4],$D0 ; load inp[0]
[A2] LDNW *${INPB}[-3],$D1 ; load inp[1]
LDDW *${CTXA}[1],B29:B28 ; load h3:h2, B28 is h2
LDNW *${INPB}[-2],$D2 ; load inp[2]
LDNW *${INPB}[-1],$D3 ; load inp[3]
LDDW *${CTXA}[3],$R2:$R0 ; load r2:r0
|| LDDW *${S3b}[4],$R3:$R1 ; load r3:r1
|| SWAP2 $D0,$D0
LDDW *${CTXA}[5],$S3:$S1 ; load s3:s1
|| LDDW *${S3b}[6],$S3b:$S2 ; load s3:s2
|| SWAP4 $D0,$D0
|| SWAP2 $D1,$D1
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|| SWAP4 $D1,$D1
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|| MVK 3,$THREE
|| SWAP2 $D2,$D2
LDW *${CTXA}[4],$H4 ; load h4
|| SWAP4 $D2,$D2
|| MV B29,B30 ; B30 is h3
MV $R0,$R0b
loop?:
MPY32U $H0,$R0,A17:A16
|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|| SWAP2 $D3,$D3
MPY32U $H0,$R2,A19:A18
|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|| ADD $D0,$H1,A24 ; A-copy of B24
|| SWAP4 $D3,$D3
|| [A2] SUB A2,1,A2 ; decrement loop counter
MPY32U A24,$S3,A21:A20 ; MPY32U $H1,$S3,A21:A20
|| MPY32U B24,$R0b,B21:B20 ; MPY32U $H1,$R0,B21:B20
|| ADDU B25,$D2:$H2,$D2:$H2 ; ADDU $D1,$D2:$H2,$D2:$H2
|| ADDU $D3,B30,$D3:$H3 ; h3+=inp[3]
|| ADD B25,$H2,B25 ; B-copy of $H2
MPY32U A24,$R1,A23:A22 ; MPY32U $H1,$R1,A23:A22
|| MPY32U B24,$R2,B23:B22 ; MPY32U $H1,$R2,B23:B22
MPY32U $H2,$S2,A25:A24
|| MPY32U B25,$S3b,B25:B24 ; MPY32U $H2,$S3,B25:B24
|| ADDU $D2,$D3:$H3,$D3:$H3
|| ADD $PADBIT,$H4,$H4 ; h4+=padbit
MPY32U $H2,$R0,A27:A26
|| MPY32U $H2,$R1,B27:B26
|| ADD $D3,$H4,$H4
|| MV $S2,$S2a
MPY32U $H3,$S1,A29:A28
|| MPY32U $H3,$S2,B29:B28
|| ADD A21,A17,A21 ; start accumulating "d3:d0"
|| ADD B21,B17,B21
|| ADDU A20,A16,A17:A16
|| ADDU B20,B16,B17:B16
|| [A2] LDNW *${INPB}++[4],$D0 ; load inp[0]
MPY32U $H3,$S3,A31:A30
|| MPY32U $H3,$R0b,B31:B30
|| ADD A23,A19,A23
|| ADD B23,B19,B23
|| ADDU A22,A18,A19:A18
|| ADDU B22,B18,B19:B18
|| [A2] LDNW *${INPB}[-3],$D1 ; load inp[1]
MPY32 $H4,$S1,B20
|| MPY32 $H4,$S2a,A20
|| ADD A25,A21,A21
|| ADD B25,B21,B21
|| ADDU A24,A17:A16,A17:A16
|| ADDU B24,B17:B16,B17:B16
|| [A2] LDNW *${INPB}[-2],$D2 ; load inp[2]
MPY32 $H4,$S3b,B22
|| ADD A27,A23,A23
|| ADD B27,B23,B23
|| ADDU A26,A19:A18,A19:A18
|| ADDU B26,B19:B18,B19:B18
|| [A2] LDNW *${INPB}[-1],$D3 ; load inp[3]
MPY32 $H4,$R0b,$H4
|| ADD A29,A21,A21 ; final hi("d0")
|| ADD B29,B21,B21 ; final hi("d1")
|| ADDU A28,A17:A16,A17:A16 ; final lo("d0")
|| ADDU B28,B17:B16,B17:B16
ADD A31,A23,A23 ; final hi("d2")
|| ADD B31,B23,B23 ; final hi("d3")
|| ADDU A30,A19:A18,A19:A18
|| ADDU B30,B19:B18,B19:B18
ADDU B20,B17:B16,B17:B16 ; final lo("d1")
|| ADDU A20,A19:A18,A19:A18 ; final lo("d2")
ADDU B22,B19:B18,B19:B18 ; final lo("d3")
|| ADD A17,A21,A21 ; "flatten" "d3:d0"
MV A19,B29 ; move to avoid cross-path stalls
ADDU A21,B17:B16,B27:B26 ; B26 is h1
ADD B21,B27,B27
|| DMV B29,A18,B29:B28 ; move to avoid cross-path stalls
ADDU B27,B29:B28,B29:B28 ; B28 is h2
|| [A2] SWAP2 $D0,$D0
ADD A23,B29,B29
|| [A2] SWAP4 $D0,$D0
ADDU B29,B19:B18,B31:B30 ; B30 is h3
ADD B23,B31,B31
|| MV A16,B24 ; B24 is h0
|| [A2] SWAP2 $D1,$D1
ADD B31,$H4,$H4
|| [A2] SWAP4 $D1,$D1
SHRU $H4,2,B16 ; last reduction step
|| AND $H4,$THREE,$H4
ADDAW B16,B16,B16 ; 5*(h4>>2)
|| [A2] BNOP loop?
ADDU B24,B16,B25:B24 ; B24 is h0
|| [A2] SWAP2 $D2,$D2
ADDU B26,B25,B27:B26 ; B26 is h1
|| [A2] SWAP4 $D2,$D2
ADDU B28,B27,B29:B28 ; B28 is h2
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
ADDU B30,B29,B31:B30 ; B30 is h3
ADD B31,$H4,$H4
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
;;===== branch to loop? is taken here
LDDW *FP[-4],A11:A10 ; ABI says so
LDDW *FP[-3],A13:A12
|| LDDW *SP[3],B11:B10
LDDW *SP[4],B13:B12
|| MV B26,B25
|| BNOP RA
LDW *++SP(40),FP ; restore frame pointer
|| MV B30,B29
STDW B25:B24,*${CTXA}[0] ; save h1:h0
STDW B29:B28,*${CTXA}[1] ; save h3:h2
STW $H4,*${CTXA}[4] ; save h4
NOP 1
.endasmfunc
___
{
my ($MAC,$NONCEA,$NONCEB)=($INPB,$LEN,$PADBIT);
$code.=<<___;
.global _poly1305_emit
.align 32
_poly1305_emit:
.asmfunc
LDDW *${CTXA}[0],A17:A16 ; load h1:h0
LDDW *${CTXA}[1],A19:A18 ; load h3:h2
LDW *${CTXA}[4],A20 ; load h4
MV $NONCEA,$NONCEB
MVK 5,A22 ; compare to modulus
ADDU A16,A22,A23:A22
|| LDW *${NONCEA}[0],A8
|| LDW *${NONCEB}[1],B8
ADDU A17,A23,A25:A24
|| LDW *${NONCEA}[2],A9
|| LDW *${NONCEB}[3],B9
ADDU A19,A25,A27:A26
ADDU A19,A27,A29:A28
ADD A20,A29,A29
SHRU A29,2,A2 ; check for overflow in 130-th bit
[A2] MV A22,A16 ; select
|| [A2] MV A24,A17
[A2] MV A26,A18
|| [A2] MV A28,A19
|| ADDU A8,A16,A23:A22 ; accumulate nonce
ADDU B8,A17,A25:A24
|| SWAP2 A22,A22
ADDU A23,A25:A24,A25:A24
ADDU A9,A18,A27:A26
|| SWAP2 A24,A24
ADDU A25,A27:A26,A27:A26
|| ADD B9,A19,A28
ADD A27,A28,A28
|| SWAP2 A26,A26
.if .BIG_ENDIAN
SWAP2 A28,A28
|| SWAP4 A22,A22
|| SWAP4 A24,B24
SWAP4 A26,A26
SWAP4 A28,A28
|| MV B24,A24
.endif
BNOP RA,1
STNW A22,*${MAC}[0] ; write the result
STNW A24,*${MAC}[1]
STNW A26,*${MAC}[2]
STNW A28,*${MAC}[3]
.endasmfunc
___
}
$code.=<<___;
.sect .const
.cstring "Poly1305 for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;

View file

@ -0,0 +1,437 @@
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# Poly1305 hash for MIPS64.
#
# May 2016
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc
# R1x000 5.64/+120% (big-endian)
# Octeon II 3.80/+280% (little-endian)
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
# excluded from the rule, because it's specified volatile];
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
# <appro@openssl.org>
#
######################################################################
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
die "MIPS64 only" unless ($flavour =~ /64|n32/i);
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
$code.=<<___;
#include "mips_arch.h"
#ifdef MIPSEB
# define MSB 0
# define LSB 7
#else
# define MSB 7
# define LSB 0
#endif
.text
.set noat
.set noreorder
.align 5
.globl poly1305_init
.ent poly1305_init
poly1305_init:
.frame $sp,0,$ra
.set reorder
sd $zero,0($ctx)
sd $zero,8($ctx)
sd $zero,16($ctx)
beqz $inp,.Lno_key
#if defined(_MIPS_ARCH_MIPS64R6)
ld $in0,0($inp)
ld $in1,8($inp)
#else
ldl $in0,0+MSB($inp)
ldl $in1,8+MSB($inp)
ldr $in0,0+LSB($inp)
ldr $in1,8+LSB($inp)
#endif
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS64R2)
dsbh $in0,$in0 # byte swap
dsbh $in1,$in1
dshd $in0,$in0
dshd $in1,$in1
# else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,$in0,$tmp0 # byte swap
and $tmp3,$in1,$tmp0
dsrl $tmp2,$in0,24
dsrl $tmp4,$in1,24
dsll $tmp1,24
dsll $tmp3,24
and $tmp2,$tmp0
and $tmp4,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
or $tmp3,$tmp4
and $tmp2,$in0,$tmp0
and $tmp4,$in1,$tmp0
dsrl $in0,8
dsrl $in1,8
dsll $tmp2,8
dsll $tmp4,8
and $in0,$tmp0
and $in1,$tmp0
or $tmp1,$tmp2
or $tmp3,$tmp4
or $in0,$tmp1
or $in1,$tmp3
dsrl $tmp1,$in0,32
dsrl $tmp3,$in1,32
dsll $in0,32
dsll $in1,32
or $in0,$tmp1
or $in1,$tmp3
# endif
#endif
li $tmp0,1
dsll $tmp0,32
daddiu $tmp0,-63
dsll $tmp0,28
daddiu $tmp0,-1 # 0ffffffc0fffffff
and $in0,$tmp0
daddiu $tmp0,-3 # 0ffffffc0ffffffc
and $in1,$tmp0
sd $in0,24($ctx)
dsrl $tmp0,$in1,2
sd $in1,32($ctx)
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
sd $tmp0,40($ctx)
.Lno_key:
li $v0,0 # return 0
jr $ra
.end poly1305_init
___
{
my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
$code.=<<___;
.align 5
.globl poly1305_blocks
.ent poly1305_blocks
poly1305_blocks:
.set noreorder
dsrl $len,4 # number of complete blocks
bnez $len,poly1305_blocks_internal
nop
jr $ra
nop
.end poly1305_blocks
.align 5
.ent poly1305_blocks_internal
poly1305_blocks_internal:
.frame $sp,6*8,$ra
.mask $SAVED_REGS_MASK,-8
.set noreorder
dsubu $sp,6*8
sd $s5,40($sp)
sd $s4,32($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
sd $s3,24($sp)
sd $s2,16($sp)
sd $s1,8($sp)
sd $s0,0($sp)
___
$code.=<<___;
.set reorder
ld $h0,0($ctx) # load hash value
ld $h1,8($ctx)
ld $h2,16($ctx)
ld $r0,24($ctx) # load key
ld $r1,32($ctx)
ld $s1,40($ctx)
.Loop:
#if defined(_MIPS_ARCH_MIPS64R6)
ld $in0,0($inp) # load input
ld $in1,8($inp)
#else
ldl $in0,0+MSB($inp) # load input
ldl $in1,8+MSB($inp)
ldr $in0,0+LSB($inp)
ldr $in1,8+LSB($inp)
#endif
daddiu $len,-1
daddiu $inp,16
#ifdef MIPSEB
# if defined(_MIPS_ARCH_MIPS64R2)
dsbh $in0,$in0 # byte swap
dsbh $in1,$in1
dshd $in0,$in0
dshd $in1,$in1
# else
ori $tmp0,$zero,0xFF
dsll $tmp2,$tmp0,32
or $tmp0,$tmp2 # 0x000000FF000000FF
and $tmp1,$in0,$tmp0 # byte swap
and $tmp3,$in1,$tmp0
dsrl $tmp2,$in0,24
dsrl $tmp4,$in1,24
dsll $tmp1,24
dsll $tmp3,24
and $tmp2,$tmp0
and $tmp4,$tmp0
dsll $tmp0,8 # 0x0000FF000000FF00
or $tmp1,$tmp2
or $tmp3,$tmp4
and $tmp2,$in0,$tmp0
and $tmp4,$in1,$tmp0
dsrl $in0,8
dsrl $in1,8
dsll $tmp2,8
dsll $tmp4,8
and $in0,$tmp0
and $in1,$tmp0
or $tmp1,$tmp2
or $tmp3,$tmp4
or $in0,$tmp1
or $in1,$tmp3
dsrl $tmp1,$in0,32
dsrl $tmp3,$in1,32
dsll $in0,32
dsll $in1,32
or $in0,$tmp1
or $in1,$tmp3
# endif
#endif
daddu $h0,$in0 # accumulate input
daddu $h1,$in1
sltu $tmp0,$h0,$in0
sltu $tmp1,$h1,$in1
daddu $h1,$tmp0
dmultu ($r0,$h0) # h0*r0
daddu $h2,$padbit
sltu $tmp0,$h1,$tmp0
mflo ($d0,$r0,$h0)
mfhi ($d1,$r0,$h0)
dmultu ($s1,$h1) # h1*5*r1
daddu $tmp0,$tmp1
daddu $h2,$tmp0
mflo ($tmp0,$s1,$h1)
mfhi ($tmp1,$s1,$h1)
dmultu ($r1,$h0) # h0*r1
daddu $d0,$tmp0
daddu $d1,$tmp1
mflo ($tmp2,$r1,$h0)
mfhi ($d2,$r1,$h0)
sltu $tmp0,$d0,$tmp0
daddu $d1,$tmp0
dmultu ($r0,$h1) # h1*r0
daddu $d1,$tmp2
sltu $tmp2,$d1,$tmp2
mflo ($tmp0,$r0,$h1)
mfhi ($tmp1,$r0,$h1)
daddu $d2,$tmp2
dmultu ($s1,$h2) # h2*5*r1
daddu $d1,$tmp0
daddu $d2,$tmp1
mflo ($tmp2,$s1,$h2)
dmultu ($r0,$h2) # h2*r0
sltu $tmp0,$d1,$tmp0
daddu $d2,$tmp0
mflo ($tmp3,$r0,$h2)
daddu $d1,$tmp2
daddu $d2,$tmp3
sltu $tmp2,$d1,$tmp2
daddu $d2,$tmp2
li $tmp0,-4 # final reduction
and $tmp0,$d2
dsrl $tmp1,$d2,2
andi $h2,$d2,3
daddu $tmp0,$tmp1
daddu $h0,$d0,$tmp0
sltu $tmp0,$h0,$tmp0
daddu $h1,$d1,$tmp0
sltu $tmp0,$h1,$tmp0
daddu $h2,$h2,$tmp0
bnez $len,.Loop
sd $h0,0($ctx) # store hash value
sd $h1,8($ctx)
sd $h2,16($ctx)
.set noreorder
ld $s5,40($sp) # epilogue
ld $s4,32($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
ld $s3,24($sp)
ld $s2,16($sp)
ld $s1,8($sp)
ld $s0,0($sp)
___
$code.=<<___;
jr $ra
daddu $sp,6*8
.end poly1305_blocks_internal
___
}
{
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
$code.=<<___;
.align 5
.globl poly1305_emit
.ent poly1305_emit
poly1305_emit:
.frame $sp,0,$ra
.set reorder
ld $tmp0,0($ctx)
ld $tmp1,8($ctx)
ld $tmp2,16($ctx)
daddiu $in0,$tmp0,5 # compare to modulus
sltiu $tmp3,$in0,5
daddu $in1,$tmp1,$tmp3
sltu $tmp3,$in1,$tmp3
daddu $tmp2,$tmp2,$tmp3
dsrl $tmp2,2 # see if it carried/borrowed
dsubu $tmp2,$zero,$tmp2
nor $tmp3,$zero,$tmp2
and $in0,$tmp2
and $tmp0,$tmp3
and $in1,$tmp2
and $tmp1,$tmp3
or $in0,$tmp0
or $in1,$tmp1
lwu $tmp0,0($nonce) # load nonce
lwu $tmp1,4($nonce)
lwu $tmp2,8($nonce)
lwu $tmp3,12($nonce)
dsll $tmp1,32
dsll $tmp3,32
or $tmp0,$tmp1
or $tmp2,$tmp3
daddu $in0,$tmp0 # accumulate nonce
daddu $in1,$tmp2
sltu $tmp0,$in0,$tmp0
daddu $in1,$tmp0
dsrl $tmp0,$in0,8 # write mac value
dsrl $tmp1,$in0,16
dsrl $tmp2,$in0,24
sb $in0,0($mac)
dsrl $tmp3,$in0,32
sb $tmp0,1($mac)
dsrl $tmp0,$in0,40
sb $tmp1,2($mac)
dsrl $tmp1,$in0,48
sb $tmp2,3($mac)
dsrl $tmp2,$in0,56
sb $tmp3,4($mac)
dsrl $tmp3,$in1,8
sb $tmp0,5($mac)
dsrl $tmp0,$in1,16
sb $tmp1,6($mac)
dsrl $tmp1,$in1,24
sb $tmp2,7($mac)
sb $in1,8($mac)
dsrl $tmp2,$in1,32
sb $tmp3,9($mac)
dsrl $tmp3,$in1,40
sb $tmp0,10($mac)
dsrl $tmp0,$in1,48
sb $tmp1,11($mac)
dsrl $tmp1,$in1,56
sb $tmp2,12($mac)
sb $tmp3,13($mac)
sb $tmp0,14($mac)
sb $tmp1,15($mac)
jr $ra
.end poly1305_emit
.rdata
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
}
$output=pop and open STDOUT,">$output";
print $code;
close STDOUT;

View file

@ -0,0 +1,645 @@
#! /usr/bin/env perl
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for PowerPC.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# and improvement coefficients relative to gcc-generated code.
#
# -m32 -m64
#
# Freescale e300 14.8/+80% -
# PPC74x0 7.60/+60% -
# PPC970 7.00/+114% 3.51/+205%
# POWER7 3.75/+260% 1.93/+100%
# POWER8 - 2.03/+200%
# POWER9 - 2.00/+150%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
# compiler-generated code. On the other hand it's known that floating-
# point performance can be dominated by FPU latency, which means that
# there is limit even for ideally optimized (and even vectorized) code.
# And this limit is estimated to be higher than above -m64 results. Or
# in other words floating-point implementation can be meaningful to
# consider only in 32-bit application context. We probably have to
# recognize that 32-bit builds are getting less popular on high-end
# systems and therefore tend to target embedded ones, which might not
# even have FPU...
#
# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
# Define endianness based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T;
$sp="r1";
my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
my ($mac,$nonce)=($inp,$len);
my $mask = "r0";
$code=<<___;
.machine "any"
.text
___
if ($flavour =~ /64/) {
###############################################################################
# base 2^64 implementation
my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
$code.=<<___;
.globl .poly1305_init_int
.align 4
.poly1305_init_int:
xor r0,r0,r0
std r0,0($ctx) # zero hash value
std r0,8($ctx)
std r0,16($ctx)
$UCMP $inp,r0
beq- Lno_key
___
$code.=<<___ if ($LITTLE_ENDIAN);
ld $d0,0($inp) # load key material
ld $d1,8($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $h0,4
lwbrx $d0,0,$inp # load key material
li $d1,8
lwbrx $h0,$h0,$inp
li $h1,12
lwbrx $d1,$d1,$inp
lwbrx $h1,$h1,$inp
insrdi $d0,$h0,32,0
insrdi $d1,$h1,32,0
___
$code.=<<___;
lis $h1,0xfff # 0x0fff0000
ori $h1,$h1,0xfffc # 0x0ffffffc
insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
ori $h0,$h1,3 # 0x0ffffffc0fffffff
and $d0,$d0,$h0
and $d1,$d1,$h1
std $d0,32($ctx) # store key
std $d1,40($ctx)
Lno_key:
xor r3,r3,r3
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.size .poly1305_init_int,.-.poly1305_init_int
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
srdi. $len,$len,4
beq- Labort
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
ld $r0,32($ctx) # load key
ld $r1,40($ctx)
ld $h0,0($ctx) # load hash value
ld $h1,8($ctx)
ld $h2,16($ctx)
srdi $s1,$r1,2
mtctr $len
add $s1,$s1,$r1 # s1 = r1 + r1>>2
li $mask,3
b Loop
.align 4
Loop:
___
$code.=<<___ if ($LITTLE_ENDIAN);
ld $t0,0($inp) # load input
ld $t1,8($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d0,4
lwbrx $t0,0,$inp # load input
li $t1,8
lwbrx $d0,$d0,$inp
li $d1,12
lwbrx $t1,$t1,$inp
lwbrx $d1,$d1,$inp
insrdi $t0,$d0,32,0
insrdi $t1,$d1,32,0
___
$code.=<<___;
addi $inp,$inp,16
addc $h0,$h0,$t0 # accumulate input
adde $h1,$h1,$t1
mulld $d0,$h0,$r0 # h0*r0
mulhdu $d1,$h0,$r0
adde $h2,$h2,$padbit
mulld $t0,$h1,$s1 # h1*5*r1
mulhdu $t1,$h1,$s1
addc $d0,$d0,$t0
adde $d1,$d1,$t1
mulld $t0,$h0,$r1 # h0*r1
mulhdu $d2,$h0,$r1
addc $d1,$d1,$t0
addze $d2,$d2
mulld $t0,$h1,$r0 # h1*r0
mulhdu $t1,$h1,$r0
addc $d1,$d1,$t0
adde $d2,$d2,$t1
mulld $t0,$h2,$s1 # h2*5*r1
mulld $t1,$h2,$r0 # h2*r0
addc $d1,$d1,$t0
adde $d2,$d2,$t1
andc $t0,$d2,$mask # final reduction step
and $h2,$d2,$mask
srdi $t1,$t0,2
add $t0,$t0,$t1
addc $h0,$d0,$t0
addze $h1,$d1
addze $h2,$h2
bdnz Loop
std $h0,0($ctx) # store hash value
std $h1,8($ctx)
std $h2,16($ctx)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
Labort:
blr
.long 0
.byte 0,12,4,1,0x80,5,4,0
.size .poly1305_blocks,.-.poly1305_blocks
.globl .poly1305_emit
.align 4
.poly1305_emit:
ld $h0,0($ctx) # load hash
ld $h1,8($ctx)
ld $h2,16($ctx)
ld $padbit,0($nonce) # load nonce
ld $nonce,8($nonce)
addic $d0,$h0,5 # compare to modulus
addze $d1,$h1
addze $d2,$h2
srdi $mask,$d2,2 # did it carry/borrow?
neg $mask,$mask
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h1,$h1,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
or $h1,$h1,$d1
___
$code.=<<___ if (!$LITTLE_ENDIAN);
rotldi $padbit,$padbit,32 # flip nonce words
rotldi $nonce,$nonce,32
___
$code.=<<___;
addc $h0,$h0,$padbit # accumulate nonce
adde $h1,$h1,$nonce
___
$code.=<<___ if ($LITTLE_ENDIAN);
std $h0,0($mac) # write result
std $h1,8($mac)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
extrdi r0,$h0,32,0
li $d0,4
stwbrx $h0,0,$mac # write result
extrdi $h0,$h1,32,0
li $d1,8
stwbrx r0,$d0,$mac
li $d2,12
stwbrx $h1,$d1,$mac
stwbrx $h0,$d2,$mac
___
$code.=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.size .poly1305_emit,.-.poly1305_emit
___
} else {
###############################################################################
# base 2^32 implementation
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
$t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
) = map("r$_",(7..12,14..31));
$code.=<<___;
.globl .poly1305_init_int
.align 4
.poly1305_init_int:
xor r0,r0,r0
stw r0,0($ctx) # zero hash value
stw r0,4($ctx)
stw r0,8($ctx)
stw r0,12($ctx)
stw r0,16($ctx)
$UCMP $inp,r0
beq- Lno_key
___
$code.=<<___ if ($LITTLE_ENDIAN);
lw $h0,0($inp) # load key material
lw $h1,4($inp)
lw $h2,8($inp)
lw $h3,12($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $h1,4
lwbrx $h0,0,$inp # load key material
li $h2,8
lwbrx $h1,$h1,$inp
li $h3,12
lwbrx $h2,$h2,$inp
lwbrx $h3,$h3,$inp
___
$code.=<<___;
lis $mask,0xf000 # 0xf0000000
li $r0,-4
andc $r0,$r0,$mask # 0x0ffffffc
andc $h0,$h0,$mask
and $h1,$h1,$r0
and $h2,$h2,$r0
and $h3,$h3,$r0
stw $h0,32($ctx) # store key
stw $h1,36($ctx)
stw $h2,40($ctx)
stw $h3,44($ctx)
Lno_key:
xor r3,r3,r3
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.size .poly1305_init_int,.-.poly1305_init_int
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
srwi. $len,$len,4
beq- Labort
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $r0,32($ctx) # load key
lwz $r1,36($ctx)
lwz $r2,40($ctx)
lwz $r3,44($ctx)
lwz $h0,0($ctx) # load hash value
lwz $h1,4($ctx)
lwz $h2,8($ctx)
lwz $h3,12($ctx)
lwz $h4,16($ctx)
srwi $s1,$r1,2
srwi $s2,$r2,2
srwi $s3,$r3,2
add $s1,$s1,$r1 # si = ri + ri>>2
add $s2,$s2,$r2
add $s3,$s3,$r3
mtctr $len
li $mask,3
b Loop
.align 4
Loop:
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $d0,0($inp) # load input
lwz $d1,4($inp)
lwz $d2,8($inp)
lwz $d3,12($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d1,4
lwbrx $d0,0,$inp # load input
li $d2,8
lwbrx $d1,$d1,$inp
li $d3,12
lwbrx $d2,$d2,$inp
lwbrx $d3,$d3,$inp
___
$code.=<<___;
addi $inp,$inp,16
addc $h0,$h0,$d0 # accumulate input
adde $h1,$h1,$d1
adde $h2,$h2,$d2
mullw $d0,$h0,$r0 # h0*r0
mulhwu $D0,$h0,$r0
mullw $d1,$h0,$r1 # h0*r1
mulhwu $D1,$h0,$r1
mullw $d2,$h0,$r2 # h0*r2
mulhwu $D2,$h0,$r2
adde $h3,$h3,$d3
adde $h4,$h4,$padbit
mullw $d3,$h0,$r3 # h0*r3
mulhwu $D3,$h0,$r3
mullw $t0,$h1,$s3 # h1*s3
mulhwu $t1,$h1,$s3
mullw $t2,$h1,$r0 # h1*r0
mulhwu $t3,$h1,$r0
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h1,$r1 # h1*r1
mulhwu $t1,$h1,$r1
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h1,$r2 # h1*r2
mulhwu $t3,$h1,$r2
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h2,$s2 # h2*s2
mulhwu $t1,$h2,$s2
addc $d3,$d3,$t2
adde $D3,$D3,$t3
mullw $t2,$h2,$s3 # h2*s3
mulhwu $t3,$h2,$s3
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h2,$r0 # h2*r0
mulhwu $t1,$h2,$r0
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h2,$r1 # h2*r1
mulhwu $t3,$h2,$r1
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h3,$s1 # h3*s1
mulhwu $t1,$h3,$s1
addc $d3,$d3,$t2
adde $D3,$D3,$t3
mullw $t2,$h3,$s2 # h3*s2
mulhwu $t3,$h3,$s2
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h3,$s3 # h3*s3
mulhwu $t1,$h3,$s3
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h3,$r0 # h3*r0
mulhwu $t3,$h3,$r0
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h4,$s1 # h4*s1
addc $d3,$d3,$t2
adde $D3,$D3,$t3
addc $d1,$d1,$t0
mullw $t1,$h4,$s2 # h4*s2
addze $D1,$D1
addc $d2,$d2,$t1
addze $D2,$D2
mullw $t2,$h4,$s3 # h4*s3
addc $d3,$d3,$t2
addze $D3,$D3
mullw $h4,$h4,$r0 # h4*r0
addc $h1,$d1,$D0
adde $h2,$d2,$D1
adde $h3,$d3,$D2
adde $h4,$h4,$D3
andc $D0,$h4,$mask # final reduction step
and $h4,$h4,$mask
srwi $D1,$D0,2
add $D0,$D0,$D1
addc $h0,$d0,$D0
addze $h1,$h1
addze $h2,$h2
addze $h3,$h3
addze $h4,$h4
bdnz Loop
stw $h0,0($ctx) # store hash value
stw $h1,4($ctx)
stw $h2,8($ctx)
stw $h3,12($ctx)
stw $h4,16($ctx)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
Labort:
blr
.long 0
.byte 0,12,4,1,0x80,18,4,0
.size .poly1305_blocks,.-.poly1305_blocks
.globl .poly1305_emit
.align 4
.poly1305_emit:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $h0,0($ctx) # load hash
lwz $h1,4($ctx)
lwz $h2,8($ctx)
lwz $h3,12($ctx)
lwz $h4,16($ctx)
addic $d0,$h0,5 # compare to modulus
addze $d1,$h1
addze $d2,$h2
addze $d3,$h3
addze $mask,$h4
srwi $mask,$mask,2 # did it carry/borrow?
neg $mask,$mask
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h1,$h1,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
lwz $d0,0($nonce) # load nonce
andc $h2,$h2,$mask
and $d2,$d2,$mask
or $h1,$h1,$d1
lwz $d1,4($nonce)
andc $h3,$h3,$mask
and $d3,$d3,$mask
or $h2,$h2,$d2
lwz $d2,8($nonce)
or $h3,$h3,$d3
lwz $d3,12($nonce)
addc $h0,$h0,$d0 # accumulate nonce
adde $h1,$h1,$d1
adde $h2,$h2,$d2
adde $h3,$h3,$d3
___
$code.=<<___ if ($LITTLE_ENDIAN);
stw $h0,0($mac) # write result
stw $h1,4($mac)
stw $h2,8($mac)
stw $h3,12($mac)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d1,4
stwbrx $h0,0,$mac # write result
li $d2,8
stwbrx $h1,$d1,$mac
li $d3,12
stwbrx $h2,$d2,$mac
stwbrx $h3,$d3,$mac
___
$code.=<<___;
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,4,3,0
.size .poly1305_emit,.-.poly1305_emit
___
}
$code.=<<___;
.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,739 @@
#! /usr/bin/env perl
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for PowerPC FPU.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# and improvement coefficients relative to gcc-generated code.
#
# Freescale e300 9.78/+30%
# PPC74x0 6.92/+50%
# PPC970 6.03/+80%
# POWER7 3.50/+30%
# POWER8 3.75/+10%
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx";
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$LOCALS=6*$SIZE_T;
$FRAME=$LOCALS+6*8+18*8;
my $sp="r1";
my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6));
my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
$two0,$two32,$two64,$two96,$two130,$five_two130,
$r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
$s2lo,$s2hi,$s3lo,$s3hi,
$c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31));
# borrowings
my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi);
$code.=<<___;
.machine "any"
.text
.globl .poly1305_init_fpu
.align 6
.poly1305_init_fpu:
$STU $sp,-$LOCALS($sp) # minimal frame
mflr $padbit
$PUSH $padbit,`$LOCALS+$LRSAVE`($sp)
bl LPICmeup
xor r0,r0,r0
mtlr $padbit # restore lr
lfd $two0,8*0($len) # load constants
lfd $two32,8*1($len)
lfd $two64,8*2($len)
lfd $two96,8*3($len)
lfd $two130,8*4($len)
lfd $five_two130,8*5($len)
stfd $two0,8*0($ctx) # initial hash value, biased 0
stfd $two32,8*1($ctx)
stfd $two64,8*2($ctx)
stfd $two96,8*3($ctx)
$UCMP $inp,r0
beq- Lno_key
lfd $h3lo,8*13($len) # new fpscr
mffs $h3hi # old fpscr
stfd $two0,8*4($ctx) # key "template"
stfd $two32,8*5($ctx)
stfd $two64,8*6($ctx)
stfd $two96,8*7($ctx)
li $in1,4
li $in2,8
li $in3,12
$LWXLE $in0,0,$inp # load key
$LWXLE $in1,$in1,$inp
$LWXLE $in2,$in2,$inp
$LWXLE $in3,$in3,$inp
lis $i1,0xf000 # 0xf0000000
ori $i2,$i1,3 # 0xf0000003
andc $in0,$in0,$i1 # &=0x0fffffff
andc $in1,$in1,$i2 # &=0x0ffffffc
andc $in2,$in2,$i2
andc $in3,$in3,$i2
stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template"
stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx)
stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx)
stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx)
mtfsf 255,$h3lo # fpscr
stfd $two0,8*18($ctx) # copy constants to context
stfd $two32,8*19($ctx)
stfd $two64,8*20($ctx)
stfd $two96,8*21($ctx)
stfd $two130,8*22($ctx)
stfd $five_two130,8*23($ctx)
lfd $h0lo,8*4($ctx) # load [biased] key
lfd $h1lo,8*5($ctx)
lfd $h2lo,8*6($ctx)
lfd $h3lo,8*7($ctx)
fsub $h0lo,$h0lo,$two0 # r0
fsub $h1lo,$h1lo,$two32 # r1
fsub $h2lo,$h2lo,$two64 # r2
fsub $h3lo,$h3lo,$two96 # r3
lfd $two0,8*6($len) # more constants
lfd $two32,8*7($len)
lfd $two64,8*8($len)
lfd $two96,8*9($len)
fmul $h1hi,$h1lo,$five_two130 # s1
fmul $h2hi,$h2lo,$five_two130 # s2
stfd $h3hi,8*15($ctx) # borrow slot for original fpscr
fmul $h3hi,$h3lo,$five_two130 # s3
fadd $h0hi,$h0lo,$two0
stfd $h1hi,8*12($ctx) # put aside for now
fadd $h1hi,$h1lo,$two32
stfd $h2hi,8*13($ctx)
fadd $h2hi,$h2lo,$two64
stfd $h3hi,8*14($ctx)
fadd $h3hi,$h3lo,$two96
fsub $h0hi,$h0hi,$two0
fsub $h1hi,$h1hi,$two32
fsub $h2hi,$h2hi,$two64
fsub $h3hi,$h3hi,$two96
lfd $two0,8*10($len) # more constants
lfd $two32,8*11($len)
lfd $two64,8*12($len)
fsub $h0lo,$h0lo,$h0hi
fsub $h1lo,$h1lo,$h1hi
fsub $h2lo,$h2lo,$h2hi
fsub $h3lo,$h3lo,$h3hi
stfd $h0hi,8*5($ctx) # r0hi
stfd $h1hi,8*7($ctx) # r1hi
stfd $h2hi,8*9($ctx) # r2hi
stfd $h3hi,8*11($ctx) # r3hi
stfd $h0lo,8*4($ctx) # r0lo
stfd $h1lo,8*6($ctx) # r1lo
stfd $h2lo,8*8($ctx) # r2lo
stfd $h3lo,8*10($ctx) # r3lo
lfd $h1lo,8*12($ctx) # s1
lfd $h2lo,8*13($ctx) # s2
lfd $h3lo,8*14($ctx) # s3
lfd $h0lo,8*15($ctx) # pull original fpscr
fadd $h1hi,$h1lo,$two0
fadd $h2hi,$h2lo,$two32
fadd $h3hi,$h3lo,$two64
fsub $h1hi,$h1hi,$two0
fsub $h2hi,$h2hi,$two32
fsub $h3hi,$h3hi,$two64
fsub $h1lo,$h1lo,$h1hi
fsub $h2lo,$h2lo,$h2hi
fsub $h3lo,$h3lo,$h3hi
stfd $h1hi,8*13($ctx) # s1hi
stfd $h2hi,8*15($ctx) # s2hi
stfd $h3hi,8*17($ctx) # s3hi
stfd $h1lo,8*12($ctx) # s1lo
stfd $h2lo,8*14($ctx) # s2lo
stfd $h3lo,8*16($ctx) # s3lo
mtfsf 255,$h0lo # restore fpscr
Lno_key:
xor r3,r3,r3
addi $sp,$sp,$LOCALS
blr
.long 0
.byte 0,12,4,1,0x80,0,2,0
.size .poly1305_init_fpu,.-.poly1305_init_fpu
.globl .poly1305_blocks_fpu
.align 4
.poly1305_blocks_fpu:
srwi. $len,$len,4
beq- Labort
$STU $sp,-$FRAME($sp)
mflr r0
stfd f14,`$FRAME-8*18`($sp)
stfd f15,`$FRAME-8*17`($sp)
stfd f16,`$FRAME-8*16`($sp)
stfd f17,`$FRAME-8*15`($sp)
stfd f18,`$FRAME-8*14`($sp)
stfd f19,`$FRAME-8*13`($sp)
stfd f20,`$FRAME-8*12`($sp)
stfd f21,`$FRAME-8*11`($sp)
stfd f22,`$FRAME-8*10`($sp)
stfd f23,`$FRAME-8*9`($sp)
stfd f24,`$FRAME-8*8`($sp)
stfd f25,`$FRAME-8*7`($sp)
stfd f26,`$FRAME-8*6`($sp)
stfd f27,`$FRAME-8*5`($sp)
stfd f28,`$FRAME-8*4`($sp)
stfd f29,`$FRAME-8*3`($sp)
stfd f30,`$FRAME-8*2`($sp)
stfd f31,`$FRAME-8*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
xor r0,r0,r0
li $in3,1
mtctr $len
neg $len,$len
stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp)
stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp)
lfd $two0,8*18($ctx) # load constants
lfd $two32,8*19($ctx)
lfd $two64,8*20($ctx)
lfd $two96,8*21($ctx)
lfd $two130,8*22($ctx)
lfd $five_two130,8*23($ctx)
lfd $h0lo,8*0($ctx) # load [biased] hash value
lfd $h1lo,8*1($ctx)
lfd $h2lo,8*2($ctx)
lfd $h3lo,8*3($ctx)
stfd $two0,`$LOCALS+8*0`($sp) # input "template"
oris $in3,$padbit,`(1023+52+96)<<4`
stfd $two32,`$LOCALS+8*1`($sp)
stfd $two64,`$LOCALS+8*2`($sp)
stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp)
li $i1,4
li $i2,8
li $i3,12
$LWXLE $in0,0,$inp # load input
$LWXLE $in1,$i1,$inp
$LWXLE $in2,$i2,$inp
$LWXLE $in3,$i3,$inp
addi $inp,$inp,16
stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
mffs $x0 # original fpscr
lfd $x1,`$LOCALS+8*4`($sp) # new fpscr
lfd $r0lo,8*4($ctx) # load key
lfd $r0hi,8*5($ctx)
lfd $r1lo,8*6($ctx)
lfd $r1hi,8*7($ctx)
lfd $r2lo,8*8($ctx)
lfd $r2hi,8*9($ctx)
lfd $r3lo,8*10($ctx)
lfd $r3hi,8*11($ctx)
lfd $s1lo,8*12($ctx)
lfd $s1hi,8*13($ctx)
lfd $s2lo,8*14($ctx)
lfd $s2hi,8*15($ctx)
lfd $s3lo,8*16($ctx)
lfd $s3hi,8*17($ctx)
stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr
mtfsf 255,$x1
addic $len,$len,1
addze r0,r0
slwi. r0,r0,4
sub $inp,$inp,r0 # conditional rewind
lfd $x0,`$LOCALS+8*0`($sp)
lfd $x1,`$LOCALS+8*1`($sp)
lfd $x2,`$LOCALS+8*2`($sp)
lfd $x3,`$LOCALS+8*3`($sp)
fsub $h0lo,$h0lo,$two0 # de-bias hash value
$LWXLE $in0,0,$inp # modulo-scheduled input load
fsub $h1lo,$h1lo,$two32
$LWXLE $in1,$i1,$inp
fsub $h2lo,$h2lo,$two64
$LWXLE $in2,$i2,$inp
fsub $h3lo,$h3lo,$two96
$LWXLE $in3,$i3,$inp
fsub $x0,$x0,$two0 # de-bias input
addi $inp,$inp,16
fsub $x1,$x1,$two32
fsub $x2,$x2,$two64
fsub $x3,$x3,$two96
fadd $x0,$x0,$h0lo # accumulate input
stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp)
fadd $x1,$x1,$h1lo
stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
fadd $x2,$x2,$h2lo
stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
fadd $x3,$x3,$h3lo
stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
b Lentry
.align 4
Loop:
fsub $y0,$y0,$two0 # de-bias input
addic $len,$len,1
fsub $y1,$y1,$two32
addze r0,r0
fsub $y2,$y2,$two64
slwi. r0,r0,4
fsub $y3,$y3,$two96
sub $inp,$inp,r0 # conditional rewind
fadd $h0lo,$h0lo,$y0 # accumulate input
fadd $h0hi,$h0hi,$y1
fadd $h2lo,$h2lo,$y2
fadd $h2hi,$h2hi,$y3
######################################### base 2^48 -> base 2^32
fadd $c1lo,$h1lo,$two64
$LWXLE $in0,0,$inp # modulo-scheduled input load
fadd $c1hi,$h1hi,$two64
$LWXLE $in1,$i1,$inp
fadd $c3lo,$h3lo,$two130
$LWXLE $in2,$i2,$inp
fadd $c3hi,$h3hi,$two130
$LWXLE $in3,$i3,$inp
fadd $c0lo,$h0lo,$two32
addi $inp,$inp,16
fadd $c0hi,$h0hi,$two32
fadd $c2lo,$h2lo,$two96
fadd $c2hi,$h2hi,$two96
fsub $c1lo,$c1lo,$two64
stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template"
fsub $c1hi,$c1hi,$two64
stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp)
fsub $c3lo,$c3lo,$two130
stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp)
fsub $c3hi,$c3hi,$two130
stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp)
fsub $c0lo,$c0lo,$two32
fsub $c0hi,$c0hi,$two32
fsub $c2lo,$c2lo,$two96
fsub $c2hi,$c2hi,$two96
fsub $h1lo,$h1lo,$c1lo
fsub $h1hi,$h1hi,$c1hi
fsub $h3lo,$h3lo,$c3lo
fsub $h3hi,$h3hi,$c3hi
fsub $h2lo,$h2lo,$c2lo
fsub $h2hi,$h2hi,$c2hi
fsub $h0lo,$h0lo,$c0lo
fsub $h0hi,$h0hi,$c0hi
fadd $h1lo,$h1lo,$c0lo
fadd $h1hi,$h1hi,$c0hi
fadd $h3lo,$h3lo,$c2lo
fadd $h3hi,$h3hi,$c2hi
fadd $h2lo,$h2lo,$c1lo
fadd $h2hi,$h2hi,$c1hi
fmadd $h0lo,$c3lo,$five_two130,$h0lo
fmadd $h0hi,$c3hi,$five_two130,$h0hi
fadd $x1,$h1lo,$h1hi
lfd $s1lo,8*12($ctx) # reload constants
fadd $x3,$h3lo,$h3hi
lfd $s1hi,8*13($ctx)
fadd $x2,$h2lo,$h2hi
lfd $r3lo,8*10($ctx)
fadd $x0,$h0lo,$h0hi
lfd $r3hi,8*11($ctx)
Lentry:
fmul $h0lo,$s3lo,$x1
fmul $h0hi,$s3hi,$x1
fmul $h2lo,$r1lo,$x1
fmul $h2hi,$r1hi,$x1
fmul $h1lo,$r0lo,$x1
fmul $h1hi,$r0hi,$x1
fmul $h3lo,$r2lo,$x1
fmul $h3hi,$r2hi,$x1
fmadd $h0lo,$s1lo,$x3,$h0lo
fmadd $h0hi,$s1hi,$x3,$h0hi
fmadd $h2lo,$s3lo,$x3,$h2lo
fmadd $h2hi,$s3hi,$x3,$h2hi
fmadd $h1lo,$s2lo,$x3,$h1lo
fmadd $h1hi,$s2hi,$x3,$h1hi
fmadd $h3lo,$r0lo,$x3,$h3lo
fmadd $h3hi,$r0hi,$x3,$h3hi
fmadd $h0lo,$s2lo,$x2,$h0lo
fmadd $h0hi,$s2hi,$x2,$h0hi
fmadd $h2lo,$r0lo,$x2,$h2lo
fmadd $h2hi,$r0hi,$x2,$h2hi
fmadd $h1lo,$s3lo,$x2,$h1lo
fmadd $h1hi,$s3hi,$x2,$h1hi
fmadd $h3lo,$r1lo,$x2,$h3lo
fmadd $h3hi,$r1hi,$x2,$h3hi
fmadd $h0lo,$r0lo,$x0,$h0lo
lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input
fmadd $h0hi,$r0hi,$x0,$h0hi
lfd $y1,`$LOCALS+8*1`($sp)
fmadd $h2lo,$r2lo,$x0,$h2lo
lfd $y2,`$LOCALS+8*2`($sp)
fmadd $h2hi,$r2hi,$x0,$h2hi
lfd $y3,`$LOCALS+8*3`($sp)
fmadd $h1lo,$r1lo,$x0,$h1lo
fmadd $h1hi,$r1hi,$x0,$h1hi
fmadd $h3lo,$r3lo,$x0,$h3lo
fmadd $h3hi,$r3hi,$x0,$h3hi
bdnz Loop
######################################### base 2^48 -> base 2^32
fadd $c0lo,$h0lo,$two32
fadd $c0hi,$h0hi,$two32
fadd $c2lo,$h2lo,$two96
fadd $c2hi,$h2hi,$two96
fadd $c1lo,$h1lo,$two64
fadd $c1hi,$h1hi,$two64
fadd $c3lo,$h3lo,$two130
fadd $c3hi,$h3hi,$two130
fsub $c0lo,$c0lo,$two32
fsub $c0hi,$c0hi,$two32
fsub $c2lo,$c2lo,$two96
fsub $c2hi,$c2hi,$two96
fsub $c1lo,$c1lo,$two64
fsub $c1hi,$c1hi,$two64
fsub $c3lo,$c3lo,$two130
fsub $c3hi,$c3hi,$two130
fsub $h1lo,$h1lo,$c1lo
fsub $h1hi,$h1hi,$c1hi
fsub $h3lo,$h3lo,$c3lo
fsub $h3hi,$h3hi,$c3hi
fsub $h2lo,$h2lo,$c2lo
fsub $h2hi,$h2hi,$c2hi
fsub $h0lo,$h0lo,$c0lo
fsub $h0hi,$h0hi,$c0hi
fadd $h1lo,$h1lo,$c0lo
fadd $h1hi,$h1hi,$c0hi
fadd $h3lo,$h3lo,$c2lo
fadd $h3hi,$h3hi,$c2hi
fadd $h2lo,$h2lo,$c1lo
fadd $h2hi,$h2hi,$c1hi
fmadd $h0lo,$c3lo,$five_two130,$h0lo
fmadd $h0hi,$c3hi,$five_two130,$h0hi
fadd $x1,$h1lo,$h1hi
fadd $x3,$h3lo,$h3hi
fadd $x2,$h2lo,$h2hi
fadd $x0,$h0lo,$h0hi
lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr
fadd $x1,$x1,$two32 # bias
fadd $x3,$x3,$two96
fadd $x2,$x2,$two64
fadd $x0,$x0,$two0
stfd $x1,8*1($ctx) # store [biased] hash value
stfd $x3,8*3($ctx)
stfd $x2,8*2($ctx)
stfd $x0,8*0($ctx)
mtfsf 255,$h0lo # restore original fpscr
lfd f14,`$FRAME-8*18`($sp)
lfd f15,`$FRAME-8*17`($sp)
lfd f16,`$FRAME-8*16`($sp)
lfd f17,`$FRAME-8*15`($sp)
lfd f18,`$FRAME-8*14`($sp)
lfd f19,`$FRAME-8*13`($sp)
lfd f20,`$FRAME-8*12`($sp)
lfd f21,`$FRAME-8*11`($sp)
lfd f22,`$FRAME-8*10`($sp)
lfd f23,`$FRAME-8*9`($sp)
lfd f24,`$FRAME-8*8`($sp)
lfd f25,`$FRAME-8*7`($sp)
lfd f26,`$FRAME-8*6`($sp)
lfd f27,`$FRAME-8*5`($sp)
lfd f28,`$FRAME-8*4`($sp)
lfd f29,`$FRAME-8*3`($sp)
lfd f30,`$FRAME-8*2`($sp)
lfd f31,`$FRAME-8*1`($sp)
addi $sp,$sp,$FRAME
Labort:
blr
.long 0
.byte 0,12,4,1,0x80,0,4,0
.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu
___
{
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3
) = map("r$_",(7..11,28..31));
my $mask = "r0";
my $FRAME = (6+4)*$SIZE_T;
$code.=<<___;
.globl .poly1305_emit_fpu
.align 4
.poly1305_emit_fpu:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash
lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx)
lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx)
lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx)
lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx)
lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx)
lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx)
lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx)
lis $mask,0xfff0
andc $d0,$d0,$mask # mask exponent
andc $d1,$d1,$mask
andc $d2,$d2,$mask
andc $d3,$d3,$mask # can be partially reduced...
li $mask,3
srwi $padbit,$d3,2 # ... so reduce
and $h4,$d3,$mask
andc $d3,$d3,$mask
add $d3,$d3,$padbit
___
if ($SIZE_T==4) {
$code.=<<___;
addc $h0,$h0,$d3
adde $h1,$h1,$d0
adde $h2,$h2,$d1
adde $h3,$h3,$d2
addze $h4,$h4
addic $d0,$h0,5 # compare to modulus
addze $d1,$h1
addze $d2,$h2
addze $d3,$h3
addze $mask,$h4
srwi $mask,$mask,2 # did it carry/borrow?
neg $mask,$mask
srawi $mask,$mask,31 # mask
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h1,$h1,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
lwz $d0,0($nonce) # load nonce
andc $h2,$h2,$mask
and $d2,$d2,$mask
or $h1,$h1,$d1
lwz $d1,4($nonce)
andc $h3,$h3,$mask
and $d3,$d3,$mask
or $h2,$h2,$d2
lwz $d2,8($nonce)
or $h3,$h3,$d3
lwz $d3,12($nonce)
addc $h0,$h0,$d0 # accumulate nonce
adde $h1,$h1,$d1
adde $h2,$h2,$d2
adde $h3,$h3,$d3
___
} else {
$code.=<<___;
add $h0,$h0,$d3
add $h1,$h1,$d0
add $h2,$h2,$d1
add $h3,$h3,$d2
srdi $d0,$h0,32
add $h1,$h1,$d0
srdi $d1,$h1,32
add $h2,$h2,$d1
srdi $d2,$h2,32
add $h3,$h3,$d2
srdi $d3,$h3,32
add $h4,$h4,$d3
insrdi $h0,$h1,32,0
insrdi $h2,$h3,32,0
addic $d0,$h0,5 # compare to modulus
addze $d1,$h2
addze $d2,$h4
srdi $mask,$d2,2 # did it carry/borrow?
neg $mask,$mask
sradi $mask,$mask,63 # mask
ld $d2,0($nonce) # load nonce
ld $d3,8($nonce)
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h2,$h2,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
or $h2,$h2,$d1
___
$code.=<<___ if (!$LITTLE_ENDIAN);
rotldi $d2,$d2,32 # flip nonce words
rotldi $d3,$d3,32
___
$code.=<<___;
addc $h0,$h0,$d2 # accumulate nonce
adde $h2,$h2,$d3
srdi $h1,$h0,32
srdi $h3,$h2,32
___
}
$code.=<<___ if ($LITTLE_ENDIAN);
stw $h0,0($mac) # write result
stw $h1,4($mac)
stw $h2,8($mac)
stw $h3,12($mac)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d1,4
stwbrx $h0,0,$mac # write result
li $d2,8
stwbrx $h1,$d1,$mac
li $d3,12
stwbrx $h2,$d2,$mac
stwbrx $h3,$d3,$mac
___
$code.=<<___;
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,4,3,0
.size .poly1305_emit_fpu,.-.poly1305_emit_fpu
___
}
# Ugly hack here, because PPC assembler syntax seem to vary too
# much from platforms to platform...
$code.=<<___;
.align 6
LPICmeup:
mflr r0
bcl 20,31,\$+4
mflr $len # vvvvvv "distance" between . and 1st data entry
addi $len,$len,`64-8` # borrow $len
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
.quad 0x4330000000000000 # 2^(52+0)
.quad 0x4530000000000000 # 2^(52+32)
.quad 0x4730000000000000 # 2^(52+64)
.quad 0x4930000000000000 # 2^(52+96)
.quad 0x4b50000000000000 # 2^(52+130)
.quad 0x37f4000000000000 # 5/2^130
.quad 0x4430000000000000 # 2^(52+16+0)
.quad 0x4630000000000000 # 2^(52+16+32)
.quad 0x4830000000000000 # 2^(52+16+64)
.quad 0x4a30000000000000 # 2^(52+16+96)
.quad 0x3e30000000000000 # 2^(52+16+0-96)
.quad 0x4030000000000000 # 2^(52+16+32-96)
.quad 0x4230000000000000 # 2^(52+16+64-96)
.quad 0x0000000000000001 # fpscr: truncate, no exceptions
.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,227 @@
#! /usr/bin/env perl
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for s390x.
#
# June 2015
#
# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
# On side note, z13 enables vector base 2^26 implementation...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$sp="%r15";
my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
$code.=<<___;
.text
.globl poly1305_init
.type poly1305_init,\@function
.align 16
poly1305_init:
lghi %r0,0
lghi %r1,-1
stg %r0,0($ctx) # zero hash value
stg %r0,8($ctx)
stg %r0,16($ctx)
cl${g}r $inp,%r0
je .Lno_key
lrvg %r4,0($inp) # load little-endian key
lrvg %r5,8($inp)
nihl %r1,0xffc0 # 0xffffffc0ffffffff
srlg %r0,%r1,4 # 0x0ffffffc0fffffff
srlg %r1,%r1,4
nill %r1,0xfffc # 0x0ffffffc0ffffffc
ngr %r4,%r0
ngr %r5,%r1
stg %r4,32($ctx)
stg %r5,40($ctx)
.Lno_key:
lghi %r2,0
br %r14
.size poly1305_init,.-poly1305_init
___
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
$code.=<<___;
.globl poly1305_blocks
.type poly1305_blocks,\@function
.align 16
poly1305_blocks:
srl${g} $len,4 # fixed-up in 64-bit build
lghi %r0,0
cl${g}r $len,%r0
je .Lno_data
stm${g} %r6,%r14,`6*$SIZE_T`($sp)
llgfr $padbit,$padbit # clear upper half, much needed with
# non-64-bit ABI
lg $r0,32($ctx) # load key
lg $r1,40($ctx)
lg $h0,0($ctx) # load hash value
lg $h1,8($ctx)
lg $h2,16($ctx)
st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
srlg $s1,$r1,2
algr $s1,$r1 # s1 = r1 + r1>>2
j .Loop
.align 16
.Loop:
lrvg $d0lo,0($inp) # load little-endian input
lrvg $d1lo,8($inp)
la $inp,16($inp)
algr $d0lo,$h0 # accumulate input
alcgr $d1lo,$h1
lgr $h0,$d0lo
mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
lgr $h1,$d1lo
mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
mlgr $t0,$r1 # h0*r1 -> $t0:$h0
mlgr $t1,$r0 # h1*r0 -> $t1:$h1
alcgr $h2,$padbit
algr $d0lo,$d1lo
lgr $d1lo,$h2
alcgr $d0hi,$d1hi
lghi $d1hi,0
algr $h1,$h0
alcgr $t1,$t0
msgr $d1lo,$s1 # h2*s1
msgr $h2,$r0 # h2*r0
algr $h1,$d1lo
alcgr $t1,$d1hi # $d1hi is zero
algr $h1,$d0hi
alcgr $h2,$t1
lghi $h0,-4 # final reduction step
ngr $h0,$h2
srlg $t0,$h2,2
algr $h0,$t0
lghi $t1,3
ngr $h2,$t1
algr $h0,$d0lo
alcgr $h1,$d1hi # $d1hi is still zero
alcgr $h2,$d1hi # $d1hi is still zero
brct$g $len,.Loop
l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
stg $h0,0($ctx) # store hash value
stg $h1,8($ctx)
stg $h2,16($ctx)
lm${g} %r6,%r14,`6*$SIZE_T`($sp)
.Lno_data:
br %r14
.size poly1305_blocks,.-poly1305_blocks
___
}
{
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
$code.=<<___;
.globl poly1305_emit
.type poly1305_emit,\@function
.align 16
poly1305_emit:
stm${g} %r6,%r9,`6*$SIZE_T`($sp)
lg $h0,0($ctx)
lg $h1,8($ctx)
lg $h2,16($ctx)
lghi %r0,5
lghi %r1,0
lgr $d0,$h0
lgr $d1,$h1
algr $h0,%r0 # compare to modulus
alcgr $h1,%r1
alcgr $h2,%r1
srlg $h2,$h2,2 # did it borrow/carry?
slgr %r1,$h2 # 0-$h2>>2
lg $h2,0($nonce) # load nonce
lghi %r0,-1
lg $ctx,8($nonce)
xgr %r0,%r1 # ~%r1
ngr $h0,%r1
ngr $d0,%r0
ngr $h1,%r1
ngr $d1,%r0
ogr $h0,$d0
rllg $d0,$h2,32 # flip nonce words
ogr $h1,$d1
rllg $d1,$ctx,32
algr $h0,$d0 # accumulate nonce
alcgr $h1,$d1
strvg $h0,0($mac) # write little-endian result
strvg $h1,8($mac)
lm${g} %r6,%r9,`6*$SIZE_T`($sp)
br %r14
.size poly1305_emit,.-poly1305_emit
.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
print $code;
close STDOUT;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,24 @@
LIBS=../../libcrypto
SOURCE[../../libcrypto]=\
poly1305_pmeth.c \
poly1305_ameth.c \
poly1305.c {- $target{poly1305_asm_src} -}
GENERATE[poly1305-sparcv9.S]=asm/poly1305-sparcv9.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-sparcv9.o]=..
GENERATE[poly1305-x86.s]=asm/poly1305-x86.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
GENERATE[poly1305-x86_64.s]=asm/poly1305-x86_64.pl $(PERLASM_SCHEME)
GENERATE[poly1305-ppc.s]=asm/poly1305-ppc.pl $(PERLASM_SCHEME)
GENERATE[poly1305-ppcfp.s]=asm/poly1305-ppcfp.pl $(PERLASM_SCHEME)
GENERATE[poly1305-armv4.S]=asm/poly1305-armv4.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-armv4.o]=..
GENERATE[poly1305-armv8.S]=asm/poly1305-armv8.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-armv8.o]=..
GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-mips.o]=..
BEGINRAW[Makefile(unix)]
{- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl
CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@
ENDRAW[Makefile(unix)]

View file

@ -0,0 +1,531 @@
/*
* Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdlib.h>
#include <string.h>
#include <openssl/crypto.h>
#include "internal/poly1305.h"
#include "poly1305_local.h"
size_t Poly1305_ctx_size(void)
{
return sizeof(struct poly1305_context);
}
/* pick 32-bit unsigned integer in little endian order */
static unsigned int U8TOU32(const unsigned char *p)
{
return (((unsigned int)(p[0] & 0xff)) |
((unsigned int)(p[1] & 0xff) << 8) |
((unsigned int)(p[2] & 0xff) << 16) |
((unsigned int)(p[3] & 0xff) << 24));
}
/*
* Implementations can be classified by amount of significant bits in
* words making up the multi-precision value, or in other words radix
* or base of numerical representation, e.g. base 2^64, base 2^32,
* base 2^26. Complementary characteristic is how wide is the result of
* multiplication of pair of digits, e.g. it would take 128 bits to
* accommodate multiplication result in base 2^64 case. These are used
* interchangeably. To describe implementation that is. But interface
* is designed to isolate this so that low-level primitives implemented
* in assembly can be self-contained/self-coherent.
*/
#ifndef POLY1305_ASM
/*
* Even though there is __int128 reference implementation targeting
* 64-bit platforms provided below, it's not obvious that it's optimal
* choice for every one of them. Depending on instruction set overall
* amount of instructions can be comparable to one in __int64
* implementation. Amount of multiplication instructions would be lower,
* but not necessarily overall. And in out-of-order execution context,
* it is the latter that can be crucial...
*
* On related note. Poly1305 author, D. J. Bernstein, discusses and
* provides floating-point implementations of the algorithm in question.
* It made a lot of sense by the time of introduction, because most
* then-modern processors didn't have pipelined integer multiplier.
* [Not to mention that some had non-constant timing for integer
* multiplications.] Floating-point instructions on the other hand could
* be issued every cycle, which allowed to achieve better performance.
* Nowadays, with SIMD and/or out-or-order execution, shared or
* even emulated FPU, it's more complicated, and floating-point
* implementation is not necessarily optimal choice in every situation,
* rather contrary...
*
* <appro@openssl.org>
*/
typedef unsigned int u32;
/*
* poly1305_blocks processes a multiple of POLY1305_BLOCK_SIZE blocks
* of |inp| no longer than |len|. Behaviour for |len| not divisible by
* block size is unspecified in general case, even though in reference
* implementation the trailing chunk is simply ignored. Per algorithm
* specification, every input block, complete or last partial, is to be
* padded with a bit past most significant byte. The latter kind is then
* padded with zeros till block size. This last partial block padding
* is caller(*)'s responsibility, and because of this the last partial
* block is always processed with separate call with |len| set to
* POLY1305_BLOCK_SIZE and |padbit| to 0. In all other cases |padbit|
* should be set to 1 to perform implicit padding with 128th bit.
* poly1305_blocks does not actually check for this constraint though,
* it's caller(*)'s responsibility to comply.
*
* (*) In the context "caller" is not application code, but higher
* level Poly1305_* from this very module, so that quirks are
* handled locally.
*/
static void
poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit);
/*
* Type-agnostic "rip-off" from constant_time_locl.h
*/
# define CONSTANT_TIME_CARRY(a,b) ( \
(a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1) \
)
# if (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) && \
(defined(__SIZEOF_LONG__) && __SIZEOF_LONG__==8)
typedef unsigned long u64;
typedef __uint128_t u128;
typedef struct {
u64 h[3];
u64 r[2];
} poly1305_internal;
/* pick 32-bit unsigned integer in little endian order */
static u64 U8TOU64(const unsigned char *p)
{
return (((u64)(p[0] & 0xff)) |
((u64)(p[1] & 0xff) << 8) |
((u64)(p[2] & 0xff) << 16) |
((u64)(p[3] & 0xff) << 24) |
((u64)(p[4] & 0xff) << 32) |
((u64)(p[5] & 0xff) << 40) |
((u64)(p[6] & 0xff) << 48) |
((u64)(p[7] & 0xff) << 56));
}
/* store a 32-bit unsigned integer in little endian */
static void U64TO8(unsigned char *p, u64 v)
{
p[0] = (unsigned char)((v) & 0xff);
p[1] = (unsigned char)((v >> 8) & 0xff);
p[2] = (unsigned char)((v >> 16) & 0xff);
p[3] = (unsigned char)((v >> 24) & 0xff);
p[4] = (unsigned char)((v >> 32) & 0xff);
p[5] = (unsigned char)((v >> 40) & 0xff);
p[6] = (unsigned char)((v >> 48) & 0xff);
p[7] = (unsigned char)((v >> 56) & 0xff);
}
static void poly1305_init(void *ctx, const unsigned char key[16])
{
poly1305_internal *st = (poly1305_internal *) ctx;
/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
st->r[0] = U8TOU64(&key[0]) & 0x0ffffffc0fffffff;
st->r[1] = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc;
}
static void
poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
{
poly1305_internal *st = (poly1305_internal *)ctx;
u64 r0, r1;
u64 s1;
u64 h0, h1, h2, c;
u128 d0, d1;
r0 = st->r[0];
r1 = st->r[1];
s1 = r1 + (r1 >> 2);
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
while (len >= POLY1305_BLOCK_SIZE) {
/* h += m[i] */
h0 = (u64)(d0 = (u128)h0 + U8TOU64(inp + 0));
h1 = (u64)(d1 = (u128)h1 + (d0 >> 64) + U8TOU64(inp + 8));
/*
* padbit can be zero only when original len was
* POLY1306_BLOCK_SIZE, but we don't check
*/
h2 += (u64)(d1 >> 64) + padbit;
/* h *= r "%" p, where "%" stands for "partial remainder" */
d0 = ((u128)h0 * r0) +
((u128)h1 * s1);
d1 = ((u128)h0 * r1) +
((u128)h1 * r0) +
(h2 * s1);
h2 = (h2 * r0);
/* last reduction step: */
/* a) h2:h0 = h2<<128 + d1<<64 + d0 */
h0 = (u64)d0;
h1 = (u64)(d1 += d0 >> 64);
h2 += (u64)(d1 >> 64);
/* b) (h2:h0 += (h2:h0>>130) * 5) %= 2^130 */
c = (h2 >> 2) + (h2 & ~3UL);
h2 &= 3;
h0 += c;
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
h2 += CONSTANT_TIME_CARRY(h1,c);
/*
* Occasional overflows to 3rd bit of h2 are taken care of
* "naturally". If after this point we end up at the top of
* this loop, then the overflow bit will be accounted for
* in next iteration. If we end up in poly1305_emit, then
* comparison to modulus below will still count as "carry
* into 131st bit", so that properly reduced value will be
* picked in conditional move.
*/
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
}
st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
}
static void poly1305_emit(void *ctx, unsigned char mac[16],
const u32 nonce[4])
{
poly1305_internal *st = (poly1305_internal *) ctx;
u64 h0, h1, h2;
u64 g0, g1, g2;
u128 t;
u64 mask;
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
/* compare to modulus by computing h + -p */
g0 = (u64)(t = (u128)h0 + 5);
g1 = (u64)(t = (u128)h1 + (t >> 64));
g2 = h2 + (u64)(t >> 64);
/* if there was carry into 131st bit, h1:h0 = g1:g0 */
mask = 0 - (g2 >> 2);
g0 &= mask;
g1 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
/* mac = (h + nonce) % (2^128) */
h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32));
h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64));
U64TO8(mac + 0, h0);
U64TO8(mac + 8, h1);
}
# else
# if defined(_WIN32) && !defined(__MINGW32__)
typedef unsigned __int64 u64;
# elif defined(__arch64__)
typedef unsigned long u64;
# else
typedef unsigned long long u64;
# endif
typedef struct {
u32 h[5];
u32 r[4];
} poly1305_internal;
/* store a 32-bit unsigned integer in little endian */
static void U32TO8(unsigned char *p, unsigned int v)
{
p[0] = (unsigned char)((v) & 0xff);
p[1] = (unsigned char)((v >> 8) & 0xff);
p[2] = (unsigned char)((v >> 16) & 0xff);
p[3] = (unsigned char)((v >> 24) & 0xff);
}
static void poly1305_init(void *ctx, const unsigned char key[16])
{
poly1305_internal *st = (poly1305_internal *) ctx;
/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
st->h[3] = 0;
st->h[4] = 0;
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
st->r[0] = U8TOU32(&key[0]) & 0x0fffffff;
st->r[1] = U8TOU32(&key[4]) & 0x0ffffffc;
st->r[2] = U8TOU32(&key[8]) & 0x0ffffffc;
st->r[3] = U8TOU32(&key[12]) & 0x0ffffffc;
}
static void
poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
{
poly1305_internal *st = (poly1305_internal *)ctx;
u32 r0, r1, r2, r3;
u32 s1, s2, s3;
u32 h0, h1, h2, h3, h4, c;
u64 d0, d1, d2, d3;
r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];
r3 = st->r[3];
s1 = r1 + (r1 >> 2);
s2 = r2 + (r2 >> 2);
s3 = r3 + (r3 >> 2);
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];
while (len >= POLY1305_BLOCK_SIZE) {
/* h += m[i] */
h0 = (u32)(d0 = (u64)h0 + U8TOU32(inp + 0));
h1 = (u32)(d1 = (u64)h1 + (d0 >> 32) + U8TOU32(inp + 4));
h2 = (u32)(d2 = (u64)h2 + (d1 >> 32) + U8TOU32(inp + 8));
h3 = (u32)(d3 = (u64)h3 + (d2 >> 32) + U8TOU32(inp + 12));
h4 += (u32)(d3 >> 32) + padbit;
/* h *= r "%" p, where "%" stands for "partial remainder" */
d0 = ((u64)h0 * r0) +
((u64)h1 * s3) +
((u64)h2 * s2) +
((u64)h3 * s1);
d1 = ((u64)h0 * r1) +
((u64)h1 * r0) +
((u64)h2 * s3) +
((u64)h3 * s2) +
(h4 * s1);
d2 = ((u64)h0 * r2) +
((u64)h1 * r1) +
((u64)h2 * r0) +
((u64)h3 * s3) +
(h4 * s2);
d3 = ((u64)h0 * r3) +
((u64)h1 * r2) +
((u64)h2 * r1) +
((u64)h3 * r0) +
(h4 * s3);
h4 = (h4 * r0);
/* last reduction step: */
/* a) h4:h0 = h4<<128 + d3<<96 + d2<<64 + d1<<32 + d0 */
h0 = (u32)d0;
h1 = (u32)(d1 += d0 >> 32);
h2 = (u32)(d2 += d1 >> 32);
h3 = (u32)(d3 += d2 >> 32);
h4 += (u32)(d3 >> 32);
/* b) (h4:h0 += (h4:h0>>130) * 5) %= 2^130 */
c = (h4 >> 2) + (h4 & ~3U);
h4 &= 3;
h0 += c;
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
h3 += (c = CONSTANT_TIME_CARRY(h2,c));
h4 += CONSTANT_TIME_CARRY(h3,c);
/*
* Occasional overflows to 3rd bit of h4 are taken care of
* "naturally". If after this point we end up at the top of
* this loop, then the overflow bit will be accounted for
* in next iteration. If we end up in poly1305_emit, then
* comparison to modulus below will still count as "carry
* into 131st bit", so that properly reduced value will be
* picked in conditional move.
*/
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
}
st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
st->h[3] = h3;
st->h[4] = h4;
}
static void poly1305_emit(void *ctx, unsigned char mac[16],
const u32 nonce[4])
{
poly1305_internal *st = (poly1305_internal *) ctx;
u32 h0, h1, h2, h3, h4;
u32 g0, g1, g2, g3, g4;
u64 t;
u32 mask;
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
h3 = st->h[3];
h4 = st->h[4];
/* compare to modulus by computing h + -p */
g0 = (u32)(t = (u64)h0 + 5);
g1 = (u32)(t = (u64)h1 + (t >> 32));
g2 = (u32)(t = (u64)h2 + (t >> 32));
g3 = (u32)(t = (u64)h3 + (t >> 32));
g4 = h4 + (u32)(t >> 32);
/* if there was carry into 131st bit, h3:h0 = g3:g0 */
mask = 0 - (g4 >> 2);
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
h2 = (h2 & mask) | g2;
h3 = (h3 & mask) | g3;
/* mac = (h + nonce) % (2^128) */
h0 = (u32)(t = (u64)h0 + nonce[0]);
h1 = (u32)(t = (u64)h1 + (t >> 32) + nonce[1]);
h2 = (u32)(t = (u64)h2 + (t >> 32) + nonce[2]);
h3 = (u32)(t = (u64)h3 + (t >> 32) + nonce[3]);
U32TO8(mac + 0, h0);
U32TO8(mac + 4, h1);
U32TO8(mac + 8, h2);
U32TO8(mac + 12, h3);
}
# endif
#else
int poly1305_init(void *ctx, const unsigned char key[16], void *func);
void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
unsigned int padbit);
void poly1305_emit(void *ctx, unsigned char mac[16],
const unsigned int nonce[4]);
#endif
void Poly1305_Init(POLY1305 *ctx, const unsigned char key[32])
{
ctx->nonce[0] = U8TOU32(&key[16]);
ctx->nonce[1] = U8TOU32(&key[20]);
ctx->nonce[2] = U8TOU32(&key[24]);
ctx->nonce[3] = U8TOU32(&key[28]);
#ifndef POLY1305_ASM
poly1305_init(ctx->opaque, key);
#else
/*
* Unlike reference poly1305_init assembly counterpart is expected
* to return a value: non-zero if it initializes ctx->func, and zero
* otherwise. Latter is to simplify assembly in cases when there no
* multiple code paths to switch between.
*/
if (!poly1305_init(ctx->opaque, key, &ctx->func)) {
ctx->func.blocks = poly1305_blocks;
ctx->func.emit = poly1305_emit;
}
#endif
ctx->num = 0;
}
#ifdef POLY1305_ASM
/*
* This "eclipses" poly1305_blocks and poly1305_emit, but it's
* conscious choice imposed by -Wshadow compiler warnings.
*/
# define poly1305_blocks (*poly1305_blocks_p)
# define poly1305_emit (*poly1305_emit_p)
#endif
void Poly1305_Update(POLY1305 *ctx, const unsigned char *inp, size_t len)
{
#ifdef POLY1305_ASM
/*
* As documented, poly1305_blocks is never called with input
* longer than single block and padbit argument set to 0. This
* property is fluently used in assembly modules to optimize
* padbit handling on loop boundary.
*/
poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;
#endif
size_t rem, num;
if ((num = ctx->num)) {
rem = POLY1305_BLOCK_SIZE - num;
if (len >= rem) {
memcpy(ctx->data + num, inp, rem);
poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 1);
inp += rem;
len -= rem;
} else {
/* Still not enough data to process a block. */
memcpy(ctx->data + num, inp, len);
ctx->num = num + len;
return;
}
}
rem = len % POLY1305_BLOCK_SIZE;
len -= rem;
if (len >= POLY1305_BLOCK_SIZE) {
poly1305_blocks(ctx->opaque, inp, len, 1);
inp += len;
}
if (rem)
memcpy(ctx->data, inp, rem);
ctx->num = rem;
}
void Poly1305_Final(POLY1305 *ctx, unsigned char mac[16])
{
#ifdef POLY1305_ASM
poly1305_blocks_f poly1305_blocks_p = ctx->func.blocks;
poly1305_emit_f poly1305_emit_p = ctx->func.emit;
#endif
size_t num;
if ((num = ctx->num)) {
ctx->data[num++] = 1; /* pad bit */
while (num < POLY1305_BLOCK_SIZE)
ctx->data[num++] = 0;
poly1305_blocks(ctx->opaque, ctx->data, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit(ctx->opaque, mac, ctx->nonce);
/* zero out the state */
OPENSSL_cleanse(ctx, sizeof(*ctx));
}

View file

@ -0,0 +1,122 @@
/*
* Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include "internal/cryptlib.h"
#include <openssl/evp.h>
#include "internal/asn1_int.h"
#include "internal/poly1305.h"
#include "poly1305_local.h"
#include "internal/evp_int.h"
/*
* POLY1305 "ASN1" method. This is just here to indicate the maximum
* POLY1305 output length and to free up a POLY1305 key.
*/
static int poly1305_size(const EVP_PKEY *pkey)
{
return POLY1305_DIGEST_SIZE;
}
static void poly1305_key_free(EVP_PKEY *pkey)
{
ASN1_OCTET_STRING *os = EVP_PKEY_get0(pkey);
if (os != NULL) {
if (os->data != NULL)
OPENSSL_cleanse(os->data, os->length);
ASN1_OCTET_STRING_free(os);
}
}
static int poly1305_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
{
/* nothing, (including ASN1_PKEY_CTRL_DEFAULT_MD_NID), is supported */
return -2;
}
static int poly1305_pkey_public_cmp(const EVP_PKEY *a, const EVP_PKEY *b)
{
return ASN1_OCTET_STRING_cmp(EVP_PKEY_get0(a), EVP_PKEY_get0(b));
}
static int poly1305_set_priv_key(EVP_PKEY *pkey, const unsigned char *priv,
size_t len)
{
ASN1_OCTET_STRING *os;
if (pkey->pkey.ptr != NULL || len != POLY1305_KEY_SIZE)
return 0;
os = ASN1_OCTET_STRING_new();
if (os == NULL)
return 0;
if (!ASN1_OCTET_STRING_set(os, priv, len)) {
ASN1_OCTET_STRING_free(os);
return 0;
}
pkey->pkey.ptr = os;
return 1;
}
static int poly1305_get_priv_key(const EVP_PKEY *pkey, unsigned char *priv,
size_t *len)
{
ASN1_OCTET_STRING *os = (ASN1_OCTET_STRING *)pkey->pkey.ptr;
if (priv == NULL) {
*len = POLY1305_KEY_SIZE;
return 1;
}
if (os == NULL || *len < POLY1305_KEY_SIZE)
return 0;
memcpy(priv, ASN1_STRING_get0_data(os), ASN1_STRING_length(os));
*len = POLY1305_KEY_SIZE;
return 1;
}
const EVP_PKEY_ASN1_METHOD poly1305_asn1_meth = {
EVP_PKEY_POLY1305,
EVP_PKEY_POLY1305,
0,
"POLY1305",
"OpenSSL POLY1305 method",
0, 0, poly1305_pkey_public_cmp, 0,
0, 0, 0,
poly1305_size,
0, 0,
0, 0, 0, 0, 0, 0, 0,
poly1305_key_free,
poly1305_pkey_ctrl,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
NULL,
poly1305_set_priv_key,
NULL,
poly1305_get_priv_key,
NULL,
};

View file

@ -0,0 +1,171 @@
/*
* Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/*
* This module is meant to be used as template for base 2^44 assembly
* implementation[s]. On side note compiler-generated code is not
* slower than compiler-generated base 2^64 code on [high-end] x86_64,
* even though amount of multiplications is 50% higher. Go figure...
*/
#include <stdlib.h>
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long u64;
typedef unsigned __int128 u128;
typedef struct {
u64 h[3];
u64 s[2];
u64 r[3];
} poly1305_internal;
#define POLY1305_BLOCK_SIZE 16
/* pick 64-bit unsigned integer in little endian order */
static u64 U8TOU64(const unsigned char *p)
{
return (((u64)(p[0] & 0xff)) |
((u64)(p[1] & 0xff) << 8) |
((u64)(p[2] & 0xff) << 16) |
((u64)(p[3] & 0xff) << 24) |
((u64)(p[4] & 0xff) << 32) |
((u64)(p[5] & 0xff) << 40) |
((u64)(p[6] & 0xff) << 48) |
((u64)(p[7] & 0xff) << 56));
}
/* store a 64-bit unsigned integer in little endian */
static void U64TO8(unsigned char *p, u64 v)
{
p[0] = (unsigned char)((v) & 0xff);
p[1] = (unsigned char)((v >> 8) & 0xff);
p[2] = (unsigned char)((v >> 16) & 0xff);
p[3] = (unsigned char)((v >> 24) & 0xff);
p[4] = (unsigned char)((v >> 32) & 0xff);
p[5] = (unsigned char)((v >> 40) & 0xff);
p[6] = (unsigned char)((v >> 48) & 0xff);
p[7] = (unsigned char)((v >> 56) & 0xff);
}
int poly1305_init(void *ctx, const unsigned char key[16])
{
poly1305_internal *st = (poly1305_internal *)ctx;
u64 r0, r1;
/* h = 0 */
st->h[0] = 0;
st->h[1] = 0;
st->h[2] = 0;
r0 = U8TOU64(&key[0]) & 0x0ffffffc0fffffff;
r1 = U8TOU64(&key[8]) & 0x0ffffffc0ffffffc;
/* break r1:r0 to three 44-bit digits, masks are 1<<44-1 */
st->r[0] = r0 & 0x0fffffffffff;
st->r[1] = ((r0 >> 44) | (r1 << 20)) & 0x0fffffffffff;
st->r[2] = (r1 >> 24);
st->s[0] = (st->r[1] + (st->r[1] << 2)) << 2;
st->s[1] = (st->r[2] + (st->r[2] << 2)) << 2;
return 0;
}
void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
u32 padbit)
{
poly1305_internal *st = (poly1305_internal *)ctx;
u64 r0, r1, r2;
u64 s1, s2;
u64 h0, h1, h2, c;
u128 d0, d1, d2;
u64 pad = (u64)padbit << 40;
r0 = st->r[0];
r1 = st->r[1];
r2 = st->r[2];
s1 = st->s[0];
s2 = st->s[1];
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
while (len >= POLY1305_BLOCK_SIZE) {
u64 m0, m1;
m0 = U8TOU64(inp + 0);
m1 = U8TOU64(inp + 8);
/* h += m[i], m[i] is broken to 44-bit digits */
h0 += m0 & 0x0fffffffffff;
h1 += ((m0 >> 44) | (m1 << 20)) & 0x0fffffffffff;
h2 += (m1 >> 24) + pad;
/* h *= r "%" p, where "%" stands for "partial remainder" */
d0 = ((u128)h0 * r0) + ((u128)h1 * s2) + ((u128)h2 * s1);
d1 = ((u128)h0 * r1) + ((u128)h1 * r0) + ((u128)h2 * s2);
d2 = ((u128)h0 * r2) + ((u128)h1 * r1) + ((u128)h2 * r0);
/* "lazy" reduction step */
h0 = (u64)d0 & 0x0fffffffffff;
h1 = (u64)(d1 += (u64)(d0 >> 44)) & 0x0fffffffffff;
h2 = (u64)(d2 += (u64)(d1 >> 44)) & 0x03ffffffffff; /* last 42 bits */
c = (d2 >> 42);
h0 += c + (c << 2);
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
}
st->h[0] = h0;
st->h[1] = h1;
st->h[2] = h2;
}
void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4])
{
poly1305_internal *st = (poly1305_internal *) ctx;
u64 h0, h1, h2;
u64 g0, g1, g2;
u128 t;
u64 mask;
h0 = st->h[0];
h1 = st->h[1];
h2 = st->h[2];
/* after "lazy" reduction, convert 44+bit digits to 64-bit ones */
h0 = (u64)(t = (u128)h0 + (h1 << 44)); h1 >>= 20;
h1 = (u64)(t = (u128)h1 + (h2 << 24) + (t >> 64)); h2 >>= 40;
h2 += (u64)(t >> 64);
/* compare to modulus by computing h + -p */
g0 = (u64)(t = (u128)h0 + 5);
g1 = (u64)(t = (u128)h1 + (t >> 64));
g2 = h2 + (u64)(t >> 64);
/* if there was carry into 131st bit, h1:h0 = g1:g0 */
mask = 0 - (g2 >> 2);
g0 &= mask;
g1 &= mask;
mask = ~mask;
h0 = (h0 & mask) | g0;
h1 = (h1 & mask) | g1;
/* mac = (h + nonce) % (2^128) */
h0 = (u64)(t = (u128)h0 + nonce[0] + ((u64)nonce[1]<<32));
h1 = (u64)(t = (u128)h1 + nonce[2] + ((u64)nonce[3]<<32) + (t >> 64));
U64TO8(mac + 0, h0);
U64TO8(mac + 8, h1);
}

View file

@ -0,0 +1,488 @@
/*
* Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/*
* This module is meant to be used as template for non-x87 floating-
* point assembly modules. The template itself is x86_64-specific
* though, as it was debugged on x86_64. So that implementor would
* have to recognize platform-specific parts, UxTOy and inline asm,
* and act accordingly.
*
* Huh? x86_64-specific code as template for non-x87? Note seven, which
* is not a typo, but reference to 80-bit precision. This module on the
* other hand relies on 64-bit precision operations, which are default
* for x86_64 code. And since we are at it, just for sense of it,
* large-block performance in cycles per processed byte for *this* code
* is:
* gcc-4.8 icc-15.0 clang-3.4(*)
*
* Westmere 4.96 5.09 4.37
* Sandy Bridge 4.95 4.90 4.17
* Haswell 4.92 4.87 3.78
* Bulldozer 4.67 4.49 4.68
* VIA Nano 7.07 7.05 5.98
* Silvermont 10.6 9.61 12.6
*
* (*) clang managed to discover parallelism and deployed SIMD;
*
* And for range of other platforms with unspecified gcc versions:
*
* Freescale e300 12.5
* PPC74x0 10.8
* POWER6 4.92
* POWER7 4.50
* POWER8 4.10
*
* z10 11.2
* z196+ 7.30
*
* UltraSPARC III 16.0
* SPARC T4 16.1
*/
#if !(defined(__GNUC__) && __GNUC__>=2)
# error "this is gcc-specific template"
#endif
#include <stdlib.h>
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef union { double d; u64 u; } elem64;
#define TWO(p) ((double)(1ULL<<(p)))
#define TWO0 TWO(0)
#define TWO32 TWO(32)
#define TWO64 (TWO32*TWO(32))
#define TWO96 (TWO64*TWO(32))
#define TWO130 (TWO96*TWO(34))
#define EXP(p) ((1023ULL+(p))<<52)
#if defined(__x86_64__) || (defined(__PPC__) && defined(__LITTLE_ENDIAN__))
# define U8TOU32(p) (*(const u32 *)(p))
# define U32TO8(p,v) (*(u32 *)(p) = (v))
#elif defined(__PPC__)
# define U8TOU32(p) ({u32 ret; asm ("lwbrx %0,0,%1":"=r"(ret):"b"(p)); ret; })
# define U32TO8(p,v) asm ("stwbrx %0,0,%1"::"r"(v),"b"(p):"memory")
#elif defined(__s390x__)
# define U8TOU32(p) ({u32 ret; asm ("lrv %0,%1":"=d"(ret):"m"(*(u32 *)(p))); ret; })
# define U32TO8(p,v) asm ("strv %1,%0":"=m"(*(u32 *)(p)):"d"(v))
#endif
#ifndef U8TOU32
# define U8TOU32(p) ((u32)(p)[0] | (u32)(p)[1]<<8 | \
(u32)(p)[2]<<16 | (u32)(p)[3]<<24 )
#endif
#ifndef U32TO8
# define U32TO8(p,v) ((p)[0] = (u8)(v), (p)[1] = (u8)((v)>>8), \
(p)[2] = (u8)((v)>>16), (p)[3] = (u8)((v)>>24) )
#endif
typedef struct {
elem64 h[4];
double r[8];
double s[6];
} poly1305_internal;
/* "round toward zero (truncate), mask all exceptions" */
#if defined(__x86_64__)
static const u32 mxcsr = 0x7f80;
#elif defined(__PPC__)
static const u64 one = 1;
#elif defined(__s390x__)
static const u32 fpc = 1;
#elif defined(__sparc__)
static const u64 fsr = 1ULL<<30;
#elif defined(__mips__)
static const u32 fcsr = 1;
#else
#error "unrecognized platform"
#endif
int poly1305_init(void *ctx, const unsigned char key[16])
{
poly1305_internal *st = (poly1305_internal *) ctx;
elem64 r0, r1, r2, r3;
/* h = 0, biased */
#if 0
st->h[0].d = TWO(52)*TWO0;
st->h[1].d = TWO(52)*TWO32;
st->h[2].d = TWO(52)*TWO64;
st->h[3].d = TWO(52)*TWO96;
#else
st->h[0].u = EXP(52+0);
st->h[1].u = EXP(52+32);
st->h[2].u = EXP(52+64);
st->h[3].u = EXP(52+96);
#endif
if (key) {
/*
* set "truncate" rounding mode
*/
#if defined(__x86_64__)
u32 mxcsr_orig;
asm volatile ("stmxcsr %0":"=m"(mxcsr_orig));
asm volatile ("ldmxcsr %0"::"m"(mxcsr));
#elif defined(__PPC__)
double fpscr_orig, fpscr = *(double *)&one;
asm volatile ("mffs %0":"=f"(fpscr_orig));
asm volatile ("mtfsf 255,%0"::"f"(fpscr));
#elif defined(__s390x__)
u32 fpc_orig;
asm volatile ("stfpc %0":"=m"(fpc_orig));
asm volatile ("lfpc %0"::"m"(fpc));
#elif defined(__sparc__)
u64 fsr_orig;
asm volatile ("stx %%fsr,%0":"=m"(fsr_orig));
asm volatile ("ldx %0,%%fsr"::"m"(fsr));
#elif defined(__mips__)
u32 fcsr_orig;
asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig));
asm volatile ("ctc1 %0,$31"::"r"(fcsr));
#endif
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
r0.u = EXP(52+0) | (U8TOU32(&key[0]) & 0x0fffffff);
r1.u = EXP(52+32) | (U8TOU32(&key[4]) & 0x0ffffffc);
r2.u = EXP(52+64) | (U8TOU32(&key[8]) & 0x0ffffffc);
r3.u = EXP(52+96) | (U8TOU32(&key[12]) & 0x0ffffffc);
st->r[0] = r0.d - TWO(52)*TWO0;
st->r[2] = r1.d - TWO(52)*TWO32;
st->r[4] = r2.d - TWO(52)*TWO64;
st->r[6] = r3.d - TWO(52)*TWO96;
st->s[0] = st->r[2] * (5.0/TWO130);
st->s[2] = st->r[4] * (5.0/TWO130);
st->s[4] = st->r[6] * (5.0/TWO130);
/*
* base 2^32 -> base 2^16
*/
st->r[1] = (st->r[0] + TWO(52)*TWO(16)*TWO0) -
TWO(52)*TWO(16)*TWO0;
st->r[0] -= st->r[1];
st->r[3] = (st->r[2] + TWO(52)*TWO(16)*TWO32) -
TWO(52)*TWO(16)*TWO32;
st->r[2] -= st->r[3];
st->r[5] = (st->r[4] + TWO(52)*TWO(16)*TWO64) -
TWO(52)*TWO(16)*TWO64;
st->r[4] -= st->r[5];
st->r[7] = (st->r[6] + TWO(52)*TWO(16)*TWO96) -
TWO(52)*TWO(16)*TWO96;
st->r[6] -= st->r[7];
st->s[1] = (st->s[0] + TWO(52)*TWO(16)*TWO0/TWO96) -
TWO(52)*TWO(16)*TWO0/TWO96;
st->s[0] -= st->s[1];
st->s[3] = (st->s[2] + TWO(52)*TWO(16)*TWO32/TWO96) -
TWO(52)*TWO(16)*TWO32/TWO96;
st->s[2] -= st->s[3];
st->s[5] = (st->s[4] + TWO(52)*TWO(16)*TWO64/TWO96) -
TWO(52)*TWO(16)*TWO64/TWO96;
st->s[4] -= st->s[5];
/*
* restore original FPU control register
*/
#if defined(__x86_64__)
asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig));
#elif defined(__PPC__)
asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig));
#elif defined(__s390x__)
asm volatile ("lfpc %0"::"m"(fpc_orig));
#elif defined(__sparc__)
asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig));
#elif defined(__mips__)
asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig));
#endif
}
return 0;
}
void poly1305_blocks(void *ctx, const unsigned char *inp, size_t len,
int padbit)
{
poly1305_internal *st = (poly1305_internal *)ctx;
elem64 in0, in1, in2, in3;
u64 pad = (u64)padbit<<32;
double x0, x1, x2, x3;
double h0lo, h0hi, h1lo, h1hi, h2lo, h2hi, h3lo, h3hi;
double c0lo, c0hi, c1lo, c1hi, c2lo, c2hi, c3lo, c3hi;
const double r0lo = st->r[0];
const double r0hi = st->r[1];
const double r1lo = st->r[2];
const double r1hi = st->r[3];
const double r2lo = st->r[4];
const double r2hi = st->r[5];
const double r3lo = st->r[6];
const double r3hi = st->r[7];
const double s1lo = st->s[0];
const double s1hi = st->s[1];
const double s2lo = st->s[2];
const double s2hi = st->s[3];
const double s3lo = st->s[4];
const double s3hi = st->s[5];
/*
* set "truncate" rounding mode
*/
#if defined(__x86_64__)
u32 mxcsr_orig;
asm volatile ("stmxcsr %0":"=m"(mxcsr_orig));
asm volatile ("ldmxcsr %0"::"m"(mxcsr));
#elif defined(__PPC__)
double fpscr_orig, fpscr = *(double *)&one;
asm volatile ("mffs %0":"=f"(fpscr_orig));
asm volatile ("mtfsf 255,%0"::"f"(fpscr));
#elif defined(__s390x__)
u32 fpc_orig;
asm volatile ("stfpc %0":"=m"(fpc_orig));
asm volatile ("lfpc %0"::"m"(fpc));
#elif defined(__sparc__)
u64 fsr_orig;
asm volatile ("stx %%fsr,%0":"=m"(fsr_orig));
asm volatile ("ldx %0,%%fsr"::"m"(fsr));
#elif defined(__mips__)
u32 fcsr_orig;
asm volatile ("cfc1 %0,$31":"=r"(fcsr_orig));
asm volatile ("ctc1 %0,$31"::"r"(fcsr));
#endif
/*
* load base 2^32 and de-bias
*/
h0lo = st->h[0].d - TWO(52)*TWO0;
h1lo = st->h[1].d - TWO(52)*TWO32;
h2lo = st->h[2].d - TWO(52)*TWO64;
h3lo = st->h[3].d - TWO(52)*TWO96;
#ifdef __clang__
h0hi = 0;
h1hi = 0;
h2hi = 0;
h3hi = 0;
#else
in0.u = EXP(52+0) | U8TOU32(&inp[0]);
in1.u = EXP(52+32) | U8TOU32(&inp[4]);
in2.u = EXP(52+64) | U8TOU32(&inp[8]);
in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad;
x0 = in0.d - TWO(52)*TWO0;
x1 = in1.d - TWO(52)*TWO32;
x2 = in2.d - TWO(52)*TWO64;
x3 = in3.d - TWO(52)*TWO96;
x0 += h0lo;
x1 += h1lo;
x2 += h2lo;
x3 += h3lo;
goto fast_entry;
#endif
do {
in0.u = EXP(52+0) | U8TOU32(&inp[0]);
in1.u = EXP(52+32) | U8TOU32(&inp[4]);
in2.u = EXP(52+64) | U8TOU32(&inp[8]);
in3.u = EXP(52+96) | U8TOU32(&inp[12]) | pad;
x0 = in0.d - TWO(52)*TWO0;
x1 = in1.d - TWO(52)*TWO32;
x2 = in2.d - TWO(52)*TWO64;
x3 = in3.d - TWO(52)*TWO96;
/*
* note that there are multiple ways to accumulate input, e.g.
* one can as well accumulate to h0lo-h1lo-h1hi-h2hi...
*/
h0lo += x0;
h0hi += x1;
h2lo += x2;
h2hi += x3;
/*
* carries that cross 32n-bit (and 130-bit) boundaries
*/
c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32;
c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64;
c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96;
c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130;
c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32;
c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64;
c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96;
c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130;
/*
* base 2^48 -> base 2^32 with last reduction step
*/
x1 = (h1lo - c1lo) + c0lo;
x2 = (h2lo - c2lo) + c1lo;
x3 = (h3lo - c3lo) + c2lo;
x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130);
x1 += (h1hi - c1hi) + c0hi;
x2 += (h2hi - c2hi) + c1hi;
x3 += (h3hi - c3hi) + c2hi;
x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130);
#ifndef __clang__
fast_entry:
#endif
/*
* base 2^32 * base 2^16 = base 2^48
*/
h0lo = s3lo * x1 + s2lo * x2 + s1lo * x3 + r0lo * x0;
h1lo = r0lo * x1 + s3lo * x2 + s2lo * x3 + r1lo * x0;
h2lo = r1lo * x1 + r0lo * x2 + s3lo * x3 + r2lo * x0;
h3lo = r2lo * x1 + r1lo * x2 + r0lo * x3 + r3lo * x0;
h0hi = s3hi * x1 + s2hi * x2 + s1hi * x3 + r0hi * x0;
h1hi = r0hi * x1 + s3hi * x2 + s2hi * x3 + r1hi * x0;
h2hi = r1hi * x1 + r0hi * x2 + s3hi * x3 + r2hi * x0;
h3hi = r2hi * x1 + r1hi * x2 + r0hi * x3 + r3hi * x0;
inp += 16;
len -= 16;
} while (len >= 16);
/*
* carries that cross 32n-bit (and 130-bit) boundaries
*/
c0lo = (h0lo + TWO(52)*TWO32) - TWO(52)*TWO32;
c1lo = (h1lo + TWO(52)*TWO64) - TWO(52)*TWO64;
c2lo = (h2lo + TWO(52)*TWO96) - TWO(52)*TWO96;
c3lo = (h3lo + TWO(52)*TWO130) - TWO(52)*TWO130;
c0hi = (h0hi + TWO(52)*TWO32) - TWO(52)*TWO32;
c1hi = (h1hi + TWO(52)*TWO64) - TWO(52)*TWO64;
c2hi = (h2hi + TWO(52)*TWO96) - TWO(52)*TWO96;
c3hi = (h3hi + TWO(52)*TWO130) - TWO(52)*TWO130;
/*
* base 2^48 -> base 2^32 with last reduction step
*/
x1 = (h1lo - c1lo) + c0lo;
x2 = (h2lo - c2lo) + c1lo;
x3 = (h3lo - c3lo) + c2lo;
x0 = (h0lo - c0lo) + c3lo * (5.0/TWO130);
x1 += (h1hi - c1hi) + c0hi;
x2 += (h2hi - c2hi) + c1hi;
x3 += (h3hi - c3hi) + c2hi;
x0 += (h0hi - c0hi) + c3hi * (5.0/TWO130);
/*
* store base 2^32, with bias
*/
st->h[1].d = x1 + TWO(52)*TWO32;
st->h[2].d = x2 + TWO(52)*TWO64;
st->h[3].d = x3 + TWO(52)*TWO96;
st->h[0].d = x0 + TWO(52)*TWO0;
/*
* restore original FPU control register
*/
#if defined(__x86_64__)
asm volatile ("ldmxcsr %0"::"m"(mxcsr_orig));
#elif defined(__PPC__)
asm volatile ("mtfsf 255,%0"::"f"(fpscr_orig));
#elif defined(__s390x__)
asm volatile ("lfpc %0"::"m"(fpc_orig));
#elif defined(__sparc__)
asm volatile ("ldx %0,%%fsr"::"m"(fsr_orig));
#elif defined(__mips__)
asm volatile ("ctc1 %0,$31"::"r"(fcsr_orig));
#endif
}
void poly1305_emit(void *ctx, unsigned char mac[16], const u32 nonce[4])
{
poly1305_internal *st = (poly1305_internal *) ctx;
u64 h0, h1, h2, h3, h4;
u32 g0, g1, g2, g3, g4;
u64 t;
u32 mask;
/*
* thanks to bias masking exponent gives integer result
*/
h0 = st->h[0].u & 0x000fffffffffffffULL;
h1 = st->h[1].u & 0x000fffffffffffffULL;
h2 = st->h[2].u & 0x000fffffffffffffULL;
h3 = st->h[3].u & 0x000fffffffffffffULL;
/*
* can be partially reduced, so reduce...
*/
h4 = h3>>32; h3 &= 0xffffffffU;
g4 = h4&-4;
h4 &= 3;
g4 += g4>>2;
h0 += g4;
h1 += h0>>32; h0 &= 0xffffffffU;
h2 += h1>>32; h1 &= 0xffffffffU;
h3 += h2>>32; h2 &= 0xffffffffU;
/* compute h + -p */
g0 = (u32)(t = h0 + 5);
g1 = (u32)(t = h1 + (t >> 32));
g2 = (u32)(t = h2 + (t >> 32));
g3 = (u32)(t = h3 + (t >> 32));
g4 = h4 + (u32)(t >> 32);
/* if there was carry, select g0-g3 */
mask = 0 - (g4 >> 2);
g0 &= mask;
g1 &= mask;
g2 &= mask;
g3 &= mask;
mask = ~mask;
g0 |= (h0 & mask);
g1 |= (h1 & mask);
g2 |= (h2 & mask);
g3 |= (h3 & mask);
/* mac = (h + nonce) % (2^128) */
g0 = (u32)(t = (u64)g0 + nonce[0]);
g1 = (u32)(t = (u64)g1 + (t >> 32) + nonce[1]);
g2 = (u32)(t = (u64)g2 + (t >> 32) + nonce[2]);
g3 = (u32)(t = (u64)g3 + (t >> 32) + nonce[3]);
U32TO8(mac + 0, g0);
U32TO8(mac + 4, g1);
U32TO8(mac + 8, g2);
U32TO8(mac + 12, g3);
}

View file

@ -0,0 +1,27 @@
/*
* Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
typedef void (*poly1305_blocks_f) (void *ctx, const unsigned char *inp,
size_t len, unsigned int padbit);
typedef void (*poly1305_emit_f) (void *ctx, unsigned char mac[16],
const unsigned int nonce[4]);
struct poly1305_context {
double opaque[24]; /* large enough to hold internal state, declared
* 'double' to ensure at least 64-bit invariant
* alignment across all platforms and
* configurations */
unsigned int nonce[4];
unsigned char data[POLY1305_BLOCK_SIZE];
size_t num;
struct {
poly1305_blocks_f blocks;
poly1305_emit_f emit;
} func;
};

View file

@ -0,0 +1,194 @@
/*
* Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include "internal/cryptlib.h"
#include <openssl/x509.h>
#include <openssl/x509v3.h>
#include <openssl/evp.h>
#include <openssl/err.h>
#include "internal/poly1305.h"
#include "poly1305_local.h"
#include "internal/evp_int.h"
/* POLY1305 pkey context structure */
typedef struct {
ASN1_OCTET_STRING ktmp; /* Temp storage for key */
POLY1305 ctx;
} POLY1305_PKEY_CTX;
static int pkey_poly1305_init(EVP_PKEY_CTX *ctx)
{
POLY1305_PKEY_CTX *pctx;
if ((pctx = OPENSSL_zalloc(sizeof(*pctx))) == NULL) {
CRYPTOerr(CRYPTO_F_PKEY_POLY1305_INIT, ERR_R_MALLOC_FAILURE);
return 0;
}
pctx->ktmp.type = V_ASN1_OCTET_STRING;
EVP_PKEY_CTX_set_data(ctx, pctx);
EVP_PKEY_CTX_set0_keygen_info(ctx, NULL, 0);
return 1;
}
static void pkey_poly1305_cleanup(EVP_PKEY_CTX *ctx)
{
POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx);
if (pctx != NULL) {
OPENSSL_clear_free(pctx->ktmp.data, pctx->ktmp.length);
OPENSSL_clear_free(pctx, sizeof(*pctx));
EVP_PKEY_CTX_set_data(ctx, NULL);
}
}
static int pkey_poly1305_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
{
POLY1305_PKEY_CTX *sctx, *dctx;
/* allocate memory for dst->data and a new POLY1305_CTX in dst->data->ctx */
if (!pkey_poly1305_init(dst))
return 0;
sctx = EVP_PKEY_CTX_get_data(src);
dctx = EVP_PKEY_CTX_get_data(dst);
if (ASN1_STRING_get0_data(&sctx->ktmp) != NULL &&
!ASN1_STRING_copy(&dctx->ktmp, &sctx->ktmp)) {
/* cleanup and free the POLY1305_PKEY_CTX in dst->data */
pkey_poly1305_cleanup(dst);
return 0;
}
memcpy(&dctx->ctx, &sctx->ctx, sizeof(POLY1305));
return 1;
}
static int pkey_poly1305_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
{
ASN1_OCTET_STRING *key;
POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx);
if (ASN1_STRING_get0_data(&pctx->ktmp) == NULL)
return 0;
key = ASN1_OCTET_STRING_dup(&pctx->ktmp);
if (key == NULL)
return 0;
return EVP_PKEY_assign_POLY1305(pkey, key);
}
static int int_update(EVP_MD_CTX *ctx, const void *data, size_t count)
{
POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(EVP_MD_CTX_pkey_ctx(ctx));
Poly1305_Update(&pctx->ctx, data, count);
return 1;
}
static int poly1305_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
{
POLY1305_PKEY_CTX *pctx = ctx->data;
ASN1_OCTET_STRING *key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
if (key->length != POLY1305_KEY_SIZE)
return 0;
EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
EVP_MD_CTX_set_update_fn(mctx, int_update);
Poly1305_Init(&pctx->ctx, key->data);
return 1;
}
static int poly1305_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
EVP_MD_CTX *mctx)
{
POLY1305_PKEY_CTX *pctx = ctx->data;
*siglen = POLY1305_DIGEST_SIZE;
if (sig != NULL)
Poly1305_Final(&pctx->ctx, sig);
return 1;
}
static int pkey_poly1305_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
{
POLY1305_PKEY_CTX *pctx = EVP_PKEY_CTX_get_data(ctx);
const unsigned char *key;
size_t len;
switch (type) {
case EVP_PKEY_CTRL_MD:
/* ignore */
break;
case EVP_PKEY_CTRL_SET_MAC_KEY:
case EVP_PKEY_CTRL_DIGESTINIT:
if (type == EVP_PKEY_CTRL_SET_MAC_KEY) {
/* user explicitly setting the key */
key = p2;
len = p1;
} else {
/* user indirectly setting the key via EVP_DigestSignInit */
key = EVP_PKEY_get0_poly1305(EVP_PKEY_CTX_get0_pkey(ctx), &len);
}
if (key == NULL || len != POLY1305_KEY_SIZE ||
!ASN1_OCTET_STRING_set(&pctx->ktmp, key, len))
return 0;
Poly1305_Init(&pctx->ctx, ASN1_STRING_get0_data(&pctx->ktmp));
break;
default:
return -2;
}
return 1;
}
static int pkey_poly1305_ctrl_str(EVP_PKEY_CTX *ctx,
const char *type, const char *value)
{
if (value == NULL)
return 0;
if (strcmp(type, "key") == 0)
return EVP_PKEY_CTX_str2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value);
if (strcmp(type, "hexkey") == 0)
return EVP_PKEY_CTX_hex2ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, value);
return -2;
}
const EVP_PKEY_METHOD poly1305_pkey_meth = {
EVP_PKEY_POLY1305,
EVP_PKEY_FLAG_SIGCTX_CUSTOM, /* we don't deal with a separate MD */
pkey_poly1305_init,
pkey_poly1305_copy,
pkey_poly1305_cleanup,
0, 0,
0,
pkey_poly1305_keygen,
0, 0,
0, 0,
0, 0,
poly1305_signctx_init,
poly1305_signctx,
0, 0,
0, 0,
0, 0,
0, 0,
pkey_poly1305_ctrl,
pkey_poly1305_ctrl_str
};