1
0
Fork 0
mirror of https://github.com/ossrs/srs.git synced 2025-03-09 15:49:59 +00:00

Upgrade openssl from 1.1.0e to 1.1.1b, with source code. 4.0.78

This commit is contained in:
winlin 2021-03-01 20:47:57 +08:00
parent 8f1c992379
commit 96dbd7bced
1476 changed files with 616554 additions and 4 deletions

View file

@ -0,0 +1,241 @@
=pod
=head1 NAME
bn_mul_words, bn_mul_add_words, bn_sqr_words, bn_div_words,
bn_add_words, bn_sub_words, bn_mul_comba4, bn_mul_comba8,
bn_sqr_comba4, bn_sqr_comba8, bn_cmp_words, bn_mul_normal,
bn_mul_low_normal, bn_mul_recursive, bn_mul_part_recursive,
bn_mul_low_recursive, bn_sqr_normal, bn_sqr_recursive,
bn_expand, bn_wexpand, bn_expand2, bn_fix_top, bn_check_top,
bn_print, bn_dump, bn_set_max, bn_set_high, bn_set_low - BIGNUM
library internal functions
=head1 SYNOPSIS
#include <openssl/bn.h>
BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num,
BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num);
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,
int num);
BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,
int num);
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a);
void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a);
int bn_cmp_words(BN_ULONG *a, BN_ULONG *b, int n);
void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b,
int nb);
void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
int dna, int dnb, BN_ULONG *tmp);
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
int n, int tna, int tnb, BN_ULONG *tmp);
void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
int n2, BN_ULONG *tmp);
void bn_sqr_normal(BN_ULONG *r, BN_ULONG *a, int n, BN_ULONG *tmp);
void bn_sqr_recursive(BN_ULONG *r, BN_ULONG *a, int n2, BN_ULONG *tmp);
void mul(BN_ULONG r, BN_ULONG a, BN_ULONG w, BN_ULONG c);
void mul_add(BN_ULONG r, BN_ULONG a, BN_ULONG w, BN_ULONG c);
void sqr(BN_ULONG r0, BN_ULONG r1, BN_ULONG a);
BIGNUM *bn_expand(BIGNUM *a, int bits);
BIGNUM *bn_wexpand(BIGNUM *a, int n);
BIGNUM *bn_expand2(BIGNUM *a, int n);
void bn_fix_top(BIGNUM *a);
void bn_check_top(BIGNUM *a);
void bn_print(BIGNUM *a);
void bn_dump(BN_ULONG *d, int n);
void bn_set_max(BIGNUM *a);
void bn_set_high(BIGNUM *r, BIGNUM *a, int n);
void bn_set_low(BIGNUM *r, BIGNUM *a, int n);
=head1 DESCRIPTION
This page documents the internal functions used by the OpenSSL
B<BIGNUM> implementation. They are described here to facilitate
debugging and extending the library. They are I<not> to be used by
applications.
=head2 The BIGNUM structure
typedef struct bignum_st BIGNUM;
struct bignum_st
{
BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks. */
int top; /* Index of last used d +1. */
/* The next are internal book keeping for bn_expand. */
int dmax; /* Size of the d array. */
int neg; /* one if the number is negative */
int flags;
};
The integer value is stored in B<d>, a malloc()ed array of words (B<BN_ULONG>),
least significant word first. A B<BN_ULONG> can be either 16, 32 or 64 bits
in size, depending on the 'number of bits' (B<BITS2>) specified in
C<openssl/bn.h>.
B<dmax> is the size of the B<d> array that has been allocated. B<top>
is the number of words being used, so for a value of 4, bn.d[0]=4 and
bn.top=1. B<neg> is 1 if the number is negative. When a B<BIGNUM> is
B<0>, the B<d> field can be B<NULL> and B<top> == B<0>.
B<flags> is a bit field of flags which are defined in C<openssl/bn.h>. The
flags begin with B<BN_FLG_>. The macros BN_set_flags(b, n) and
BN_get_flags(b, n) exist to enable or fetch flag(s) B<n> from B<BIGNUM>
structure B<b>.
Various routines in this library require the use of temporary
B<BIGNUM> variables during their execution. Since dynamic memory
allocation to create B<BIGNUM>s is rather expensive when used in
conjunction with repeated subroutine calls, the B<BN_CTX> structure is
used. This structure contains B<BN_CTX_NUM> B<BIGNUM>s, see
L<BN_CTX_start(3)>.
=head2 Low-level arithmetic operations
These functions are implemented in C and for several platforms in
assembly language:
bn_mul_words(B<rp>, B<ap>, B<num>, B<w>) operates on the B<num> word
arrays B<rp> and B<ap>. It computes B<ap> * B<w>, places the result
in B<rp>, and returns the high word (carry).
bn_mul_add_words(B<rp>, B<ap>, B<num>, B<w>) operates on the B<num>
word arrays B<rp> and B<ap>. It computes B<ap> * B<w> + B<rp>, places
the result in B<rp>, and returns the high word (carry).
bn_sqr_words(B<rp>, B<ap>, B<n>) operates on the B<num> word array
B<ap> and the 2*B<num> word array B<ap>. It computes B<ap> * B<ap>
word-wise, and places the low and high bytes of the result in B<rp>.
bn_div_words(B<h>, B<l>, B<d>) divides the two word number (B<h>, B<l>)
by B<d> and returns the result.
bn_add_words(B<rp>, B<ap>, B<bp>, B<num>) operates on the B<num> word
arrays B<ap>, B<bp> and B<rp>. It computes B<ap> + B<bp>, places the
result in B<rp>, and returns the high word (carry).
bn_sub_words(B<rp>, B<ap>, B<bp>, B<num>) operates on the B<num> word
arrays B<ap>, B<bp> and B<rp>. It computes B<ap> - B<bp>, places the
result in B<rp>, and returns the carry (1 if B<bp> E<gt> B<ap>, 0
otherwise).
bn_mul_comba4(B<r>, B<a>, B<b>) operates on the 4 word arrays B<a> and
B<b> and the 8 word array B<r>. It computes B<a>*B<b> and places the
result in B<r>.
bn_mul_comba8(B<r>, B<a>, B<b>) operates on the 8 word arrays B<a> and
B<b> and the 16 word array B<r>. It computes B<a>*B<b> and places the
result in B<r>.
bn_sqr_comba4(B<r>, B<a>, B<b>) operates on the 4 word arrays B<a> and
B<b> and the 8 word array B<r>.
bn_sqr_comba8(B<r>, B<a>, B<b>) operates on the 8 word arrays B<a> and
B<b> and the 16 word array B<r>.
The following functions are implemented in C:
bn_cmp_words(B<a>, B<b>, B<n>) operates on the B<n> word arrays B<a>
and B<b>. It returns 1, 0 and -1 if B<a> is greater than, equal and
less than B<b>.
bn_mul_normal(B<r>, B<a>, B<na>, B<b>, B<nb>) operates on the B<na>
word array B<a>, the B<nb> word array B<b> and the B<na>+B<nb> word
array B<r>. It computes B<a>*B<b> and places the result in B<r>.
bn_mul_low_normal(B<r>, B<a>, B<b>, B<n>) operates on the B<n> word
arrays B<r>, B<a> and B<b>. It computes the B<n> low words of
B<a>*B<b> and places the result in B<r>.
bn_mul_recursive(B<r>, B<a>, B<b>, B<n2>, B<dna>, B<dnb>, B<t>) operates
on the word arrays B<a> and B<b> of length B<n2>+B<dna> and B<n2>+B<dnb>
(B<dna> and B<dnb> are currently allowed to be 0 or negative) and the 2*B<n2>
word arrays B<r> and B<t>. B<n2> must be a power of 2. It computes
B<a>*B<b> and places the result in B<r>.
bn_mul_part_recursive(B<r>, B<a>, B<b>, B<n>, B<tna>, B<tnb>, B<tmp>)
operates on the word arrays B<a> and B<b> of length B<n>+B<tna> and
B<n>+B<tnb> and the 4*B<n> word arrays B<r> and B<tmp>.
bn_mul_low_recursive(B<r>, B<a>, B<b>, B<n2>, B<tmp>) operates on the
B<n2> word arrays B<r> and B<tmp> and the B<n2>/2 word arrays B<a>
and B<b>.
BN_mul() calls bn_mul_normal(), or an optimized implementation if the
factors have the same size: bn_mul_comba8() is used if they are 8
words long, bn_mul_recursive() if they are larger than
B<BN_MULL_SIZE_NORMAL> and the size is an exact multiple of the word
size, and bn_mul_part_recursive() for others that are larger than
B<BN_MULL_SIZE_NORMAL>.
bn_sqr_normal(B<r>, B<a>, B<n>, B<tmp>) operates on the B<n> word array
B<a> and the 2*B<n> word arrays B<tmp> and B<r>.
The implementations use the following macros which, depending on the
architecture, may use "long long" C operations or inline assembler.
They are defined in C<bn_lcl.h>.
mul(B<r>, B<a>, B<w>, B<c>) computes B<w>*B<a>+B<c> and places the
low word of the result in B<r> and the high word in B<c>.
mul_add(B<r>, B<a>, B<w>, B<c>) computes B<w>*B<a>+B<r>+B<c> and
places the low word of the result in B<r> and the high word in B<c>.
sqr(B<r0>, B<r1>, B<a>) computes B<a>*B<a> and places the low word
of the result in B<r0> and the high word in B<r1>.
=head2 Size changes
bn_expand() ensures that B<b> has enough space for a B<bits> bit
number. bn_wexpand() ensures that B<b> has enough space for an
B<n> word number. If the number has to be expanded, both macros
call bn_expand2(), which allocates a new B<d> array and copies the
data. They return B<NULL> on error, B<b> otherwise.
The bn_fix_top() macro reduces B<a-E<gt>top> to point to the most
significant non-zero word plus one when B<a> has shrunk.
=head2 Debugging
bn_check_top() verifies that C<((a)-E<gt>top E<gt>= 0 && (a)-E<gt>top
E<lt>= (a)-E<gt>dmax)>. A violation will cause the program to abort.
bn_print() prints B<a> to stderr. bn_dump() prints B<n> words at B<d>
(in reverse order, i.e. most significant word first) to stderr.
bn_set_max() makes B<a> a static number with a B<dmax> of its current size.
This is used by bn_set_low() and bn_set_high() to make B<r> a read-only
B<BIGNUM> that contains the B<n> low or high words of B<a>.
If B<BN_DEBUG> is not defined, bn_check_top(), bn_print(), bn_dump()
and bn_set_max() are defined as empty macros.
=head1 SEE ALSO
L<bn(3)>
=head1 COPYRIGHT
Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
Licensed under the OpenSSL license (the "License"). You may not use
this file except in compliance with the License. You can obtain a copy
in the file LICENSE in the source distribution or at
L<https://www.openssl.org/source/license.html>.
=cut

View file

@ -0,0 +1,328 @@
#! /usr/bin/env perl
# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# On 21264 RSA sign performance improves by 70/35/20/15 percent for
# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
# instructed to '-tune host' code with in-line assembler. Other
# benchmarks improve by 15-20%. To anchor it to something else, the
# code provides approximately the same performance per GHz as AMD64.
# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
# difference.
$output=pop;
open STDOUT,">$output";
# int bn_mul_mont(
$rp="a0"; # BN_ULONG *rp,
$ap="a1"; # const BN_ULONG *ap,
$bp="a2"; # const BN_ULONG *bp,
$np="a3"; # const BN_ULONG *np,
$n0="a4"; # const BN_ULONG *n0,
$num="a5"; # int num);
$lo0="t0";
$hi0="t1";
$lo1="t2";
$hi1="t3";
$aj="t4";
$bi="t5";
$nj="t6";
$tp="t7";
$alo="t8";
$ahi="t9";
$nlo="t10";
$nhi="t11";
$tj="t12";
$i="s3";
$j="s4";
$m1="s5";
$code=<<___;
#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl bn_mul_mont
.align 5
.ent bn_mul_mont
bn_mul_mont:
lda sp,-48(sp)
stq ra,0(sp)
stq s3,8(sp)
stq s4,16(sp)
stq s5,24(sp)
stq fp,32(sp)
mov sp,fp
.mask 0x0400f000,-48
.frame fp,48,ra
.prologue 0
.align 4
.set reorder
sextl $num,$num
mov 0,v0
cmplt $num,4,AT
bne AT,.Lexit
ldq $hi0,0($ap) # ap[0]
s8addq $num,16,AT
ldq $aj,8($ap)
subq sp,AT,sp
ldq $bi,0($bp) # bp[0]
lda AT,-4096(zero) # mov -4096,AT
ldq $n0,0($n0)
and sp,AT,sp
mulq $hi0,$bi,$lo0
ldq $hi1,0($np) # np[0]
umulh $hi0,$bi,$hi0
ldq $nj,8($np)
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov 2,$j
umulh $aj,$bi,$ahi
mov sp,$tp
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
s8addq $j,$np,$nj
.align 4
.L1st:
.set noreorder
ldq $aj,0($aj)
addl $j,1,$j
ldq $nj,0($nj)
lda $tp,8($tp)
addq $alo,$hi0,$lo0
mulq $aj,$bi,$alo
cmpult $lo0,$hi0,AT
addq $nlo,$hi1,$lo1
mulq $nj,$m1,$nlo
addq $ahi,AT,$hi0
cmpult $lo1,$hi1,v0
cmplt $j,$num,$tj
umulh $aj,$bi,$ahi
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
s8addq $j,$np,$nj
stq $lo1,-8($tp)
nop
unop
bne $tj,.L1st
.set reorder
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,v0
addq $hi1,v0,$hi1
stq $lo1,0($tp)
addq $hi1,$hi0,$hi1
cmpult $hi1,$hi0,AT
stq $hi1,8($tp)
stq AT,16($tp)
mov 1,$i
.align 4
.Louter:
s8addq $i,$bp,$bi
ldq $hi0,0($ap)
ldq $aj,8($ap)
ldq $bi,0($bi)
ldq $hi1,0($np)
ldq $nj,8($np)
ldq $tj,0(sp)
mulq $hi0,$bi,$lo0
umulh $hi0,$bi,$hi0
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
mulq $lo0,$n0,$m1
mulq $hi1,$m1,$lo1
umulh $hi1,$m1,$hi1
addq $lo1,$lo0,$lo1
cmpult $lo1,$lo0,AT
mov 2,$j
addq $hi1,AT,$hi1
mulq $aj,$bi,$alo
mov sp,$tp
umulh $aj,$bi,$ahi
mulq $nj,$m1,$nlo
s8addq $j,$ap,$aj
umulh $nj,$m1,$nhi
.align 4
.Linner:
.set noreorder
ldq $tj,8($tp) #L0
nop #U1
ldq $aj,0($aj) #L1
s8addq $j,$np,$nj #U0
ldq $nj,0($nj) #L0
nop #U1
addq $alo,$hi0,$lo0 #L1
lda $tp,8($tp)
mulq $aj,$bi,$alo #U1
cmpult $lo0,$hi0,AT #L0
addq $nlo,$hi1,$lo1 #L1
addl $j,1,$j
mulq $nj,$m1,$nlo #U1
addq $ahi,AT,$hi0 #L0
addq $lo0,$tj,$lo0 #L1
cmpult $lo1,$hi1,v0 #U0
umulh $aj,$bi,$ahi #U1
cmpult $lo0,$tj,AT #L0
addq $lo1,$lo0,$lo1 #L1
addq $nhi,v0,$hi1 #U0
umulh $nj,$m1,$nhi #U1
s8addq $j,$ap,$aj #L0
cmpult $lo1,$lo0,v0 #L1
cmplt $j,$num,$tj #U0 # borrow $tj
addq $hi0,AT,$hi0 #L0
addq $hi1,v0,$hi1 #U1
stq $lo1,-8($tp) #L1
bne $tj,.Linner #U0
.set reorder
ldq $tj,8($tp)
addq $alo,$hi0,$lo0
addq $nlo,$hi1,$lo1
cmpult $lo0,$hi0,AT
cmpult $lo1,$hi1,v0
addq $ahi,AT,$hi0
addq $nhi,v0,$hi1
addq $lo0,$tj,$lo0
cmpult $lo0,$tj,AT
addq $hi0,AT,$hi0
ldq $tj,16($tp)
addq $lo1,$lo0,$j
cmpult $j,$lo0,v0
addq $hi1,v0,$hi1
addq $hi1,$hi0,$lo1
stq $j,0($tp)
cmpult $lo1,$hi0,$hi1
addq $lo1,$tj,$lo1
cmpult $lo1,$tj,AT
addl $i,1,$i
addq $hi1,AT,$hi1
stq $lo1,8($tp)
cmplt $i,$num,$tj # borrow $tj
stq $hi1,16($tp)
bne $tj,.Louter
s8addq $num,sp,$tj # &tp[num]
mov $rp,$bp # put rp aside
mov sp,$tp
mov sp,$ap
mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,0($tp)
ldq $lo1,0($np)
lda $tp,8($tp)
lda $np,8($np)
subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
or $hi0,AT,$hi0
stq $lo0,0($rp)
cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
mov $bp,$rp # restore rp
.align 4
.Lcopy: ldq $aj,0($tp) # conditional copy
ldq $nj,0($rp)
lda $tp,8($tp)
lda $rp,8($rp)
cmoveq $hi0,$nj,$aj
stq zero,-8($tp) # zap tp
cmpult $tp,$tj,AT
stq $aj,-8($rp)
bne AT,.Lcopy
mov 1,v0
.Lexit:
.set noreorder
mov fp,sp
/*ldq ra,0(sp)*/
ldq s3,8(sp)
ldq s4,16(sp)
ldq s5,24(sp)
ldq fp,32(sp)
lda sp,48(sp)
ret (ra)
.end bn_mul_mont
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
print $code;
close STDOUT;

View file

@ -0,0 +1,332 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... Except that it has two code paths: pure
# integer code suitable for any ARMv4 and later CPU and NEON code
# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
# faster than compiler-generated code. For ECDH and ECDSA verify (but
# not for ECDSA sign) it means 25%-45% improvement depending on key
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$code=<<___;
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
___
################
# private interface to mul_1x1_ialu
#
$a="r1";
$b="r0";
($a0,$a1,$a2,$a12,$a4,$a14)=
($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
$mask="r12";
$code.=<<___;
.type mul_1x1_ialu,%function
.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
str $a0,[sp,#0] @ tab[0]=0
add $a2,$a1,$a1 @ a2=a1<<1
str $a1,[sp,#4] @ tab[1]=a1
eor $a12,$a1,$a2 @ a1^a2
str $a2,[sp,#8] @ tab[2]=a2
mov $a4,$a1,lsl#2 @ a4=a1<<2
str $a12,[sp,#12] @ tab[3]=a1^a2
eor $a14,$a1,$a4 @ a1^a4
str $a4,[sp,#16] @ tab[4]=a4
eor $a0,$a2,$a4 @ a2^a4
str $a14,[sp,#20] @ tab[5]=a1^a4
eor $a12,$a12,$a4 @ a1^a2^a4
str $a0,[sp,#24] @ tab[6]=a2^a4
and $i0,$mask,$b,lsl#2
str $a12,[sp,#28] @ tab[7]=a1^a2^a4
and $i1,$mask,$b,lsr#1
ldr $lo,[sp,$i0] @ tab[b & 0x7]
and $i0,$mask,$b,lsr#4
ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7]
and $i1,$mask,$b,lsr#7
ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7]
eor $lo,$lo,$t1,lsl#3 @ stall
mov $hi,$t1,lsr#29
ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7]
and $i0,$mask,$b,lsr#10
eor $lo,$lo,$t0,lsl#6
eor $hi,$hi,$t0,lsr#26
ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7]
and $i1,$mask,$b,lsr#13
eor $lo,$lo,$t1,lsl#9
eor $hi,$hi,$t1,lsr#23
ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7]
and $i0,$mask,$b,lsr#16
eor $lo,$lo,$t0,lsl#12
eor $hi,$hi,$t0,lsr#20
ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7]
and $i1,$mask,$b,lsr#19
eor $lo,$lo,$t1,lsl#15
eor $hi,$hi,$t1,lsr#17
ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7]
and $i0,$mask,$b,lsr#22
eor $lo,$lo,$t0,lsl#18
eor $hi,$hi,$t0,lsr#14
ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7]
and $i1,$mask,$b,lsr#25
eor $lo,$lo,$t1,lsl#21
eor $hi,$hi,$t1,lsr#11
ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7]
tst $a,#1<<30
and $i0,$mask,$b,lsr#28
eor $lo,$lo,$t0,lsl#24
eor $hi,$hi,$t0,lsr#8
ldr $t0,[sp,$i0] @ tab[b >> 30 ]
#ifdef __thumb2__
itt ne
#endif
eorne $lo,$lo,$b,lsl#30
eorne $hi,$hi,$b,lsr#2
tst $a,#1<<31
eor $lo,$lo,$t1,lsl#27
eor $hi,$hi,$t1,lsr#5
#ifdef __thumb2__
itt ne
#endif
eorne $lo,$lo,$b,lsl#31
eorne $hi,$hi,$b,lsr#1
eor $lo,$lo,$t0,lsl#30
eor $hi,$hi,$t0,lsr#2
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
___
################
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
{
$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
bn_GF2m_mul_2x2:
#if __ARM_MAX_ARCH__>=7
stmdb sp!,{r10,lr}
ldr r12,.LOPENSSL_armcap
adr r10,.LOPENSSL_armcap
ldr r12,[r12,r10]
#ifdef __APPLE__
ldr r12,[r12]
#endif
tst r12,#ARMV7_NEON
itt ne
ldrne r10,[sp],#8
bne .LNEON
stmdb sp!,{r4-r9}
#else
stmdb sp!,{r4-r10,lr}
#endif
___
$ret="r10"; # reassigned 1st argument
$code.=<<___;
mov $ret,r0 @ reassign 1st argument
mov $b,r3 @ $b=b1
sub r7,sp,#36
mov r8,sp
and r7,r7,#-32
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
mov sp,r7 @ allocate tab[8]
str r8,[r7,#32]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
eor $b,$b,r3 @ flip b0 and b1
eor $a,$a,r2 @ flip a0 and a1
eor r3,r3,$b
eor r2,r2,$a
eor $b,$b,r3
eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
eor $a,$a,r2
eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
eor $lo,$lo,$hi
ldr sp,[sp,#32] @ destroy tab[8]
eor $hi,$hi,@r[1]
eor $lo,$lo,@r[0]
eor $hi,$hi,@r[2]
eor $lo,$lo,@r[3]
eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
eor $lo,$lo,$hi
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
#else
ldmia sp!,{r4-r10,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
___
}
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
vmov $a, r2, r1
vmov $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vst1.32 {$r}, [r0]
ret @ bx lr
#endif
___
}
$code.=<<___;
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bret\b/bx lr/go or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush

View file

@ -0,0 +1,757 @@
#! /usr/bin/env perl
# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2007.
# Montgomery multiplication for ARMv4.
#
# Performance improvement naturally varies among CPU implementations
# and compilers. The code was observed to provide +65-35% improvement
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
# about decorations, ABI and instruction syntax are identical.
# November 2013
#
# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
# performance improvement on Cortex-A8 is ~45-100% depending on key
# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
# On Snapdragon S4 improvement was measured to vary from ~70% to
# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
# rather because original integer-only code seems to perform
# suboptimally on S4. Situation on Cortex-A9 is unfortunately
# different. It's being looked into, but the trouble is that
# performance for vectors longer than 256 bits is actually couple
# of percent worse than for integer-only code. The code is chosen
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
# September 2015
#
# Align Cortex-A9 performance with November 2013 improvements, i.e.
# NEON code is now ~20-105% faster than integer-only one on this
# processor. But this optimization further improved performance even
# on other processors: NEON code path is ~45-180% faster than original
# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
# Snapdragon S4.
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
$bp="r2"; $bi="r2"; $rp="r2";
$np="r3";
$tp="r4";
$aj="r5";
$nj="r6";
$tj="r7";
$n0="r8";
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
$alo="r10"; # sl, gcc uses it to keep @GOT
$ahi="r11"; # fp
$nlo="r12"; # ip
########### # r13 is stack pointer
$nhi="r14"; # lr
########### # r15 is program counter
#### argument block layout relative to &tp[num-1], a.k.a. $num
$_rp="$num,#12*4";
# ap permanently resides in r1
$_bp="$num,#13*4";
# np permanently resides in r3
$_n0="$num,#14*4";
$_num="$num,#15*4"; $_bpend=$_num;
$code=<<___;
#include "arm_arch.h"
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
.word OPENSSL_armcap_P-.Lbn_mul_mont
#endif
.global bn_mul_mont
.type bn_mul_mont,%function
.align 5
bn_mul_mont:
.Lbn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
adr r0,.Lbn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
#ifdef __APPLE__
ldr r0,[r0]
#endif
tst r0,#ARMV7_NEON @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
.Lialu:
#endif
cmp ip,#2
mov $num,ip @ load num
#ifdef __thumb2__
ittt lt
#endif
movlt r0,#0
addlt sp,sp,#2*4
blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
mov $num,$num,lsl#2 @ rescale $num for byte count
sub sp,sp,$num @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub $num,$num,#4 @ "num=num-1"
add $tp,$bp,$num @ &bp[num-1]
add $num,sp,$num @ $num to point at &tp[num-1]
ldr $n0,[$_n0] @ &n0
ldr $bi,[$bp] @ bp[0]
ldr $aj,[$ap],#4 @ ap[0],ap++
ldr $nj,[$np],#4 @ np[0],np++
ldr $n0,[$n0] @ *n0
str $tp,[$_bpend] @ save &bp[num]
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
str $n0,[$_n0] @ save n0 value
mul $n0,$alo,$n0 @ "tp[0]"*n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
mov $tp,sp
.L1st:
ldr $aj,[$ap],#4 @ ap[j],ap++
mov $alo,$ahi
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .L1st
adds $nlo,$nlo,$ahi
ldr $tp,[$_bp] @ restore bp
mov $nhi,#0
ldr $n0,[$_n0] @ restore n0
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
mov $tj,sp
str $nhi,[$num,#4] @ tp[num]=
.Louter:
sub $tj,$num,$tj @ "original" $num-1 value
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
ldr $bi,[$tp,#4]! @ *(++bp)
sub $np,$np,$tj @ "rewind" np to &np[1]
ldr $aj,[$ap,#-4] @ ap[0]
ldr $alo,[sp] @ tp[0]
ldr $nj,[$np,#-4] @ np[0]
ldr $tj,[sp,#4] @ tp[1]
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
str $tp,[$_bp] @ save bp
mul $n0,$alo,$n0
mov $nlo,#0
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
mov $tp,sp
.Linner:
ldr $aj,[$ap],#4 @ ap[j],ap++
adds $alo,$ahi,$tj @ +=tp[j]
ldr $nj,[$np],#4 @ np[j],np++
mov $ahi,#0
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
mov $nhi,#0
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
adc $ahi,$ahi,#0
ldr $tj,[$tp,#8] @ tp[j+1]
adds $nlo,$nlo,$alo
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
adc $nlo,$nhi,#0
cmp $tp,$num
bne .Linner
adds $nlo,$nlo,$ahi
mov $nhi,#0
ldr $tp,[$_bp] @ restore bp
adc $nhi,$nhi,#0
ldr $n0,[$_n0] @ restore n0
adds $nlo,$nlo,$tj
ldr $tj,[$_bpend] @ restore &bp[num]
adc $nhi,$nhi,#0
str $nlo,[$num] @ tp[num-1]=
str $nhi,[$num,#4] @ tp[num]=
cmp $tp,$tj
#ifdef __thumb2__
itt ne
#endif
movne $tj,sp
bne .Louter
ldr $rp,[$_rp] @ pull rp
mov $aj,sp
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,$aj @ "original" num value
mov $tp,sp @ "rewind" $tp
mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
str $tj,[$rp],#4 @ rp[j]=
teq $tp,$num @ preserve carry
bne .Lsub
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
.Lcopy: ldr $tj,[$tp] @ conditional copy
ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
#ifdef __thumb2__
it cc
#endif
movcc $aj,$tj
str $aj,[$rp],#4
teq $tp,$num @ preserve carry
bne .Lcopy
mov sp,$num
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4-r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
.Labrt:
#if __ARM_ARCH__>=5
ret @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size bn_mul_mont,.-bn_mul_mont
___
{
my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
my ($Z,$Temp)=("q4","q5");
my @ACC=map("q$_",(6..13));
my ($Bi,$Ni,$M0)=map("d$_",(28..31));
my $zero="$Z#lo";
my $temp="$Temp#lo";
my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
.arch armv7-a
.fpu neon
.type bn_mul8x_mont_neon,%function
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4-r11}
vstmdb sp!,{d8-d15} @ ABI specification says so
ldmia ip,{r4-r5} @ load rest of parameter block
mov ip,sp
cmp $num,#8
bhi .LNEON_8n
@ special case for $num==8, everything is in register bank...
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
sub $toutptr,sp,$num,lsl#4
vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
and $toutptr,$toutptr,#-64
vld1.32 {${M0}[0]}, [$n0,:32]
mov sp,$toutptr @ alloca
vzip.16 $Bi,$zero
vmull.u32 @ACC[0],$Bi,${A0}[0]
vmull.u32 @ACC[1],$Bi,${A0}[1]
vmull.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmull.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
vmul.u32 $Ni,$Ni,$M0
vmull.u32 @ACC[4],$Bi,${A2}[0]
vld1.32 {$N0-$N3}, [$nptr]!
vmull.u32 @ACC[5],$Bi,${A2}[1]
vmull.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmull.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
sub $outer,$num,#1
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
b .LNEON_outer8
.align 4
.LNEON_outer8:
vld1.32 {${Bi}[0]}, [$bptr,:32]!
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
veor $zero,$zero,$zero
subs $outer,$outer,#1
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmov $Temp,@ACC[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmov @ACC[0],@ACC[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmov @ACC[1],@ACC[2]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vmov @ACC[2],@ACC[3]
vmov @ACC[3],@ACC[4]
vshr.u64 $temp,$temp,#16
vmov @ACC[4],@ACC[5]
vmov @ACC[5],@ACC[6]
vadd.u64 $temp,$temp,$Temp#hi
vmov @ACC[6],@ACC[7]
veor @ACC[7],@ACC[7]
vshr.u64 $temp,$temp,#16
bne .LNEON_outer8
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
mov $toutptr,sp
vshr.u64 $temp,@ACC[0]#lo,#16
mov $inner,$num
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
add $tinptr,sp,#96
vshr.u64 $temp,@ACC[0]#hi,#16
vzip.16 @ACC[0]#lo,@ACC[0]#hi
b .LNEON_tail_entry
.align 4
.LNEON_8n:
veor @ACC[0],@ACC[0],@ACC[0]
sub $toutptr,sp,#128
veor @ACC[1],@ACC[1],@ACC[1]
sub $toutptr,$toutptr,$num,lsl#4
veor @ACC[2],@ACC[2],@ACC[2]
and $toutptr,$toutptr,#-64
veor @ACC[3],@ACC[3],@ACC[3]
mov sp,$toutptr @ alloca
veor @ACC[4],@ACC[4],@ACC[4]
add $toutptr,$toutptr,#256
veor @ACC[5],@ACC[5],@ACC[5]
sub $inner,$num,#8
veor @ACC[6],@ACC[6],@ACC[6]
veor @ACC[7],@ACC[7],@ACC[7]
.LNEON_8n_init:
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
subs $inner,$inner,#8
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]-@ACC[7]},[$toutptr,:256]!
bne .LNEON_8n_init
add $tinptr,sp,#256
vld1.32 {$A0-$A3},[$aptr]!
add $bnptr,sp,#8
vld1.32 {${M0}[0]},[$n0,:32]
mov $outer,$num
b .LNEON_8n_outer
.align 4
.LNEON_8n_outer:
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
veor $zero,$zero,$zero
vzip.16 $Bi,$zero
add $toutptr,sp,#128
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=0; $i<7;) {
$code.=<<___;
vld1.32 {${Bi}[0]},[$bptr,:32]! @ *b++
vmlal.u32 @ACC[0],$Ni,${N0}[0]
veor $temp,$temp,$temp
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vzip.16 $Bi,$temp
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64]! @ put aside smashed m[8*i+$i]
___
push(@ACC,shift(@ACC)); $i++;
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]!
vmlal.u32 @ACC[1],$Bi,${A0}[1]
veor $zero,$zero,$zero
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vshl.i64 $Ni,@ACC[0]#hi,#16
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vadd.u64 $Ni,$Ni,@ACC[0]#lo
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmul.u32 $Ni,$Ni,$M0
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vst1.32 {$Bi},[$bnptr,:64]! @ put aside smashed b[8*i+$i]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vzip.16 $Ni,$zero
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vshr.u64 @ACC[0]#lo,@ACC[0]#lo,#16
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
vst1.32 {$Ni},[$bnptr,:64] @ put aside smashed m[8*i+$i]
add $bnptr,sp,#8 @ rewind
___
push(@ACC,shift(@ACC));
$code.=<<___;
sub $inner,$num,#8
b .LNEON_8n_inner
.align 4
.LNEON_8n_inner:
subs $inner,$inner,#8
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+0]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
vld1.32 {$N0-$N3},[$nptr]!
vmlal.u32 @ACC[3],$Bi,${A1}[1]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vld1.32 {$Bi},[$bnptr,:64]! @ pull smashed b[8*i+$i]
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vmlal.u32 @ACC[2],$Ni,${N1}[0]
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vmlal.u32 @ACC[7],$Ni,${N3}[1]
vst1.64 {@ACC[0]},[$toutptr,:128]!
___
push(@ACC,shift(@ACC));
$code.=<<___;
vmlal.u32 @ACC[0],$Bi,${A0}[0]
vld1.64 {@ACC[7]},[$tinptr,:128]
vmlal.u32 @ACC[1],$Bi,${A0}[1]
vld1.32 {$Ni},[$bnptr,:64]! @ pull smashed m[8*i+$i]
vmlal.u32 @ACC[2],$Bi,${A1}[0]
it ne
addne $tinptr,$tinptr,#16 @ don't advance in last iteration
vmlal.u32 @ACC[3],$Bi,${A1}[1]
vmlal.u32 @ACC[4],$Bi,${A2}[0]
vmlal.u32 @ACC[5],$Bi,${A2}[1]
vmlal.u32 @ACC[6],$Bi,${A3}[0]
vmlal.u32 @ACC[7],$Bi,${A3}[1]
___
}
$code.=<<___;
it eq
subeq $aptr,$aptr,$num,lsl#2 @ rewind
vmlal.u32 @ACC[0],$Ni,${N0}[0]
vld1.32 {$Bi},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 @ACC[1],$Ni,${N0}[1]
vld1.32 {$A0-$A3},[$aptr]!
vmlal.u32 @ACC[2],$Ni,${N1}[0]
add $bnptr,sp,#8 @ rewind
vmlal.u32 @ACC[3],$Ni,${N1}[1]
vmlal.u32 @ACC[4],$Ni,${N2}[0]
vmlal.u32 @ACC[5],$Ni,${N2}[1]
vmlal.u32 @ACC[6],$Ni,${N3}[0]
vst1.64 {@ACC[0]},[$toutptr,:128]!
vmlal.u32 @ACC[7],$Ni,${N3}[1]
bne .LNEON_8n_inner
___
push(@ACC,shift(@ACC));
$code.=<<___;
add $tinptr,sp,#128
vst1.64 {@ACC[0]-@ACC[1]},[$toutptr,:256]!
veor q2,q2,q2 @ $N0-$N1
vst1.64 {@ACC[2]-@ACC[3]},[$toutptr,:256]!
veor q3,q3,q3 @ $N2-$N3
vst1.64 {@ACC[4]-@ACC[5]},[$toutptr,:256]!
vst1.64 {@ACC[6]},[$toutptr,:128]
subs $outer,$outer,#8
vld1.64 {@ACC[0]-@ACC[1]},[$tinptr,:256]!
vld1.64 {@ACC[2]-@ACC[3]},[$tinptr,:256]!
vld1.64 {@ACC[4]-@ACC[5]},[$tinptr,:256]!
vld1.64 {@ACC[6]-@ACC[7]},[$tinptr,:256]!
itt ne
subne $nptr,$nptr,$num,lsl#2 @ rewind
bne .LNEON_8n_outer
add $toutptr,sp,#128
vst1.64 {q2-q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 $temp,@ACC[0]#lo,#16
vst1.64 {q2-q3},[sp,:256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vst1.64 {q2-q3}, [sp,:256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vst1.64 {q2-q3}, [sp,:256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
mov $inner,$num
b .LNEON_tail_entry
.align 4
.LNEON_tail:
vadd.u64 @ACC[0]#lo,@ACC[0]#lo,$temp
vshr.u64 $temp,@ACC[0]#lo,#16
vld1.64 {@ACC[2]-@ACC[3]}, [$tinptr, :256]!
vadd.u64 @ACC[0]#hi,@ACC[0]#hi,$temp
vld1.64 {@ACC[4]-@ACC[5]}, [$tinptr, :256]!
vshr.u64 $temp,@ACC[0]#hi,#16
vld1.64 {@ACC[6]-@ACC[7]}, [$tinptr, :256]!
vzip.16 @ACC[0]#lo,@ACC[0]#hi
.LNEON_tail_entry:
___
for ($i=1; $i<8; $i++) {
$code.=<<___;
vadd.u64 @ACC[1]#lo,@ACC[1]#lo,$temp
vst1.32 {@ACC[0]#lo[0]}, [$toutptr, :32]!
vshr.u64 $temp,@ACC[1]#lo,#16
vadd.u64 @ACC[1]#hi,@ACC[1]#hi,$temp
vshr.u64 $temp,@ACC[1]#hi,#16
vzip.16 @ACC[1]#lo,@ACC[1]#hi
___
push(@ACC,shift(@ACC));
}
push(@ACC,shift(@ACC));
$code.=<<___;
vld1.64 {@ACC[0]-@ACC[1]}, [$tinptr, :256]!
subs $inner,$inner,#8
vst1.32 {@ACC[7]#lo[0]}, [$toutptr, :32]!
bne .LNEON_tail
vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
subs $aptr,sp,#0 @ clear carry flag
add $bptr,sp,$num,lsl#2
.LNEON_sub:
ldmia $aptr!, {r4-r7}
ldmia $nptr!, {r8-r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_sub
ldr r10, [$aptr] @ load top-most bit
mov r11,sp
veor q0,q0,q0
sub r11,$bptr,r11 @ this is num*4
veor q1,q1,q1
mov $aptr,sp
sub $rptr,$rptr,r11 @ rewind $rptr
mov $nptr,$bptr @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
.LNEON_copy_n_zap:
ldmia $aptr!, {r4-r7}
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
ldmia $aptr, {r4-r7}
stmia $rptr!, {r8-r11}
sub $aptr,$aptr,#16
ldmia $rptr, {r8-r11}
it cc
movcc r8, r4
vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
it cc
movcc r11,r7
teq $aptr,$bptr @ preserves carry
stmia $rptr!, {r8-r11}
bne .LNEON_copy_n_zap
mov sp,ip
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r11}
ret @ bx lr
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
#endif
___
}
$code.=<<___;
.asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
#if __ARM_MAX_ARCH__>=7
.comm OPENSSL_armcap_P,4,4
#endif
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge or
s/\bret\b/bx lr/g or
s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,785 @@
#! /usr/bin/env perl
# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&bn_mul_add_words("bn_mul_add_words");
&bn_mul_words("bn_mul_words");
&bn_sqr_words("bn_sqr_words");
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
&bn_sub_part_words("bn_sub_part_words");
&asm_finish();
close STDOUT;
sub bn_mul_add_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("maw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
&movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
&pmuludq("mm2","mm0"); # mm2 = w*a[0]
&movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
&pmuludq("mm4","mm0"); # mm4 = w*a[1]
&movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
&pmuludq("mm6","mm0"); # mm6 = w*a[2]
&movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
&pmuludq("mm7","mm0"); # mm7 = w*a[3]
&paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
&movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
&paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
&movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
&paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
&movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
&paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
&movd(&DWP(0,$r,"",0),"mm1");
&movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
&pmuludq("mm2","mm0"); # mm2 = w*a[4]
&psrlq("mm1",32); # mm1 = carry0
&movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
&pmuludq("mm4","mm0"); # mm4 = w*a[5]
&paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
&movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
&pmuludq("mm6","mm0"); # mm6 = w*a[6]
&movd(&DWP(4,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry1
&movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
&add($a,32);
&pmuludq("mm3","mm0"); # mm3 = w*a[7]
&paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
&movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
&paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
&movd(&DWP(8,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry2
&paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
&movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
&paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
&movd(&DWP(12,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry3
&paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
&movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
&paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
&movd(&DWP(16,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry4
&paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
&movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
&paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
&movd(&DWP(20,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry5
&paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
&movd(&DWP(24,$r,"",0),"mm1");
&psrlq("mm1",32); # mm1 = carry6
&paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
&movd(&DWP(28,$r,"",0),"mm1");
&lea($r,&DWP(32,$r));
&psrlq("mm1",32); # mm1 = carry_out
&sub($c,8);
&jz(&label("maw_sse2_exit"));
&set_label("maw_sse2_entry");
&test($c,0xfffffff8);
&jnz(&label("maw_sse2_unrolled"));
&set_label("maw_sse2_loop",4);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&movd("mm3",&DWP(0,$r)); # mm3 = r[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm3"); # carry += r[i]
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("maw_sse2_loop"));
&set_label("maw_sse2_exit");
&movd("eax","mm1"); # c = carry_out
&emms();
&ret();
&set_label("maw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ebp";
$r="edi";
$c="esi";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov("ecx",&wparam(2)); #
&mov($a,&wparam(1)); #
&and("ecx",0xfffffff8); # num / 8
&mov($w,&wparam(3)); #
&push("ecx"); # Up the stack for a tmp variable
&jz(&label("maw_finish"));
&set_label("maw_loop",16);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+= c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&sub("ecx",8);
&lea($a,&DWP(32,$a));
&lea($r,&DWP(32,$r));
&jnz(&label("maw_loop"));
&set_label("maw_finish",0);
&mov("ecx",&wparam(2)); # get num
&and("ecx",7);
&jnz(&label("maw_finish2")); # helps branch prediction
&jmp(&label("maw_end"));
&set_label("maw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
&adc("edx",0); # H(t)+=carry
&add("eax",&DWP($i*4,$r)); # L(t)+= *r
&adc("edx",0); # H(t)+=carry
&dec("ecx") if ($i != 7-1);
&mov(&DWP($i*4,$r),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
&jz(&label("maw_end")) if ($i != 7-1);
}
&set_label("maw_end",0);
&mov("eax",$c);
&pop("ecx"); # clear variable from
&function_end($name);
}
sub bn_mul_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("mw_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry = 0
&set_label("mw_sse2_loop",16);
&movd("mm2",&DWP(0,$a)); # mm2 = a[i]
&pmuludq("mm2","mm0"); # a[i] *= w
&lea($a,&DWP(4,$a));
&paddq("mm1","mm2"); # carry += a[i]*w
&movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
&sub($c,1);
&psrlq("mm1",32); # carry = carry_high
&lea($r,&DWP(4,$r));
&jnz(&label("mw_sse2_loop"));
&movd("eax","mm1"); # return carry
&emms();
&ret();
&set_label("mw_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$Low="eax";
$High="edx";
$a="ebx";
$w="ecx";
$r="edi";
$c="esi";
$num="ebp";
&xor($c,$c); # clear carry
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&mov($w,&wparam(3)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("mw_finish"));
&set_label("mw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
&mov($c,"edx"); # c= H(t);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jz(&label("mw_finish"));
&jmp(&label("mw_loop"));
&set_label("mw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jnz(&label("mw_finish2"));
&jmp(&label("mw_end"));
&set_label("mw_finish2",1);
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0));# *a
&mul($w); # *a * w
&add("eax",$c); # L(t)+=c
# XXX
&adc("edx",0); # H(t)+=carry
&mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
&mov($c,"edx"); # c= H(t);
&dec($num) if ($i != 7-1);
&jz(&label("mw_end")) if ($i != 7-1);
}
&set_label("mw_end",0);
&mov("eax",$c);
&function_end($name);
}
sub bn_sqr_words
{
local($name)=@_;
&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
$r="eax";
$a="edx";
$c="ecx";
if ($sse2) {
&picmeup("eax","OPENSSL_ia32cap_P");
&bt(&DWP(0,"eax"),26);
&jnc(&label("sqr_non_sse2"));
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&mov($c,&wparam(2));
&set_label("sqr_sse2_loop",16);
&movd("mm0",&DWP(0,$a)); # mm0 = a[i]
&pmuludq("mm0","mm0"); # a[i] *= a[i]
&lea($a,&DWP(4,$a)); # a++
&movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
&sub($c,1);
&lea($r,&DWP(8,$r)); # r += 2
&jnz(&label("sqr_sse2_loop"));
&emms();
&ret();
&set_label("sqr_non_sse2",16);
}
# function_begin prologue
&push("ebp");
&push("ebx");
&push("esi");
&push("edi");
&comment("");
$r="esi";
$a="edi";
$num="ebx";
&mov($r,&wparam(0)); #
&mov($a,&wparam(1)); #
&mov($num,&wparam(2)); #
&and($num,0xfffffff8); # num / 8
&jz(&label("sw_finish"));
&set_label("sw_loop",0);
for ($i=0; $i<32; $i+=4)
{
&comment("Round $i");
&mov("eax",&DWP($i,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*2,$r,"",0),"eax"); #
&mov(&DWP($i*2+4,$r,"",0),"edx");#
}
&comment("");
&add($a,32);
&add($r,64);
&sub($num,8);
&jnz(&label("sw_loop"));
&set_label("sw_finish",0);
&mov($num,&wparam(2)); # get num
&and($num,7);
&jz(&label("sw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov("eax",&DWP($i*4,$a,"",0)); # *a
# XXX
&mul("eax"); # *a * *a
&mov(&DWP($i*8,$r,"",0),"eax"); #
&dec($num) if ($i != 7-1);
&mov(&DWP($i*8+4,$r,"",0),"edx");
&jz(&label("sw_end")) if ($i != 7-1);
}
&set_label("sw_end",0);
&function_end($name);
}
sub bn_div_words
{
local($name)=@_;
&function_begin_B($name,"");
&mov("edx",&wparam(0)); #
&mov("eax",&wparam(1)); #
&mov("ecx",&wparam(2)); #
&div("ecx");
&ret();
&function_end_B($name);
}
sub bn_add_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&add($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&add($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}
sub bn_sub_part_words
{
local($name)=@_;
&function_begin($name,"");
&comment("");
$a="esi";
$b="edi";
$c="eax";
$r="ebx";
$tmp1="ecx";
$tmp2="edx";
$num="ebp";
&mov($r,&wparam(0)); # get r
&mov($a,&wparam(1)); # get a
&mov($b,&wparam(2)); # get b
&mov($num,&wparam(3)); # get num
&xor($c,$c); # clear carry
&and($num,0xfffffff8); # num / 8
&jz(&label("aw_finish"));
&set_label("aw_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($a,32);
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("aw_loop"));
&set_label("aw_finish",0);
&mov($num,&wparam(3)); # get num
&and($num,7);
&jz(&label("aw_end"));
for ($i=0; $i<7; $i++)
{
&comment("Tail Round $i");
&mov($tmp1,&DWP(0,$a,"",0)); # *a
&mov($tmp2,&DWP(0,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP(0,$r,"",0),$tmp1); # *r
&add($a, 4);
&add($b, 4);
&add($r, 4);
&dec($num) if ($i != 6);
&jz(&label("aw_end")) if ($i != 6);
}
&set_label("aw_end",0);
&cmp(&wparam(4),0);
&je(&label("pw_end"));
&mov($num,&wparam(4)); # get dl
&cmp($num,0);
&je(&label("pw_end"));
&jge(&label("pw_pos"));
&comment("pw_neg");
&mov($tmp2,0);
&sub($tmp2,$num);
&mov($num,$tmp2);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_neg_finish"));
&set_label("pw_neg_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl<0 Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0)); # *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
&mov($tmp1,0);
&mov($tmp2,&DWP($i*4,$b,"",0));# *b
&sub($tmp1,$c);
&mov($c,0);
&adc($c,$c);
&sub($tmp1,$tmp2);
&adc($c,0);
&dec($num) if ($i != 6);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jz(&label("pw_end")) if ($i != 6);
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
&set_label("pw_pos_loop",0);
for ($i=0; $i<8; $i++)
{
&comment("dl>0 Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&sub($tmp1,$c);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_tail_nc".$i));
&dec($num) if ($i != 6);
&jz(&label("pw_end")) if ($i != 6);
}
&mov($c,1);
&jmp(&label("pw_end"));
&set_label("pw_nc_loop",0);
for ($i=0; $i<8; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_tail_nc".$i,0);
&dec($num) if ($i != 6);
&jz(&label("pw_nc_end")) if ($i != 6);
}
&set_label("pw_nc_end",0);
&mov($c,0);
&set_label("pw_end",0);
# &mov("eax",$c); # $c is "eax"
&function_end($name);
}

View file

@ -0,0 +1,382 @@
;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
;;
;; Licensed under the OpenSSL license (the "License"). You may not use
;; this file except in compliance with the License. You can obtain a copy
;; in the file LICENSE in the source distribution or at
;; https://www.openssl.org/source/license.html
;;
;;====================================================================
;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
;; project.
;;
;; Rights for redistribution and usage in source and binary forms are
;; granted according to the OpenSSL license. Warranty of any kind is
;; disclaimed.
;;====================================================================
;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
;;====================================================================
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg bn_mul_add_words,_bn_mul_add_words
.asg bn_mul_words,_bn_mul_words
.asg bn_sqr_words,_bn_sqr_words
.asg bn_add_words,_bn_add_words
.asg bn_sub_words,_bn_sub_words
.asg bn_div_words,_bn_div_words
.asg bn_sqr_comba8,_bn_sqr_comba8
.asg bn_mul_comba8,_bn_mul_comba8
.asg bn_sqr_comba4,_bn_sqr_comba4
.asg bn_mul_comba4,_bn_mul_comba4
.endif
.asg B3,RA
.asg A4,ARG0
.asg B4,ARG1
.asg A6,ARG2
.asg B6,ARG3
.asg A8,ARG4
.asg B8,ARG5
.asg A4,RET
.asg A15,FP
.asg B14,DP
.asg B15,SP
.global _bn_mul_add_words
_bn_mul_add_words:
.asmfunc
MV ARG2,B0
[!B0] BNOP RA
||[!B0] MVK 0,RET
[B0] MVC B0,ILC
[B0] ZERO A19 ; high part of accumulator
|| [B0] MV ARG0,A2
|| [B0] MV ARG3,A3
NOP 3
SPLOOP 2 ; 2*n+10
;;====================================================================
LDW *ARG1++,B7 ; ap[i]
NOP 3
LDW *ARG0++,A7 ; rp[i]
MPY32U B7,A3,A17:A16
NOP 3 ; [2,0] in epilogue
ADDU A16,A7,A21:A20
ADDU A19,A21:A20,A19:A18
|| MV.S A17,A23
SPKERNEL 2,1 ; leave slot for "return value"
|| STW A18,*A2++ ; rp[i]
|| ADD A19,A23,A19
;;====================================================================
BNOP RA,4
MV A19,RET ; return value
.endasmfunc
.global _bn_mul_words
_bn_mul_words:
.asmfunc
MV ARG2,B0
[!B0] BNOP RA
||[!B0] MVK 0,RET
[B0] MVC B0,ILC
[B0] ZERO A19 ; high part of accumulator
NOP 3
SPLOOP 2 ; 2*n+10
;;====================================================================
LDW *ARG1++,A7 ; ap[i]
NOP 4
MPY32U A7,ARG3,A17:A16
NOP 4 ; [2,0] in epiloque
ADDU A19,A16,A19:A18
|| MV.S A17,A21
SPKERNEL 2,1 ; leave slot for "return value"
|| STW A18,*ARG0++ ; rp[i]
|| ADD.L A19,A21,A19
;;====================================================================
BNOP RA,4
MV A19,RET ; return value
.endasmfunc
.global _bn_sqr_words
_bn_sqr_words:
.asmfunc
MV ARG2,B0
[!B0] BNOP RA
||[!B0] MVK 0,RET
[B0] MVC B0,ILC
[B0] MV ARG0,B2
|| [B0] ADD 4,ARG0,ARG0
NOP 3
SPLOOP 2 ; 2*n+10
;;====================================================================
LDW *ARG1++,B7 ; ap[i]
NOP 4
MPY32U B7,B7,B1:B0
NOP 3 ; [2,0] in epilogue
STW B0,*B2++(8) ; rp[2*i]
MV B1,A1
SPKERNEL 2,0 ; fully overlap BNOP RA,5
|| STW A1,*ARG0++(8) ; rp[2*i+1]
;;====================================================================
BNOP RA,5
.endasmfunc
.global _bn_add_words
_bn_add_words:
.asmfunc
MV ARG3,B0
[!B0] BNOP RA
||[!B0] MVK 0,RET
[B0] MVC B0,ILC
[B0] ZERO A1 ; carry flag
|| [B0] MV ARG0,A3
NOP 3
SPLOOP 2 ; 2*n+6
;;====================================================================
LDW *ARG2++,A7 ; bp[i]
|| LDW *ARG1++,B7 ; ap[i]
NOP 4
ADDU A7,B7,A9:A8
ADDU A1,A9:A8,A1:A0
SPKERNEL 0,0 ; fully overlap BNOP RA,5
|| STW A0,*A3++ ; write result
|| MV A1,RET ; keep carry flag in RET
;;====================================================================
BNOP RA,5
.endasmfunc
.global _bn_sub_words
_bn_sub_words:
.asmfunc
MV ARG3,B0
[!B0] BNOP RA
||[!B0] MVK 0,RET
[B0] MVC B0,ILC
[B0] ZERO A2 ; borrow flag
|| [B0] MV ARG0,A3
NOP 3
SPLOOP 2 ; 2*n+6
;;====================================================================
LDW *ARG2++,A7 ; bp[i]
|| LDW *ARG1++,B7 ; ap[i]
NOP 4
SUBU B7,A7,A1:A0
[A2] SUB A1:A0,1,A1:A0
SPKERNEL 0,1 ; leave slot for "return borrow flag"
|| STW A0,*A3++ ; write result
|| AND 1,A1,A2 ; pass on borrow flag
;;====================================================================
BNOP RA,4
AND 1,A1,RET ; return borrow flag
.endasmfunc
.global _bn_div_words
_bn_div_words:
.asmfunc
LMBD 1,A6,A0 ; leading zero bits in dv
LMBD 1,A4,A1 ; leading zero bits in hi
|| MVK 32,B0
CMPLTU A1,A0,A2
|| ADD A0,B0,B0
[ A2] BNOP RA
||[ A2] MVK -1,A4 ; return overflow
||[!A2] MV A4,A3 ; reassign hi
[!A2] MV B4,A4 ; reassign lo, will be quotient
||[!A2] MVC B0,ILC
[!A2] SHL A6,A0,A6 ; normalize dv
|| MVK 1,A1
[!A2] CMPLTU A3,A6,A1 ; hi<dv?
||[!A2] SHL A4,1,A5:A4 ; lo<<1
[!A1] SUB A3,A6,A3 ; hi-=dv
||[!A1] OR 1,A4,A4
[!A2] SHRU A3,31,A1 ; upper bit
||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
SPLOOP 3
[!A1] CMPLTU A3,A6,A1 ; hi<dv?
||[ A1] ZERO A1
|| SHL A4,1,A5:A4 ; lo<<1
[!A1] SUB A3,A6,A3 ; hi-=dv
||[!A1] OR 1,A4,A4 ; quotient
SHRU A3,31,A1 ; upper bit
|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
SPKERNEL
BNOP RA,5
.endasmfunc
;;====================================================================
;; Not really Comba algorithm, just straightforward NxM... Dedicated
;; fully unrolled real Comba implementations are asymptotically 2x
;; faster, but naturally larger undertaking. Purpose of this exercise
;; was rather to learn to master nested SPLOOPs...
;;====================================================================
.global _bn_sqr_comba8
.global _bn_mul_comba8
_bn_sqr_comba8:
MV ARG1,ARG2
_bn_mul_comba8:
.asmfunc
MVK 8,B0 ; N, RILC
|| MVK 8,A0 ; M, outer loop counter
|| MV ARG1,A5 ; copy ap
|| MV ARG0,B4 ; copy rp
|| ZERO B19 ; high part of accumulator
MVC B0,RILC
|| SUB B0,2,B1 ; N-2, initial ILC
|| SUB B0,1,B2 ; const B2=N-1
|| LDW *A5++,B6 ; ap[0]
|| MV A0,A3 ; const A3=M
sploopNxM?: ; for best performance arrange M<=N
[A0] SPLOOPD 2 ; 2*n+10
|| MVC B1,ILC
|| ADDAW B4,B0,B5
|| ZERO B7
|| LDW *A5++,A9 ; pre-fetch ap[1]
|| ZERO A1
|| SUB A0,1,A0
;;====================================================================
;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
;; This is because of Advisory 15 from TI publication SPRZ247I.
LDW *ARG2++,A7 ; bp[i]
NOP 3
[A1] LDW *B5++,B7 ; rp[i]
MPY32U A7,B6,B17:B16
NOP 3
ADDU B16,B7,B21:B20
ADDU B19,B21:B20,B19:B18
|| MV.S B17,B23
SPKERNEL
|| STW B18,*B4++ ; rp[i]
|| ADD.S B19,B23,B19
;;====================================================================
outer?: ; m*2*(n+1)+10
SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
SPMASKR
|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
MVD A9,B6 ; move through .M unit(*)
[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
SUBAW B5,B2,B5 ; rewind rp to rp[1]
MVK 1,A1
[A0] BNOP.S1 outer?,4
|| [A0] SUB.L A0,1,A0
STW B19,*B4--[B2] ; rewind rp tp rp[1]
|| ZERO.S B19 ; high part of accumulator
;; end of outer?
BNOP RA,5 ; return
.endasmfunc
;; (*) It should be noted that B6 is used as input to MPY32U in
;; chronologically next cycle in *preceding* SPLOOP iteration.
;; Normally such arrangement would require DINT, but at this
;; point SPLOOP is draining and interrupts are disabled
;; implicitly.
.global _bn_sqr_comba4
.global _bn_mul_comba4
_bn_sqr_comba4:
MV ARG1,ARG2
_bn_mul_comba4:
.asmfunc
.if 0
BNOP sploopNxM?,3
;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
;; because of low-counter effect, when prologue phase finishes
;; before SPKERNEL instruction is reached. As result it's 25%
;; slower than expected...
MVK 4,B0 ; N, RILC
|| MVK 4,A0 ; M, outer loop counter
|| MV ARG1,A5 ; copy ap
|| MV ARG0,B4 ; copy rp
|| ZERO B19 ; high part of accumulator
MVC B0,RILC
|| SUB B0,2,B1 ; first ILC
|| SUB B0,1,B2 ; const B2=N-1
|| LDW *A5++,B6 ; ap[0]
|| MV A0,A3 ; const A3=M
.else
;; This alternative is an exercise in fully unrolled Comba
;; algorithm implementation that operates at n*(n+1)+12, or
;; as little as 32 cycles...
LDW *ARG1[0],B16 ; a[0]
|| LDW *ARG2[0],A16 ; b[0]
LDW *ARG1[1],B17 ; a[1]
|| LDW *ARG2[1],A17 ; b[1]
LDW *ARG1[2],B18 ; a[2]
|| LDW *ARG2[2],A18 ; b[2]
LDW *ARG1[3],B19 ; a[3]
|| LDW *ARG2[3],A19 ; b[3]
NOP
MPY32U A16,B16,A1:A0 ; a[0]*b[0]
MPY32U A17,B16,A23:A22 ; a[0]*b[1]
MPY32U A16,B17,A25:A24 ; a[1]*b[0]
MPY32U A16,B18,A27:A26 ; a[2]*b[0]
STW A0,*ARG0[0]
|| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
MPY32U A18,B16,A31:A30 ; a[0]*b[2]
|| ADDU A22,A1,A1:A0
MV A23,B0
|| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
|| ADDU A24,A1:A0,A1:A0
ADDU A25,B0,B1:B0
|| STW A0,*ARG0[1]
|| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
|| ADDU A26,A1,A9:A8
ADDU A27,B1,B9:B8
|| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
|| ADDU A28,A9:A8,A9:A8
ADDU A29,B9:B8,B9:B8
|| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
|| ADDU A30,A9:A8,A9:A8
ADDU A31,B9:B8,B9:B8
|| ADDU B0,A9:A8,A9:A8
STW A8,*ARG0[2]
|| ADDU A20,A9,A1:A0
ADDU A21,B9,B1:B0
|| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
|| ADDU A22,A1:A0,A1:A0
ADDU A23,B1:B0,B1:B0
|| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
|| ADDU A24,A1:A0,A1:A0
ADDU A25,B1:B0,B1:B0
|| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
|| ADDU A26,A1:A0,A1:A0
ADDU A27,B1:B0,B1:B0
|| ADDU B8,A1:A0,A1:A0
STW A0,*ARG0[3]
|| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
|| ADDU A20,A1,A9:A8
ADDU A21,B1,B9:B8
|| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
|| ADDU A22,A9:A8,A9:A8
ADDU A23,B9:B8,B9:B8
|| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
|| ADDU A24,A9:A8,A9:A8
ADDU A25,B9:B8,B9:B8
|| ADDU B0,A9:A8,A9:A8
STW A8,*ARG0[4]
|| ADDU A26,A9,A1:A0
ADDU A27,B9,B1:B0
|| ADDU A28,A1:A0,A1:A0
ADDU A29,B1:B0,B1:B0
|| BNOP RA
|| ADDU B8,A1:A0,A1:A0
STW A0,*ARG0[5]
|| ADDU A30,A1,A9:A8
ADD A31,B1,B8
ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
ADD B8,A9,A9
|| STW A8,*ARG0[6]
STW A9,*ARG0[7]
.endif
.endasmfunc

View file

@ -0,0 +1,160 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# February 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
# C for the time being... The subroutine runs in 37 cycles, which is
# 4.5x faster than compiler-generated code. Though comparison is
# totally unfair, because this module utilizes Galois Field Multiply
# instruction.
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
($A,$B)=($Alo,$B_1);
$xFF="B1";
sub mul_1x1_upper {
my ($A,$B)=@_;
$code.=<<___;
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|| AND $B,$xFF,$B_0
|| SHRU $B,24,$B_3
SHRU $A,16, $Ahi ; smash $A to two halfwords
|| EXTU $A,16,16,$Alo
XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits multiplication
|| XORMPY $Ahi,$B_2,$Ahix2
|| EXTU $B,16,24,$B_1
XORMPY $Alo,$B_0,$Alox0
|| XORMPY $Ahi,$B_0,$Ahix0
XORMPY $Alo,$B_3,$Alox3
|| XORMPY $Ahi,$B_3,$Ahix3
XORMPY $Alo,$B_1,$Alox1
|| XORMPY $Ahi,$B_1,$Ahix1
___
}
sub mul_1x1_merged {
my ($OUTlo,$OUThi,$A,$B)=@_;
$code.=<<___;
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|| AND $B,$xFF,$B_0
|| SHRU $B,24,$B_3
SHRU $A,16, $Ahi ; smash $A to two halfwords
|| EXTU $A,16,16,$Alo
XOR $Ahix0,$Alox2,$Ahix0
|| MV $Ahix2,$OUThi
|| XORMPY $Alo,$B_2,$Alox2
XORMPY $Ahi,$B_2,$Ahix2
|| EXTU $B,16,24,$B_1
|| XORMPY $Alo,$B_0,A1 ; $Alox0
XOR $Ahix1,$Alox3,$Ahix1
|| SHL $Ahix0,16,$OUTlo
|| SHRU $Ahix0,16,$Ahix0
XOR $Alox0,$OUTlo,$OUTlo
|| XOR $Ahix0,$OUThi,$OUThi
|| XORMPY $Ahi,$B_0,$Ahix0
|| XORMPY $Alo,$B_3,$Alox3
|| SHL $Alox1,8,$Alox1
|| SHL $Ahix3,8,$Ahix3
XOR $Alox1,$OUTlo,$OUTlo
|| XOR $Ahix3,$OUThi,$OUThi
|| XORMPY $Ahi,$B_3,$Ahix3
|| SHL $Ahix1,24,$Alox1
|| SHRU $Ahix1,8, $Ahix1
XOR $Alox1,$OUTlo,$OUTlo
|| XOR $Ahix1,$OUThi,$OUThi
|| XORMPY $Alo,$B_1,$Alox1
|| XORMPY $Ahi,$B_1,$Ahix1
|| MV A1,$Alox0
___
}
sub mul_1x1_lower {
my ($OUTlo,$OUThi)=@_;
$code.=<<___;
;NOP
XOR $Ahix0,$Alox2,$Ahix0
|| MV $Ahix2,$OUThi
NOP
XOR $Ahix1,$Alox3,$Ahix1
|| SHL $Ahix0,16,$OUTlo
|| SHRU $Ahix0,16,$Ahix0
XOR $Alox0,$OUTlo,$OUTlo
|| XOR $Ahix0,$OUThi,$OUThi
|| SHL $Alox1,8,$Alox1
|| SHL $Ahix3,8,$Ahix3
XOR $Alox1,$OUTlo,$OUTlo
|| XOR $Ahix3,$OUThi,$OUThi
|| SHL $Ahix1,24,$Alox1
|| SHRU $Ahix1,8, $Ahix1
XOR $Alox1,$OUTlo,$OUTlo
|| XOR $Ahix1,$OUThi,$OUThi
___
}
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.asg bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
.endif
.global _bn_GF2m_mul_2x2
_bn_GF2m_mul_2x2:
.asmfunc
MVK 0xFF,$xFF
___
&mul_1x1_upper($a0,$b0); # a0·b0
$code.=<<___;
|| MV $b1,$B
MV $a1,$A
___
&mul_1x1_merged("A28","B28",$A,$B); # a0·b0/a1·b1
$code.=<<___;
|| XOR $b0,$b1,$B
XOR $a0,$a1,$A
___
&mul_1x1_merged("A31","B31",$A,$B); # a1·b1/(a0+a1)·(b0+b1)
$code.=<<___;
XOR A28,A31,A29
|| XOR B28,B31,B29 ; a0·b0+a1·b1
___
&mul_1x1_lower("A30","B30"); # (a0+a1)·(b0+b1)
$code.=<<___;
|| BNOP B3
XOR A29,A30,A30
|| XOR B29,B30,B30 ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
XOR B28,A30,A30
|| STW A28,*${rp}[0]
XOR B30,A31,A31
|| STW A30,*${rp}[1]
STW A31,*${rp}[2]
STW B31,*${rp}[3]
.endasmfunc
___
print $code;
close STDOUT;

View file

@ -0,0 +1,298 @@
#! /usr/bin/env perl
# Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
&bn_mul_comba("bn_mul_comba8",8);
&bn_mul_comba("bn_mul_comba4",4);
&bn_sqr_comba("bn_sqr_comba8",8);
&bn_sqr_comba("bn_sqr_comba4",4);
&asm_finish();
close STDOUT;
sub mul_add_c
{
local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("mul a[$ai]*b[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
&mul("edx");
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
&mov("eax",&wparam(0)) if $pos > 0; # load r[]
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
sub sqr_add_c
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$b,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add($c0,"eax");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
###
&adc($c1,"edx");
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
sub sqr_add_c2
{
local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
# pos == -1 if eax and edx are pre-loaded, 0 to load from next
# words, and 1 if load return value
&comment("sqr a[$ai]*a[$bi]");
# "eax" and "edx" will always be pre-loaded.
# &mov("eax",&DWP($ai*4,$a,"",0)) ;
# &mov("edx",&DWP($bi*4,$a,"",0));
if ($ai == $bi)
{ &mul("eax");}
else
{ &mul("edx");}
&add("eax","eax");
###
&adc("edx","edx");
###
&adc($c2,0);
&add($c0,"eax");
&adc($c1,"edx");
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
&adc($c2,0);
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
###
}
sub bn_mul_comba
{
local($name,$num)=@_;
local($a,$b,$c0,$c1,$c2);
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($tot,$end);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
&push("esi");
&mov($a,&wparam(1));
&push("edi");
&mov($b,&wparam(2));
&push("ebp");
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($j+1) == $end)
{
$v=1;
$v=2 if (($i+1) == $tot);
}
else
{ $v=0; }
if (($j+1) != $end)
{
$na=($ai-1);
$nb=($bi+1);
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
if ($v)
{
&comment("saved r[$i]");
# &mov("eax",&wparam(0));
# &mov(&DWP($i*4,"eax","",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&comment("save r[$i]");
# &mov("eax",&wparam(0));
&mov(&DWP($i*4,"eax","",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}
sub bn_sqr_comba
{
local($name,$num)=@_;
local($r,$a,$c0,$c1,$c2)=@_;
local($i,$as,$ae,$bs,$be,$ai,$bi);
local($b,$tot,$end,$half);
&function_begin_B($name,"");
$c0="ebx";
$c1="ecx";
$c2="ebp";
$a="esi";
$r="edi";
&push("esi");
&push("edi");
&push("ebp");
&push("ebx");
&mov($r,&wparam(0));
&mov($a,&wparam(1));
&xor($c0,$c0);
&xor($c1,$c1);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
$as=0;
$ae=0;
$bs=0;
$be=0;
$tot=$num+$num-1;
for ($i=0; $i<$tot; $i++)
{
$ai=$as;
$bi=$bs;
$end=$be+1;
&comment("############### Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{
&xor($c2,$c2) if ($j == $bs);
if (($ai-1) < ($bi+1))
{
$v=1;
$v=2 if ($i+1) == $tot;
}
else
{ $v=0; }
if (!$v)
{
$na=$ai-1;
$nb=$bi+1;
}
else
{
$na=$as+($i < ($num-1));
$nb=$bs+($i >= ($num-1));
}
if ($ai == $bi)
{
&sqr_add_c($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
else
{
&sqr_add_c2($r,$a,$ai,$bi,
$c0,$c1,$c2,$v,$i,$na,$nb);
}
if ($v)
{
&comment("saved r[$i]");
#&mov(&DWP($i*4,$r,"",0),$c0);
($c0,$c1,$c2)=($c1,$c2,$c0);
last;
}
$ai--;
$bi++;
}
$as++ if ($i < ($num-1));
$ae++ if ($i >= ($num-1));
$bs++ if ($i >= ($num-1));
$be++ if ($i < ($num-1));
}
&mov(&DWP($i*4,$r,"",0),$c0);
&pop("ebx");
&pop("ebp");
&pop("edi");
&pop("esi");
&ret();
&function_end_B($name);
}

View file

@ -0,0 +1,860 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# January 2010
#
# "Teaser" Montgomery multiplication module for IA-64. There are
# several possibilities for improvement:
#
# - modulo-scheduling outer loop would eliminate quite a number of
# stalls after ldf8, xma and getf.sig outside inner loop and
# improve shorter key performance;
# - shorter vector support [with input vectors being fetched only
# once] should be added;
# - 2x unroll with help of n0[1] would make the code scalable on
# "wider" IA-64, "wider" than Itanium 2 that is, which is not of
# acute interest, because upcoming Tukwila's individual cores are
# reportedly based on Itanium 2 design;
# - dedicated squaring procedure(?);
#
# January 2010
#
# Shorter vector support is implemented by zero-padding ap and np
# vectors up to 8 elements, or 512 bits. This means that 256-bit
# inputs will be processed only 2 times faster than 512-bit inputs,
# not 4 [as one would expect, because algorithm complexity is n^2].
# The reason for padding is that inputs shorter than 512 bits won't
# be processed faster anyway, because minimal critical path of the
# core loop happens to match 512-bit timing. Either way, it resulted
# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
# 1024-bit one [in comparison to original version of *this* module].
#
# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
# this module is:
# sign verify sign/s verify/s
# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
#
# ... and *without* (but still with ia64.S):
#
# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
#
# As it can be seen, RSA sign performance improves by 130-30%,
# hereafter less for longer keys, while verify - by 74-13%.
# DSA performance improves by 115-30%.
$output=pop;
if ($^O eq "hpux") {
$ADDP="addp4";
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
} else { $ADDP="add"; }
$code=<<___;
.explicit
.text
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
bn_mul_mont:
.prologue
.body
{ .mmi; cmp4.le p6,p7=2,r37;;
(p6) cmp4.lt.unc p8,p9=8,r37
mov ret0=r0 };;
{ .bbb;
(p9) br.cond.dptk.many bn_mul_mont_8
(p8) br.cond.dpnt.many bn_mul_mont_general
(p7) br.ret.spnt.many b0 };;
.endp bn_mul_mont#
prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
rptr=r8; aptr=r9; bptr=r14; nptr=r15;
tptr=r16; // &tp[0]
tp_1=r17; // &tp[-1]
num=r18; len=r19; lc=r20;
topbit=r21; // carry bit from tmp[num]
n0=f6;
m0=f7;
bi=f8;
.align 64
.local bn_mul_mont_general#
.proc bn_mul_mont_general#
bn_mul_mont_general:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
$ADDP aptr=0,in1
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; .vframe prevsp
mov prevsp=sp
$ADDP bptr=0,in2
.save pr,prevpr
mov prevpr=pr };;
.body
.rotf alo[6],nlo[4],ahi[8],nhi[6]
.rotr a[3],n[3],t[2]
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 alo[4]=[aptr],16 // ap[0]
$ADDP r30=8,in1 };;
{ .mmi; ldf8 alo[3]=[r30],16 // ap[1]
ldf8 alo[2]=[aptr],16 // ap[2]
$ADDP in4=0,in4 };;
{ .mmi; ldf8 alo[1]=[r30] // ap[3]
ldf8 n0=[in4] // n0
$ADDP rptr=0,in0 }
{ .mmi; $ADDP nptr=0,in3
mov r31=16
zxt4 num=in5 };;
{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
shladd len=num,3,r0
shladd r31=num,3,r31 };;
{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
add lc=-5,num
sub r31=sp,r31 };;
{ .mfb; and sp=-16,r31 // alloca
xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
nop.b 0 }
{ .mfb; nop.m 0
xmpy.lu alo[4]=alo[4],bi
brp.loop.imp .L1st_ctop,.L1st_cend-16
};;
{ .mfi; nop.m 0
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
add tp_1=8,sp }
{ .mfi; nop.m 0
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20001f<<16
// ------^----- (p40) at first (p23)
// ----------^^ p[16:20]=1
};;
{ .mfi; nop.m 0
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
mov ar.lc=lc }
{ .mfi; nop.m 0
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
.align 32
.L1st_ctop:
.pred.rel "mutex",p40,p42
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23) }
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p23) st8 [tp_1]=n[2],8
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mmb; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
br.ctop.sptk .L1st_ctop };;
.L1st_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
add num=-1,num };; // num--
{ .mmi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
sub aptr=aptr,len };; // rewind
{ .mmi; .pred.rel "mutex",p40,p42
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
sub nptr=nptr,len };;
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;
.Louter:
{ .mmi; ldf8 bi=[bptr],8 // (*bp++)
ldf8 ahi[3]=[tptr] // tp[0]
add r30=8,aptr };;
{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
ldf8 alo[3]=[r30],16 // ap[1]
add r31=8,nptr };;
{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
brp.loop.imp .Linner_ctop,.Linner_cend-16
}
{ .mfb; ldf8 alo[1]=[r30] // ap[3]
xma.lu alo[4]=alo[4],bi,ahi[3]
clrrrb.pr };;
{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
nop.i 0 }
{ .mfi; ldf8 nlo[1]=[r31] // np[1]
xma.lu alo[3]=alo[3],bi,ahi[2]
mov pr.rot=0x20101f<<16
// ------^----- (p40) at first (p23)
// --------^--- (p30) at first (p22)
// ----------^^ p[16:20]=1
};;
{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted
xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
mov ar.lc=lc }
{ .mfi;
fcvt.fxu.s1 nhi[1]=f0
mov ar.ec=8 };;
// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
// in latter case accounts for two-tick pipeline stall, which means
// that its performance would be ~20% lower than optimal one. No
// attempt was made to address this, because original Itanium is
// hardly represented out in the wild...
.align 32
.Linner_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p30,p32
{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
(p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
(p40) add n[2]=n[2],a[2] } // (p23)
{ .mfi; (p16) nop.m 0
(p18) xma.lu alo[2]=alo[2],bi,ahi[1]
(p42) add n[2]=n[2],a[2],1 };; // (p23)
{ .mfi; (p21) getf.sig a[0]=alo[5]
(p16) nop.f 0
(p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
{ .mfi; (p21) ld8 t[0]=[tptr],8
(p16) nop.f 0
(p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
(p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
(p30) add a[1]=a[1],t[1] } // (p22)
{ .mfi; (p16) nop.m 0
(p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
(p32) add a[1]=a[1],t[1],1 };; // (p22)
{ .mmi; (p21) getf.sig n[0]=nlo[3]
(p16) nop.m 0
(p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
{ .mmb; (p23) st8 [tp_1]=n[2],8
(p32) cmp.leu p31,p29=a[1],t[1] // (p22)
br.ctop.sptk .Linner_ctop };;
.Linner_cend:
{ .mmi; getf.sig a[0]=ahi[6] // (p24)
getf.sig n[0]=nhi[4]
nop.i 0 };;
{ .mmi; .pred.rel "mutex",p31,p33
(p31) add a[0]=a[0],topbit
(p33) add a[0]=a[0],topbit,1
mov topbit=r0 };;
{ .mfi; .pred.rel "mutex",p31,p33
(p31) cmp.ltu p32,p30=a[0],topbit
(p33) cmp.leu p32,p30=a[0],topbit
}
{ .mfi; .pred.rel "mutex",p40,p42
(p40) add n[0]=n[0],a[0]
(p42) add n[0]=n[0],a[0],1
};;
{ .mmi; .pred.rel "mutex",p44,p46
(p40) cmp.ltu p41,p39=n[0],a[0]
(p42) cmp.leu p41,p39=n[0],a[0]
(p32) add topbit=r0,r0,1 }
{ .mmi; st8 [tp_1]=n[0],8
cmp4.ne p6,p0=1,num
sub aptr=aptr,len };; // rewind
{ .mmi; sub nptr=nptr,len
(p41) add topbit=r0,r0,1
add tptr=16,sp }
{ .mmb; add tp_1=8,sp
add num=-1,num // num--
(p6) br.cond.sptk.many .Louter };;
{ .mbb; add lc=4,lc
brp.loop.imp .Lsub_ctop,.Lsub_cend-16
clrrrb.pr };;
{ .mii; nop.m 0
mov pr.rot=0x10001<<16
// ------^---- (p33) at first (p17)
mov ar.lc=lc }
{ .mii; nop.m 0
mov ar.ec=3
nop.i 0 };;
.Lsub_ctop:
.pred.rel "mutex",p33,p35
{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
(p16) nop.f 0
(p33) sub n[1]=t[1],n[1] } // (p17)
{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
(p16) nop.f 0
(p35) sub n[1]=t[1],n[1],1 };; // (p17)
{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
(p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
(p18) nop.b 0 }
{ .mib; (p18) nop.m 0
(p35) cmp.geu p34,p32=n[1],t[1] // (p17)
br.ctop.sptk .Lsub_ctop };;
.Lsub_cend:
{ .mmb; .pred.rel "mutex",p34,p36
(p34) sub topbit=topbit,r0 // (p19)
(p36) sub topbit=topbit,r0,1
brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
}
{ .mmb; sub rptr=rptr,len // rewind
sub tptr=tptr,len
clrrrb.pr };;
{ .mmi; mov aptr=rptr
mov bptr=tptr
mov pr.rot=1<<16 };;
{ .mii; cmp.eq p0,p6=topbit,r0
mov ar.lc=lc
mov ar.ec=2 };;
.Lcopy_ctop:
{ .mmi; (p16) ld8 a[0]=[aptr],8
(p16) ld8 t[0]=[bptr],8
(p6) mov a[1]=t[1] };; // (p17)
{ .mmb; (p17) st8 [rptr]=a[1],8
(p17) st8 [tptr]=r0,8
br.ctop.sptk .Lcopy_ctop };;
.Lcopy_cend:
{ .mmi; mov ret0=1 // signal "handled"
rum 1<<5 // clear um.mfh
mov ar.lc=prevlc }
{ .mib; .restore sp
mov sp=prevsp
mov pr=prevpr,0x1ffff
br.ret.sptk.many b0 };;
.endp bn_mul_mont_general#
a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
t0=r15;
ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
.align 64
.skip 48 // aligns loop body
.local bn_mul_mont_8#
.proc bn_mul_mont_8#
bn_mul_mont_8:
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,6,2,0,8
.vframe prevsp
mov prevsp=sp
.save ar.lc,prevlc
mov prevlc=ar.lc }
{ .mmi; add r17=-6*16,sp
add sp=-7*16,sp
.save pr,prevpr
mov prevpr=pr };;
{ .mmi; .save.gf 0,0x10
stf.spill [sp]=f16,-16
.save.gf 0,0x20
stf.spill [r17]=f17,32
add r16=-5*16,prevsp};;
{ .mmi; .save.gf 0,0x40
stf.spill [r16]=f18,32
.save.gf 0,0x80
stf.spill [r17]=f19,32
$ADDP aptr=0,in1 };;
{ .mmi; .save.gf 0,0x100
stf.spill [r16]=f20,32
.save.gf 0,0x200
stf.spill [r17]=f21,32
$ADDP r29=8,in1 };;
{ .mmi; .save.gf 0,0x400
stf.spill [r16]=f22
.save.gf 0,0x800
stf.spill [r17]=f23
$ADDP rptr=0,in0 };;
.body
.rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
.rotr t[8]
// load input vectors padding them to 8 elements
{ .mmi; ldf8 ai0=[aptr],16 // ap[0]
ldf8 ai1=[r29],16 // ap[1]
$ADDP bptr=0,in2 }
{ .mmi; $ADDP r30=8,in2
$ADDP nptr=0,in3
$ADDP r31=8,in3 };;
{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
ldf8 bj[6]=[r30],16 // bp[1]
cmp4.le p4,p5=3,in5 }
{ .mmi; ldf8 ni0=[nptr],16 // np[0]
ldf8 ni1=[r31],16 // np[1]
cmp4.le p6,p7=4,in5 };;
{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
(p5)fcvt.fxu ai2=f0
cmp4.le p8,p9=5,in5 }
{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
(p7)fcvt.fxu ai3=f0
cmp4.le p10,p11=6,in5 }
{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
(p5)fcvt.fxu bj[5]=f0
cmp4.le p12,p13=7,in5 }
{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
(p7)fcvt.fxu bj[4]=f0
cmp4.le p14,p15=8,in5 }
{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
(p5)fcvt.fxu ni2=f0
addp4 r28=-1,in5 }
{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
(p7)fcvt.fxu ni3=f0
$ADDP in4=0,in4 };;
{ .mfi; ldf8 n0=[in4]
fcvt.fxu tf[1]=f0
nop.i 0 }
{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
(p9)fcvt.fxu ai4=f0
mov t[0]=r0 }
{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
(p11)fcvt.fxu ai5=f0
mov t[1]=r0 }
{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
(p9)fcvt.fxu bj[3]=f0
mov t[2]=r0 }
{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
(p11)fcvt.fxu bj[2]=f0
mov t[3]=r0 }
{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
(p9)fcvt.fxu ni4=f0
mov t[4]=r0 }
{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
(p11)fcvt.fxu ni5=f0
mov t[5]=r0 };;
{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
(p13)fcvt.fxu ai6=f0
mov t[6]=r0 }
{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
(p15)fcvt.fxu ai7=f0
mov t[7]=r0 }
{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
(p13)fcvt.fxu bj[1]=f0
mov ar.lc=r28 }
{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
(p15)fcvt.fxu bj[0]=f0
mov ar.ec=1 }
{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
(p13)fcvt.fxu ni6=f0
mov pr.rot=1<<16 }
{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
(p15)fcvt.fxu ni7=f0
brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
};;
// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
// to measure with help of Interval Time Counter indicated that the
// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
// addressing the issue is problematic, because I don't have access
// to platform-specific instruction-level profiler. On Itanium it
// should run in 56*n ticks, because of higher xma latency...
.Louter_8_ctop:
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 0:
(p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
(p40) add a3=a3,n3 } // (p17) a3+=n3
{ .mfi; (p42) add a3=a3,n3,1
(p16) xma.lu alo[0]=ai0,bj[7],tf[1]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 4:
(p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
(p41) add a4=a4,n4 } // (p17) a4+=n4
{ .mfi; (p43) add a4=a4,n4,1
(p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
(p16) nop.i 0 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p16) nop.m 0 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 8:
(p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
(p40) add a5=a5,n5 } // (p17) a5+=n5
{ .mfi; (p42) add a5=a5,n5,1
(p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a1=alo[1] // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mfi; (p16) nop.m 0 // 10:
(p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
(p40) cmp.ltu p43,p41=a5,n5 }
{ .mfi; (p42) cmp.leu p43,p41=a5,n5
(p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
(p16) nop.i 0 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p17) getf.sig n8=nhi[8] // 12:
(p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
(p41) add a6=a6,n6 } // (p17) a6+=n6
{ .mfi; (p43) add a6=a6,n6,1
(p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a2=alo[2] // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mfi; (p16) nop.m 0 // 14:
(p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
(p41) cmp.ltu p42,p40=a6,n6 }
{ .mfi; (p43) cmp.leu p42,p40=a6,n6
(p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
(p16) nop.i 0 };;
{ .mii; (p16) nop.m 0 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 16:
(p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
(p40) add a7=a7,n7 } // (p17) a7+=n7
{ .mfi; (p42) add a7=a7,n7,1
(p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a3=alo[3] // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mfi; (p16) nop.m 0 // 18:
(p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
(p40) cmp.ltu p43,p41=a7,n7 }
{ .mfi; (p42) cmp.leu p43,p41=a7,n7
(p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n1=nlo[1] // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mfi; (p16) nop.m 0 // 20:
(p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
(p41) add a8=a8,n8 } // (p17) a8+=n8
{ .mfi; (p43) add a8=a8,n8,1
(p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a4=alo[4] // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 };;
{ .mfi; (p16) nop.m 0 // 22:
(p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
(p41) cmp.ltu p42,p40=a8,n8 }
{ .mfi; (p43) cmp.leu p42,p40=a8,n8
(p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n2=nlo[2] // 23:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 };;
{ .mfi; (p16) nop.m 0 // 24:
(p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
(p16) add a1=a1,n1 } // (p16) a1+=n1
{ .mfi; (p16) nop.m 0
(p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
(p17) mov t[0]=r0 };;
{ .mii; (p16) getf.sig a5=alo[5] // 25:
(p16) add t0=t[7],a1 // (p16) t[7]+=a1
(p42) add t[0]=t[0],r0,1 };;
{ .mfi; (p16) setf.sig tf[0]=t0 // 26:
(p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
(p50) add t[0]=t[0],r0,1 }
{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
(p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig n3=nlo[3] // 27:
(p16) cmp.ltu.unc p50,p48=t0,a1
(p16) nop.i 0 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mfi; (p16) nop.m 0 // 28:
(p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
(p40) add a2=a2,n2 } // (p16) a2+=n2
{ .mfi; (p42) add a2=a2,n2,1
(p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
(p16) nop.i 0 };;
{ .mii; (p16) getf.sig a6=alo[6] // 29:
(p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
(p50) add t[6]=t[6],a2,1 };;
{ .mfi; (p16) nop.m 0 // 30:
(p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
(p40) cmp.ltu p41,p39=a2,n2 }
{ .mfi; (p42) cmp.leu p41,p39=a2,n2
(p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
(p16) nop.i 0 };;
{ .mfi; (p16) getf.sig n4=nlo[4] // 31:
(p16) nop.f 0
(p48) cmp.ltu p49,p47=t[6],a2 }
{ .mfb; (p50) cmp.leu p49,p47=t[6],a2
(p16) nop.f 0
br.ctop.sptk.many .Louter_8_ctop };;
.Louter_8_cend:
// above loop has to execute one more time, without (p16), which is
// replaced with merged move of np[8] to GPR bank
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mmi; (p0) getf.sig n1=ni0 // 0:
(p40) add a3=a3,n3 // (p17) a3+=n3
(p42) add a3=a3,n3,1 };;
{ .mii; (p17) getf.sig a7=alo[8] // 1:
(p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
(p50) add t[6]=t[6],a3,1 };;
{ .mfi; (p17) getf.sig a8=ahi[8] // 2:
(p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
(p40) cmp.ltu p43,p41=a3,n3 }
{ .mfi; (p42) cmp.leu p43,p41=a3,n3
(p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n5=nlo[6] // 3:
(p48) cmp.ltu p51,p49=t[6],a3
(p50) cmp.leu p51,p49=t[6],a3 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mmi; (p0) getf.sig n2=ni1 // 4:
(p41) add a4=a4,n4 // (p17) a4+=n4
(p43) add a4=a4,n4,1 };;
{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
(p0) nop.f 0
(p51) add t[5]=t[5],a4,1 };;
{ .mfi; (p0) getf.sig n3=ni2 // 6:
(p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
(p41) cmp.ltu p42,p40=a4,n4 }
{ .mfi; (p43) cmp.leu p42,p40=a4,n4
(p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
(p0) nop.i 0 };;
{ .mii; (p17) getf.sig n6=nlo[7] // 7:
(p49) cmp.ltu p50,p48=t[5],a4
(p51) cmp.leu p50,p48=t[5],a4 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) getf.sig n4=ni3 // 8:
(p40) add a5=a5,n5 // (p17) a5+=n5
(p42) add a5=a5,n5,1 };;
{ .mii; (p0) nop.m 0 // 9:
(p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
(p50) add t[4]=t[4],a5,1 };;
{ .mii; (p0) nop.m 0 // 10:
(p40) cmp.ltu p43,p41=a5,n5
(p42) cmp.leu p43,p41=a5,n5 };;
{ .mii; (p17) getf.sig n7=nlo[8] // 11:
(p48) cmp.ltu p51,p49=t[4],a5
(p50) cmp.leu p51,p49=t[4],a5 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p17) getf.sig n8=nhi[8] // 12:
(p41) add a6=a6,n6 // (p17) a6+=n6
(p43) add a6=a6,n6,1 };;
{ .mii; (p0) getf.sig n5=ni4 // 13:
(p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
(p51) add t[3]=t[3],a6,1 };;
{ .mii; (p0) nop.m 0 // 14:
(p41) cmp.ltu p42,p40=a6,n6
(p43) cmp.leu p42,p40=a6,n6 };;
{ .mii; (p0) getf.sig n6=ni5 // 15:
(p49) cmp.ltu p50,p48=t[3],a6
(p51) cmp.leu p50,p48=t[3],a6 };;
.pred.rel "mutex",p40,p42
.pred.rel "mutex",p48,p50
{ .mii; (p0) nop.m 0 // 16:
(p40) add a7=a7,n7 // (p17) a7+=n7
(p42) add a7=a7,n7,1 };;
{ .mii; (p0) nop.m 0 // 17:
(p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
(p50) add t[2]=t[2],a7,1 };;
{ .mii; (p0) nop.m 0 // 18:
(p40) cmp.ltu p43,p41=a7,n7
(p42) cmp.leu p43,p41=a7,n7 };;
{ .mii; (p0) getf.sig n7=ni6 // 19:
(p48) cmp.ltu p51,p49=t[2],a7
(p50) cmp.leu p51,p49=t[2],a7 };;
.pred.rel "mutex",p41,p43
.pred.rel "mutex",p49,p51
{ .mii; (p0) nop.m 0 // 20:
(p41) add a8=a8,n8 // (p17) a8+=n8
(p43) add a8=a8,n8,1 };;
{ .mmi; (p0) nop.m 0 // 21:
(p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
(p51) add t[1]=t[1],a8,1 }
{ .mmi; (p17) mov t[0]=r0
(p41) cmp.ltu p42,p40=a8,n8
(p43) cmp.leu p42,p40=a8,n8 };;
{ .mmi; (p0) getf.sig n8=ni7 // 22:
(p49) cmp.ltu p50,p48=t[1],a8
(p51) cmp.leu p50,p48=t[1],a8 }
{ .mmi; (p42) add t[0]=t[0],r0,1
(p0) add r16=-7*16,prevsp
(p0) add r17=-6*16,prevsp };;
// subtract np[8] from carrybit|tmp[8]
// carrybit|tmp[8] layout upon exit from above loop is:
// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
{ .mmi; (p50)add t[0]=t[0],r0,1
add r18=-5*16,prevsp
sub n1=t0,n1 };;
{ .mmi; cmp.gtu p34,p32=n1,t0;;
.pred.rel "mutex",p32,p34
(p32)sub n2=t[7],n2
(p34)sub n2=t[7],n2,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n2,t[7]
(p34)cmp.geu p35,p33=n2,t[7];;
.pred.rel "mutex",p33,p35
(p33)sub n3=t[6],n3 }
{ .mmi; (p35)sub n3=t[6],n3,1;;
(p33)cmp.gtu p34,p32=n3,t[6]
(p35)cmp.geu p34,p32=n3,t[6] };;
.pred.rel "mutex",p32,p34
{ .mii; (p32)sub n4=t[5],n4
(p34)sub n4=t[5],n4,1;;
(p32)cmp.gtu p35,p33=n4,t[5] }
{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
.pred.rel "mutex",p33,p35
(p33)sub n5=t[4],n5
(p35)sub n5=t[4],n5,1 };;
{ .mii; (p33)cmp.gtu p34,p32=n5,t[4]
(p35)cmp.geu p34,p32=n5,t[4];;
.pred.rel "mutex",p32,p34
(p32)sub n6=t[3],n6 }
{ .mmi; (p34)sub n6=t[3],n6,1;;
(p32)cmp.gtu p35,p33=n6,t[3]
(p34)cmp.geu p35,p33=n6,t[3] };;
.pred.rel "mutex",p33,p35
{ .mii; (p33)sub n7=t[2],n7
(p35)sub n7=t[2],n7,1;;
(p33)cmp.gtu p34,p32=n7,t[2] }
{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
.pred.rel "mutex",p32,p34
(p32)sub n8=t[1],n8
(p34)sub n8=t[1],n8,1 };;
{ .mii; (p32)cmp.gtu p35,p33=n8,t[1]
(p34)cmp.geu p35,p33=n8,t[1];;
.pred.rel "mutex",p33,p35
(p33)sub a8=t[0],r0 }
{ .mmi; (p35)sub a8=t[0],r0,1;;
(p33)cmp.gtu p34,p32=a8,t[0]
(p35)cmp.geu p34,p32=a8,t[0] };;
// save the result, either tmp[num] or tmp[num]-np[num]
.pred.rel "mutex",p32,p34
{ .mmi; (p32)st8 [rptr]=n1,8
(p34)st8 [rptr]=t0,8
add r19=-4*16,prevsp};;
{ .mmb; (p32)st8 [rptr]=n2,8
(p34)st8 [rptr]=t[7],8
(p5)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n3,8
(p34)st8 [rptr]=t[6],8
(p7)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n4,8
(p34)st8 [rptr]=t[5],8
(p9)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n5,8
(p34)st8 [rptr]=t[4],8
(p11)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n6,8
(p34)st8 [rptr]=t[3],8
(p13)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n7,8
(p34)st8 [rptr]=t[2],8
(p15)br.cond.dpnt.few .Ldone };;
{ .mmb; (p32)st8 [rptr]=n8,8
(p34)st8 [rptr]=t[1],8
nop.b 0 };;
.Ldone: // epilogue
{ .mmi; ldf.fill f16=[r16],64
ldf.fill f17=[r17],64
nop.i 0 }
{ .mmi; ldf.fill f18=[r18],64
ldf.fill f19=[r19],64
mov pr=prevpr,0x1ffff };;
{ .mmi; ldf.fill f20=[r16]
ldf.fill f21=[r17]
mov ar.lc=prevlc }
{ .mmi; ldf.fill f22=[r18]
ldf.fill f23=[r19]
mov ret0=1 } // signal "handled"
{ .mib; rum 1<<5
.restore sp
mov sp=prevsp
br.ret.sptk.many b0 };;
.endp bn_mul_mont_8#
.type copyright#,\@object
copyright:
stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
___
open STDOUT,">$output" if $output;
print $code;
close STDOUT;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,433 @@
#! /usr/bin/env perl
# Copyright 2010-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# This module doesn't present direct interest for OpenSSL, because it
# doesn't provide better performance for longer keys, at least not on
# in-order-execution cores. While 512-bit RSA sign operations can be
# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
# verify:-( All comparisons are against bn_mul_mont-free assembler.
# The module might be of interest to embedded system developers, as
# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
# code.
######################################################################
# There is a number of MIPS ABI in use, O32 and N32/64 are most
# widely used. Then there is a new contender: NUBI. It appears that if
# one picks the latter, it's possible to arrange code in ABI neutral
# manner. Therefore let's stick to NUBI register layout:
#
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
#
# The return value is placed in $a0. Following coding rules facilitate
# interoperability:
#
# - never ever touch $tp, "thread pointer", former $gp;
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
# old code];
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
#
# For reference here is register layout for N32/64 MIPS ABIs:
#
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
#
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_ADD="daddu"; # incidentally works even on n32
$PTR_SUB="dsubu"; # incidentally works even on n32
$REG_S="sd";
$REG_L="ld";
$SZREG=8;
} else {
$PTR_ADD="addu";
$PTR_SUB="subu";
$REG_S="sw";
$REG_L="lw";
$SZREG=4;
}
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
#
# <appro@openssl.org>
#
######################################################################
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
if ($flavour =~ /64|n32/i) {
$LD="ld";
$ST="sd";
$MULTU="dmultu";
$ADDU="daddu";
$SUBU="dsubu";
$BNSZ=8;
} else {
$LD="lw";
$ST="sw";
$MULTU="multu";
$ADDU="addu";
$SUBU="subu";
$BNSZ=4;
}
# int bn_mul_mont(
$rp=$a0; # BN_ULONG *rp,
$ap=$a1; # const BN_ULONG *ap,
$bp=$a2; # const BN_ULONG *bp,
$np=$a3; # const BN_ULONG *np,
$n0=$a4; # const BN_ULONG *n0,
$num=$a5; # int num);
$lo0=$a6;
$hi0=$a7;
$lo1=$t1;
$hi1=$t2;
$aj=$s0;
$bi=$s1;
$nj=$s2;
$tp=$s3;
$alo=$s4;
$ahi=$s5;
$nlo=$s6;
$nhi=$s7;
$tj=$s8;
$i=$s9;
$j=$s10;
$m1=$s11;
$FRAMESIZE=14;
$code=<<___;
#include "mips_arch.h"
.text
.set noat
.set noreorder
.align 5
.globl bn_mul_mont
.ent bn_mul_mont
bn_mul_mont:
___
$code.=<<___ if ($flavour =~ /o32/i);
lw $n0,16($sp)
lw $num,20($sp)
___
$code.=<<___;
slt $at,$num,4
bnez $at,1f
li $t0,0
slt $at,$num,17 # on in-order CPU
bnez $at,bn_mul_mont_internal
nop
1: jr $ra
li $a0,0
.end bn_mul_mont
.align 5
.ent bn_mul_mont_internal
bn_mul_mont_internal:
.frame $fp,$FRAMESIZE*$SZREG,$ra
.mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
$PTR_SUB $sp,$FRAMESIZE*$SZREG
$REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
move $fp,$sp
.set reorder
$LD $n0,0($n0)
$LD $bi,0($bp) # bp[0]
$LD $aj,0($ap) # ap[0]
$LD $nj,0($np) # np[0]
$PTR_SUB $sp,2*$BNSZ # place for two extra words
sll $num,`log($BNSZ)/log(2)`
li $at,-4096
$PTR_SUB $sp,$num
and $sp,$at
$MULTU ($aj,$bi)
$LD $ahi,$BNSZ($ap)
$LD $nhi,$BNSZ($np)
mflo ($lo0,$aj,$bi)
mfhi ($hi0,$aj,$bi)
$MULTU ($lo0,$n0)
mflo ($m1,$lo0,$n0)
$MULTU ($ahi,$bi)
mflo ($alo,$ahi,$bi)
mfhi ($ahi,$ahi,$bi)
$MULTU ($nj,$m1)
mflo ($lo1,$nj,$m1)
mfhi ($hi1,$nj,$m1)
$MULTU ($nhi,$m1)
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo ($nlo,$nhi,$m1)
mfhi ($nhi,$nhi,$m1)
move $tp,$sp
li $j,2*$BNSZ
.align 4
.L1st:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU ($aj,$bi)
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo ($alo,$aj,$bi)
mfhi ($ahi,$aj,$bi)
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$MULTU ($nj,$m1)
$ADDU $hi1,$at
addu $j,$BNSZ
$ST $lo1,($tp)
sltu $t0,$j,$num
mflo ($nlo,$nj,$m1)
mfhi ($nhi,$nj,$m1)
bnez $t0,.L1st
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo1,$nlo,$hi1
sltu $t0,$lo1,$hi1
$ADDU $hi1,$nhi,$t0
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
$ST $lo1,($tp)
$ADDU $hi1,$hi0
sltu $at,$hi1,$hi0
$ST $hi1,$BNSZ($tp)
$ST $at,2*$BNSZ($tp)
li $i,$BNSZ
.align 4
.Louter:
$PTR_ADD $bi,$bp,$i
$LD $bi,($bi)
$LD $aj,($ap)
$LD $ahi,$BNSZ($ap)
$LD $tj,($sp)
$MULTU ($aj,$bi)
$LD $nj,($np)
$LD $nhi,$BNSZ($np)
mflo ($lo0,$aj,$bi)
mfhi ($hi0,$aj,$bi)
$ADDU $lo0,$tj
$MULTU ($lo0,$n0)
sltu $at,$lo0,$tj
$ADDU $hi0,$at
mflo ($m1,$lo0,$n0)
$MULTU ($ahi,$bi)
mflo ($alo,$ahi,$bi)
mfhi ($ahi,$ahi,$bi)
$MULTU ($nj,$m1)
mflo ($lo1,$nj,$m1)
mfhi ($hi1,$nj,$m1)
$MULTU ($nhi,$m1)
$ADDU $lo1,$lo0
sltu $at,$lo1,$lo0
$ADDU $hi1,$at
mflo ($nlo,$nhi,$m1)
mfhi ($nhi,$nhi,$m1)
move $tp,$sp
li $j,2*$BNSZ
$LD $tj,$BNSZ($tp)
.align 4
.Linner:
.set noreorder
$PTR_ADD $aj,$ap,$j
$PTR_ADD $nj,$np,$j
$LD $aj,($aj)
$LD $nj,($nj)
$MULTU ($aj,$bi)
$ADDU $lo0,$alo,$hi0
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo0,$hi0
sltu $t0,$lo1,$hi1
$ADDU $hi0,$ahi,$at
$ADDU $hi1,$nhi,$t0
mflo ($alo,$aj,$bi)
mfhi ($ahi,$aj,$bi)
$ADDU $lo0,$tj
addu $j,$BNSZ
$MULTU ($nj,$m1)
sltu $at,$lo0,$tj
$ADDU $lo1,$lo0
$ADDU $hi0,$at
sltu $t0,$lo1,$lo0
$LD $tj,2*$BNSZ($tp)
$ADDU $hi1,$t0
sltu $at,$j,$num
mflo ($nlo,$nj,$m1)
mfhi ($nhi,$nj,$m1)
$ST $lo1,($tp)
bnez $at,.Linner
$PTR_ADD $tp,$BNSZ
.set reorder
$ADDU $lo0,$alo,$hi0
sltu $at,$lo0,$hi0
$ADDU $hi0,$ahi,$at
$ADDU $lo0,$tj
sltu $t0,$lo0,$tj
$ADDU $hi0,$t0
$LD $tj,2*$BNSZ($tp)
$ADDU $lo1,$nlo,$hi1
sltu $at,$lo1,$hi1
$ADDU $hi1,$nhi,$at
$ADDU $lo1,$lo0
sltu $t0,$lo1,$lo0
$ADDU $hi1,$t0
$ST $lo1,($tp)
$ADDU $lo1,$hi1,$hi0
sltu $hi1,$lo1,$hi0
$ADDU $lo1,$tj
sltu $at,$lo1,$tj
$ADDU $hi1,$at
$ST $lo1,$BNSZ($tp)
$ST $hi1,2*$BNSZ($tp)
addu $i,$BNSZ
sltu $t0,$i,$num
bnez $t0,.Louter
.set noreorder
$PTR_ADD $tj,$sp,$num # &tp[num]
move $tp,$sp
move $ap,$sp
li $hi0,0 # clear borrow bit
.align 4
.Lsub: $LD $lo0,($tp)
$LD $lo1,($np)
$PTR_ADD $tp,$BNSZ
$PTR_ADD $np,$BNSZ
$SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
sgtu $at,$lo1,$lo0
$SUBU $lo0,$lo1,$hi0
sgtu $hi0,$lo0,$lo1
$ST $lo0,($rp)
or $hi0,$at
sltu $at,$tp,$tj
bnez $at,.Lsub
$PTR_ADD $rp,$BNSZ
$SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
move $tp,$sp
$PTR_SUB $rp,$num # restore rp
not $hi1,$hi0
.Lcopy: $LD $nj,($tp) # conditional move
$LD $aj,($rp)
$ST $zero,($tp)
$PTR_ADD $tp,$BNSZ
and $nj,$hi0
and $aj,$hi1
or $aj,$nj
sltu $at,$tp,$tj
$ST $aj,($rp)
bnez $at,.Lcopy
$PTR_ADD $rp,$BNSZ
li $a0,1
li $t0,1
.set noreorder
move $sp,$fp
$REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
$REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
$REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
$REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
$REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
$REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
$REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
$REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
$REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
___
$code.=<<___ if ($flavour =~ /nubi/i);
$REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
$REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
$REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
$REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE*$SZREG
.end bn_mul_mont_internal
.rdata
.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,228 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... gcc 4.3 appeared to generate poor code, therefore
# the effort. And indeed, the module delivers 55%-90%(*) improvement
# on heaviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
# This is for 64-bit build. In 32-bit "highgprs" case improvement is
# even higher, for example on z990 it was measured 80%-150%. ECDSA
# sign is modest 9%-12% faster. Keep in mind that these coefficients
# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
# burnt in it...
#
# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
# so that improvement coefficients can vary from one specific
# setup to another.
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$rp="%r2";
$a1="%r3";
$a0="%r4";
$b1="%r5";
$b0="%r6";
$ra="%r14";
$sp="%r15";
@T=("%r0","%r1");
@i=("%r12","%r13");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
$code.=<<___;
.text
.type _mul_1x1,\@function
.align 16
_mul_1x1:
lgr $a1,$a
sllg $a2,$a,1
sllg $a4,$a,2
sllg $a8,$a,3
srag $lo,$a1,63 # broadcast 63rd bit
nihh $a1,0x1fff
srag @i[0],$a2,63 # broadcast 62nd bit
nihh $a2,0x3fff
srag @i[1],$a4,63 # broadcast 61st bit
nihh $a4,0x7fff
ngr $lo,$b
ngr @i[0],$b
ngr @i[1],$b
lghi @T[0],0
lgr $a12,$a1
stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
xgr $a12,$a2
stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
lgr $a48,$a4
stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
xgr $a48,$a8
stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
xgr $a1,$a4
stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
xgr $a2,$a4
stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
xgr $a12,$a4
stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
xgr $a1,$a48
stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
xgr $a2,$a48
stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
xgr $a12,$a48
stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
xgr $a1,$a4
stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
xgr $a2,$a4
stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
xgr $a12,$a4
stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
srlg $hi,$lo,1
stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
sllg $lo,$lo,63
stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
srlg @T[0],@i[0],2
stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
lghi $mask,`0xf<<3`
sllg $a1,@i[0],62
sllg @i[0],$b,3
srlg @T[1],@i[1],3
ngr @i[0],$mask
sllg $a2,@i[1],61
srlg @i[1],$b,4-3
xgr $hi,@T[0]
ngr @i[1],$mask
xgr $lo,$a1
xgr $hi,@T[1]
xgr $lo,$a2
xg $lo,$stdframe(@i[0],$sp)
srlg @i[0],$b,8-3
ngr @i[0],$mask
___
for($n=1;$n<14;$n++) {
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
srlg @i[1],$b,`($n+2)*4`-3
sllg @T[0],@T[1],`$n*4`
ngr @i[1],$mask
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
lg @T[1],$stdframe(@i[1],$sp)
sllg @T[0],@T[1],`$n*4`
srlg @T[1],@T[1],`64-$n*4`
xgr $lo,@T[0]
xgr $hi,@T[1]
lg @T[0],$stdframe(@i[0],$sp)
sllg @T[1],@T[0],`($n+1)*4`
srlg @T[0],@T[0],`64-($n+1)*4`
xgr $lo,@T[1]
xgr $hi,@T[0]
br $ra
.size _mul_1x1,.-_mul_1x1
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@function
.align 16
bn_GF2m_mul_2x2:
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi %r1,-$stdframe-128
la %r0,0($sp)
la $sp,0(%r1,$sp) # alloca
st${g} %r0,0($sp) # back chain
___
if ($SIZE_T==8) {
my @r=map("%r$_",(6..9));
$code.=<<___;
bras $ra,_mul_1x1 # a1·b1
stmg $lo,$hi,16($rp)
lg $a,`$stdframe+128+4*$SIZE_T`($sp)
lg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # a0·b0
stmg $lo,$hi,0($rp)
lg $a,`$stdframe+128+3*$SIZE_T`($sp)
lg $b,`$stdframe+128+5*$SIZE_T`($sp)
xg $a,`$stdframe+128+4*$SIZE_T`($sp)
xg $b,`$stdframe+128+6*$SIZE_T`($sp)
bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
lmg @r[0],@r[3],0($rp)
xgr $lo,$hi
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)
stg $lo,8($rp)
___
} else {
$code.=<<___;
sllg %r3,%r3,32
sllg %r5,%r5,32
or %r3,%r4
or %r5,%r6
bras $ra,_mul_1x1
rllg $lo,$lo,32
rllg $hi,$hi,32
stmg $lo,$hi,0($rp)
___
}
$code.=<<___;
lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
br $ra
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,284 @@
#! /usr/bin/env perl
# Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2007.
#
# Performance improvement over vanilla C code varies from 85% to 45%
# depending on key length and benchmark. Unfortunately in this context
# these are not very impressive results [for code that utilizes "wide"
# 64x64=128-bit multiplication, which is not commonly available to C
# programmers], at least hand-coded bn_asm.c replacement is known to
# provide 30-40% better results for longest keys. Well, on a second
# thought it's not very surprising, because z-CPUs are single-issue
# and _strictly_ in-order execution, while bn_mul_mont is more or less
# dependent on CPU ability to pipe-line instructions and have several
# of them "in-flight" at the same time. I mean while other methods,
# for example Karatsuba, aim to minimize amount of multiplications at
# the cost of other operations increase, bn_mul_mont aim to neatly
# "overlap" multiplications and the other operations [and on most
# platforms even minimize the amount of the other operations, in
# particular references to memory]. But it's possible to improve this
# module performance by implementing dedicated squaring code-path and
# possibly by unrolling loops...
# January 2009.
#
# Reschedule to minimize/avoid Address Generation Interlock hazard,
# make inner loops counter-based.
# November 2010.
#
# Adapt for -m31 build. If kernel supports what's called "highgprs"
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
# instructions and achieve "64-bit" performance even in 31-bit legacy
# application context. The feature is not specific to any particular
# processor, as long as it's "z-CPU". Latter implies that the code
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
# On z990 it was measured to perform 2.6-2.2 times better than
# compiler-generated code, less for longer keys...
$flavour = shift;
if ($flavour =~ /3[12]/) {
$SIZE_T=4;
$g="";
} else {
$SIZE_T=8;
$g="g";
}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$stdframe=16*$SIZE_T+4*8;
$mn0="%r0";
$num="%r1";
# int bn_mul_mont(
$rp="%r2"; # BN_ULONG *rp,
$ap="%r3"; # const BN_ULONG *ap,
$bp="%r4"; # const BN_ULONG *bp,
$np="%r5"; # const BN_ULONG *np,
$n0="%r6"; # const BN_ULONG *n0,
#$num="160(%r15)" # int num);
$bi="%r2"; # zaps rp
$j="%r7";
$ahi="%r8";
$alo="%r9";
$nhi="%r10";
$nlo="%r11";
$AHI="%r12";
$NHI="%r13";
$count="%r14";
$sp="%r15";
$code.=<<___;
.text
.globl bn_mul_mont
.type bn_mul_mont,\@function
bn_mul_mont:
lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
la $bp,0($num,$bp)
st${g} %r2,2*$SIZE_T($sp)
cghi $num,16 #
lghi %r2,0 #
blr %r14 # if($num<16) return 0;
___
$code.=<<___ if ($flavour =~ /3[12]/);
tmll $num,4
bnzr %r14 # if ($num&1) return 0;
___
$code.=<<___ if ($flavour !~ /3[12]/);
cghi $num,96 #
bhr %r14 # if($num>96) return 0;
___
$code.=<<___;
stm${g} %r3,%r15,3*$SIZE_T($sp)
lghi $rp,-$stdframe-8 # leave room for carry bit
lcgr $j,$num # -$num
lgr %r0,$sp
la $rp,0($rp,$sp)
la $sp,0($j,$rp) # alloca
st${g} %r0,0($sp) # back chain
sra $num,3 # restore $num
la $bp,0($j,$bp) # restore $bp
ahi $num,-1 # adjust $num for inner loop
lg $n0,0($n0) # pull n0
_dswap $n0
lg $bi,0($bp)
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[0]
lgr $AHI,$ahi
lgr $mn0,$alo # "tp[0]"*n0
msgr $mn0,$n0
lg $nlo,0($np) #
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.L1st:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[0]
algr $alo,$AHI
lghi $AHI,0
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI # +="tp[j]"
algr $nlo,$alo
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.L1st
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI # upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
.Louter:
lg $bi,0($bp) # bp[i]
_dswap $bi
lg $alo,0($ap)
_dswap $alo
mlgr $ahi,$bi # ap[0]*bp[i]
alg $alo,$stdframe($sp) # +=tp[0]
lghi $AHI,0
alcgr $AHI,$ahi
lgr $mn0,$alo
msgr $mn0,$n0 # tp[0]*n0
lg $nlo,0($np) # np[0]
_dswap $nlo
mlgr $nhi,$mn0 # np[0]*m1
algr $nlo,$alo # +="tp[0]"
lghi $NHI,0
alcgr $NHI,$nhi
la $j,8(%r0) # j=1
lr $count,$num
.align 16
.Linner:
lg $alo,0($j,$ap)
_dswap $alo
mlgr $ahi,$bi # ap[j]*bp[i]
algr $alo,$AHI
lghi $AHI,0
alcgr $ahi,$AHI
alg $alo,$stdframe($j,$sp)# +=tp[j]
alcgr $AHI,$ahi
lg $nlo,0($j,$np)
_dswap $nlo
mlgr $nhi,$mn0 # np[j]*m1
algr $nlo,$NHI
lghi $NHI,0
alcgr $nhi,$NHI
algr $nlo,$alo # +="tp[j]"
alcgr $NHI,$nhi
stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
la $j,8($j) # j++
brct $count,.Linner
algr $NHI,$AHI
lghi $AHI,0
alcgr $AHI,$AHI
alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
lghi $ahi,0
alcgr $AHI,$ahi # new upmost overflow bit
stg $NHI,$stdframe-8($j,$sp)
stg $AHI,$stdframe($j,$sp)
la $bp,8($bp) # bp++
cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
jne .Louter
l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
la $ap,$stdframe($sp)
ahi $num,1 # restore $num, incidentally clears "borrow"
la $j,0(%r0)
lr $count,$num
.Lsub: lg $alo,0($j,$ap)
lg $nlo,0($j,$np)
_dswap $nlo
slbgr $alo,$nlo
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lsub
lghi $ahi,0
slbgr $AHI,$ahi # handle upmost carry
lghi $NHI,-1
xgr $NHI,$AHI
la $j,0(%r0)
lgr $count,$num
.Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy
lg $alo,0($j,$rp)
ngr $ahi,$AHI
ngr $alo,$NHI
ogr $alo,$ahi
_dswap $alo
stg $j,$stdframe($j,$sp) # zap tp
stg $alo,0($j,$rp)
la $j,8($j)
brct $count,.Lcopy
la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
lm${g} %r6,%r15,0(%r1)
lghi %r2,1 # signal "processed"
br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,713 @@
.ident "s390x.S, version 1.1"
// ====================================================================
// Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
// ====================================================================
.text
#define zero %r0
// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_add_words
.type bn_mul_add_words,@function
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside [to give way to]
lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r13,48(%r15)
lghi %r2,3
lghi %r12,0 // carry = 0
slgr %r1,%r3 // rp-=ap
nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
lg %r7,0(%r3) // ap[0]
lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
brct %r4,.Loop4_madd
j .Loop4_madd_tail
.Loop4_madd:
mlgr %r8,%r5
lg %r11,16(%r3) // ap[i+2]
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
lg %r7,32(%r3)
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
mlgr %r6,%r5
lg %r9,40(%r3)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
brct %r4,.Loop4_madd
.Loop4_madd_tail:
mlgr %r8,%r5
lg %r11,16(%r3)
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
la %r2,1(%r2) // see if len%4 is zero ...
brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
lgr %r2,zero // return value
alcgr %r2,%r12 // collect even carry bit
lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
lgr %r12,%r6
la %r3,8(%r3) // i++
brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words
// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
.globl bn_mul_words
.type bn_mul_words,@function
.align 4
bn_mul_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_mul // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lg %r9,8(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,8(%r2,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
stg %r7,16(%r2,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r4,.Loop4_mul
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_mul // without touching condition code:-)
.Lend_mul:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
br %r14
.Loop1_mul:
lg %r7,0(%r2,%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
stg %r7,0(%r2,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_mul
j .Lend_mul
.size bn_mul_words,.-bn_mul_words
// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
.globl bn_sqr_words
.type bn_sqr_words,@function
.align 4
bn_sqr_words:
ltgfr %r4,%r4
bler %r14
stmg %r6,%r7,48(%r15)
srag %r1,%r4,2 // cnt=len/4
jz .Loop1_sqr
.Loop4_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
lg %r7,8(%r3)
mlgr %r6,%r7
stg %r7,16(%r2)
stg %r6,24(%r2)
lg %r7,16(%r3)
mlgr %r6,%r7
stg %r7,32(%r2)
stg %r6,40(%r2)
lg %r7,24(%r3)
mlgr %r6,%r7
stg %r7,48(%r2)
stg %r6,56(%r2)
la %r3,32(%r3)
la %r2,64(%r2)
brct %r1,.Loop4_sqr
lghi %r1,3
nr %r4,%r1 // cnt=len%4
jz .Lend_sqr
.Loop1_sqr:
lg %r7,0(%r3)
mlgr %r6,%r7
stg %r7,0(%r2)
stg %r6,8(%r2)
la %r3,8(%r3)
la %r2,16(%r2)
brct %r4,.Loop1_sqr
.Lend_sqr:
lmg %r6,%r7,48(%r15)
br %r14
.size bn_sqr_words,.-bn_sqr_words
// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
.globl bn_div_words
.type bn_div_words,@function
.align 4
bn_div_words:
dlgr %r2,%r4
lgr %r2,%r3
br %r14
.size bn_div_words,.-bn_div_words
// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_add_words
.type bn_add_words,@function
.align 4
bn_add_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jz .Loop1_add // carry is incidentally cleared if branch taken
algr %r2,%r2 // clear carry
.Loop4_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
alcg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
alcg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
alcg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_add
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_add // without touching condition code:-)
.Lexit_add:
lghi %r2,0
alcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.Loop1_add:
lg %r0,0(%r2,%r3)
alcg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_add
j .Lexit_add
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
.globl bn_sub_words
.type bn_sub_words,@function
.align 4
bn_sub_words:
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0
ltgfr %r5,%r5
bler %r14 // if (len<=0) return 0;
stg %r6,48(%r15)
lghi %r6,3
nr %r6,%r5 // len%4
sra %r5,2 // len/4, use sra because it sets condition code
jnz .Loop4_sub // borrow is incidentally cleared if branch taken
slgr %r2,%r2 // clear borrow
.Loop1_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
la %r2,8(%r2) // i++
brct %r6,.Loop1_sub
j .Lexit_sub
.Loop4_sub:
lg %r0,0(%r2,%r3)
slbg %r0,0(%r2,%r4)
stg %r0,0(%r2,%r1)
lg %r0,8(%r2,%r3)
slbg %r0,8(%r2,%r4)
stg %r0,8(%r2,%r1)
lg %r0,16(%r2,%r3)
slbg %r0,16(%r2,%r4)
stg %r0,16(%r2,%r1)
lg %r0,24(%r2,%r3)
slbg %r0,24(%r2,%r4)
stg %r0,24(%r2,%r1)
la %r2,32(%r2) // i+=4
brct %r5,.Loop4_sub
la %r6,1(%r6) // see if len%4 is zero ...
brct %r6,.Loop1_sub // without touching condition code:-)
.Lexit_sub:
lghi %r2,0
slbgr %r2,%r2
lcgr %r2,%r2
lg %r6,48(%r15)
br %r14
.size bn_sub_words,.-bn_sub_words
#define c1 %r1
#define c2 %r5
#define c3 %r8
#define mul_add_c(ai,bi,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,bi*8(%r4); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba8
.type bn_mul_comba8,@function
.align 4
bn_mul_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(4,0,c2,c3,c1);
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
mul_add_c(0,4,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(0,5,c3,c1,c2);
mul_add_c(1,4,c3,c1,c2);
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
mul_add_c(4,1,c3,c1,c2);
mul_add_c(5,0,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(6,0,c1,c2,c3);
mul_add_c(5,1,c1,c2,c3);
mul_add_c(4,2,c1,c2,c3);
mul_add_c(3,3,c1,c2,c3);
mul_add_c(2,4,c1,c2,c3);
mul_add_c(1,5,c1,c2,c3);
mul_add_c(0,6,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
mul_add_c(0,7,c2,c3,c1);
mul_add_c(1,6,c2,c3,c1);
mul_add_c(2,5,c2,c3,c1);
mul_add_c(3,4,c2,c3,c1);
mul_add_c(4,3,c2,c3,c1);
mul_add_c(5,2,c2,c3,c1);
mul_add_c(6,1,c2,c3,c1);
mul_add_c(7,0,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
mul_add_c(7,1,c3,c1,c2);
mul_add_c(6,2,c3,c1,c2);
mul_add_c(5,3,c3,c1,c2);
mul_add_c(4,4,c3,c1,c2);
mul_add_c(3,5,c3,c1,c2);
mul_add_c(2,6,c3,c1,c2);
mul_add_c(1,7,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
mul_add_c(2,7,c1,c2,c3);
mul_add_c(3,6,c1,c2,c3);
mul_add_c(4,5,c1,c2,c3);
mul_add_c(5,4,c1,c2,c3);
mul_add_c(6,3,c1,c2,c3);
mul_add_c(7,2,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
mul_add_c(7,3,c2,c3,c1);
mul_add_c(6,4,c2,c3,c1);
mul_add_c(5,5,c2,c3,c1);
mul_add_c(4,6,c2,c3,c1);
mul_add_c(3,7,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
mul_add_c(4,7,c3,c1,c2);
mul_add_c(5,6,c3,c1,c2);
mul_add_c(6,5,c3,c1,c2);
mul_add_c(7,4,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
mul_add_c(7,5,c1,c2,c3);
mul_add_c(6,6,c1,c2,c3);
mul_add_c(5,7,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
mul_add_c(6,7,c2,c3,c1);
mul_add_c(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
mul_add_c(7,7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba8,.-bn_mul_comba8
// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
.globl bn_mul_comba4
.type bn_mul_comba4,@function
.align 4
bn_mul_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
mul_add_c(0,0,c1,c2,c3);
stg c1,0*8(%r3)
lghi c1,0
mul_add_c(0,1,c2,c3,c1);
mul_add_c(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
mul_add_c(2,0,c3,c1,c2);
mul_add_c(1,1,c3,c1,c2);
mul_add_c(0,2,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
mul_add_c(0,3,c1,c2,c3);
mul_add_c(1,2,c1,c2,c3);
mul_add_c(2,1,c1,c2,c3);
mul_add_c(3,0,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
mul_add_c(3,1,c2,c3,c1);
mul_add_c(2,2,c2,c3,c1);
mul_add_c(1,3,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
mul_add_c(2,3,c3,c1,c2);
mul_add_c(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
mul_add_c(3,3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
stmg %r6,%r8,48(%r15)
br %r14
.size bn_mul_comba4,.-bn_mul_comba4
#define sqr_add_c(ai,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlgr %r6,%r7; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
#define sqr_add_c2(ai,aj,c1,c2,c3) \
lg %r7,ai*8(%r3); \
mlg %r6,aj*8(%r3); \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero; \
algr c1,%r7; \
alcgr c2,%r6; \
alcgr c3,zero
// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba8
.type bn_sqr_comba8,@function
.align 4
bn_sqr_comba8:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
sqr_add_c2(4,0,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(5,0,c3,c1,c2);
sqr_add_c2(4,1,c3,c1,c2);
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
sqr_add_c2(4,2,c1,c2,c3);
sqr_add_c2(5,1,c1,c2,c3);
sqr_add_c2(6,0,c1,c2,c3);
stg c1,6*8(%r2)
lghi c1,0
sqr_add_c2(7,0,c2,c3,c1);
sqr_add_c2(6,1,c2,c3,c1);
sqr_add_c2(5,2,c2,c3,c1);
sqr_add_c2(4,3,c2,c3,c1);
stg c2,7*8(%r2)
lghi c2,0
sqr_add_c(4,c3,c1,c2);
sqr_add_c2(5,3,c3,c1,c2);
sqr_add_c2(6,2,c3,c1,c2);
sqr_add_c2(7,1,c3,c1,c2);
stg c3,8*8(%r2)
lghi c3,0
sqr_add_c2(7,2,c1,c2,c3);
sqr_add_c2(6,3,c1,c2,c3);
sqr_add_c2(5,4,c1,c2,c3);
stg c1,9*8(%r2)
lghi c1,0
sqr_add_c(5,c2,c3,c1);
sqr_add_c2(6,4,c2,c3,c1);
sqr_add_c2(7,3,c2,c3,c1);
stg c2,10*8(%r2)
lghi c2,0
sqr_add_c2(7,4,c3,c1,c2);
sqr_add_c2(6,5,c3,c1,c2);
stg c3,11*8(%r2)
lghi c3,0
sqr_add_c(6,c1,c2,c3);
sqr_add_c2(7,5,c1,c2,c3);
stg c1,12*8(%r2)
lghi c1,0
sqr_add_c2(7,6,c2,c3,c1);
stg c2,13*8(%r2)
lghi c2,0
sqr_add_c(7,c3,c1,c2);
stg c3,14*8(%r2)
stg c1,15*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba8,.-bn_sqr_comba8
// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
.globl bn_sqr_comba4
.type bn_sqr_comba4,@function
.align 4
bn_sqr_comba4:
stmg %r6,%r8,48(%r15)
lghi c1,0
lghi c2,0
lghi c3,0
lghi zero,0
sqr_add_c(0,c1,c2,c3);
stg c1,0*8(%r2)
lghi c1,0
sqr_add_c2(1,0,c2,c3,c1);
stg c2,1*8(%r2)
lghi c2,0
sqr_add_c(1,c3,c1,c2);
sqr_add_c2(2,0,c3,c1,c2);
stg c3,2*8(%r2)
lghi c3,0
sqr_add_c2(3,0,c1,c2,c3);
sqr_add_c2(2,1,c1,c2,c3);
stg c1,3*8(%r2)
lghi c1,0
sqr_add_c(2,c2,c3,c1);
sqr_add_c2(3,1,c2,c3,c1);
stg c2,4*8(%r2)
lghi c2,0
sqr_add_c2(3,2,c3,c1,c2);
stg c3,5*8(%r2)
lghi c3,0
sqr_add_c(3,c1,c2,c3);
stg c1,6*8(%r2)
stg c2,7*8(%r2)
lmg %r6,%r8,48(%r15)
br %r14
.size bn_sqr_comba4,.-bn_sqr_comba4

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,200 @@
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# October 2012
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: one suitable
# for all SPARCv9 processors and one for VIS3-capable ones. Former
# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.
$output = pop;
open STDOUT,">$output";
$locals=16*8;
$tab="%l0";
@T=("%g2","%g3");
@i=("%g4","%g5");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
$code.=<<___;
#include <sparc_arch.h>
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl bn_GF2m_mul_2x2
.align 16
bn_GF2m_mul_2x2:
SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
ld [%g1+0],%g1 ! OPENSSL_sparcv9cap_P[0]
andcc %g1, SPARCV9_VIS3, %g0
bz,pn %icc,.Lsoftware
nop
sllx %o1, 32, %o1
sllx %o3, 32, %o3
or %o2, %o1, %o1
or %o4, %o3, %o3
.word 0x95b262ab ! xmulx %o1, %o3, %o2
.word 0x99b262cb ! xmulxhi %o1, %o3, %o4
srlx %o2, 32, %o1 ! 13 cycles later
st %o2, [%o0+0]
st %o1, [%o0+4]
srlx %o4, 32, %o3
st %o4, [%o0+8]
retl
st %o3, [%o0+12]
.align 16
.Lsoftware:
save %sp,-STACK_FRAME-$locals,%sp
sllx %i1,32,$a
mov -1,$a12
sllx %i3,32,$b
or %i2,$a,$a
srlx $a12,1,$a48 ! 0x7fff...
or %i4,$b,$b
srlx $a12,2,$a12 ! 0x3fff...
add %sp,STACK_BIAS+STACK_FRAME,$tab
sllx $a,2,$a4
mov $a,$a1
sllx $a,1,$a2
srax $a4,63,@i[1] ! broadcast 61st bit
and $a48,$a4,$a4 ! (a<<2)&0x7fff...
srlx $a48,2,$a48
srax $a2,63,@i[0] ! broadcast 62nd bit
and $a12,$a2,$a2 ! (a<<1)&0x3fff...
srax $a1,63,$lo ! broadcast 63rd bit
and $a48,$a1,$a1 ! (a<<0)&0x1fff...
sllx $a1,3,$a8
and $b,$lo,$lo
and $b,@i[0],@i[0]
and $b,@i[1],@i[1]
stx %g0,[$tab+0*8] ! tab[0]=0
xor $a1,$a2,$a12
stx $a1,[$tab+1*8] ! tab[1]=a1
stx $a2,[$tab+2*8] ! tab[2]=a2
xor $a4,$a8,$a48
stx $a12,[$tab+3*8] ! tab[3]=a1^a2
xor $a4,$a1,$a1
stx $a4,[$tab+4*8] ! tab[4]=a4
xor $a4,$a2,$a2
stx $a1,[$tab+5*8] ! tab[5]=a1^a4
xor $a4,$a12,$a12
stx $a2,[$tab+6*8] ! tab[6]=a2^a4
xor $a48,$a1,$a1
stx $a12,[$tab+7*8] ! tab[7]=a1^a2^a4
xor $a48,$a2,$a2
stx $a8,[$tab+8*8] ! tab[8]=a8
xor $a48,$a12,$a12
stx $a1,[$tab+9*8] ! tab[9]=a1^a8
xor $a4,$a1,$a1
stx $a2,[$tab+10*8] ! tab[10]=a2^a8
xor $a4,$a2,$a2
stx $a12,[$tab+11*8] ! tab[11]=a1^a2^a8
xor $a4,$a12,$a12
stx $a48,[$tab+12*8] ! tab[12]=a4^a8
srlx $lo,1,$hi
stx $a1,[$tab+13*8] ! tab[13]=a1^a4^a8
sllx $lo,63,$lo
stx $a2,[$tab+14*8] ! tab[14]=a2^a4^a8
srlx @i[0],2,@T[0]
stx $a12,[$tab+15*8] ! tab[15]=a1^a2^a4^a8
sllx @i[0],62,$a1
sllx $b,3,@i[0]
srlx @i[1],3,@T[1]
and @i[0],`0xf<<3`,@i[0]
sllx @i[1],61,$a2
ldx [$tab+@i[0]],@i[0]
srlx $b,4-3,@i[1]
xor @T[0],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
xor $a1,$lo,$lo
ldx [$tab+@i[1]],@i[1]
xor @T[1],$hi,$hi
xor @i[0],$lo,$lo
srlx $b,8-3,@i[0]
xor $a2,$lo,$lo
and @i[0],`0xf<<3`,@i[0]
___
for($n=1;$n<14;$n++) {
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
srlx $b,`($n+2)*4`-3,@i[1]
xor @T[1],$hi,$hi
and @i[1],`0xf<<3`,@i[1]
___
push(@i,shift(@i)); push(@T,shift(@T));
}
$code.=<<___;
sllx @i[1],`$n*4`,@T[0]
ldx [$tab+@i[0]],@i[0]
srlx @i[1],`64-$n*4`,@T[1]
xor @T[0],$lo,$lo
sllx @i[0],`($n+1)*4`,@T[0]
xor @T[1],$hi,$hi
srlx @i[0],`64-($n+1)*4`,@T[1]
xor @T[0],$lo,$lo
xor @T[1],$hi,$hi
srlx $lo,32,%i1
st $lo,[%i0+0]
st %i1,[%i0+4]
srlx $hi,32,%i2
st $hi,[%i0+8]
st %i2,[%i0+12]
ret
restore
.type bn_GF2m_mul_2x2,#function
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,620 @@
#! /usr/bin/env perl
# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
#
# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
# for undertaken effort are multiple. First of all, UltraSPARC is not
# the whole SPARCv9 universe and other VIS-free implementations deserve
# optimized code as much. Secondly, newly introduced UltraSPARC T1,
# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
# several integrated RSA/DSA accelerator circuits accessible through
# kernel driver [only(*)], but having decent user-land software
# implementation is important too. Finally, reasons like desire to
# experiment with dedicated squaring procedure. Yes, this module
# implements one, because it was easiest to draft it in SPARCv9
# instructions...
# (*) Engine accessing the driver in question is on my TODO list.
# For reference, accelerator is estimated to give 6 to 10 times
# improvement on single-threaded RSA sign. It should be noted
# that 6-10x improvement coefficient does not actually mean
# something extraordinary in terms of absolute [single-threaded]
# performance, as SPARCv9 instruction set is by all means least
# suitable for high performance crypto among other 64 bit
# platforms. 6-10x factor simply places T1 in same performance
# domain as say AMD64 and IA-64. Improvement of RSA verify don't
# appear impressive at all, but it's the sign operation which is
# far more critical/interesting.
# You might notice that inner loops are modulo-scheduled:-) This has
# essentially negligible impact on UltraSPARC performance, it's
# Fujitsu SPARC64 V users who should notice and hopefully appreciate
# the advantage... Currently this module surpasses sparcv9a-mont.pl
# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
# module still have hidden potential [see TODO list there], which is
# estimated to be larger than 20%...
$output = pop;
open STDOUT,">$output";
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$frame="STACK_FRAME";
$bias="STACK_BIAS";
$car0="%o0";
$car1="%o1";
$car2="%o2"; # 1 bit
$acc0="%o3";
$acc1="%o4";
$mask="%g1"; # 32 bits, what a waste...
$tmp0="%g4";
$tmp1="%g5";
$i="%l0";
$j="%l1";
$mul0="%l2";
$mul1="%l3";
$tp="%l4";
$apj="%l5";
$npj="%l6";
$tpj="%l7";
$fname="bn_mul_mont_int";
$code=<<___;
#include "sparc_arch.h"
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
cmp %o5,4 ! 128 bits minimum
bge,pt %icc,.Lenter
sethi %hi(0xffffffff),$mask
retl
clr %o0
.align 32
.Lenter:
save %sp,-$frame,%sp
sll $num,2,$num ! num*=4
or $mask,%lo(0xffffffff),$mask
ld [$n0],$n0
cmp $ap,$bp
and $num,$mask,$num
ld [$bp],$mul0 ! bp[0]
nop
add %sp,$bias,%o7 ! real top of stack
ld [$ap],$car0 ! ap[0] ! redundant in squaring context
sub %o7,$num,%o7
ld [$ap+4],$apj ! ap[1]
and %o7,-1024,%o7
ld [$np],$car1 ! np[0]
sub %o7,$bias,%sp ! alloca
ld [$np+4],$npj ! np[1]
be,pt SIZE_T_CC,.Lbn_sqr_mont
mov 12,$j
mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.L1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp]
cmp $j,$num
mov $tmp1,$acc1
srlx $car1,32,$car1
bl %icc,.L1st
add $tp,4,$tp ! tp++
!.L1st
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
mov 4,$i ! i++
ld [$bp+4],$mul0 ! bp[1]
.Louter:
add %sp,$bias+$frame,$tp
ld [$ap],$car0 ! ap[0]
ld [$ap+4],$apj ! ap[1]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
ld [$tp],$tmp1 ! tp[0]
ld [$tp+4],$tpj ! tp[1]
mov 12,$j
mulx $car0,$mul0,$car0
mulx $apj,$mul0,$tmp0 !prologue!
add $tmp1,$car0,$car0
ld [$ap+8],$apj !prologue!
and $car0,$mask,$acc0
mulx $n0,$acc0,$mul1
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1 !prologue!
srlx $car0,32,$car0
add $acc0,$car1,$car1
ld [$np+8],$npj !prologue!
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Linner:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
add $acc0,$car0,$car0
add $acc1,$car1,$car1
ld [$np+$j],$npj ! np[j]
and $car0,$mask,$acc0
ld [$tp+8],$tpj ! tp[j]
srlx $car0,32,$car0
add $acc0,$car1,$car1
add $j,4,$j ! j++
mov $tmp0,$acc0
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
mov $tmp1,$acc1
cmp $j,$num
bl %icc,.Linner
add $tp,4,$tp ! tp++
!.Linner
mulx $apj,$mul0,$tmp0 !epilogue!
mulx $npj,$mul1,$tmp1
add $tpj,$car0,$car0
add $acc0,$car0,$car0
ld [$tp+8],$tpj ! tp[j]
and $car0,$mask,$acc0
add $acc1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $tpj,$car0,$car0
add $tmp0,$car0,$car0
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
add $acc0,$car1,$car1
st $car1,[$tp+4] ! tp[j-1]
srlx $car0,32,$car0
add $i,4,$i ! i++
srlx $car1,32,$car1
add $car0,$car1,$car1
cmp $i,$num
add $car2,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
bl,a %icc,.Louter
ld [$bp+$i],$mul0 ! bp[i]
!.Louter
add $tp,12,$tp
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
sub %g0,$num,%o7 ! k=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 16
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
subccc %o0,%o1,%o1 ! tp[j]-np[j]
add $rp,%o7,$i
add %o7,4,%o7
brnz %o7,.Lsub
st %o1,[$i]
subccc $car2,0,$car2 ! handle upmost overflow bit
sub %g0,$num,%o7
.Lcopy:
ld [$tp+%o7],%o1 ! conditional copy
ld [$rp+%o7],%o0
st %g0,[$tp+%o7] ! zap tp
movcs %icc,%o1,%o0
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
mov 1,%i0
ret
restore
___
########
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%o5";
$code.=<<___;
.align 32
.Lbn_sqr_mont:
mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
mulx $apj,$mul0,$tmp0 !prologue!
and $car0,$mask,$acc0
add %sp,$bias+$frame,$tp
ld [$ap+8],$apj !prologue!
mulx $n0,$acc0,$mul1 ! "t[0]"*n0
srlx $car0,32,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
mulx $npj,$mul1,$acc1 !prologue!
and $car0,1,$sbit
ld [$np+8],$npj !prologue!
srlx $car0,1,$car0
add $acc0,$car1,$car1
srlx $car1,32,$car1
mov $tmp0,$acc0 !prologue!
.Lsqr_1st:
mulx $apj,$mul0,$tmp0
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
mov $tmp1,$acc1
srlx $acc0,32,$sbit
add $j,4,$j ! j++
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp]
mov $tmp0,$acc0
srlx $car1,32,$car1
bl %icc,.Lsqr_1st
add $tp,4,$tp ! tp++
!.Lsqr_1st
mulx $apj,$mul0,$tmp0 ! epilogue
mulx $npj,$mul1,$tmp1
add $acc0,$car0,$car0 ! ap[j]*a0+c0
add $acc1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $tmp0,$car0,$car0 ! ap[j]*a0+c0
add $tmp1,$car1,$car1
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $car0,$car1,$car1
st $car1,[$tp+8]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp0 ! tp[0]
ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
ld [%sp+$bias+$frame+8],$tpj ! tp[2]
ld [$ap+4],$mul0 ! ap[1]
ld [$ap+8],$apj ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp0,$mul1
mulx $mul0,$mul0,$car0
and $mul1,$mask,$mul1
mulx $car1,$mul1,$car1
mulx $npj,$mul1,$acc1
add $tmp0,$car1,$car1
and $car0,$mask,$acc0
ld [$np+8],$npj ! np[2]
srlx $car1,32,$car1
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add $acc0,$car1,$car1
and $car0,1,$sbit
add $acc1,$car1,$car1
srlx $car0,1,$car0
mov 12,$j
st $car1,[%sp+$bias+$frame] ! tp[0]=
srlx $car1,32,$car1
add %sp,$bias+$frame+4,$tp
.Lsqr_2nd:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$sbit,$sbit
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc1,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_2nd
add $tp,4,$tp ! tp++
!.Lsqr_2nd
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$sbit,$sbit
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+8],$mul0 ! ap[2]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
mov 8,$i
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
mov 4,$j
.Lsqr_outer:
.Lsqr_inner1:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner1
add $tp,4,$tp
!.Lsqr_inner1
add $j,4,$j
ld [$ap+$j],$apj ! ap[j]
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
ld [$np+$j],$npj ! np[j]
srlx $car1,32,$tmp0
and $car1,$mask,$car1
add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
ld [$tp+8],$tpj ! tp[j]
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $j,4,$j
cmp $j,$num
be,pn %icc,.Lsqr_no_inner2
add $tp,4,$tp
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
add $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
bl %icc,.Lsqr_inner2
add $tp,4,$tp ! tp++
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp] ! tp[j-1]
srlx $car1,32,$car1
add $car0,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
add $i,4,$i ! i++
ld [%sp+$bias+$frame],$tmp1 ! tp[0]
ld [%sp+$bias+$frame+4],$tpj ! tp[1]
ld [$ap+$i],$mul0 ! ap[j]
ld [$np],$car1 ! np[0]
ld [$np+4],$npj ! np[1]
mulx $n0,$tmp1,$mul1
and $mul1,$mask,$mul1
add $i,4,$tmp0
mulx $mul0,$mul0,$car0
mulx $car1,$mul1,$car1
and $car0,$mask,$acc0
add $tmp1,$car1,$car1
srlx $car0,32,$car0
add %sp,$bias+$frame,$tp
srlx $car1,32,$car1
and $car0,1,$sbit
srlx $car0,1,$car0
cmp $tmp0,$num ! i<num-1
bl %icc,.Lsqr_outer
mov 4,$j
.Lsqr_last:
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $j,4,$j
ld [$tp+8],$tpj
cmp $j,$i
add $acc1,$car1,$car1
ld [$np+$j],$npj
st $car1,[$tp]
srlx $car1,32,$car1
bl %icc,.Lsqr_last
add $tp,4,$tp
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$acc0,$acc0
srlx $acc0,32,$tmp0
and $acc0,$mask,$acc0
add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
srlx $car1,32,$car2
ba .Ltail
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

View file

@ -0,0 +1,887 @@
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
# Because unlike integer multiplier, which simply stalls whole CPU,
# FPU is fully pipelined and can effectively emit 48 bit partial
# product every cycle. Why not blended SPARC v9? One can argue that
# making this module dependent on UltraSPARC VIS extension limits its
# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
# implementations from compatibility matrix. But the rest, whole Sun
# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
# VIS extension instructions used in this module. This is considered
# good enough to not care about HAL SPARC64 users [if any] who have
# integer-only pure SPARCv9 module to "fall down" to.
# USI&II cores currently exhibit uniform 2x improvement [over pre-
# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
# performance improves few percents for shorter keys and worsens few
# percents for longer keys. This is because USIII integer multiplier
# is >3x faster than USI&II one, which is harder to match [but see
# TODO list below]. It should also be noted that SPARC64 V features
# out-of-order execution, which *might* mean that integer multiplier
# is pipelined, which in turn *might* be impossible to match... On
# additional note, SPARC64 V implements FP Multiply-Add instruction,
# which is perfectly usable in this context... In other words, as far
# as Fujitsu SPARC64 V goes, talk to the author:-)
# The implementation implies following "non-natural" limitations on
# input arguments:
# - num may not be less than 4;
# - num has to be even;
# Failure to meet either condition has no fatal effects, simply
# doesn't give any performance gain.
# TODO:
# - modulo-schedule inner loop for better performance (on in-order
# execution core such as UltraSPARC this shall result in further
# noticeable(!) improvement);
# - dedicated squaring procedure[?];
######################################################################
# November 2006
#
# Modulo-scheduled inner loops allow to interleave floating point and
# integer instructions and minimize Read-After-Write penalties. This
# results in *further* 20-50% performance improvement [depending on
# key length, more for longer keys] on USI&II cores and 30-80% - on
# USIII&IV.
$output = pop;
open STDOUT,">$output";
$fname="bn_mul_mont_fpu";
$frame="STACK_FRAME";
$bias="STACK_BIAS";
$locals=64;
# In order to provide for 32-/64-bit ABI duality, I keep integers wider
# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
# exclusively for pointers, indexes and other small values...
# int bn_mul_mont(
$rp="%i0"; # BN_ULONG *rp,
$ap="%i1"; # const BN_ULONG *ap,
$bp="%i2"; # const BN_ULONG *bp,
$np="%i3"; # const BN_ULONG *np,
$n0="%i4"; # const BN_ULONG *n0,
$num="%i5"; # int num);
$tp="%l0"; # t[num]
$ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
$ap_h="%l2"; # to these four vectors as double-precision FP values.
$np_l="%l3"; # This way a bunch of fxtods are eliminated in second
$np_h="%l4"; # loop and L1-cache aliasing is minimized...
$i="%l5";
$j="%l6";
$mask="%l7"; # 16-bit mask, 0xffff
$n0="%g4"; # reassigned(!) to "64-bit" register
$carry="%i4"; # %i4 reused(!) for a carry bit
# FP register naming chart
#
# ..HILO
# dcba
# --------
# LOa
# LOb
# LOc
# LOd
# HIa
# HIb
# HIc
# HId
# ..a
# ..b
$ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
$na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
$alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
$nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
$dota="%f24"; $dotb="%f26";
$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
#include "sparc_arch.h"
.section ".text",#alloc,#execinstr
.global $fname
.align 32
$fname:
save %sp,-$frame-$locals,%sp
cmp $num,4
bl,a,pn %icc,.Lret
clr %i0
andcc $num,1,%g0 ! $num has to be even...
bnz,a,pn %icc,.Lret
clr %i0 ! signal "unsupported input value"
srl $num,1,$num
sethi %hi(0xffff),$mask
ld [%i4+0],$n0 ! $n0 reassigned, remember?
or $mask,%lo(0xffff),$mask
ld [%i4+4],%o0
sllx %o0,32,%o0
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
sll $num,3,$num ! num*=8
add %sp,$bias,%o0 ! real top of stack
sll $num,2,%o1
add %o1,$num,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,$bias,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,$bias+$frame+$locals,$tp
add $tp,$num,$ap_l
add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
add $ap_l,$num,$ap_h
add $ap_h,$num,$np_l
add $np_l,$num,$np_h
wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
add $rp,$num,$rp ! readjust input pointers to point
add $ap,$num,$ap ! at the ends too...
add $bp,$num,$bp
add $np,$num,$np
stx %o7,[%sp+$bias+$frame+48] ! save %asi
sub %g0,$num,$i ! i=-num
sub %g0,$num,$j ! j=-num
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[0]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
add $np,$j,%o5
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+$bias+$frame+0]
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o3+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
fxtod $alo,$alo
ldda [%o4+0]%asi,$bb
fxtod $ahi,$ahi
ldda [%o4+6]%asi,$bc
fxtod $nlo,$nlo
ldda [%o4+4]%asi,$bd
fxtod $nhi,$nhi
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fxtod $na,$na
std $ahi,[$ap_h+$j]
fxtod $nb,$nb
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fxtod $nc,$nc
std $nhi,[$np_h+$j]
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
add $j,8,$j
std $nlob,[%sp+$bias+$frame+8]
add $ap,$j,%o4
std $nloc,[%sp+$bias+$frame+16]
add $np,$j,%o5
std $nlod,[%sp+$bias+$frame+24]
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
!and %o0,$mask,%o0
!and %o1,$mask,%o1
!and %o2,$mask,%o2
!sllx %o1,16,%o1
!sllx %o2,32,%o2
!sllx %o3,48,%o7
!or %o1,%o0,%o0
!or %o2,%o0,%o0
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $dota,$nloa,$nloa
faddd $dotb,$nlob,$nlob
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.L1stskip
std $nlod,[%sp+$bias+$frame+24]
.align 32 ! incidentally already aligned !
.L1st:
add $ap,$j,%o4
add $np,$j,%o5
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
fzeros $alo
ld [%o4+4],$ahi_
fzeros $ahi
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
fzeros $nlo
ld [%o5+4],$nhi_
fzeros $nhi
fxtod $alo,$alo
fxtod $ahi,$ahi
fxtod $nlo,$nlo
fxtod $nhi,$nhi
ldx [%sp+$bias+$frame+0],%o0
fmuld $alo,$ba,$aloa
ldx [%sp+$bias+$frame+8],%o1
fmuld $nlo,$na,$nloa
ldx [%sp+$bias+$frame+16],%o2
fmuld $alo,$bb,$alob
ldx [%sp+$bias+$frame+24],%o3
fmuld $nlo,$nb,$nlob
srlx %o0,16,%o7
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
fmuld $alo,$bc,$aloc
add %o7,%o1,%o1
std $ahi,[$ap_h+$j]
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
srlx %o1,16,%o7
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
fmuld $alo,$bd,$alod
add %o7,%o2,%o2
std $nhi,[$np_h+$j]
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
srlx %o2,16,%o7
fmuld $ahi,$ba,$ahia
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
and %o1,$mask,%o1
and %o2,$mask,%o2
fmuld $ahi,$bb,$ahib
sllx %o1,16,%o1
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
sllx %o2,32,%o2
fmuld $ahi,$bc,$ahic
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
or %o2,%o0,%o0
fmuld $ahi,$bd,$ahid
or %o7,%o0,%o0 ! 64-bit result
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
addcc %g1,%o0,%o0
faddd $dota,$nloa,$nloa
srlx %o3,16,%g1 ! 34-bit carry
faddd $dotb,$nlob,$nlob
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
std $nlod,[%sp+$bias+$frame+24]
addcc $j,8,$j
bnz,pt %icc,.L1st
add $tp,8,$tp
.L1stskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+$bias+$frame+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]=
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
mov %g1,$carry
stx %o4,[$tp] ! tp[num-1]=
ba .Louter
add $i,8,$i
.align 32
.Louter:
sub %g0,$num,$j ! j=-num
add %sp,$bias+$frame+$locals,$tp
add $ap,$j,%o3
add $bp,$i,%o4
ld [%o3+4],%g1 ! bp[i]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
ldx [$tp],%o2 ! tp[0]
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+$bias+$frame+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,$ba
ldda [%o4+0]%asi,$bb
ldda [%o4+6]%asi,$bc
ldda [%o4+4]%asi,$bd
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+$bias+$frame+6]%asi,$na
fxtod $ba,$ba
ldda [%sp+$bias+$frame+4]%asi,$nb
fxtod $bb,$bb
ldda [%sp+$bias+$frame+2]%asi,$nc
fxtod $bc,$bc
ldda [%sp+$bias+$frame+0]%asi,$nd
fxtod $bd,$bd
ldd [$ap_l+$j],$alo ! load a[j] in double format
fxtod $na,$na
ldd [$ap_h+$j],$ahi
fxtod $nb,$nb
ldd [$np_l+$j],$nlo ! load n[j] in double format
fxtod $nc,$nc
ldd [$np_h+$j],$nhi
fxtod $nd,$nd
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
fmuld $alo,$bd,$alod
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
fmuld $ahi,$ba,$ahia
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
fmuld $ahi,$bb,$ahib
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
fmuld $ahi,$bc,$ahic
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
fmuld $ahi,$bd,$ahid
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
faddd $ahic,$nhic,$dota ! $nhic
faddd $ahid,$nhid,$dotb ! $nhid
faddd $nloc,$nhia,$nloc
faddd $nlod,$nhib,$nlod
fdtox $nloa,$nloa
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
add $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
! why?
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
ldx [$tp],%o7
faddd $nloc,$nhia,$nloc
addcc %o7,%o0,%o0
! end-of-why?
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
fdtox $nloc,$nloc
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
addcc $j,8,$j
std $nloc,[%sp+$bias+$frame+16]
bz,pn %icc,.Linnerskip
std $nlod,[%sp+$bias+$frame+24]
ba .Linner
nop
.align 32
.Linner:
ldd [$ap_l+$j],$alo ! load a[j] in double format
ldd [$ap_h+$j],$ahi
ldd [$np_l+$j],$nlo ! load n[j] in double format
ldd [$np_h+$j],$nhi
fmuld $alo,$ba,$aloa
fmuld $nlo,$na,$nloa
fmuld $alo,$bb,$alob
fmuld $nlo,$nb,$nlob
fmuld $alo,$bc,$aloc
ldx [%sp+$bias+$frame+0],%o0
faddd $aloa,$nloa,$nloa
fmuld $nlo,$nc,$nloc
ldx [%sp+$bias+$frame+8],%o1
fmuld $alo,$bd,$alod
ldx [%sp+$bias+$frame+16],%o2
faddd $alob,$nlob,$nlob
fmuld $nlo,$nd,$nlod
ldx [%sp+$bias+$frame+24],%o3
fmuld $ahi,$ba,$ahia
srlx %o0,16,%o7
faddd $aloc,$nloc,$nloc
fmuld $nhi,$na,$nhia
add %o7,%o1,%o1
fmuld $ahi,$bb,$ahib
srlx %o1,16,%o7
faddd $alod,$nlod,$nlod
fmuld $nhi,$nb,$nhib
add %o7,%o2,%o2
fmuld $ahi,$bc,$ahic
srlx %o2,16,%o7
faddd $ahia,$nhia,$nhia
fmuld $nhi,$nc,$nhic
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
fmuld $ahi,$bd,$ahid
and %o1,$mask,%o1
and %o2,$mask,%o2
faddd $ahib,$nhib,$nhib
fmuld $nhi,$nd,$nhid
sllx %o1,16,%o1
faddd $dota,$nloa,$nloa
sllx %o2,32,%o2
faddd $dotb,$nlob,$nlob
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd $ahic,$nhic,$dota ! $nhic
or %o2,%o0,%o0
faddd $ahid,$nhid,$dotb ! $nhid
or %o7,%o0,%o0 ! 64-bit result
faddd $nloc,$nhia,$nloc
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
faddd $nlod,$nhib,$nlod
srlx %o3,16,%g1 ! 34-bit carry
fdtox $nloa,$nloa
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox $nlob,$nlob
addcc %o7,%o0,%o0
fdtox $nloc,$nloc
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
fdtox $nlod,$nlod
std $nloa,[%sp+$bias+$frame+0]
std $nlob,[%sp+$bias+$frame+8]
std $nloc,[%sp+$bias+$frame+16]
addcc $j,8,$j
std $nlod,[%sp+$bias+$frame+24]
bnz,pt %icc,.Linner
add $tp,8,$tp
.Linnerskip:
fdtox $dota,$dota
fdtox $dotb,$dotb
ldx [%sp+$bias+$frame+0],%o0
ldx [%sp+$bias+$frame+8],%o1
ldx [%sp+$bias+$frame+16],%o2
ldx [%sp+$bias+$frame+24],%o3
srlx %o0,16,%o7
std $dota,[%sp+$bias+$frame+32]
add %o7,%o1,%o1
std $dotb,[%sp+$bias+$frame+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,$mask,%o0
and %o1,$mask,%o1
and %o2,$mask,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+$bias+$frame+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+$bias+$frame+40],%o5
addcc %g1,%o0,%o0
ldx [$tp+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[$tp] ! tp[j-1]
add $tp,8,$tp
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,$mask,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
addcc $carry,%o4,%o4
stx %o4,[$tp] ! tp[num-1]
mov %g1,$carry
bcs,a %xcc,.+8
add $carry,1,$carry
addcc $i,8,$i
bnz %icc,.Louter
nop
add $tp,8,$tp ! adjust tp to point at the end
orn %g0,%g0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
ldx [$tp+%o7],%o0
add $np,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
srlx %o0,32,%o1
subccc %o0,%o2,%o2
add $rp,%o7,%g1
subccc %o1,%o3,%o3
st %o2,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
ba .Lcopy
nop
.align 32
.Lcopy:
ldx [$tp+%o7],%o0
add $rp,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[$tp+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
.Lzap:
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
stx %g0,[$np_h+%o7]
add %o7,8,%o7
brnz,pt %o7,.Lzap
nop
ldx [%sp+$bias+$frame+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0
.Lret:
ret
restore
.type $fname,#function
.size $fname,(.-$fname)
.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
.align 32
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
# Below substitution makes it possible to compile without demanding
# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
# dare to do this, because VIS capability is detected at run-time now
# and this routine is not called on CPU not capable to execute it. Do
# note that fzeros is not the only VIS dependency! Another dependency
# is implicit and is just _a_ numerical value loaded to %asi register,
# which assembler can't recognize as VIS specific...
$code =~ s/fzeros\s+%f([0-9]+)/
sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
/gem;
print $code;
# flush
close STDOUT;

View file

@ -0,0 +1,251 @@
#! /usr/bin/env perl
# Copyright 2006-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Wrapper around 'rep montmul', VIA-specific instruction accessing
# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
#
# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
# different software configurations on 1.5GHz VIA Esther processor.
# Lines marked with "software integer" denote performance of hand-
# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
# refers to hand-coded SSE2 Montgomery multiplication procedure found
# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
# Padlock SDK 2.0.1 available for download from VIA, which naturally
# utilizes the magic 'repz montmul' instruction. And finally "hardware
# this" refers to *this* implementation which also uses 'repz montmul'
#
# sign verify sign/s verify/s
# rsa 512 bits 0.001720s 0.000140s 581.4 7149.7 software integer
# rsa 512 bits 0.000690s 0.000086s 1450.3 11606.0 software SSE2
# rsa 512 bits 0.006136s 0.000201s 163.0 4974.5 hardware VIA SDK
# rsa 512 bits 0.000712s 0.000050s 1404.9 19858.5 hardware this
#
# rsa 1024 bits 0.008518s 0.000413s 117.4 2420.8 software integer
# rsa 1024 bits 0.004275s 0.000277s 233.9 3609.7 software SSE2
# rsa 1024 bits 0.012136s 0.000260s 82.4 3844.5 hardware VIA SDK
# rsa 1024 bits 0.002522s 0.000116s 396.5 8650.9 hardware this
#
# rsa 2048 bits 0.050101s 0.001371s 20.0 729.6 software integer
# rsa 2048 bits 0.030273s 0.001008s 33.0 991.9 software SSE2
# rsa 2048 bits 0.030833s 0.000976s 32.4 1025.1 hardware VIA SDK
# rsa 2048 bits 0.011879s 0.000342s 84.2 2921.7 hardware this
#
# rsa 4096 bits 0.327097s 0.004859s 3.1 205.8 software integer
# rsa 4096 bits 0.229318s 0.003859s 4.4 259.2 software SSE2
# rsa 4096 bits 0.233953s 0.003274s 4.3 305.4 hardware VIA SDK
# rsa 4096 bits 0.070493s 0.001166s 14.2 857.6 hardware this
#
# dsa 512 bits 0.001342s 0.001651s 745.2 605.7 software integer
# dsa 512 bits 0.000844s 0.000987s 1185.3 1013.1 software SSE2
# dsa 512 bits 0.001902s 0.002247s 525.6 444.9 hardware VIA SDK
# dsa 512 bits 0.000458s 0.000524s 2182.2 1909.1 hardware this
#
# dsa 1024 bits 0.003964s 0.004926s 252.3 203.0 software integer
# dsa 1024 bits 0.002686s 0.003166s 372.3 315.8 software SSE2
# dsa 1024 bits 0.002397s 0.002823s 417.1 354.3 hardware VIA SDK
# dsa 1024 bits 0.000978s 0.001170s 1022.2 855.0 hardware this
#
# dsa 2048 bits 0.013280s 0.016518s 75.3 60.5 software integer
# dsa 2048 bits 0.009911s 0.011522s 100.9 86.8 software SSE2
# dsa 2048 bits 0.009542s 0.011763s 104.8 85.0 hardware VIA SDK
# dsa 2048 bits 0.002884s 0.003352s 346.8 298.3 hardware this
#
# To give you some other reference point here is output for 2.4GHz P4
# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
# SSE2" in above terms.
#
# rsa 512 bits 0.000407s 0.000047s 2454.2 21137.0
# rsa 1024 bits 0.002426s 0.000141s 412.1 7100.0
# rsa 2048 bits 0.015046s 0.000491s 66.5 2034.9
# rsa 4096 bits 0.109770s 0.002379s 9.1 420.3
# dsa 512 bits 0.000438s 0.000525s 2281.1 1904.1
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on
# key length;
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
$func="bn_mul_mont_padlock";
$pad=16*1; # amount of reserved bytes on top of every vector
# stack layout
$mZeroPrime=&DWP(0,"esp"); # these are specified by VIA
$A=&DWP(4,"esp");
$B=&DWP(8,"esp");
$T=&DWP(12,"esp");
$M=&DWP(16,"esp");
$scratch=&DWP(20,"esp");
$rp=&DWP(24,"esp"); # these are mine
$sp=&DWP(28,"esp");
# &DWP(32,"esp") # 32 byte scratch area
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
# penalty. I allocate only as much as actually required...
&function_begin($func);
&xor ("eax","eax");
&mov ("ecx",&wparam(5)); # num
# meet VIA's limitations for num [note that the specification
# expresses them in bits, while we work with amount of 32-bit words]
&test ("ecx",3);
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
&cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
&cld ();
&mov ("edi",&wparam(0)); # rp
&mov ("eax",&wparam(1)); # ap
&mov ("ebx",&wparam(2)); # bp
&mov ("edx",&wparam(3)); # np
&mov ("esi",&wparam(4)); # n0
&mov ("esi",&DWP(0,"esi")); # *n0
&lea ("ecx",&DWP($pad,"","ecx",4)); # ecx becomes vector size in bytes
&lea ("ebp",&DWP(64,"","ecx",4)); # allocate 4 vectors + 64 bytes
&neg ("ebp");
&add ("ebp","esp");
&and ("ebp",-64); # align to cache-line
&xchg ("ebp","esp"); # alloca
&mov ($rp,"edi"); # save rp
&mov ($sp,"ebp"); # save esp
&mov ($mZeroPrime,"esi");
&lea ("esi",&DWP(64,"esp")); # tp
&mov ($T,"esi");
&lea ("edi",&DWP(32,"esp")); # scratch area
&mov ($scratch,"edi");
&mov ("esi","eax");
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
&xor ("eax","eax");
&mov ("ecx","ebp");
&lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded ap copy...
&mov ("ecx","ebp");
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded bp copy...
&mov ("ecx","ebp");
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
&mov ("ecx",$pad/4);
&data_byte(0xf3,0xab); # rep stosl, bzero pad
# edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
&lea ("esi",&DWP(64,"esp")); # tp
# edi still points at the end of padded np copy...
&neg ("ebp");
&lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
&mov ("edi",$rp); # restore rp
&xor ("edx","edx"); # i=0 and clear CF
&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("sub")); # doesn't affect CF!
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
&sbb ("eax",0);
&mov ("ecx","edx"); # num
&mov ("edx",0); # i=0
&set_label("copy",8);
&mov ("ebx",&DWP(0,"esi","edx",4));
&mov ("eax",&DWP(0,"edi","edx",4));
&mov (&DWP(0,"esi","edx",4),"ecx"); # zap tp
&cmovc ("eax","ebx");
&mov (&DWP(0,"edi","edx",4),"eax");
&lea ("edx",&DWP(1,"edx")); # i++
&loop (&label("copy"));
&mov ("ebp",$sp);
&xor ("eax","eax");
&mov ("ecx",64/4);
&mov ("edi","esp"); # zap frame including scratch area
&data_byte(0xf3,0xab); # rep stosl, bzero
# zap copies of ap, bp and np
&lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
&lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
&inc ("eax"); # signal "done"
&popf ();
&set_label("leave");
&function_end($func);
&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View file

@ -0,0 +1,384 @@
#! /usr/bin/env perl
# Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2012.
#
# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and
# onward. There are three new instructions used here: umulxhi,
# addxc[cc] and initializing store. On T3 RSA private key operations
# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
# lengths. This is without dedicated squaring procedure. On T4
# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
# for reference purposes, because T4 has dedicated Montgomery
# multiplication and squaring *instructions* that deliver even more.
$output = pop;
open STDOUT,">$output";
$frame = "STACK_FRAME";
$bias = "STACK_BIAS";
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
___
($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
# int bn_mul_mont(
$rp="%o0"; # BN_ULONG *rp,
$ap="%o1"; # const BN_ULONG *ap,
$bp="%o2"; # const BN_ULONG *bp,
$np="%o3"; # const BN_ULONG *np,
$n0p="%o4"; # const BN_ULONG *n0,
$num="%o5"; # int num); # caller ensures that num is even
# and >=6
$code.=<<___;
.globl bn_mul_mont_vis3
.align 32
bn_mul_mont_vis3:
add %sp, $bias, %g4 ! real top of stack
sll $num, 2, $num ! size in bytes
add $num, 63, %g5
andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes
add %g5, %g5, %g1
add %g5, %g1, %g1 ! 3*buffer size
sub %g4, %g1, %g1
andn %g1, 63, %g1 ! align at 64 byte
sub %g1, $frame, %g1 ! new top of stack
sub %g1, %g4, %g1
save %sp, %g1, %sp
___
# +-------------------------------+<----- %sp
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 tmp[0] |
# +-------------------------------+
# . .
# . .
# +-------------------------------+<----- aligned at 64 bytes
# | __int64 ap[1..0] | converted ap[]
# +-------------------------------+
# | __int64 np[1..0] | converted np[]
# +-------------------------------+
# | __int64 ap[3..2] |
# . .
# . .
# +-------------------------------+
($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
($ovf,$i)=($t0,$t1);
$code.=<<___;
ld [$n0p+0], $t0 ! pull n0[0..1] value
add %sp, $bias+$frame, $tp
ld [$n0p+4], $t1
add $tp, %g5, $anp
ld [$bp+0], $t2 ! m0=bp[0]
sllx $t1, 32, $n0
ld [$bp+4], $t3
or $t0, $n0, $n0
add $bp, 8, $bp
ld [$ap+0], $t0 ! ap[0]
sllx $t3, 32, $m0
ld [$ap+4], $t1
or $t2, $m0, $m0
ld [$ap+8], $t2 ! ap[1]
sllx $t1, 32, $aj
ld [$ap+12], $t3
or $t0, $aj, $aj
add $ap, 16, $ap
stx $aj, [$anp] ! converted ap[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
umulxhi $aj, $m0, $hi0
ld [$np+0], $t0 ! np[0]
sllx $t3, 32, $aj
ld [$np+4], $t1
or $t2, $aj, $aj
ld [$np+8], $t2 ! np[1]
sllx $t1, 32, $nj
ld [$np+12], $t3
or $t0, $nj, $nj
add $np, 16, $np
stx $nj, [$anp+8] ! converted np[0]
mulx $lo0, $n0, $m1 ! "tp[0]"*n0
stx $aj, [$anp+16] ! converted ap[1]
mulx $aj, $m0, $alo ! ap[1]*bp[0]
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
sllx $t3, 32, $nj
or $t2, $nj, $nj
stx $nj, [$anp+24] ! converted np[1]
add $anp, 32, $anp
addcc $lo0, $lo1, $lo1
addxc %g0, $hi1, $hi1
mulx $nj, $m1, $nlo ! np[1]*m1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .L1st
sub $num, 24, $cnt ! cnt=num-3
.align 16
.L1st:
ld [$ap+0], $t0 ! ap[j]
addcc $alo, $hi0, $lo0
ld [$ap+4], $t1
addxc $aj, %g0, $hi0
sllx $t1, 32, $aj
add $ap, 8, $ap
or $t0, $aj, $aj
stx $aj, [$anp] ! converted ap[j]
ld [$np+0], $t2 ! np[j]
addcc $nlo, $hi1, $lo1
ld [$np+4], $t3
addxc $nj, %g0, $hi1 ! nhi=nj
sllx $t3, 32, $nj
add $np, 8, $np
mulx $aj, $m0, $alo ! ap[j]*bp[0]
or $t2, $nj, $nj
umulxhi $aj, $m0, $aj ! ahi=aj
stx $nj, [$anp+8] ! converted np[j]
add $anp, 16, $anp ! anp++
mulx $nj, $m1, $nlo ! np[j]*m1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
umulxhi $nj, $m1, $nj ! nhi=nj
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp ! tp++
brnz,pt $cnt, .L1st
sub $cnt, 8, $cnt ! j--
!.L1st
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1
addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
addcc $hi0, $hi1, $hi1
addxc %g0, %g0, $ovf ! upmost overflow bit
stx $hi1, [$tp]
add $tp, 8, $tp
ba .Louter
sub $num, 16, $i ! i=num-2
.align 16
.Louter:
ld [$bp+0], $t2 ! m0=bp[i]
ld [$bp+4], $t3
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
add $bp, 8, $bp
sllx $t3, 32, $m0
ldx [$anp+0], $aj ! ap[0]
or $t2, $m0, $m0
ldx [$anp+8], $nj ! np[0]
mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
ldx [$tp], $tj ! tp[0]
umulxhi $aj, $m0, $hi0
ldx [$anp+16], $aj ! ap[1]
addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
mulx $aj, $m0, $alo ! ap[1]*bp[i]
addxc %g0, $hi0, $hi0
mulx $lo0, $n0, $m1 ! tp[0]*n0
umulxhi $aj, $m0, $aj ! ahi=aj
mulx $nj, $m1, $lo1 ! np[0]*m1
umulxhi $nj, $m1, $hi1
ldx [$anp+24], $nj ! np[1]
add $anp, 32, $anp
addcc $lo1, $lo0, $lo1
mulx $nj, $m1, $nlo ! np[1]*m1
addxc %g0, $hi1, $hi1
umulxhi $nj, $m1, $nj ! nhi=nj
ba .Linner
sub $num, 24, $cnt ! cnt=num-3
.align 16
.Linner:
addcc $alo, $hi0, $lo0
ldx [$tp+8], $tj ! tp[j]
addxc $aj, %g0, $hi0 ! ahi=aj
ldx [$anp+0], $aj ! ap[j]
addcc $nlo, $hi1, $lo1
mulx $aj, $m0, $alo ! ap[j]*bp[i]
addxc $nj, %g0, $hi1 ! nhi=nj
ldx [$anp+8], $nj ! np[j]
add $anp, 16, $anp
umulxhi $aj, $m0, $aj ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
mulx $nj, $m1, $nlo ! np[j]*m1
addxc %g0, $hi0, $hi0
umulxhi $nj, $m1, $nj ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
add $tp, 8, $tp
brnz,pt $cnt, .Linner
sub $cnt, 8, $cnt
!.Linner
ldx [$tp+8], $tj ! tp[j]
addcc $alo, $hi0, $lo0
addxc $aj, %g0, $hi0 ! ahi=aj
addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
addxc %g0, $hi0, $hi0
addcc $nlo, $hi1, $lo1
addxc $nj, %g0, $hi1 ! nhi=nj
addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
addxc %g0, $hi1, $hi1
stx $lo1, [$tp] ! tp[j-1]
subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
addxccc $hi1, $hi0, $hi1
addxc %g0, %g0, $ovf
stx $hi1, [$tp+8]
add $tp, 16, $tp
brnz,pt $i, .Louter
sub $i, 8, $i
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
ba .Lsub
subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
.align 16
.Lsub:
ldx [$tp], $tj
add $tp, 8, $tp
ldx [$anp+8], $nj
add $anp, 16, $anp
subccc $tj, $nj, $t2 ! tp[j]-np[j]
srlx $tj, 32, $tj
srlx $nj, 32, $nj
subccc $tj, $nj, $t3
add $rp, 8, $rp
st $t2, [$rp-4] ! reverse order
st $t3, [$rp-8]
brnz,pt $cnt, .Lsub
sub $cnt, 8, $cnt
sub $anp, $num, $anp ! rewind
sub $tp, $num, $tp
sub $anp, $num, $anp
sub $rp, $num, $rp
subccc $ovf, %g0, $ovf ! handle upmost overflow bit
ba .Lcopy
sub $num, 8, $cnt
.align 16
.Lcopy: ! conditional copy
ld [$tp+0], $t0
ld [$tp+4], $t1
ld [$rp+0], $t2
ld [$rp+4], $t3
stx %g0, [$tp] ! zap
add $tp, 8, $tp
stx %g0, [$anp] ! zap
stx %g0, [$anp+8]
add $anp, 16, $anp
movcs %icc, $t0, $t2
movcs %icc, $t1, $t3
st $t3, [$rp+0] ! flip order
st $t2, [$rp+4]
add $rp, 8, $rp
brnz $cnt, .Lcopy
sub $cnt, 8, $cnt
mov 1, %o0
ret
restore
.type bn_mul_mont_vis3, #function
.size bn_mul_mont_vis3, .-bn_mul_mont_vis3
.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
# Purpose of these subroutines is to explicitly encode VIS instructions,
# so that one can compile the module without having to specify VIS
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
# Idea is to reserve for option to produce "universal" binary and let
# programmer detect if current CPU is VIS capable at run-time.
sub unvis3 {
my ($mnemonic,$rs1,$rs2,$rd)=@_;
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
my ($ref,$opf);
my %visopf = ( "addxc" => 0x011,
"addxccc" => 0x013,
"umulxhi" => 0x016 );
$ref = "$mnemonic\t$rs1,$rs2,$rd";
if ($opf=$visopf{$mnemonic}) {
foreach ($rs1,$rs2,$rd) {
return $ref if (!/%([goli])([0-9])/);
$_=$bias{$1}+$2;
}
return sprintf ".word\t0x%08x !%s",
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
$ref;
} else {
return $ref;
}
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
&unvis3($1,$2,$3,$4)
/ge;
print $_,"\n";
}
close STDOUT;

View file

@ -0,0 +1,325 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has three code paths: pure integer
# code suitable for any x86 CPU, MMX code suitable for PIII and later
# and PCLMULQDQ suitable for Westmere and later. Improvement varies
# from one benchmark and µ-arch to another. Below are interval values
# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
# code:
#
# PIII 16%-30%
# P4 12%-12%
# Opteron 18%-40%
# Core2 19%-44%
# Atom 38%-64%
# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX)
# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX)
#
# Note that above improvement coefficients are not coefficients for
# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
# is more and more dominated by other subroutines, most notably by
# BN_GF2m_mod[_mul]_arr...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
$a="eax";
$b="ebx";
($a1,$a2,$a4)=("ecx","edx","ebp");
$R="mm0";
@T=("mm1","mm2");
($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
@i=("esi","edi");
if (!$x86only) {
&function_begin_B("_mul_1x1_mmx");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&and ($a1,0x3fffffff);
&lea ($a4,&DWP(0,$a2,$a2));
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&movd ($A,$a);
&movd ($B,$b);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&pxor ($B31,$B31);
&pxor ($B30,$B30);
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&pcmpgtd($B31,$A); # broadcast 31st bit
&paddd ($A,$A); # $A<<=1
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&pand ($B31,$B);
&pcmpgtd($B30,$A); # broadcast 30th bit
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&psllq ($B31,31);
&pand ($B30,$B);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&mov (@i[0],0x7);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($a4,@i[0]);
&and (@i[0],$b);
&shr ($b,3);
&mov (@i[1],$a4);
&psllq ($B30,30);
&and (@i[1],$b);
&shr ($b,3);
&movd ($R,&DWP(0,"esp",@i[0],4));
&mov (@i[0],$a4);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],$a4);
&psllq (@T[1],3*$n);
&and (@i[1],$b);
&shr ($b,3);
&pxor ($R,@T[1]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&movd (@T[1],&DWP(0,"esp",@i[1],4));
&pxor ($R,$B30);
&psllq (@T[1],3*$n++);
&pxor ($R,@T[1]);
&movd (@T[0],&DWP(0,"esp",@i[0],4));
&pxor ($R,$B31);
&psllq (@T[0],3*$n);
&add ("esp",32+4);
&pxor ($R,@T[0]);
&ret ();
&function_end_B("_mul_1x1_mmx");
}
($lo,$hi)=("eax","edx");
@T=("ecx","ebp");
&function_begin_B("_mul_1x1_ialu");
&sub ("esp",32+4);
&mov ($a1,$a);
&lea ($a2,&DWP(0,$a,$a));
&lea ($a4,&DWP(0,"",$a,4));
&and ($a1,0x3fffffff);
&lea (@i[1],&DWP(0,$lo,$lo));
&sar ($lo,31); # broadcast 31st bit
&mov (&DWP(0*4,"esp"),0);
&and ($a2,0x7fffffff);
&mov (&DWP(1*4,"esp"),$a1); # a1
&xor ($a1,$a2); # a1^a2
&mov (&DWP(2*4,"esp"),$a2); # a2
&xor ($a2,$a4); # a2^a4
&mov (&DWP(3*4,"esp"),$a1); # a1^a2
&xor ($a1,$a2); # a1^a4=a1^a2^a2^a4
&mov (&DWP(4*4,"esp"),$a4); # a4
&xor ($a4,$a2); # a2=a4^a2^a4
&mov (&DWP(5*4,"esp"),$a1); # a1^a4
&xor ($a4,$a1); # a1^a2^a4
&sar (@i[1],31); # broadcast 30th bit
&and ($lo,$b);
&mov (&DWP(6*4,"esp"),$a2); # a2^a4
&and (@i[1],$b);
&mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4
&mov ($hi,$lo);
&shl ($lo,31);
&mov (@T[0],@i[1]);
&shr ($hi,1);
&mov (@i[0],0x7);
&shl (@i[1],30);
&and (@i[0],$b);
&shr (@T[0],2);
&xor ($lo,@i[1]);
&shr ($b,3);
&mov (@i[1],0x7); # 5-byte instruction!?
&and (@i[1],$b);
&shr ($b,3);
&xor ($hi,@T[0]);
&xor ($lo,&DWP(0,"esp",@i[0],4));
&mov (@i[0],0x7);
&and (@i[0],$b);
&shr ($b,3);
for($n=1;$n<9;$n++) {
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@i[1],0x7);
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&and (@i[1],$b);
&shr (@T[0],32-3*$n);
&xor ($lo,@T[1]);
&shr ($b,3);
&xor ($hi,@T[0]);
push(@i,shift(@i)); push(@T,shift(@T));
}
&mov (@T[1],&DWP(0,"esp",@i[1],4));
&mov (@T[0],@T[1]);
&shl (@T[1],3*$n);
&mov (@i[1],&DWP(0,"esp",@i[0],4));
&shr (@T[0],32-3*$n); $n++;
&mov (@i[0],@i[1]);
&xor ($lo,@T[1]);
&shl (@i[1],3*$n);
&xor ($hi,@T[0]);
&shr (@i[0],32-3*$n);
&xor ($lo,@i[1]);
&xor ($hi,@i[0]);
&add ("esp",32+4);
&ret ();
&function_end_B("_mul_1x1_ialu");
# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
&function_begin_B("bn_GF2m_mul_2x2");
if (!$x86only) {
&picmeup("edx","OPENSSL_ia32cap_P");
&mov ("eax",&DWP(0,"edx"));
&mov ("edx",&DWP(4,"edx"));
&test ("eax",1<<23); # check MMX bit
&jz (&label("ialu"));
if ($sse2) {
&test ("eax",1<<24); # check FXSR bit
&jz (&label("mmx"));
&test ("edx",1<<1); # check PCLMULQDQ bit
&jz (&label("mmx"));
&movups ("xmm0",&QWP(8,"esp"));
&shufps ("xmm0","xmm0",0b10110001);
&pclmulqdq ("xmm0","xmm0",1);
&mov ("eax",&DWP(4,"esp"));
&movups (&QWP(0,"eax"),"xmm0");
&ret ();
&set_label("mmx",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_mmx"); # a1·b1
&movq ("mm7",$R);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # a0·b0
&movq ("mm6",$R);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1)
&pxor ($R,"mm7");
&mov ($a,&wparam(0));
&pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0
&movq ($A,$R);
&psllq ($R,32);
&pop ("edi");
&psrlq ($A,32);
&pop ("esi");
&pxor ($R,"mm6");
&pop ("ebx");
&pxor ($A,"mm7");
&movq (&QWP(0,$a),$R);
&pop ("ebp");
&movq (&QWP(8,$a),$A);
&emms ();
&ret ();
&set_label("ialu",16);
}
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&stack_push(4+1);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&call ("_mul_1x1_ialu"); # a1·b1
&mov (&DWP(8,"esp"),$lo);
&mov (&DWP(12,"esp"),$hi);
&mov ($a,&wparam(2));
&mov ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # a0·b0
&mov (&DWP(0,"esp"),$lo);
&mov (&DWP(4,"esp"),$hi);
&mov ($a,&wparam(1));
&mov ($b,&wparam(3));
&xor ($a,&wparam(2));
&xor ($b,&wparam(4));
&call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1)
&mov ("ebp",&wparam(0));
@r=("ebx","ecx","edi","esi");
&mov (@r[0],&DWP(0,"esp"));
&mov (@r[1],&DWP(4,"esp"));
&mov (@r[2],&DWP(8,"esp"));
&mov (@r[3],&DWP(12,"esp"));
&xor ($lo,$hi);
&xor ($hi,@r[1]);
&xor ($lo,@r[0]);
&mov (&DWP(0,"ebp"),@r[0]);
&xor ($hi,@r[2]);
&mov (&DWP(12,"ebp"),@r[3]);
&xor ($lo,@r[3]);
&stack_pop(4+1);
&xor ($hi,@r[3]);
&pop ("edi");
&xor ($lo,$hi);
&pop ("esi");
&mov (&DWP(8,"ebp"),$hi);
&pop ("ebx");
&mov (&DWP(4,"ebp"),$lo);
&pop ("ebp");
&ret ();
&function_end_B("bn_GF2m_mul_2x2");
&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View file

@ -0,0 +1,631 @@
#! /usr/bin/env perl
# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# October 2005
#
# This is a "teaser" code, as it can be improved in several ways...
# First of all non-SSE2 path should be implemented (yes, for now it
# performs Montgomery multiplication/convolution only on SSE2-capable
# CPUs such as P4, others fall down to original code). Then inner loop
# can be unrolled and modulo-scheduled to improve ILP and possibly
# moved to 128-bit XMM register bank (though it would require input
# rearrangement and/or increase bus bandwidth utilization). Dedicated
# squaring procedure should give further performance improvement...
# Yet, for being draft, the code improves rsa512 *sign* benchmark by
# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
# December 2006
#
# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
# Integer-only code [being equipped with dedicated squaring procedure]
# gives ~40% on rsa512 sign benchmark...
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0]);
$sse2=0;
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&external_label("OPENSSL_ia32cap_P") if ($sse2);
&function_begin("bn_mul_mont");
$i="edx";
$j="ecx";
$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
$_num=&DWP(4*0,"esp"); # stack top layout
$_rp=&DWP(4*1,"esp");
$_ap=&DWP(4*2,"esp");
$_bp=&DWP(4*3,"esp");
$_np=&DWP(4*4,"esp");
$_n0=&DWP(4*5,"esp"); $_n0q=&QWP(4*5,"esp");
$_sp=&DWP(4*6,"esp");
$_bpend=&DWP(4*7,"esp");
$frame=32; # size of above frame rounded up to 16n
&xor ("eax","eax");
&mov ("edi",&wparam(5)); # int num
&cmp ("edi",4);
&jl (&label("just_leave"));
&lea ("esi",&wparam(0)); # put aside pointer to argument block
&lea ("edx",&wparam(1)); # load ap
&add ("edi",2); # extra two words on top of tp
&neg ("edi");
&lea ("ebp",&DWP(-$frame,"esp","edi",4)); # future alloca($frame+4*(num+2))
&neg ("edi");
# minimize cache contention by arranging 2K window between stack
# pointer and ap argument [np is also position sensitive vector,
# but it's assumed to be near ap, as it's allocated at ~same
# time].
&mov ("eax","ebp");
&sub ("eax","edx");
&and ("eax",2047);
&sub ("ebp","eax"); # this aligns sp and ap modulo 2048
&xor ("edx","ebp");
&and ("edx",2048);
&xor ("edx",2048);
&sub ("ebp","edx"); # this splits them apart modulo 4096
&and ("ebp",-64); # align to cache line
# An OS-agnostic version of __chkstk.
#
# Some OSes (Windows) insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","esp");
&sub ("eax","ebp");
&and ("eax",-4096);
&mov ("edx","esp"); # saved stack pointer!
&lea ("esp",&DWP(0,"ebp","eax"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&jmp (&label("page_walk_done"));
&set_label("page_walk",16);
&lea ("esp",&DWP(-4096,"esp"));
&mov ("eax",&DWP(0,"esp"));
&cmp ("esp","ebp");
&ja (&label("page_walk"));
&set_label("page_walk_done");
################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
&mov ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
&mov ("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
&mov ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
#&mov ("edi",&DWP(5*4,"esi"));# int num
&mov ("esi",&DWP(0,"esi")); # pull n0[0]
&mov ($_rp,"eax"); # ... save a copy of argument block
&mov ($_ap,"ebx");
&mov ($_bp,"ecx");
&mov ($_np,"ebp");
&mov ($_n0,"esi");
&lea ($num,&DWP(-3,"edi")); # num=num-1 to assist modulo-scheduling
#&mov ($_num,$num); # redundant as $num is not reused
&mov ($_sp,"edx"); # saved stack pointer!
if($sse2) {
$acc0="mm0"; # mmx register bank layout
$acc1="mm1";
$car0="mm2";
$car1="mm3";
$mul0="mm4";
$mul1="mm5";
$temp="mm6";
$mask="mm7";
&picmeup("eax","OPENSSL_ia32cap_P");
&bt (&DWP(0,"eax"),26);
&jnc (&label("non_sse2"));
&mov ("eax",-1);
&movd ($mask,"eax"); # mask 32 lower bits
&mov ($ap,$_ap); # load input pointers
&mov ($bp,$_bp);
&mov ($np,$_np);
&xor ($i,$i); # i=0
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp)); # bp[0]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[0]
&movq ($car0,$mul1);
&movq ($acc0,$mul1); # I wish movd worked for
&pand ($acc0,$mask); # inter-register transfers
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1); # "t[0]"*np[0]*n0
&paddq ($car1,$acc0);
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&inc ($j); # j++
&set_label("1st",16);
&pmuludq($acc0,$mul0); # ap[j]*bp[0]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[0];
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[j-1]=
&psrlq ($car1,32);
&lea ($j,&DWP(1,$j));
&cmp ($j,$num);
&jl (&label("1st"));
&pmuludq($acc0,$mul0); # ap[num-1]*bp[0]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[0];
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car1,$car0);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&inc ($i); # i++
&set_label("outer");
&xor ($j,$j); # j=0
&movd ($mul0,&DWP(0,$bp,$i,4)); # bp[i]
&movd ($mul1,&DWP(0,$ap)); # ap[0]
&movd ($temp,&DWP($frame,"esp")); # tp[0]
&movd ($car1,&DWP(0,$np)); # np[0]
&pmuludq($mul1,$mul0); # ap[0]*bp[i]
&paddq ($mul1,$temp); # +=tp[0]
&movq ($acc0,$mul1);
&movq ($car0,$mul1);
&pand ($acc0,$mask);
&pmuludq($mul1,$_n0q); # *=n0
&pmuludq($car1,$mul1);
&paddq ($car1,$acc0);
&movd ($temp,&DWP($frame+4,"esp")); # tp[1]
&movd ($acc1,&DWP(4,$np)); # np[1]
&movd ($acc0,&DWP(4,$ap)); # ap[1]
&psrlq ($car0,32);
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[1]
&inc ($j); # j++
&dec ($num);
&set_label("inner");
&pmuludq($acc0,$mul0); # ap[j]*bp[i]
&pmuludq($acc1,$mul1); # np[j]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&movd ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
&pand ($acc0,$mask);
&movd ($acc1,&DWP(4,$np,$j,4)); # np[j+1]
&paddq ($car1,$acc0); # +=ap[j]*bp[i]+tp[j]
&movd ($acc0,&DWP(4,$ap,$j,4)); # ap[j+1]
&psrlq ($car0,32);
&movd (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
&psrlq ($car1,32);
&paddq ($car0,$temp); # +=tp[j+1]
&dec ($num);
&lea ($j,&DWP(1,$j)); # j++
&jnz (&label("inner"));
&mov ($num,$j);
&pmuludq($acc0,$mul0); # ap[num-1]*bp[i]
&pmuludq($acc1,$mul1); # np[num-1]*m1
&paddq ($car0,$acc0); # +=c0
&paddq ($car1,$acc1); # +=c1
&movq ($acc0,$car0);
&pand ($acc0,$mask);
&paddq ($car1,$acc0); # +=ap[num-1]*bp[i]+tp[num-1]
&movd (&DWP($frame-4,"esp",$j,4),$car1); # tp[num-2]=
&psrlq ($car0,32);
&psrlq ($car1,32);
&movd ($temp,&DWP($frame+4,"esp",$num,4)); # += tp[num]
&paddq ($car1,$car0);
&paddq ($car1,$temp);
&movq (&QWP($frame,"esp",$num,4),$car1); # tp[num].tp[num-1]
&lea ($i,&DWP(1,$i)); # i++
&cmp ($i,$num);
&jle (&label("outer"));
&emms (); # done with mmx bank
&jmp (&label("common_tail"));
&set_label("non_sse2",16);
}
if (0) {
&mov ("esp",$_sp);
&xor ("eax","eax"); # signal "not fast enough [yet]"
&jmp (&label("just_leave"));
# While the below code provides competitive performance for
# all key lengths on modern Intel cores, it's still more
# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
# means compared to the original integer-only assembler.
# 512-bit RSA sign is better by ~40%, but that's about all
# one can say about all CPUs...
} else {
$inp="esi"; # integer path uses these registers differently
$word="edi";
$carry="ebp";
&mov ($inp,$_ap);
&lea ($carry,&DWP(1,$num));
&mov ($word,$_bp);
&xor ($j,$j); # j=0
&mov ("edx",$inp);
&and ($carry,1); # see if num is even
&sub ("edx",$word); # see if ap==bp
&lea ("eax",&DWP(4,$word,$num,4)); # &bp[num]
&or ($carry,"edx");
&mov ($word,&DWP(0,$word)); # bp[0]
&jz (&label("bn_sqr_mont"));
&mov ($_bpend,"eax");
&mov ("eax",&DWP(0,$inp));
&xor ("edx","edx");
&set_label("mull",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[0]
&add ($carry,"eax");
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("mull"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[0]
&mov ($word,$_n0);
&add ("eax",$carry);
&mov ($inp,$_np);
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&mov (&DWP($frame,"esp",$num,4),"eax"); # tp[num-1]=
&xor ($j,$j);
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mov ("eax",&DWP(0,$inp)); # np[0]
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&inc ($j);
&jmp (&label("2ndmadd"));
&set_label("1stmadd",16);
&mov ($carry,"edx");
&mul ($word); # ap[j]*bp[i]
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("1stmadd"));
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*bp[i]
&add ("eax",&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&add ($carry,"eax");
&adc ("edx",0);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&xor ($j,$j);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&mov (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
&adc ($j,0);
&mov ("eax",&DWP(0,$inp)); # np[0]
&mov (&DWP($frame+4,"esp",$num,4),"edx"); # tp[num]=
&mov (&DWP($frame+8,"esp",$num,4),$j); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ("eax",&DWP(4,$inp)); # np[1]
&adc ("edx",0);
&mov ($j,1);
&set_label("2ndmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
&jl (&label("2ndmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&xor ("eax","eax");
&mov ($j,$_bp); # &bp[i]
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&lea ($j,&DWP(4,$j));
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$_bpend);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(0,$j)); # bp[i+1]
&mov ($inp,$_ap);
&mov ($_bp,$j); # &bp[++i]
&xor ($j,$j);
&xor ("edx","edx");
&mov ("eax",&DWP(0,$inp));
&jmp (&label("1stmadd"));
&set_label("bn_sqr_mont",16);
$sbit=$num;
&mov ($_num,$num);
&mov ($_bp,$j); # i=0
&mov ("eax",$word); # ap[0]
&mul ($word); # ap[0]*ap[0]
&mov (&DWP($frame,"esp"),"eax"); # tp[0]=
&mov ($sbit,"edx");
&shr ("edx",1);
&and ($sbit,1);
&inc ($j);
&set_label("sqr",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[0]
&add ("eax",$carry);
&lea ($j,&DWP(1,$j));
&adc ("edx",0);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&shr ("eax",31);
&cmp ($j,$_num);
&mov ($sbit,"eax");
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&jl (&label("sqr"));
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[num-1]
&mov ($carry,"edx");
&mul ($word); # ap[num-1]*ap[0]
&add ("eax",$carry);
&mov ($word,$_n0);
&adc ("edx",0);
&mov ($inp,$_np);
&lea ($carry,&DWP(0,$sbit,"eax",2));
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&shr ("eax",31);
&mov (&DWP($frame,"esp",$j,4),$carry); # tp[num-1]=
&lea ($carry,&DWP(0,"eax","edx",2));
&mov ("eax",&DWP(0,$inp)); # np[0]
&shr ("edx",31);
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
&mov (&DWP($frame+8,"esp",$j,4),"edx"); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&mov ($num,$j);
&adc ("edx",0);
&mov ("eax",&DWP(4,$inp)); # np[1]
&mov ($j,1);
&set_label("3rdmadd",16);
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(4,$inp,$j,4)); # np[j+1]
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
&mov ($carry,"edx");
&mul ($word); # np[j+1]*m
&add ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
&lea ($j,&DWP(2,$j));
&adc ("edx",0);
&add ($carry,"eax");
&mov ("eax",&DWP(0,$inp,$j,4)); # np[j+2]
&adc ("edx",0);
&cmp ($j,$num);
&mov (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
&jl (&label("3rdmadd"));
&mov ($carry,"edx");
&mul ($word); # np[j]*m
&add ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
&adc ("edx",0);
&add ($carry,"eax");
&adc ("edx",0);
&mov (&DWP($frame-4,"esp",$num,4),$carry); # tp[num-2]=
&mov ($j,$_bp); # i
&xor ("eax","eax");
&mov ($inp,$_ap);
&add ("edx",&DWP($frame+4,"esp",$num,4)); # carry+=tp[num]
&adc ("eax",&DWP($frame+8,"esp",$num,4)); # +=tp[num+1]
&mov (&DWP($frame,"esp",$num,4),"edx"); # tp[num-1]=
&cmp ($j,$num);
&mov (&DWP($frame+4,"esp",$num,4),"eax"); # tp[num]=
&je (&label("common_tail"));
&mov ($word,&DWP(4,$inp,$j,4)); # ap[i]
&lea ($j,&DWP(1,$j));
&mov ("eax",$word);
&mov ($_bp,$j); # ++i
&mul ($word); # ap[i]*ap[i]
&add ("eax",&DWP($frame,"esp",$j,4)); # +=tp[i]
&adc ("edx",0);
&mov (&DWP($frame,"esp",$j,4),"eax"); # tp[i]=
&xor ($carry,$carry);
&cmp ($j,$num);
&lea ($j,&DWP(1,$j));
&je (&label("sqrlast"));
&mov ($sbit,"edx"); # zaps $num
&shr ("edx",1);
&and ($sbit,1);
&set_label("sqradd",16);
&mov ("eax",&DWP(0,$inp,$j,4)); # ap[j]
&mov ($carry,"edx");
&mul ($word); # ap[j]*ap[i]
&add ("eax",$carry);
&lea ($carry,&DWP(0,"eax","eax"));
&adc ("edx",0);
&shr ("eax",31);
&add ($carry,&DWP($frame,"esp",$j,4)); # +=tp[j]
&lea ($j,&DWP(1,$j));
&adc ("eax",0);
&add ($carry,$sbit);
&adc ("eax",0);
&cmp ($j,$_num);
&mov (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
&mov ($sbit,"eax");
&jle (&label("sqradd"));
&mov ($carry,"edx");
&add ("edx","edx");
&shr ($carry,31);
&add ("edx",$sbit);
&adc ($carry,0);
&set_label("sqrlast");
&mov ($word,$_n0);
&mov ($inp,$_np);
&imul ($word,&DWP($frame,"esp")); # n0*tp[0]
&add ("edx",&DWP($frame,"esp",$j,4)); # +=tp[num]
&mov ("eax",&DWP(0,$inp)); # np[0]
&adc ($carry,0);
&mov (&DWP($frame,"esp",$j,4),"edx"); # tp[num]=
&mov (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
&mul ($word); # np[0]*m
&add ("eax",&DWP($frame,"esp")); # +=tp[0]
&lea ($num,&DWP(-1,$j));
&adc ("edx",0);
&mov ($j,1);
&mov ("eax",&DWP(4,$inp)); # np[1]
&jmp (&label("3rdmadd"));
}
&set_label("common_tail",16);
&mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
&lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
&mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
&xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
&dec ($j); # doesn't affect CF!
&mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
&lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
&mov ("edx",-1);
&xor ("edx","eax");
&jmp (&label("copy"));
&set_label("copy",16); # conditional copy
&mov ($tp,&DWP($frame,"esp",$num,4));
&mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
&and ($tp,"eax");
&and ($np,"edx");
&or ($np,$tp);
&mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
&function_end("bn_mul_mont");
&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View file

@ -0,0 +1,643 @@
/*
* Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "../bn_lcl.h"
#if !(defined(__GNUC__) && __GNUC__>=2)
# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
#else
/*-
* x86_64 BIGNUM accelerator version 0.1, December 2002.
*
* Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
* project.
*
* Rights for redistribution and usage in source and binary forms are
* granted according to the OpenSSL license. Warranty of any kind is
* disclaimed.
*
* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
* versions, like 1.0...
* A. Well, that's because this code is basically a quick-n-dirty
* proof-of-concept hack. As you can see it's implemented with
* inline assembler, which means that you're bound to GCC and that
* there might be enough room for further improvement.
*
* Q. Why inline assembler?
* A. x86_64 features own ABI which I'm not familiar with. This is
* why I decided to let the compiler take care of subroutine
* prologue/epilogue as well as register allocation. For reference.
* Win64 implements different ABI for AMD64, different from Linux.
*
* Q. How much faster does it get?
* A. 'apps/openssl speed rsa dsa' output with no-asm:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
* sign verify sign/s verify/s
* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
*
* 'apps/openssl speed rsa dsa' output with this module:
*
* sign verify sign/s verify/s
* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
* sign verify sign/s verify/s
* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
*
* For the reference. IA-32 assembler implementation performs
* very much like 64-bit code compiled with no-asm on the same
* machine.
*/
# undef mul
# undef mul_add
/*-
* "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
* "g"(0) let the compiler to decide where does it
* want to keep the value of zero;
*/
# define mul_add(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"m"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+m"(r),"+d"(high) \
: "r"(carry),"g"(0) \
: "cc"); \
carry=high; \
} while (0)
# define mul(r,a,word,carry) do { \
register BN_ULONG high,low; \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(word),"g"(a) \
: "cc"); \
asm ("addq %2,%0; adcq %3,%1" \
: "+r"(carry),"+d"(high)\
: "a"(low),"g"(0) \
: "cc"); \
(r)=carry, carry=high; \
} while (0)
# undef sqr
# define sqr(r0,r1,a) \
asm ("mulq %2" \
: "=a"(r0),"=d"(r1) \
: "a"(a) \
: "cc");
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return c1;
while (num & ~3) {
mul_add(rp[0], ap[0], w, c1);
mul_add(rp[1], ap[1], w, c1);
mul_add(rp[2], ap[2], w, c1);
mul_add(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul_add(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul_add(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul_add(rp[2], ap[2], w, c1);
return c1;
}
return c1;
}
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
{
BN_ULONG c1 = 0;
if (num <= 0)
return c1;
while (num & ~3) {
mul(rp[0], ap[0], w, c1);
mul(rp[1], ap[1], w, c1);
mul(rp[2], ap[2], w, c1);
mul(rp[3], ap[3], w, c1);
ap += 4;
rp += 4;
num -= 4;
}
if (num) {
mul(rp[0], ap[0], w, c1);
if (--num == 0)
return c1;
mul(rp[1], ap[1], w, c1);
if (--num == 0)
return c1;
mul(rp[2], ap[2], w, c1);
}
return c1;
}
void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
{
if (n <= 0)
return;
while (n & ~3) {
sqr(r[0], r[1], a[0]);
sqr(r[2], r[3], a[1]);
sqr(r[4], r[5], a[2]);
sqr(r[6], r[7], a[3]);
a += 4;
r += 8;
n -= 4;
}
if (n) {
sqr(r[0], r[1], a[0]);
if (--n == 0)
return;
sqr(r[2], r[3], a[1]);
if (--n == 0)
return;
sqr(r[4], r[5], a[2]);
}
}
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
{
BN_ULONG ret, waste;
asm("divq %4":"=a"(ret), "=d"(waste)
: "a"(l), "d"(h), "r"(d)
: "cc");
return ret;
}
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear carry */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" adcq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" dec %1 \n"
" jnz 1b \n"
" sbbq %0,%0 \n"
:"=&r" (ret), "+c"(n), "+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# ifndef SIMICS
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n)
{
BN_ULONG ret;
size_t i = 0;
if (n <= 0)
return 0;
asm volatile (" subq %0,%0 \n" /* clear borrow */
" jmp 1f \n"
".p2align 4 \n"
"1: movq (%4,%2,8),%0 \n"
" sbbq (%5,%2,8),%0 \n"
" movq %0,(%3,%2,8) \n"
" lea 1(%2),%2 \n"
" dec %1 \n"
" jnz 1b \n"
" sbbq %0,%0 \n"
:"=&r" (ret), "+c"(n), "+r"(i)
:"r"(rp), "r"(ap), "r"(bp)
:"cc", "memory");
return ret & 1;
}
# else
/* Simics 1.4<7 has buggy sbbq:-( */
# define BN_MASK2 0xffffffffffffffffL
BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
{
BN_ULONG t1, t2;
int c = 0;
if (n <= 0)
return (BN_ULONG)0;
for (;;) {
t1 = a[0];
t2 = b[0];
r[0] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[1];
t2 = b[1];
r[1] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[2];
t2 = b[2];
r[2] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
t1 = a[3];
t2 = b[3];
r[3] = (t1 - t2 - c) & BN_MASK2;
if (t1 != t2)
c = (t1 < t2);
if (--n <= 0)
break;
a += 4;
b += 4;
r += 4;
}
return c;
}
# endif
/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
/*
* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
* c=(c2,c1,c0)
*/
/*
* Keep in mind that carrying into high part of multiplication result
* can not overflow, because it cannot be all-ones.
*/
# if 0
/* original macros are kept for reference purposes */
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG ta = (a), tb = (b); \
BN_ULONG lo, hi, tt; \
BN_UMULT_LOHI(lo,hi,ta,tb); \
c0 += lo; tt = hi+((c0<lo)?1:0); \
c1 += tt; c2 += (c1<tt)?1:0; \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG ta = (a)[i]; \
BN_ULONG lo, hi; \
BN_UMULT_LOHI(lo,hi,ta,ta); \
c0 += lo; hi += (c0<lo)?1:0; \
c1 += hi; c2 += (c1<hi)?1:0; \
} while(0)
# else
# define mul_add_c(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define sqr_add_c(a,i,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %2" \
: "=a"(t1),"=d"(t2) \
: "a"(a[i]) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# define mul_add_c2(a,b,c0,c1,c2) do { \
BN_ULONG t1,t2; \
asm ("mulq %3" \
: "=a"(t1),"=d"(t2) \
: "a"(a),"m"(b) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
: "+r"(c0),"+r"(c1),"+r"(c2) \
: "r"(t1),"r"(t2),"g"(0) \
: "cc"); \
} while (0)
# endif
# define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[4], b[0], c2, c3, c1);
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
mul_add_c(a[0], b[4], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[0], b[5], c3, c1, c2);
mul_add_c(a[1], b[4], c3, c1, c2);
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
mul_add_c(a[4], b[1], c3, c1, c2);
mul_add_c(a[5], b[0], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[6], b[0], c1, c2, c3);
mul_add_c(a[5], b[1], c1, c2, c3);
mul_add_c(a[4], b[2], c1, c2, c3);
mul_add_c(a[3], b[3], c1, c2, c3);
mul_add_c(a[2], b[4], c1, c2, c3);
mul_add_c(a[1], b[5], c1, c2, c3);
mul_add_c(a[0], b[6], c1, c2, c3);
r[6] = c1;
c1 = 0;
mul_add_c(a[0], b[7], c2, c3, c1);
mul_add_c(a[1], b[6], c2, c3, c1);
mul_add_c(a[2], b[5], c2, c3, c1);
mul_add_c(a[3], b[4], c2, c3, c1);
mul_add_c(a[4], b[3], c2, c3, c1);
mul_add_c(a[5], b[2], c2, c3, c1);
mul_add_c(a[6], b[1], c2, c3, c1);
mul_add_c(a[7], b[0], c2, c3, c1);
r[7] = c2;
c2 = 0;
mul_add_c(a[7], b[1], c3, c1, c2);
mul_add_c(a[6], b[2], c3, c1, c2);
mul_add_c(a[5], b[3], c3, c1, c2);
mul_add_c(a[4], b[4], c3, c1, c2);
mul_add_c(a[3], b[5], c3, c1, c2);
mul_add_c(a[2], b[6], c3, c1, c2);
mul_add_c(a[1], b[7], c3, c1, c2);
r[8] = c3;
c3 = 0;
mul_add_c(a[2], b[7], c1, c2, c3);
mul_add_c(a[3], b[6], c1, c2, c3);
mul_add_c(a[4], b[5], c1, c2, c3);
mul_add_c(a[5], b[4], c1, c2, c3);
mul_add_c(a[6], b[3], c1, c2, c3);
mul_add_c(a[7], b[2], c1, c2, c3);
r[9] = c1;
c1 = 0;
mul_add_c(a[7], b[3], c2, c3, c1);
mul_add_c(a[6], b[4], c2, c3, c1);
mul_add_c(a[5], b[5], c2, c3, c1);
mul_add_c(a[4], b[6], c2, c3, c1);
mul_add_c(a[3], b[7], c2, c3, c1);
r[10] = c2;
c2 = 0;
mul_add_c(a[4], b[7], c3, c1, c2);
mul_add_c(a[5], b[6], c3, c1, c2);
mul_add_c(a[6], b[5], c3, c1, c2);
mul_add_c(a[7], b[4], c3, c1, c2);
r[11] = c3;
c3 = 0;
mul_add_c(a[7], b[5], c1, c2, c3);
mul_add_c(a[6], b[6], c1, c2, c3);
mul_add_c(a[5], b[7], c1, c2, c3);
r[12] = c1;
c1 = 0;
mul_add_c(a[6], b[7], c2, c3, c1);
mul_add_c(a[7], b[6], c2, c3, c1);
r[13] = c2;
c2 = 0;
mul_add_c(a[7], b[7], c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
mul_add_c(a[0], b[0], c1, c2, c3);
r[0] = c1;
c1 = 0;
mul_add_c(a[0], b[1], c2, c3, c1);
mul_add_c(a[1], b[0], c2, c3, c1);
r[1] = c2;
c2 = 0;
mul_add_c(a[2], b[0], c3, c1, c2);
mul_add_c(a[1], b[1], c3, c1, c2);
mul_add_c(a[0], b[2], c3, c1, c2);
r[2] = c3;
c3 = 0;
mul_add_c(a[0], b[3], c1, c2, c3);
mul_add_c(a[1], b[2], c1, c2, c3);
mul_add_c(a[2], b[1], c1, c2, c3);
mul_add_c(a[3], b[0], c1, c2, c3);
r[3] = c1;
c1 = 0;
mul_add_c(a[3], b[1], c2, c3, c1);
mul_add_c(a[2], b[2], c2, c3, c1);
mul_add_c(a[1], b[3], c2, c3, c1);
r[4] = c2;
c2 = 0;
mul_add_c(a[2], b[3], c3, c1, c2);
mul_add_c(a[3], b[2], c3, c1, c2);
r[5] = c3;
c3 = 0;
mul_add_c(a[3], b[3], c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
sqr_add_c2(a, 4, 0, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 5, 0, c3, c1, c2);
sqr_add_c2(a, 4, 1, c3, c1, c2);
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
sqr_add_c2(a, 4, 2, c1, c2, c3);
sqr_add_c2(a, 5, 1, c1, c2, c3);
sqr_add_c2(a, 6, 0, c1, c2, c3);
r[6] = c1;
c1 = 0;
sqr_add_c2(a, 7, 0, c2, c3, c1);
sqr_add_c2(a, 6, 1, c2, c3, c1);
sqr_add_c2(a, 5, 2, c2, c3, c1);
sqr_add_c2(a, 4, 3, c2, c3, c1);
r[7] = c2;
c2 = 0;
sqr_add_c(a, 4, c3, c1, c2);
sqr_add_c2(a, 5, 3, c3, c1, c2);
sqr_add_c2(a, 6, 2, c3, c1, c2);
sqr_add_c2(a, 7, 1, c3, c1, c2);
r[8] = c3;
c3 = 0;
sqr_add_c2(a, 7, 2, c1, c2, c3);
sqr_add_c2(a, 6, 3, c1, c2, c3);
sqr_add_c2(a, 5, 4, c1, c2, c3);
r[9] = c1;
c1 = 0;
sqr_add_c(a, 5, c2, c3, c1);
sqr_add_c2(a, 6, 4, c2, c3, c1);
sqr_add_c2(a, 7, 3, c2, c3, c1);
r[10] = c2;
c2 = 0;
sqr_add_c2(a, 7, 4, c3, c1, c2);
sqr_add_c2(a, 6, 5, c3, c1, c2);
r[11] = c3;
c3 = 0;
sqr_add_c(a, 6, c1, c2, c3);
sqr_add_c2(a, 7, 5, c1, c2, c3);
r[12] = c1;
c1 = 0;
sqr_add_c2(a, 7, 6, c2, c3, c1);
r[13] = c2;
c2 = 0;
sqr_add_c(a, 7, c3, c1, c2);
r[14] = c3;
r[15] = c1;
}
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
{
BN_ULONG c1, c2, c3;
c1 = 0;
c2 = 0;
c3 = 0;
sqr_add_c(a, 0, c1, c2, c3);
r[0] = c1;
c1 = 0;
sqr_add_c2(a, 1, 0, c2, c3, c1);
r[1] = c2;
c2 = 0;
sqr_add_c(a, 1, c3, c1, c2);
sqr_add_c2(a, 2, 0, c3, c1, c2);
r[2] = c3;
c3 = 0;
sqr_add_c2(a, 3, 0, c1, c2, c3);
sqr_add_c2(a, 2, 1, c1, c2, c3);
r[3] = c1;
c1 = 0;
sqr_add_c(a, 2, c2, c3, c1);
sqr_add_c2(a, 3, 1, c2, c3, c1);
r[4] = c2;
c2 = 0;
sqr_add_c2(a, 3, 2, c3, c1, c2);
r[5] = c3;
c3 = 0;
sqr_add_c(a, 3, c1, c2, c3);
r[6] = c1;
r[7] = c2;
}
#endif

View file

@ -0,0 +1,424 @@
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# May 2011
#
# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
# the time being... Except that it has two code paths: code suitable
# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
# later. Improvement varies from one benchmark and µ-arch to another.
# Vanilla code path is at most 20% faster than compiler-generated code
# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
# all CPU time is burnt in it...
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
($lo,$hi)=("%rax","%rdx"); $a=$lo;
($i0,$i1)=("%rsi","%rdi");
($t0,$t1)=("%rbx","%rcx");
($b,$mask)=("%rbp","%r8");
($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
($R,$Tx)=("%xmm0","%xmm1");
$code.=<<___;
.text
.type _mul_1x1,\@abi-omnipotent
.align 16
_mul_1x1:
.cfi_startproc
sub \$128+8,%rsp
.cfi_adjust_cfa_offset 128+8
mov \$-1,$a1
lea ($a,$a),$i0
shr \$3,$a1
lea (,$a,4),$i1
and $a,$a1 # a1=a&0x1fffffffffffffff
lea (,$a,8),$a8
sar \$63,$a # broadcast 63rd bit
lea ($a1,$a1),$a2
sar \$63,$i0 # broadcast 62nd bit
lea (,$a1,4),$a4
and $b,$a
sar \$63,$i1 # broadcast 61st bit
mov $a,$hi # $a is $lo
shl \$63,$lo
and $b,$i0
shr \$1,$hi
mov $i0,$t1
shl \$62,$i0
and $b,$i1
shr \$2,$t1
xor $i0,$lo
mov $i1,$t0
shl \$61,$i1
xor $t1,$hi
shr \$3,$t0
xor $i1,$lo
xor $t0,$hi
mov $a1,$a12
movq \$0,0(%rsp) # tab[0]=0
xor $a2,$a12 # a1^a2
mov $a1,8(%rsp) # tab[1]=a1
mov $a4,$a48
mov $a2,16(%rsp) # tab[2]=a2
xor $a8,$a48 # a4^a8
mov $a12,24(%rsp) # tab[3]=a1^a2
xor $a4,$a1
mov $a4,32(%rsp) # tab[4]=a4
xor $a4,$a2
mov $a1,40(%rsp) # tab[5]=a1^a4
xor $a4,$a12
mov $a2,48(%rsp) # tab[6]=a2^a4
xor $a48,$a1 # a1^a4^a4^a8=a1^a8
mov $a12,56(%rsp) # tab[7]=a1^a2^a4
xor $a48,$a2 # a2^a4^a4^a8=a1^a8
mov $a8,64(%rsp) # tab[8]=a8
xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8
mov $a1,72(%rsp) # tab[9]=a1^a8
xor $a4,$a1 # a1^a8^a4
mov $a2,80(%rsp) # tab[10]=a2^a8
xor $a4,$a2 # a2^a8^a4
mov $a12,88(%rsp) # tab[11]=a1^a2^a8
xor $a4,$a12 # a1^a2^a8^a4
mov $a48,96(%rsp) # tab[12]=a4^a8
mov $mask,$i0
mov $a1,104(%rsp) # tab[13]=a1^a4^a8
and $b,$i0
mov $a2,112(%rsp) # tab[14]=a2^a4^a8
shr \$4,$b
mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8
mov $mask,$i1
and $b,$i1
shr \$4,$b
movq (%rsp,$i0,8),$R # half of calculations is done in SSE2
mov $mask,$i0
and $b,$i0
shr \$4,$b
___
for ($n=1;$n<8;$n++) {
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $mask,$i1
mov $t1,$t0
shl \$`8*$n-4`,$t1
and $b,$i1
movq (%rsp,$i0,8),$Tx
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
pslldq \$$n,$Tx
mov $mask,$i0
shr \$4,$b
xor $t0,$hi
and $b,$i0
shr \$4,$b
pxor $Tx,$R
___
}
$code.=<<___;
mov (%rsp,$i1,8),$t1
mov $t1,$t0
shl \$`8*$n-4`,$t1
movq $R,$i0
shr \$`64-(8*$n-4)`,$t0
xor $t1,$lo
psrldq \$8,$R
xor $t0,$hi
movq $R,$i1
xor $i0,$lo
xor $i1,$hi
add \$128+8,%rsp
.cfi_adjust_cfa_offset -128-8
ret
.Lend_mul_1x1:
.cfi_endproc
.size _mul_1x1,.-_mul_1x1
___
($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order
("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order
$code.=<<___;
.extern OPENSSL_ia32cap_P
.globl bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,\@abi-omnipotent
.align 16
bn_GF2m_mul_2x2:
.cfi_startproc
mov %rsp,%rax
mov OPENSSL_ia32cap_P(%rip),%r10
bt \$33,%r10
jnc .Lvanilla_mul_2x2
movq $a1,%xmm0
movq $b1,%xmm1
movq $a0,%xmm2
___
$code.=<<___ if ($win64);
movq 40(%rsp),%xmm3
___
$code.=<<___ if (!$win64);
movq $b0,%xmm3
___
$code.=<<___;
movdqa %xmm0,%xmm4
movdqa %xmm1,%xmm5
pclmulqdq \$0,%xmm1,%xmm0 # a1·b1
pxor %xmm2,%xmm4
pxor %xmm3,%xmm5
pclmulqdq \$0,%xmm3,%xmm2 # a0·b0
pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
xorps %xmm0,%xmm4
xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1
movdqa %xmm4,%xmm5
pslldq \$8,%xmm4
psrldq \$8,%xmm5
pxor %xmm4,%xmm2
pxor %xmm5,%xmm0
movdqu %xmm2,0($rp)
movdqu %xmm0,16($rp)
ret
.align 16
.Lvanilla_mul_2x2:
lea -8*17(%rsp),%rsp
.cfi_adjust_cfa_offset 8*17
___
$code.=<<___ if ($win64);
mov `8*17+40`(%rsp),$b0
mov %rdi,8*15(%rsp)
mov %rsi,8*16(%rsp)
___
$code.=<<___;
mov %r14,8*10(%rsp)
.cfi_rel_offset %r14,8*10
mov %r13,8*11(%rsp)
.cfi_rel_offset %r13,8*11
mov %r12,8*12(%rsp)
.cfi_rel_offset %r12,8*12
mov %rbp,8*13(%rsp)
.cfi_rel_offset %rbp,8*13
mov %rbx,8*14(%rsp)
.cfi_rel_offset %rbx,8*14
.Lbody_mul_2x2:
mov $rp,32(%rsp) # save the arguments
mov $a1,40(%rsp)
mov $a0,48(%rsp)
mov $b1,56(%rsp)
mov $b0,64(%rsp)
mov \$0xf,$mask
mov $a1,$a
mov $b1,$b
call _mul_1x1 # a1·b1
mov $lo,16(%rsp)
mov $hi,24(%rsp)
mov 48(%rsp),$a
mov 64(%rsp),$b
call _mul_1x1 # a0·b0
mov $lo,0(%rsp)
mov $hi,8(%rsp)
mov 40(%rsp),$a
mov 56(%rsp),$b
xor 48(%rsp),$a
xor 64(%rsp),$b
call _mul_1x1 # (a0+a1)·(b0+b1)
___
@r=("%rbx","%rcx","%rdi","%rsi");
$code.=<<___;
mov 0(%rsp),@r[0]
mov 8(%rsp),@r[1]
mov 16(%rsp),@r[2]
mov 24(%rsp),@r[3]
mov 32(%rsp),%rbp
xor $hi,$lo
xor @r[1],$hi
xor @r[0],$lo
mov @r[0],0(%rbp)
xor @r[2],$hi
mov @r[3],24(%rbp)
xor @r[3],$lo
xor @r[3],$hi
xor $hi,$lo
mov $hi,16(%rbp)
mov $lo,8(%rbp)
mov 8*10(%rsp),%r14
.cfi_restore %r14
mov 8*11(%rsp),%r13
.cfi_restore %r13
mov 8*12(%rsp),%r12
.cfi_restore %r12
mov 8*13(%rsp),%rbp
.cfi_restore %rbp
mov 8*14(%rsp),%rbx
.cfi_restore %rbx
___
$code.=<<___ if ($win64);
mov 8*15(%rsp),%rdi
mov 8*16(%rsp),%rsi
___
$code.=<<___;
lea 8*17(%rsp),%rsp
.cfi_adjust_cfa_offset -8*17
.Lepilogue_mul_2x2:
ret
.Lend_mul_2x2:
.cfi_endproc
.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
lea .Lbody_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
lea .Lepilogue_mul_2x2(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lin_prologue
mov 8*10(%rax),%r14 # mimic epilogue
mov 8*11(%rax),%r13
mov 8*12(%rax),%r12
mov 8*13(%rax),%rbp
mov 8*14(%rax),%rbx
mov 8*15(%rax),%rdi
mov 8*16(%rax),%rsi
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
lea 8*17(%rax),%rax
.Lin_prologue:
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
.rva _mul_1x1
.rva .Lend_mul_1x1
.rva .LSEH_info_1x1
.rva .Lvanilla_mul_2x2
.rva .Lend_mul_2x2
.rva .LSEH_info_2x2
.section .xdata
.align 8
.LSEH_info_1x1:
.byte 0x01,0x07,0x02,0x00
.byte 0x07,0x01,0x11,0x00 # sub rsp,128+8
.LSEH_info_2x2:
.byte 9,0,0,0
.rva se_handler
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,171 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/* signed add of b to a. */
int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int ret, r_neg, cmp_res;
bn_check_top(a);
bn_check_top(b);
if (a->neg == b->neg) {
r_neg = a->neg;
ret = BN_uadd(r, a, b);
} else {
cmp_res = BN_ucmp(a, b);
if (cmp_res > 0) {
r_neg = a->neg;
ret = BN_usub(r, a, b);
} else if (cmp_res < 0) {
r_neg = b->neg;
ret = BN_usub(r, b, a);
} else {
r_neg = 0;
BN_zero(r);
ret = 1;
}
}
r->neg = r_neg;
bn_check_top(r);
return ret;
}
/* signed sub of b from a. */
int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int ret, r_neg, cmp_res;
bn_check_top(a);
bn_check_top(b);
if (a->neg != b->neg) {
r_neg = a->neg;
ret = BN_uadd(r, a, b);
} else {
cmp_res = BN_ucmp(a, b);
if (cmp_res > 0) {
r_neg = a->neg;
ret = BN_usub(r, a, b);
} else if (cmp_res < 0) {
r_neg = !b->neg;
ret = BN_usub(r, b, a);
} else {
r_neg = 0;
BN_zero(r);
ret = 1;
}
}
r->neg = r_neg;
bn_check_top(r);
return ret;
}
/* unsigned add of b to a, r can be equal to a or b. */
int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int max, min, dif;
const BN_ULONG *ap, *bp;
BN_ULONG *rp, carry, t1, t2;
bn_check_top(a);
bn_check_top(b);
if (a->top < b->top) {
const BIGNUM *tmp;
tmp = a;
a = b;
b = tmp;
}
max = a->top;
min = b->top;
dif = max - min;
if (bn_wexpand(r, max + 1) == NULL)
return 0;
r->top = max;
ap = a->d;
bp = b->d;
rp = r->d;
carry = bn_add_words(rp, ap, bp, min);
rp += min;
ap += min;
while (dif) {
dif--;
t1 = *(ap++);
t2 = (t1 + carry) & BN_MASK2;
*(rp++) = t2;
carry &= (t2 == 0);
}
*rp = carry;
r->top += carry;
r->neg = 0;
bn_check_top(r);
return 1;
}
/* unsigned subtraction of b from a, a must be larger than b. */
int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
{
int max, min, dif;
BN_ULONG t1, t2, borrow, *rp;
const BN_ULONG *ap, *bp;
bn_check_top(a);
bn_check_top(b);
max = a->top;
min = b->top;
dif = max - min;
if (dif < 0) { /* hmm... should not be happening */
BNerr(BN_F_BN_USUB, BN_R_ARG2_LT_ARG3);
return 0;
}
if (bn_wexpand(r, max) == NULL)
return 0;
ap = a->d;
bp = b->d;
rp = r->d;
borrow = bn_sub_words(rp, ap, bp, min);
ap += min;
rp += min;
while (dif) {
dif--;
t1 = *(ap++);
t2 = (t1 - borrow) & BN_MASK2;
*(rp++) = t2;
borrow &= (t1 == 0);
}
while (max && *--rp == 0)
max--;
r->top = max;
r->neg = 0;
bn_pollute(r);
return 1;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,312 @@
/*
* Copyright 1998-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/opensslconf.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#define BN_BLINDING_COUNTER 32
struct bn_blinding_st {
BIGNUM *A;
BIGNUM *Ai;
BIGNUM *e;
BIGNUM *mod; /* just a reference */
CRYPTO_THREAD_ID tid;
int counter;
unsigned long flags;
BN_MONT_CTX *m_ctx;
int (*bn_mod_exp) (BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
CRYPTO_RWLOCK *lock;
};
BN_BLINDING *BN_BLINDING_new(const BIGNUM *A, const BIGNUM *Ai, BIGNUM *mod)
{
BN_BLINDING *ret = NULL;
bn_check_top(mod);
if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_BLINDING_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
ret->lock = CRYPTO_THREAD_lock_new();
if (ret->lock == NULL) {
BNerr(BN_F_BN_BLINDING_NEW, ERR_R_MALLOC_FAILURE);
OPENSSL_free(ret);
return NULL;
}
BN_BLINDING_set_current_thread(ret);
if (A != NULL) {
if ((ret->A = BN_dup(A)) == NULL)
goto err;
}
if (Ai != NULL) {
if ((ret->Ai = BN_dup(Ai)) == NULL)
goto err;
}
/* save a copy of mod in the BN_BLINDING structure */
if ((ret->mod = BN_dup(mod)) == NULL)
goto err;
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(ret->mod, BN_FLG_CONSTTIME);
/*
* Set the counter to the special value -1 to indicate that this is
* never-used fresh blinding that does not need updating before first
* use.
*/
ret->counter = -1;
return ret;
err:
BN_BLINDING_free(ret);
return NULL;
}
void BN_BLINDING_free(BN_BLINDING *r)
{
if (r == NULL)
return;
BN_free(r->A);
BN_free(r->Ai);
BN_free(r->e);
BN_free(r->mod);
CRYPTO_THREAD_lock_free(r->lock);
OPENSSL_free(r);
}
int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
{
int ret = 0;
if ((b->A == NULL) || (b->Ai == NULL)) {
BNerr(BN_F_BN_BLINDING_UPDATE, BN_R_NOT_INITIALIZED);
goto err;
}
if (b->counter == -1)
b->counter = 0;
if (++b->counter == BN_BLINDING_COUNTER && b->e != NULL &&
!(b->flags & BN_BLINDING_NO_RECREATE)) {
/* re-create blinding parameters */
if (!BN_BLINDING_create_param(b, NULL, NULL, ctx, NULL, NULL))
goto err;
} else if (!(b->flags & BN_BLINDING_NO_UPDATE)) {
if (b->m_ctx != NULL) {
if (!bn_mul_mont_fixed_top(b->Ai, b->Ai, b->Ai, b->m_ctx, ctx)
|| !bn_mul_mont_fixed_top(b->A, b->A, b->A, b->m_ctx, ctx))
goto err;
} else {
if (!BN_mod_mul(b->Ai, b->Ai, b->Ai, b->mod, ctx)
|| !BN_mod_mul(b->A, b->A, b->A, b->mod, ctx))
goto err;
}
}
ret = 1;
err:
if (b->counter == BN_BLINDING_COUNTER)
b->counter = 0;
return ret;
}
int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
return BN_BLINDING_convert_ex(n, NULL, b, ctx);
}
int BN_BLINDING_convert_ex(BIGNUM *n, BIGNUM *r, BN_BLINDING *b, BN_CTX *ctx)
{
int ret = 1;
bn_check_top(n);
if ((b->A == NULL) || (b->Ai == NULL)) {
BNerr(BN_F_BN_BLINDING_CONVERT_EX, BN_R_NOT_INITIALIZED);
return 0;
}
if (b->counter == -1)
/* Fresh blinding, doesn't need updating. */
b->counter = 0;
else if (!BN_BLINDING_update(b, ctx))
return 0;
if (r != NULL && (BN_copy(r, b->Ai) == NULL))
return 0;
if (b->m_ctx != NULL)
ret = BN_mod_mul_montgomery(n, n, b->A, b->m_ctx, ctx);
else
ret = BN_mod_mul(n, n, b->A, b->mod, ctx);
return ret;
}
int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
{
return BN_BLINDING_invert_ex(n, NULL, b, ctx);
}
int BN_BLINDING_invert_ex(BIGNUM *n, const BIGNUM *r, BN_BLINDING *b,
BN_CTX *ctx)
{
int ret;
bn_check_top(n);
if (r == NULL && (r = b->Ai) == NULL) {
BNerr(BN_F_BN_BLINDING_INVERT_EX, BN_R_NOT_INITIALIZED);
return 0;
}
if (b->m_ctx != NULL) {
/* ensure that BN_mod_mul_montgomery takes pre-defined path */
if (n->dmax >= r->top) {
size_t i, rtop = r->top, ntop = n->top;
BN_ULONG mask;
for (i = 0; i < rtop; i++) {
mask = (BN_ULONG)0 - ((i - ntop) >> (8 * sizeof(i) - 1));
n->d[i] &= mask;
}
mask = (BN_ULONG)0 - ((rtop - ntop) >> (8 * sizeof(ntop) - 1));
/* always true, if (rtop >= ntop) n->top = r->top; */
n->top = (int)(rtop & ~mask) | (ntop & mask);
n->flags |= (BN_FLG_FIXED_TOP & ~mask);
}
ret = BN_mod_mul_montgomery(n, n, r, b->m_ctx, ctx);
} else {
ret = BN_mod_mul(n, n, r, b->mod, ctx);
}
bn_check_top(n);
return ret;
}
int BN_BLINDING_is_current_thread(BN_BLINDING *b)
{
return CRYPTO_THREAD_compare_id(CRYPTO_THREAD_get_current_id(), b->tid);
}
void BN_BLINDING_set_current_thread(BN_BLINDING *b)
{
b->tid = CRYPTO_THREAD_get_current_id();
}
int BN_BLINDING_lock(BN_BLINDING *b)
{
return CRYPTO_THREAD_write_lock(b->lock);
}
int BN_BLINDING_unlock(BN_BLINDING *b)
{
return CRYPTO_THREAD_unlock(b->lock);
}
unsigned long BN_BLINDING_get_flags(const BN_BLINDING *b)
{
return b->flags;
}
void BN_BLINDING_set_flags(BN_BLINDING *b, unsigned long flags)
{
b->flags = flags;
}
BN_BLINDING *BN_BLINDING_create_param(BN_BLINDING *b,
const BIGNUM *e, BIGNUM *m, BN_CTX *ctx,
int (*bn_mod_exp) (BIGNUM *r,
const BIGNUM *a,
const BIGNUM *p,
const BIGNUM *m,
BN_CTX *ctx,
BN_MONT_CTX *m_ctx),
BN_MONT_CTX *m_ctx)
{
int retry_counter = 32;
BN_BLINDING *ret = NULL;
if (b == NULL)
ret = BN_BLINDING_new(NULL, NULL, m);
else
ret = b;
if (ret == NULL)
goto err;
if (ret->A == NULL && (ret->A = BN_new()) == NULL)
goto err;
if (ret->Ai == NULL && (ret->Ai = BN_new()) == NULL)
goto err;
if (e != NULL) {
BN_free(ret->e);
ret->e = BN_dup(e);
}
if (ret->e == NULL)
goto err;
if (bn_mod_exp != NULL)
ret->bn_mod_exp = bn_mod_exp;
if (m_ctx != NULL)
ret->m_ctx = m_ctx;
do {
int rv;
if (!BN_priv_rand_range(ret->A, ret->mod))
goto err;
if (int_bn_mod_inverse(ret->Ai, ret->A, ret->mod, ctx, &rv))
break;
/*
* this should almost never happen for good RSA keys
*/
if (!rv)
goto err;
if (retry_counter-- == 0) {
BNerr(BN_F_BN_BLINDING_CREATE_PARAM, BN_R_TOO_MANY_ITERATIONS);
goto err;
}
} while (1);
if (ret->bn_mod_exp != NULL && ret->m_ctx != NULL) {
if (!ret->bn_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx, ret->m_ctx))
goto err;
} else {
if (!BN_mod_exp(ret->A, ret->A, ret->e, ret->mod, ctx))
goto err;
}
if (ret->m_ctx != NULL) {
if (!bn_to_mont_fixed_top(ret->Ai, ret->Ai, ret->m_ctx, ctx)
|| !bn_to_mont_fixed_top(ret->A, ret->A, ret->m_ctx, ctx))
goto err;
}
return ret;
err:
if (b == NULL) {
BN_BLINDING_free(ret);
ret = NULL;
}
return ret;
}

View file

@ -0,0 +1,553 @@
/*
* Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/bn.h>
/*-
* "First Oakley Default Group" from RFC2409, section 6.1.
*
* The prime is: 2^768 - 2 ^704 - 1 + 2^64 * { [2^638 pi] + 149686 }
*
* RFC2409 specifies a generator of 2.
* RFC2412 specifies a generator of of 22.
*/
BIGNUM *BN_get_rfc2409_prime_768(BIGNUM *bn)
{
static const unsigned char RFC2409_PRIME_768[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x3A, 0x36, 0x20,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC2409_PRIME_768, sizeof(RFC2409_PRIME_768), bn);
}
/*-
* "Second Oakley Default Group" from RFC2409, section 6.2.
*
* The prime is: 2^1024 - 2^960 - 1 + 2^64 * { [2^894 pi] + 129093 }.
*
* RFC2409 specifies a generator of 2.
* RFC2412 specifies a generator of 22.
*/
BIGNUM *BN_get_rfc2409_prime_1024(BIGNUM *bn)
{
static const unsigned char RFC2409_PRIME_1024[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE6, 0x53, 0x81,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC2409_PRIME_1024, sizeof(RFC2409_PRIME_1024), bn);
}
/*-
* "1536-bit MODP Group" from RFC3526, Section 2.
*
* The prime is: 2^1536 - 2^1472 - 1 + 2^64 * { [2^1406 pi] + 741804 }
*
* RFC3526 specifies a generator of 2.
* RFC2312 specifies a generator of 22.
*/
BIGNUM *BN_get_rfc3526_prime_1536(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_1536[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x23, 0x73, 0x27,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_1536, sizeof(RFC3526_PRIME_1536), bn);
}
/*-
* "2048-bit MODP Group" from RFC3526, Section 3.
*
* The prime is: 2^2048 - 2^1984 - 1 + 2^64 * { [2^1918 pi] + 124476 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *BN_get_rfc3526_prime_2048(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_2048[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAC, 0xAA, 0x68,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_2048, sizeof(RFC3526_PRIME_2048), bn);
}
/*-
* "3072-bit MODP Group" from RFC3526, Section 4.
*
* The prime is: 2^3072 - 2^3008 - 1 + 2^64 * { [2^2942 pi] + 1690314 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *BN_get_rfc3526_prime_3072(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_3072[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x3A, 0xD2, 0xCA,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_3072, sizeof(RFC3526_PRIME_3072), bn);
}
/*-
* "4096-bit MODP Group" from RFC3526, Section 5.
*
* The prime is: 2^4096 - 2^4032 - 1 + 2^64 * { [2^3966 pi] + 240904 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *BN_get_rfc3526_prime_4096(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_4096[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x06, 0x31, 0x99,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_4096, sizeof(RFC3526_PRIME_4096), bn);
}
/*-
* "6144-bit MODP Group" from RFC3526, Section 6.
*
* The prime is: 2^6144 - 2^6080 - 1 + 2^64 * { [2^6014 pi] + 929484 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *BN_get_rfc3526_prime_6144(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_6144[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xCC, 0x40, 0x24,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_6144, sizeof(RFC3526_PRIME_6144), bn);
}
/*-
* "8192-bit MODP Group" from RFC3526, Section 7.
*
* The prime is: 2^8192 - 2^8128 - 1 + 2^64 * { [2^8062 pi] + 4743158 }
*
* RFC3526 specifies a generator of 2.
*/
BIGNUM *BN_get_rfc3526_prime_8192(BIGNUM *bn)
{
static const unsigned char RFC3526_PRIME_8192[] = {
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xC9, 0x0F, 0xDA, 0xA2, 0x21, 0x68, 0xC2, 0x34,
0xC4, 0xC6, 0x62, 0x8B, 0x80, 0xDC, 0x1C, 0xD1,
0x29, 0x02, 0x4E, 0x08, 0x8A, 0x67, 0xCC, 0x74,
0x02, 0x0B, 0xBE, 0xA6, 0x3B, 0x13, 0x9B, 0x22,
0x51, 0x4A, 0x08, 0x79, 0x8E, 0x34, 0x04, 0xDD,
0xEF, 0x95, 0x19, 0xB3, 0xCD, 0x3A, 0x43, 0x1B,
0x30, 0x2B, 0x0A, 0x6D, 0xF2, 0x5F, 0x14, 0x37,
0x4F, 0xE1, 0x35, 0x6D, 0x6D, 0x51, 0xC2, 0x45,
0xE4, 0x85, 0xB5, 0x76, 0x62, 0x5E, 0x7E, 0xC6,
0xF4, 0x4C, 0x42, 0xE9, 0xA6, 0x37, 0xED, 0x6B,
0x0B, 0xFF, 0x5C, 0xB6, 0xF4, 0x06, 0xB7, 0xED,
0xEE, 0x38, 0x6B, 0xFB, 0x5A, 0x89, 0x9F, 0xA5,
0xAE, 0x9F, 0x24, 0x11, 0x7C, 0x4B, 0x1F, 0xE6,
0x49, 0x28, 0x66, 0x51, 0xEC, 0xE4, 0x5B, 0x3D,
0xC2, 0x00, 0x7C, 0xB8, 0xA1, 0x63, 0xBF, 0x05,
0x98, 0xDA, 0x48, 0x36, 0x1C, 0x55, 0xD3, 0x9A,
0x69, 0x16, 0x3F, 0xA8, 0xFD, 0x24, 0xCF, 0x5F,
0x83, 0x65, 0x5D, 0x23, 0xDC, 0xA3, 0xAD, 0x96,
0x1C, 0x62, 0xF3, 0x56, 0x20, 0x85, 0x52, 0xBB,
0x9E, 0xD5, 0x29, 0x07, 0x70, 0x96, 0x96, 0x6D,
0x67, 0x0C, 0x35, 0x4E, 0x4A, 0xBC, 0x98, 0x04,
0xF1, 0x74, 0x6C, 0x08, 0xCA, 0x18, 0x21, 0x7C,
0x32, 0x90, 0x5E, 0x46, 0x2E, 0x36, 0xCE, 0x3B,
0xE3, 0x9E, 0x77, 0x2C, 0x18, 0x0E, 0x86, 0x03,
0x9B, 0x27, 0x83, 0xA2, 0xEC, 0x07, 0xA2, 0x8F,
0xB5, 0xC5, 0x5D, 0xF0, 0x6F, 0x4C, 0x52, 0xC9,
0xDE, 0x2B, 0xCB, 0xF6, 0x95, 0x58, 0x17, 0x18,
0x39, 0x95, 0x49, 0x7C, 0xEA, 0x95, 0x6A, 0xE5,
0x15, 0xD2, 0x26, 0x18, 0x98, 0xFA, 0x05, 0x10,
0x15, 0x72, 0x8E, 0x5A, 0x8A, 0xAA, 0xC4, 0x2D,
0xAD, 0x33, 0x17, 0x0D, 0x04, 0x50, 0x7A, 0x33,
0xA8, 0x55, 0x21, 0xAB, 0xDF, 0x1C, 0xBA, 0x64,
0xEC, 0xFB, 0x85, 0x04, 0x58, 0xDB, 0xEF, 0x0A,
0x8A, 0xEA, 0x71, 0x57, 0x5D, 0x06, 0x0C, 0x7D,
0xB3, 0x97, 0x0F, 0x85, 0xA6, 0xE1, 0xE4, 0xC7,
0xAB, 0xF5, 0xAE, 0x8C, 0xDB, 0x09, 0x33, 0xD7,
0x1E, 0x8C, 0x94, 0xE0, 0x4A, 0x25, 0x61, 0x9D,
0xCE, 0xE3, 0xD2, 0x26, 0x1A, 0xD2, 0xEE, 0x6B,
0xF1, 0x2F, 0xFA, 0x06, 0xD9, 0x8A, 0x08, 0x64,
0xD8, 0x76, 0x02, 0x73, 0x3E, 0xC8, 0x6A, 0x64,
0x52, 0x1F, 0x2B, 0x18, 0x17, 0x7B, 0x20, 0x0C,
0xBB, 0xE1, 0x17, 0x57, 0x7A, 0x61, 0x5D, 0x6C,
0x77, 0x09, 0x88, 0xC0, 0xBA, 0xD9, 0x46, 0xE2,
0x08, 0xE2, 0x4F, 0xA0, 0x74, 0xE5, 0xAB, 0x31,
0x43, 0xDB, 0x5B, 0xFC, 0xE0, 0xFD, 0x10, 0x8E,
0x4B, 0x82, 0xD1, 0x20, 0xA9, 0x21, 0x08, 0x01,
0x1A, 0x72, 0x3C, 0x12, 0xA7, 0x87, 0xE6, 0xD7,
0x88, 0x71, 0x9A, 0x10, 0xBD, 0xBA, 0x5B, 0x26,
0x99, 0xC3, 0x27, 0x18, 0x6A, 0xF4, 0xE2, 0x3C,
0x1A, 0x94, 0x68, 0x34, 0xB6, 0x15, 0x0B, 0xDA,
0x25, 0x83, 0xE9, 0xCA, 0x2A, 0xD4, 0x4C, 0xE8,
0xDB, 0xBB, 0xC2, 0xDB, 0x04, 0xDE, 0x8E, 0xF9,
0x2E, 0x8E, 0xFC, 0x14, 0x1F, 0xBE, 0xCA, 0xA6,
0x28, 0x7C, 0x59, 0x47, 0x4E, 0x6B, 0xC0, 0x5D,
0x99, 0xB2, 0x96, 0x4F, 0xA0, 0x90, 0xC3, 0xA2,
0x23, 0x3B, 0xA1, 0x86, 0x51, 0x5B, 0xE7, 0xED,
0x1F, 0x61, 0x29, 0x70, 0xCE, 0xE2, 0xD7, 0xAF,
0xB8, 0x1B, 0xDD, 0x76, 0x21, 0x70, 0x48, 0x1C,
0xD0, 0x06, 0x91, 0x27, 0xD5, 0xB0, 0x5A, 0xA9,
0x93, 0xB4, 0xEA, 0x98, 0x8D, 0x8F, 0xDD, 0xC1,
0x86, 0xFF, 0xB7, 0xDC, 0x90, 0xA6, 0xC0, 0x8F,
0x4D, 0xF4, 0x35, 0xC9, 0x34, 0x02, 0x84, 0x92,
0x36, 0xC3, 0xFA, 0xB4, 0xD2, 0x7C, 0x70, 0x26,
0xC1, 0xD4, 0xDC, 0xB2, 0x60, 0x26, 0x46, 0xDE,
0xC9, 0x75, 0x1E, 0x76, 0x3D, 0xBA, 0x37, 0xBD,
0xF8, 0xFF, 0x94, 0x06, 0xAD, 0x9E, 0x53, 0x0E,
0xE5, 0xDB, 0x38, 0x2F, 0x41, 0x30, 0x01, 0xAE,
0xB0, 0x6A, 0x53, 0xED, 0x90, 0x27, 0xD8, 0x31,
0x17, 0x97, 0x27, 0xB0, 0x86, 0x5A, 0x89, 0x18,
0xDA, 0x3E, 0xDB, 0xEB, 0xCF, 0x9B, 0x14, 0xED,
0x44, 0xCE, 0x6C, 0xBA, 0xCE, 0xD4, 0xBB, 0x1B,
0xDB, 0x7F, 0x14, 0x47, 0xE6, 0xCC, 0x25, 0x4B,
0x33, 0x20, 0x51, 0x51, 0x2B, 0xD7, 0xAF, 0x42,
0x6F, 0xB8, 0xF4, 0x01, 0x37, 0x8C, 0xD2, 0xBF,
0x59, 0x83, 0xCA, 0x01, 0xC6, 0x4B, 0x92, 0xEC,
0xF0, 0x32, 0xEA, 0x15, 0xD1, 0x72, 0x1D, 0x03,
0xF4, 0x82, 0xD7, 0xCE, 0x6E, 0x74, 0xFE, 0xF6,
0xD5, 0x5E, 0x70, 0x2F, 0x46, 0x98, 0x0C, 0x82,
0xB5, 0xA8, 0x40, 0x31, 0x90, 0x0B, 0x1C, 0x9E,
0x59, 0xE7, 0xC9, 0x7F, 0xBE, 0xC7, 0xE8, 0xF3,
0x23, 0xA9, 0x7A, 0x7E, 0x36, 0xCC, 0x88, 0xBE,
0x0F, 0x1D, 0x45, 0xB7, 0xFF, 0x58, 0x5A, 0xC5,
0x4B, 0xD4, 0x07, 0xB2, 0x2B, 0x41, 0x54, 0xAA,
0xCC, 0x8F, 0x6D, 0x7E, 0xBF, 0x48, 0xE1, 0xD8,
0x14, 0xCC, 0x5E, 0xD2, 0x0F, 0x80, 0x37, 0xE0,
0xA7, 0x97, 0x15, 0xEE, 0xF2, 0x9B, 0xE3, 0x28,
0x06, 0xA1, 0xD5, 0x8B, 0xB7, 0xC5, 0xDA, 0x76,
0xF5, 0x50, 0xAA, 0x3D, 0x8A, 0x1F, 0xBF, 0xF0,
0xEB, 0x19, 0xCC, 0xB1, 0xA3, 0x13, 0xD5, 0x5C,
0xDA, 0x56, 0xC9, 0xEC, 0x2E, 0xF2, 0x96, 0x32,
0x38, 0x7F, 0xE8, 0xD7, 0x6E, 0x3C, 0x04, 0x68,
0x04, 0x3E, 0x8F, 0x66, 0x3F, 0x48, 0x60, 0xEE,
0x12, 0xBF, 0x2D, 0x5B, 0x0B, 0x74, 0x74, 0xD6,
0xE6, 0x94, 0xF9, 0x1E, 0x6D, 0xBE, 0x11, 0x59,
0x74, 0xA3, 0x92, 0x6F, 0x12, 0xFE, 0xE5, 0xE4,
0x38, 0x77, 0x7C, 0xB6, 0xA9, 0x32, 0xDF, 0x8C,
0xD8, 0xBE, 0xC4, 0xD0, 0x73, 0xB9, 0x31, 0xBA,
0x3B, 0xC8, 0x32, 0xB6, 0x8D, 0x9D, 0xD3, 0x00,
0x74, 0x1F, 0xA7, 0xBF, 0x8A, 0xFC, 0x47, 0xED,
0x25, 0x76, 0xF6, 0x93, 0x6B, 0xA4, 0x24, 0x66,
0x3A, 0xAB, 0x63, 0x9C, 0x5A, 0xE4, 0xF5, 0x68,
0x34, 0x23, 0xB4, 0x74, 0x2B, 0xF1, 0xC9, 0x78,
0x23, 0x8F, 0x16, 0xCB, 0xE3, 0x9D, 0x65, 0x2D,
0xE3, 0xFD, 0xB8, 0xBE, 0xFC, 0x84, 0x8A, 0xD9,
0x22, 0x22, 0x2E, 0x04, 0xA4, 0x03, 0x7C, 0x07,
0x13, 0xEB, 0x57, 0xA8, 0x1A, 0x23, 0xF0, 0xC7,
0x34, 0x73, 0xFC, 0x64, 0x6C, 0xEA, 0x30, 0x6B,
0x4B, 0xCB, 0xC8, 0x86, 0x2F, 0x83, 0x85, 0xDD,
0xFA, 0x9D, 0x4B, 0x7F, 0xA2, 0xC0, 0x87, 0xE8,
0x79, 0x68, 0x33, 0x03, 0xED, 0x5B, 0xDD, 0x3A,
0x06, 0x2B, 0x3C, 0xF5, 0xB3, 0xA2, 0x78, 0xA6,
0x6D, 0x2A, 0x13, 0xF8, 0x3F, 0x44, 0xF8, 0x2D,
0xDF, 0x31, 0x0E, 0xE0, 0x74, 0xAB, 0x6A, 0x36,
0x45, 0x97, 0xE8, 0x99, 0xA0, 0x25, 0x5D, 0xC1,
0x64, 0xF3, 0x1C, 0xC5, 0x08, 0x46, 0x85, 0x1D,
0xF9, 0xAB, 0x48, 0x19, 0x5D, 0xED, 0x7E, 0xA1,
0xB1, 0xD5, 0x10, 0xBD, 0x7E, 0xE7, 0x4D, 0x73,
0xFA, 0xF3, 0x6B, 0xC3, 0x1E, 0xCF, 0xA2, 0x68,
0x35, 0x90, 0x46, 0xF4, 0xEB, 0x87, 0x9F, 0x92,
0x40, 0x09, 0x43, 0x8B, 0x48, 0x1C, 0x6C, 0xD7,
0x88, 0x9A, 0x00, 0x2E, 0xD5, 0xEE, 0x38, 0x2B,
0xC9, 0x19, 0x0D, 0xA6, 0xFC, 0x02, 0x6E, 0x47,
0x95, 0x58, 0xE4, 0x47, 0x56, 0x77, 0xE9, 0xAA,
0x9E, 0x30, 0x50, 0xE2, 0x76, 0x56, 0x94, 0xDF,
0xC8, 0x1F, 0x56, 0xE8, 0x80, 0xB9, 0x6E, 0x71,
0x60, 0xC9, 0x80, 0xDD, 0x98, 0xED, 0xD3, 0xDF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
};
return BN_bin2bn(RFC3526_PRIME_8192, sizeof(RFC3526_PRIME_8192), bn);
}

View file

@ -0,0 +1,361 @@
/*
* Copyright 2000-2019 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/*-
* TODO list
*
* 1. Check a bunch of "(words+1)" type hacks in various bignum functions and
* check they can be safely removed.
* - Check +1 and other ugliness in BN_from_montgomery()
*
* 2. Consider allowing a BN_new_ex() that, at least, lets you specify an
* appropriate 'block' size that will be honoured by bn_expand_internal() to
* prevent piddly little reallocations. OTOH, profiling bignum expansions in
* BN_CTX doesn't show this to be a big issue.
*/
/* How many bignums are in each "pool item"; */
#define BN_CTX_POOL_SIZE 16
/* The stack frame info is resizing, set a first-time expansion size; */
#define BN_CTX_START_FRAMES 32
/***********/
/* BN_POOL */
/***********/
/* A bundle of bignums that can be linked with other bundles */
typedef struct bignum_pool_item {
/* The bignum values */
BIGNUM vals[BN_CTX_POOL_SIZE];
/* Linked-list admin */
struct bignum_pool_item *prev, *next;
} BN_POOL_ITEM;
/* A linked-list of bignums grouped in bundles */
typedef struct bignum_pool {
/* Linked-list admin */
BN_POOL_ITEM *head, *current, *tail;
/* Stack depth and allocation size */
unsigned used, size;
} BN_POOL;
static void BN_POOL_init(BN_POOL *);
static void BN_POOL_finish(BN_POOL *);
static BIGNUM *BN_POOL_get(BN_POOL *, int);
static void BN_POOL_release(BN_POOL *, unsigned int);
/************/
/* BN_STACK */
/************/
/* A wrapper to manage the "stack frames" */
typedef struct bignum_ctx_stack {
/* Array of indexes into the bignum stack */
unsigned int *indexes;
/* Number of stack frames, and the size of the allocated array */
unsigned int depth, size;
} BN_STACK;
static void BN_STACK_init(BN_STACK *);
static void BN_STACK_finish(BN_STACK *);
static int BN_STACK_push(BN_STACK *, unsigned int);
static unsigned int BN_STACK_pop(BN_STACK *);
/**********/
/* BN_CTX */
/**********/
/* The opaque BN_CTX type */
struct bignum_ctx {
/* The bignum bundles */
BN_POOL pool;
/* The "stack frames", if you will */
BN_STACK stack;
/* The number of bignums currently assigned */
unsigned int used;
/* Depth of stack overflow */
int err_stack;
/* Block "gets" until an "end" (compatibility behaviour) */
int too_many;
/* Flags. */
int flags;
};
/* Enable this to find BN_CTX bugs */
#ifdef BN_CTX_DEBUG
static const char *ctxdbg_cur = NULL;
static void ctxdbg(BN_CTX *ctx)
{
unsigned int bnidx = 0, fpidx = 0;
BN_POOL_ITEM *item = ctx->pool.head;
BN_STACK *stack = &ctx->stack;
fprintf(stderr, "(%16p): ", ctx);
while (bnidx < ctx->used) {
fprintf(stderr, "%03x ", item->vals[bnidx++ % BN_CTX_POOL_SIZE].dmax);
if (!(bnidx % BN_CTX_POOL_SIZE))
item = item->next;
}
fprintf(stderr, "\n");
bnidx = 0;
fprintf(stderr, " : ");
while (fpidx < stack->depth) {
while (bnidx++ < stack->indexes[fpidx])
fprintf(stderr, " ");
fprintf(stderr, "^^^ ");
bnidx++;
fpidx++;
}
fprintf(stderr, "\n");
}
# define CTXDBG_ENTRY(str, ctx) do { \
ctxdbg_cur = (str); \
fprintf(stderr,"Starting %s\n", ctxdbg_cur); \
ctxdbg(ctx); \
} while(0)
# define CTXDBG_EXIT(ctx) do { \
fprintf(stderr,"Ending %s\n", ctxdbg_cur); \
ctxdbg(ctx); \
} while(0)
# define CTXDBG_RET(ctx,ret)
#else
# define CTXDBG_ENTRY(str, ctx)
# define CTXDBG_EXIT(ctx)
# define CTXDBG_RET(ctx,ret)
#endif
BN_CTX *BN_CTX_new(void)
{
BN_CTX *ret;
if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_CTX_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
/* Initialise the structure */
BN_POOL_init(&ret->pool);
BN_STACK_init(&ret->stack);
return ret;
}
BN_CTX *BN_CTX_secure_new(void)
{
BN_CTX *ret = BN_CTX_new();
if (ret != NULL)
ret->flags = BN_FLG_SECURE;
return ret;
}
void BN_CTX_free(BN_CTX *ctx)
{
if (ctx == NULL)
return;
#ifdef BN_CTX_DEBUG
{
BN_POOL_ITEM *pool = ctx->pool.head;
fprintf(stderr, "BN_CTX_free, stack-size=%d, pool-bignums=%d\n",
ctx->stack.size, ctx->pool.size);
fprintf(stderr, "dmaxs: ");
while (pool) {
unsigned loop = 0;
while (loop < BN_CTX_POOL_SIZE)
fprintf(stderr, "%02x ", pool->vals[loop++].dmax);
pool = pool->next;
}
fprintf(stderr, "\n");
}
#endif
BN_STACK_finish(&ctx->stack);
BN_POOL_finish(&ctx->pool);
OPENSSL_free(ctx);
}
void BN_CTX_start(BN_CTX *ctx)
{
CTXDBG_ENTRY("BN_CTX_start", ctx);
/* If we're already overflowing ... */
if (ctx->err_stack || ctx->too_many)
ctx->err_stack++;
/* (Try to) get a new frame pointer */
else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
BNerr(BN_F_BN_CTX_START, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
ctx->err_stack++;
}
CTXDBG_EXIT(ctx);
}
void BN_CTX_end(BN_CTX *ctx)
{
CTXDBG_ENTRY("BN_CTX_end", ctx);
if (ctx->err_stack)
ctx->err_stack--;
else {
unsigned int fp = BN_STACK_pop(&ctx->stack);
/* Does this stack frame have anything to release? */
if (fp < ctx->used)
BN_POOL_release(&ctx->pool, ctx->used - fp);
ctx->used = fp;
/* Unjam "too_many" in case "get" had failed */
ctx->too_many = 0;
}
CTXDBG_EXIT(ctx);
}
BIGNUM *BN_CTX_get(BN_CTX *ctx)
{
BIGNUM *ret;
CTXDBG_ENTRY("BN_CTX_get", ctx);
if (ctx->err_stack || ctx->too_many)
return NULL;
if ((ret = BN_POOL_get(&ctx->pool, ctx->flags)) == NULL) {
/*
* Setting too_many prevents repeated "get" attempts from cluttering
* the error stack.
*/
ctx->too_many = 1;
BNerr(BN_F_BN_CTX_GET, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
return NULL;
}
/* OK, make sure the returned bignum is "zero" */
BN_zero(ret);
/* clear BN_FLG_CONSTTIME if leaked from previous frames */
ret->flags &= (~BN_FLG_CONSTTIME);
ctx->used++;
CTXDBG_RET(ctx, ret);
return ret;
}
/************/
/* BN_STACK */
/************/
static void BN_STACK_init(BN_STACK *st)
{
st->indexes = NULL;
st->depth = st->size = 0;
}
static void BN_STACK_finish(BN_STACK *st)
{
OPENSSL_free(st->indexes);
st->indexes = NULL;
}
static int BN_STACK_push(BN_STACK *st, unsigned int idx)
{
if (st->depth == st->size) {
/* Need to expand */
unsigned int newsize =
st->size ? (st->size * 3 / 2) : BN_CTX_START_FRAMES;
unsigned int *newitems;
if ((newitems = OPENSSL_malloc(sizeof(*newitems) * newsize)) == NULL) {
BNerr(BN_F_BN_STACK_PUSH, ERR_R_MALLOC_FAILURE);
return 0;
}
if (st->depth)
memcpy(newitems, st->indexes, sizeof(*newitems) * st->depth);
OPENSSL_free(st->indexes);
st->indexes = newitems;
st->size = newsize;
}
st->indexes[(st->depth)++] = idx;
return 1;
}
static unsigned int BN_STACK_pop(BN_STACK *st)
{
return st->indexes[--(st->depth)];
}
/***********/
/* BN_POOL */
/***********/
static void BN_POOL_init(BN_POOL *p)
{
p->head = p->current = p->tail = NULL;
p->used = p->size = 0;
}
static void BN_POOL_finish(BN_POOL *p)
{
unsigned int loop;
BIGNUM *bn;
while (p->head) {
for (loop = 0, bn = p->head->vals; loop++ < BN_CTX_POOL_SIZE; bn++)
if (bn->d)
BN_clear_free(bn);
p->current = p->head->next;
OPENSSL_free(p->head);
p->head = p->current;
}
}
static BIGNUM *BN_POOL_get(BN_POOL *p, int flag)
{
BIGNUM *bn;
unsigned int loop;
/* Full; allocate a new pool item and link it in. */
if (p->used == p->size) {
BN_POOL_ITEM *item;
if ((item = OPENSSL_malloc(sizeof(*item))) == NULL) {
BNerr(BN_F_BN_POOL_GET, ERR_R_MALLOC_FAILURE);
return NULL;
}
for (loop = 0, bn = item->vals; loop++ < BN_CTX_POOL_SIZE; bn++) {
bn_init(bn);
if ((flag & BN_FLG_SECURE) != 0)
BN_set_flags(bn, BN_FLG_SECURE);
}
item->prev = p->tail;
item->next = NULL;
if (p->head == NULL)
p->head = p->current = p->tail = item;
else {
p->tail->next = item;
p->tail = item;
p->current = item;
}
p->size += BN_CTX_POOL_SIZE;
p->used++;
/* Return the first bignum from the new pool */
return item->vals;
}
if (!p->used)
p->current = p->head;
else if ((p->used % BN_CTX_POOL_SIZE) == 0)
p->current = p->current->next;
return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
}
static void BN_POOL_release(BN_POOL *p, unsigned int num)
{
unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
p->used -= num;
while (num--) {
bn_check_top(p->current->vals + offset);
if (offset == 0) {
offset = BN_CTX_POOL_SIZE - 1;
p->current = p->current->prev;
} else
offset--;
}
}

View file

@ -0,0 +1,68 @@
/*
* Copyright 2002-2019 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/*
* Support for deprecated functions goes here - static linkage will only
* slurp this code if applications are using them directly.
*/
#include <openssl/opensslconf.h>
#if OPENSSL_API_COMPAT >= 0x00908000L
NON_EMPTY_TRANSLATION_UNIT
#else
# include <stdio.h>
# include <time.h>
# include "internal/cryptlib.h"
# include "bn_lcl.h"
BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
const BIGNUM *add, const BIGNUM *rem,
void (*callback) (int, int, void *), void *cb_arg)
{
BN_GENCB cb;
BIGNUM *rnd = NULL;
BN_GENCB_set_old(&cb, callback, cb_arg);
if (ret == NULL) {
if ((rnd = BN_new()) == NULL)
goto err;
} else
rnd = ret;
if (!BN_generate_prime_ex(rnd, bits, safe, add, rem, &cb))
goto err;
/* we have a prime :-) */
return rnd;
err:
BN_free(rnd);
return NULL;
}
int BN_is_prime(const BIGNUM *a, int checks,
void (*callback) (int, int, void *), BN_CTX *ctx_passed,
void *cb_arg)
{
BN_GENCB cb;
BN_GENCB_set_old(&cb, callback, cb_arg);
return BN_is_prime_ex(a, checks, ctx_passed, &cb);
}
int BN_is_prime_fasttest(const BIGNUM *a, int checks,
void (*callback) (int, int, void *),
BN_CTX *ctx_passed, void *cb_arg,
int do_trial_division)
{
BN_GENCB cb;
BN_GENCB_set_old(&cb, callback, cb_arg);
return BN_is_prime_fasttest_ex(a, checks, ctx_passed,
do_trial_division, &cb);
}
#endif

View file

@ -0,0 +1,512 @@
/*
* Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "bn_lcl.h"
#include "internal/nelem.h"
#ifndef OPENSSL_NO_DH
#include <openssl/dh.h>
#include "internal/bn_dh.h"
/* DH parameters from RFC5114 */
# if BN_BITS2 == 64
static const BN_ULONG dh1024_160_p[] = {
0xDF1FB2BC2E4A4371ULL, 0xE68CFDA76D4DA708ULL, 0x45BF37DF365C1A65ULL,
0xA151AF5F0DC8B4BDULL, 0xFAA31A4FF55BCCC0ULL, 0x4EFFD6FAE5644738ULL,
0x98488E9C219A7372ULL, 0xACCBDD7D90C4BD70ULL, 0x24975C3CD49B83BFULL,
0x13ECB4AEA9061123ULL, 0x9838EF1E2EE652C0ULL, 0x6073E28675A23D18ULL,
0x9A6A9DCA52D23B61ULL, 0x52C99FBCFB06A3C6ULL, 0xDE92DE5EAE5D54ECULL,
0xB10B8F96A080E01DULL
};
static const BN_ULONG dh1024_160_g[] = {
0x855E6EEB22B3B2E5ULL, 0x858F4DCEF97C2A24ULL, 0x2D779D5918D08BC8ULL,
0xD662A4D18E73AFA3ULL, 0x1DBF0A0169B6A28AULL, 0xA6A24C087A091F53ULL,
0x909D0D2263F80A76ULL, 0xD7FBD7D3B9A92EE1ULL, 0x5E91547F9E2749F4ULL,
0x160217B4B01B886AULL, 0x777E690F5504F213ULL, 0x266FEA1E5C41564BULL,
0xD6406CFF14266D31ULL, 0xF8104DD258AC507FULL, 0x6765A442EFB99905ULL,
0xA4D1CBD5C3FD3412ULL
};
static const BN_ULONG dh1024_160_q[] = {
0x64B7CB9D49462353ULL, 0x81A8DF278ABA4E7DULL, 0x00000000F518AA87ULL
};
static const BN_ULONG dh2048_224_p[] = {
0x0AC4DFFE0C10E64FULL, 0xCF9DE5384E71B81CULL, 0x7EF363E2FFA31F71ULL,
0xE3FB73C16B8E75B9ULL, 0xC9B53DCF4BA80A29ULL, 0x23F10B0E16E79763ULL,
0xC52172E413042E9BULL, 0xBE60E69CC928B2B9ULL, 0x80CD86A1B9E587E8ULL,
0x315D75E198C641A4ULL, 0xCDF93ACC44328387ULL, 0x15987D9ADC0A486DULL,
0x7310F7121FD5A074ULL, 0x278273C7DE31EFDCULL, 0x1602E714415D9330ULL,
0x81286130BC8985DBULL, 0xB3BF8A3170918836ULL, 0x6A00E0A0B9C49708ULL,
0xC6BA0B2C8BBC27BEULL, 0xC9F98D11ED34DBF6ULL, 0x7AD5B7D0B6C12207ULL,
0xD91E8FEF55B7394BULL, 0x9037C9EDEFDA4DF8ULL, 0x6D3F8152AD6AC212ULL,
0x1DE6B85A1274A0A6ULL, 0xEB3D688A309C180EULL, 0xAF9A3C407BA1DF15ULL,
0xE6FA141DF95A56DBULL, 0xB54B1597B61D0A75ULL, 0xA20D64E5683B9FD1ULL,
0xD660FAA79559C51FULL, 0xAD107E1E9123A9D0ULL
};
static const BN_ULONG dh2048_224_g[] = {
0x84B890D3191F2BFAULL, 0x81BC087F2A7065B3ULL, 0x19C418E1F6EC0179ULL,
0x7B5A0F1C71CFFF4CULL, 0xEDFE72FE9B6AA4BDULL, 0x81E1BCFE94B30269ULL,
0x566AFBB48D6C0191ULL, 0xB539CCE3409D13CDULL, 0x6AA21E7F5F2FF381ULL,
0xD9E263E4770589EFULL, 0x10E183EDD19963DDULL, 0xB70A8137150B8EEBULL,
0x051AE3D428C8F8ACULL, 0xBB77A86F0C1AB15BULL, 0x6E3025E316A330EFULL,
0x19529A45D6F83456ULL, 0xF180EB34118E98D1ULL, 0xB5F6C6B250717CBEULL,
0x09939D54DA7460CDULL, 0xE247150422EA1ED4ULL, 0xB8A762D0521BC98AULL,
0xF4D027275AC1348BULL, 0xC17669101999024AULL, 0xBE5E9001A8D66AD7ULL,
0xC57DB17C620A8652ULL, 0xAB739D7700C29F52ULL, 0xDD921F01A70C4AFAULL,
0xA6824A4E10B9A6F0ULL, 0x74866A08CFE4FFE3ULL, 0x6CDEBE7B89998CAFULL,
0x9DF30B5C8FFDAC50ULL, 0xAC4032EF4F2D9AE3ULL
};
static const BN_ULONG dh2048_224_q[] = {
0xBF389A99B36371EBULL, 0x1F80535A4738CEBCULL, 0xC58D93FE99717710ULL,
0x00000000801C0D34ULL
};
static const BN_ULONG dh2048_256_p[] = {
0xDB094AE91E1A1597ULL, 0x693877FAD7EF09CAULL, 0x6116D2276E11715FULL,
0xA4B54330C198AF12ULL, 0x75F26375D7014103ULL, 0xC3A3960A54E710C3ULL,
0xDED4010ABD0BE621ULL, 0xC0B857F689962856ULL, 0xB3CA3F7971506026ULL,
0x1CCACB83E6B486F6ULL, 0x67E144E514056425ULL, 0xF6A167B5A41825D9ULL,
0x3AD8347796524D8EULL, 0xF13C6D9A51BFA4ABULL, 0x2D52526735488A0EULL,
0xB63ACAE1CAA6B790ULL, 0x4FDB70C581B23F76ULL, 0xBC39A0BF12307F5CULL,
0xB941F54EB1E59BB8ULL, 0x6C5BFC11D45F9088ULL, 0x22E0B1EF4275BF7BULL,
0x91F9E6725B4758C0ULL, 0x5A8A9D306BCF67EDULL, 0x209E0C6497517ABDULL,
0x3BF4296D830E9A7CULL, 0x16C3D91134096FAAULL, 0xFAF7DF4561B2AA30ULL,
0xE00DF8F1D61957D4ULL, 0x5D2CEED4435E3B00ULL, 0x8CEEF608660DD0F2ULL,
0xFFBBD19C65195999ULL, 0x87A8E61DB4B6663CULL
};
static const BN_ULONG dh2048_256_g[] = {
0x664B4C0F6CC41659ULL, 0x5E2327CFEF98C582ULL, 0xD647D148D4795451ULL,
0x2F63078490F00EF8ULL, 0x184B523D1DB246C3ULL, 0xC7891428CDC67EB6ULL,
0x7FD028370DF92B52ULL, 0xB3353BBB64E0EC37ULL, 0xECD06E1557CD0915ULL,
0xB7D2BBD2DF016199ULL, 0xC8484B1E052588B9ULL, 0xDB2A3B7313D3FE14ULL,
0xD052B985D182EA0AULL, 0xA4BD1BFFE83B9C80ULL, 0xDFC967C1FB3F2E55ULL,
0xB5045AF2767164E1ULL, 0x1D14348F6F2F9193ULL, 0x64E67982428EBC83ULL,
0x8AC376D282D6ED38ULL, 0x777DE62AAAB8A862ULL, 0xDDF463E5E9EC144BULL,
0x0196F931C77A57F2ULL, 0xA55AE31341000A65ULL, 0x901228F8C28CBB18ULL,
0xBC3773BF7E8C6F62ULL, 0xBE3A6C1B0C6B47B1ULL, 0xFF4FED4AAC0BB555ULL,
0x10DBC15077BE463FULL, 0x07F4793A1A0BA125ULL, 0x4CA7B18F21EF2054ULL,
0x2E77506660EDBD48ULL, 0x3FB32C9B73134D0BULL
};
static const BN_ULONG dh2048_256_q[] = {
0xA308B0FE64F5FBD3ULL, 0x99B1A47D1EB3750BULL, 0xB447997640129DA2ULL,
0x8CF83642A709A097ULL
};
/* Primes from RFC 7919 */
static const BN_ULONG ffdhe2048_p[] = {
0xFFFFFFFFFFFFFFFFULL, 0x886B423861285C97ULL, 0xC6F34A26C1B2EFFAULL,
0xC58EF1837D1683B2ULL, 0x3BB5FCBC2EC22005ULL, 0xC3FE3B1B4C6FAD73ULL,
0x8E4F1232EEF28183ULL, 0x9172FE9CE98583FFULL, 0xC03404CD28342F61ULL,
0x9E02FCE1CDF7E2ECULL, 0x0B07A7C8EE0A6D70ULL, 0xAE56EDE76372BB19ULL,
0x1D4F42A3DE394DF4ULL, 0xB96ADAB760D7F468ULL, 0xD108A94BB2C8E3FBULL,
0xBC0AB182B324FB61ULL, 0x30ACCA4F483A797AULL, 0x1DF158A136ADE735ULL,
0xE2A689DAF3EFE872ULL, 0x984F0C70E0E68B77ULL, 0xB557135E7F57C935ULL,
0x856365553DED1AF3ULL, 0x2433F51F5F066ED0ULL, 0xD3DF1ED5D5FD6561ULL,
0xF681B202AEC4617AULL, 0x7D2FE363630C75D8ULL, 0xCC939DCE249B3EF9ULL,
0xA9E13641146433FBULL, 0xD8B9C583CE2D3695ULL, 0xAFDC5620273D3CF1ULL,
0xADF85458A2BB4A9AULL, 0xFFFFFFFFFFFFFFFFULL
};
static const BN_ULONG ffdhe3072_p[] = {
0xFFFFFFFFFFFFFFFFULL, 0x25E41D2B66C62E37ULL, 0x3C1B20EE3FD59D7CULL,
0x0ABCD06BFA53DDEFULL, 0x1DBF9A42D5C4484EULL, 0xABC521979B0DEADAULL,
0xE86D2BC522363A0DULL, 0x5CAE82AB9C9DF69EULL, 0x64F2E21E71F54BFFULL,
0xF4FD4452E2D74DD3ULL, 0xB4130C93BC437944ULL, 0xAEFE130985139270ULL,
0x598CB0FAC186D91CULL, 0x7AD91D2691F7F7EEULL, 0x61B46FC9D6E6C907ULL,
0xBC34F4DEF99C0238ULL, 0xDE355B3B6519035BULL, 0x886B4238611FCFDCULL,
0xC6F34A26C1B2EFFAULL, 0xC58EF1837D1683B2ULL, 0x3BB5FCBC2EC22005ULL,
0xC3FE3B1B4C6FAD73ULL, 0x8E4F1232EEF28183ULL, 0x9172FE9CE98583FFULL,
0xC03404CD28342F61ULL, 0x9E02FCE1CDF7E2ECULL, 0x0B07A7C8EE0A6D70ULL,
0xAE56EDE76372BB19ULL, 0x1D4F42A3DE394DF4ULL, 0xB96ADAB760D7F468ULL,
0xD108A94BB2C8E3FBULL, 0xBC0AB182B324FB61ULL, 0x30ACCA4F483A797AULL,
0x1DF158A136ADE735ULL, 0xE2A689DAF3EFE872ULL, 0x984F0C70E0E68B77ULL,
0xB557135E7F57C935ULL, 0x856365553DED1AF3ULL, 0x2433F51F5F066ED0ULL,
0xD3DF1ED5D5FD6561ULL, 0xF681B202AEC4617AULL, 0x7D2FE363630C75D8ULL,
0xCC939DCE249B3EF9ULL, 0xA9E13641146433FBULL, 0xD8B9C583CE2D3695ULL,
0xAFDC5620273D3CF1ULL, 0xADF85458A2BB4A9AULL, 0xFFFFFFFFFFFFFFFFULL
};
static const BN_ULONG ffdhe4096_p[] = {
0xFFFFFFFFFFFFFFFFULL, 0xC68A007E5E655F6AULL, 0x4DB5A851F44182E1ULL,
0x8EC9B55A7F88A46BULL, 0x0A8291CDCEC97DCFULL, 0x2A4ECEA9F98D0ACCULL,
0x1A1DB93D7140003CULL, 0x092999A333CB8B7AULL, 0x6DC778F971AD0038ULL,
0xA907600A918130C4ULL, 0xED6A1E012D9E6832ULL, 0x7135C886EFB4318AULL,
0x87F55BA57E31CC7AULL, 0x7763CF1D55034004ULL, 0xAC7D5F42D69F6D18ULL,
0x7930E9E4E58857B6ULL, 0x6E6F52C3164DF4FBULL, 0x25E41D2B669E1EF1ULL,
0x3C1B20EE3FD59D7CULL, 0x0ABCD06BFA53DDEFULL, 0x1DBF9A42D5C4484EULL,
0xABC521979B0DEADAULL, 0xE86D2BC522363A0DULL, 0x5CAE82AB9C9DF69EULL,
0x64F2E21E71F54BFFULL, 0xF4FD4452E2D74DD3ULL, 0xB4130C93BC437944ULL,
0xAEFE130985139270ULL, 0x598CB0FAC186D91CULL, 0x7AD91D2691F7F7EEULL,
0x61B46FC9D6E6C907ULL, 0xBC34F4DEF99C0238ULL, 0xDE355B3B6519035BULL,
0x886B4238611FCFDCULL, 0xC6F34A26C1B2EFFAULL, 0xC58EF1837D1683B2ULL,
0x3BB5FCBC2EC22005ULL, 0xC3FE3B1B4C6FAD73ULL, 0x8E4F1232EEF28183ULL,
0x9172FE9CE98583FFULL, 0xC03404CD28342F61ULL, 0x9E02FCE1CDF7E2ECULL,
0x0B07A7C8EE0A6D70ULL, 0xAE56EDE76372BB19ULL, 0x1D4F42A3DE394DF4ULL,
0xB96ADAB760D7F468ULL, 0xD108A94BB2C8E3FBULL, 0xBC0AB182B324FB61ULL,
0x30ACCA4F483A797AULL, 0x1DF158A136ADE735ULL, 0xE2A689DAF3EFE872ULL,
0x984F0C70E0E68B77ULL, 0xB557135E7F57C935ULL, 0x856365553DED1AF3ULL,
0x2433F51F5F066ED0ULL, 0xD3DF1ED5D5FD6561ULL, 0xF681B202AEC4617AULL,
0x7D2FE363630C75D8ULL, 0xCC939DCE249B3EF9ULL, 0xA9E13641146433FBULL,
0xD8B9C583CE2D3695ULL, 0xAFDC5620273D3CF1ULL, 0xADF85458A2BB4A9AULL,
0xFFFFFFFFFFFFFFFFULL
};
static const BN_ULONG ffdhe6144_p[] = {
0xFFFFFFFFFFFFFFFFULL, 0xA40E329CD0E40E65ULL, 0xA41D570D7938DAD4ULL,
0x62A69526D43161C1ULL, 0x3FDD4A8E9ADB1E69ULL, 0x5B3B71F9DC6B80D6ULL,
0xEC9D1810C6272B04ULL, 0x8CCF2DD5CACEF403ULL, 0xE49F5235C95B9117ULL,
0x505DC82DB854338AULL, 0x62292C311562A846ULL, 0xD72B03746AE77F5EULL,
0xF9C9091B462D538CULL, 0x0AE8DB5847A67CBEULL, 0xB3A739C122611682ULL,
0xEEAAC0232A281BF6ULL, 0x94C6651E77CAF992ULL, 0x763E4E4B94B2BBC1ULL,
0x587E38DA0077D9B4ULL, 0x7FB29F8C183023C3ULL, 0x0ABEC1FFF9E3A26EULL,
0xA00EF092350511E3ULL, 0xB855322EDB6340D8ULL, 0xA52471F7A9A96910ULL,
0x388147FB4CFDB477ULL, 0x9B1F5C3E4E46041FULL, 0xCDAD0657FCCFEC71ULL,
0xB38E8C334C701C3AULL, 0x917BDD64B1C0FD4CULL, 0x3BB454329B7624C8ULL,
0x23BA4442CAF53EA6ULL, 0x4E677D2C38532A3AULL, 0x0BFD64B645036C7AULL,
0xC68A007E5E0DD902ULL, 0x4DB5A851F44182E1ULL, 0x8EC9B55A7F88A46BULL,
0x0A8291CDCEC97DCFULL, 0x2A4ECEA9F98D0ACCULL, 0x1A1DB93D7140003CULL,
0x092999A333CB8B7AULL, 0x6DC778F971AD0038ULL, 0xA907600A918130C4ULL,
0xED6A1E012D9E6832ULL, 0x7135C886EFB4318AULL, 0x87F55BA57E31CC7AULL,
0x7763CF1D55034004ULL, 0xAC7D5F42D69F6D18ULL, 0x7930E9E4E58857B6ULL,
0x6E6F52C3164DF4FBULL, 0x25E41D2B669E1EF1ULL, 0x3C1B20EE3FD59D7CULL,
0x0ABCD06BFA53DDEFULL, 0x1DBF9A42D5C4484EULL, 0xABC521979B0DEADAULL,
0xE86D2BC522363A0DULL, 0x5CAE82AB9C9DF69EULL, 0x64F2E21E71F54BFFULL,
0xF4FD4452E2D74DD3ULL, 0xB4130C93BC437944ULL, 0xAEFE130985139270ULL,
0x598CB0FAC186D91CULL, 0x7AD91D2691F7F7EEULL, 0x61B46FC9D6E6C907ULL,
0xBC34F4DEF99C0238ULL, 0xDE355B3B6519035BULL, 0x886B4238611FCFDCULL,
0xC6F34A26C1B2EFFAULL, 0xC58EF1837D1683B2ULL, 0x3BB5FCBC2EC22005ULL,
0xC3FE3B1B4C6FAD73ULL, 0x8E4F1232EEF28183ULL, 0x9172FE9CE98583FFULL,
0xC03404CD28342F61ULL, 0x9E02FCE1CDF7E2ECULL, 0x0B07A7C8EE0A6D70ULL,
0xAE56EDE76372BB19ULL, 0x1D4F42A3DE394DF4ULL, 0xB96ADAB760D7F468ULL,
0xD108A94BB2C8E3FBULL, 0xBC0AB182B324FB61ULL, 0x30ACCA4F483A797AULL,
0x1DF158A136ADE735ULL, 0xE2A689DAF3EFE872ULL, 0x984F0C70E0E68B77ULL,
0xB557135E7F57C935ULL, 0x856365553DED1AF3ULL, 0x2433F51F5F066ED0ULL,
0xD3DF1ED5D5FD6561ULL, 0xF681B202AEC4617AULL, 0x7D2FE363630C75D8ULL,
0xCC939DCE249B3EF9ULL, 0xA9E13641146433FBULL, 0xD8B9C583CE2D3695ULL,
0xAFDC5620273D3CF1ULL, 0xADF85458A2BB4A9AULL, 0xFFFFFFFFFFFFFFFFULL
};
static const BN_ULONG ffdhe8192_p[] = {
0xFFFFFFFFFFFFFFFFULL, 0xD68C8BB7C5C6424CULL, 0x011E2A94838FF88CULL,
0x0822E506A9F4614EULL, 0x97D11D49F7A8443DULL, 0xA6BBFDE530677F0DULL,
0x2F741EF8C1FE86FEULL, 0xFAFABE1C5D71A87EULL, 0xDED2FBABFBE58A30ULL,
0xB6855DFE72B0A66EULL, 0x1EFC8CE0BA8A4FE8ULL, 0x83F81D4A3F2FA457ULL,
0xA1FE3075A577E231ULL, 0xD5B8019488D9C0A0ULL, 0x624816CDAD9A95F9ULL,
0x99E9E31650C1217BULL, 0x51AA691E0E423CFCULL, 0x1C217E6C3826E52CULL,
0x51A8A93109703FEEULL, 0xBB7099876A460E74ULL, 0x541FC68C9C86B022ULL,
0x59160CC046FD8251ULL, 0x2846C0BA35C35F5CULL, 0x54504AC78B758282ULL,
0x29388839D2AF05E4ULL, 0xCB2C0F1CC01BD702ULL, 0x555B2F747C932665ULL,
0x86B63142A3AB8829ULL, 0x0B8CC3BDF64B10EFULL, 0x687FEB69EDD1CC5EULL,
0xFDB23FCEC9509D43ULL, 0x1E425A31D951AE64ULL, 0x36AD004CF600C838ULL,
0xA40E329CCFF46AAAULL, 0xA41D570D7938DAD4ULL, 0x62A69526D43161C1ULL,
0x3FDD4A8E9ADB1E69ULL, 0x5B3B71F9DC6B80D6ULL, 0xEC9D1810C6272B04ULL,
0x8CCF2DD5CACEF403ULL, 0xE49F5235C95B9117ULL, 0x505DC82DB854338AULL,
0x62292C311562A846ULL, 0xD72B03746AE77F5EULL, 0xF9C9091B462D538CULL,
0x0AE8DB5847A67CBEULL, 0xB3A739C122611682ULL, 0xEEAAC0232A281BF6ULL,
0x94C6651E77CAF992ULL, 0x763E4E4B94B2BBC1ULL, 0x587E38DA0077D9B4ULL,
0x7FB29F8C183023C3ULL, 0x0ABEC1FFF9E3A26EULL, 0xA00EF092350511E3ULL,
0xB855322EDB6340D8ULL, 0xA52471F7A9A96910ULL, 0x388147FB4CFDB477ULL,
0x9B1F5C3E4E46041FULL, 0xCDAD0657FCCFEC71ULL, 0xB38E8C334C701C3AULL,
0x917BDD64B1C0FD4CULL, 0x3BB454329B7624C8ULL, 0x23BA4442CAF53EA6ULL,
0x4E677D2C38532A3AULL, 0x0BFD64B645036C7AULL, 0xC68A007E5E0DD902ULL,
0x4DB5A851F44182E1ULL, 0x8EC9B55A7F88A46BULL, 0x0A8291CDCEC97DCFULL,
0x2A4ECEA9F98D0ACCULL, 0x1A1DB93D7140003CULL, 0x092999A333CB8B7AULL,
0x6DC778F971AD0038ULL, 0xA907600A918130C4ULL, 0xED6A1E012D9E6832ULL,
0x7135C886EFB4318AULL, 0x87F55BA57E31CC7AULL, 0x7763CF1D55034004ULL,
0xAC7D5F42D69F6D18ULL, 0x7930E9E4E58857B6ULL, 0x6E6F52C3164DF4FBULL,
0x25E41D2B669E1EF1ULL, 0x3C1B20EE3FD59D7CULL, 0x0ABCD06BFA53DDEFULL,
0x1DBF9A42D5C4484EULL, 0xABC521979B0DEADAULL, 0xE86D2BC522363A0DULL,
0x5CAE82AB9C9DF69EULL, 0x64F2E21E71F54BFFULL, 0xF4FD4452E2D74DD3ULL,
0xB4130C93BC437944ULL, 0xAEFE130985139270ULL, 0x598CB0FAC186D91CULL,
0x7AD91D2691F7F7EEULL, 0x61B46FC9D6E6C907ULL, 0xBC34F4DEF99C0238ULL,
0xDE355B3B6519035BULL, 0x886B4238611FCFDCULL, 0xC6F34A26C1B2EFFAULL,
0xC58EF1837D1683B2ULL, 0x3BB5FCBC2EC22005ULL, 0xC3FE3B1B4C6FAD73ULL,
0x8E4F1232EEF28183ULL, 0x9172FE9CE98583FFULL, 0xC03404CD28342F61ULL,
0x9E02FCE1CDF7E2ECULL, 0x0B07A7C8EE0A6D70ULL, 0xAE56EDE76372BB19ULL,
0x1D4F42A3DE394DF4ULL, 0xB96ADAB760D7F468ULL, 0xD108A94BB2C8E3FBULL,
0xBC0AB182B324FB61ULL, 0x30ACCA4F483A797AULL, 0x1DF158A136ADE735ULL,
0xE2A689DAF3EFE872ULL, 0x984F0C70E0E68B77ULL, 0xB557135E7F57C935ULL,
0x856365553DED1AF3ULL, 0x2433F51F5F066ED0ULL, 0xD3DF1ED5D5FD6561ULL,
0xF681B202AEC4617AULL, 0x7D2FE363630C75D8ULL, 0xCC939DCE249B3EF9ULL,
0xA9E13641146433FBULL, 0xD8B9C583CE2D3695ULL, 0xAFDC5620273D3CF1ULL,
0xADF85458A2BB4A9AULL, 0xFFFFFFFFFFFFFFFFULL
};
# elif BN_BITS2 == 32
static const BN_ULONG dh1024_160_p[] = {
0x2E4A4371, 0xDF1FB2BC, 0x6D4DA708, 0xE68CFDA7, 0x365C1A65, 0x45BF37DF,
0x0DC8B4BD, 0xA151AF5F, 0xF55BCCC0, 0xFAA31A4F, 0xE5644738, 0x4EFFD6FA,
0x219A7372, 0x98488E9C, 0x90C4BD70, 0xACCBDD7D, 0xD49B83BF, 0x24975C3C,
0xA9061123, 0x13ECB4AE, 0x2EE652C0, 0x9838EF1E, 0x75A23D18, 0x6073E286,
0x52D23B61, 0x9A6A9DCA, 0xFB06A3C6, 0x52C99FBC, 0xAE5D54EC, 0xDE92DE5E,
0xA080E01D, 0xB10B8F96
};
static const BN_ULONG dh1024_160_g[] = {
0x22B3B2E5, 0x855E6EEB, 0xF97C2A24, 0x858F4DCE, 0x18D08BC8, 0x2D779D59,
0x8E73AFA3, 0xD662A4D1, 0x69B6A28A, 0x1DBF0A01, 0x7A091F53, 0xA6A24C08,
0x63F80A76, 0x909D0D22, 0xB9A92EE1, 0xD7FBD7D3, 0x9E2749F4, 0x5E91547F,
0xB01B886A, 0x160217B4, 0x5504F213, 0x777E690F, 0x5C41564B, 0x266FEA1E,
0x14266D31, 0xD6406CFF, 0x58AC507F, 0xF8104DD2, 0xEFB99905, 0x6765A442,
0xC3FD3412, 0xA4D1CBD5
};
static const BN_ULONG dh1024_160_q[] = {
0x49462353, 0x64B7CB9D, 0x8ABA4E7D, 0x81A8DF27, 0xF518AA87
};
static const BN_ULONG dh2048_224_p[] = {
0x0C10E64F, 0x0AC4DFFE, 0x4E71B81C, 0xCF9DE538, 0xFFA31F71, 0x7EF363E2,
0x6B8E75B9, 0xE3FB73C1, 0x4BA80A29, 0xC9B53DCF, 0x16E79763, 0x23F10B0E,
0x13042E9B, 0xC52172E4, 0xC928B2B9, 0xBE60E69C, 0xB9E587E8, 0x80CD86A1,
0x98C641A4, 0x315D75E1, 0x44328387, 0xCDF93ACC, 0xDC0A486D, 0x15987D9A,
0x1FD5A074, 0x7310F712, 0xDE31EFDC, 0x278273C7, 0x415D9330, 0x1602E714,
0xBC8985DB, 0x81286130, 0x70918836, 0xB3BF8A31, 0xB9C49708, 0x6A00E0A0,
0x8BBC27BE, 0xC6BA0B2C, 0xED34DBF6, 0xC9F98D11, 0xB6C12207, 0x7AD5B7D0,
0x55B7394B, 0xD91E8FEF, 0xEFDA4DF8, 0x9037C9ED, 0xAD6AC212, 0x6D3F8152,
0x1274A0A6, 0x1DE6B85A, 0x309C180E, 0xEB3D688A, 0x7BA1DF15, 0xAF9A3C40,
0xF95A56DB, 0xE6FA141D, 0xB61D0A75, 0xB54B1597, 0x683B9FD1, 0xA20D64E5,
0x9559C51F, 0xD660FAA7, 0x9123A9D0, 0xAD107E1E
};
static const BN_ULONG dh2048_224_g[] = {
0x191F2BFA, 0x84B890D3, 0x2A7065B3, 0x81BC087F, 0xF6EC0179, 0x19C418E1,
0x71CFFF4C, 0x7B5A0F1C, 0x9B6AA4BD, 0xEDFE72FE, 0x94B30269, 0x81E1BCFE,
0x8D6C0191, 0x566AFBB4, 0x409D13CD, 0xB539CCE3, 0x5F2FF381, 0x6AA21E7F,
0x770589EF, 0xD9E263E4, 0xD19963DD, 0x10E183ED, 0x150B8EEB, 0xB70A8137,
0x28C8F8AC, 0x051AE3D4, 0x0C1AB15B, 0xBB77A86F, 0x16A330EF, 0x6E3025E3,
0xD6F83456, 0x19529A45, 0x118E98D1, 0xF180EB34, 0x50717CBE, 0xB5F6C6B2,
0xDA7460CD, 0x09939D54, 0x22EA1ED4, 0xE2471504, 0x521BC98A, 0xB8A762D0,
0x5AC1348B, 0xF4D02727, 0x1999024A, 0xC1766910, 0xA8D66AD7, 0xBE5E9001,
0x620A8652, 0xC57DB17C, 0x00C29F52, 0xAB739D77, 0xA70C4AFA, 0xDD921F01,
0x10B9A6F0, 0xA6824A4E, 0xCFE4FFE3, 0x74866A08, 0x89998CAF, 0x6CDEBE7B,
0x8FFDAC50, 0x9DF30B5C, 0x4F2D9AE3, 0xAC4032EF
};
static const BN_ULONG dh2048_224_q[] = {
0xB36371EB, 0xBF389A99, 0x4738CEBC, 0x1F80535A, 0x99717710, 0xC58D93FE,
0x801C0D34
};
static const BN_ULONG dh2048_256_p[] = {
0x1E1A1597, 0xDB094AE9, 0xD7EF09CA, 0x693877FA, 0x6E11715F, 0x6116D227,
0xC198AF12, 0xA4B54330, 0xD7014103, 0x75F26375, 0x54E710C3, 0xC3A3960A,
0xBD0BE621, 0xDED4010A, 0x89962856, 0xC0B857F6, 0x71506026, 0xB3CA3F79,
0xE6B486F6, 0x1CCACB83, 0x14056425, 0x67E144E5, 0xA41825D9, 0xF6A167B5,
0x96524D8E, 0x3AD83477, 0x51BFA4AB, 0xF13C6D9A, 0x35488A0E, 0x2D525267,
0xCAA6B790, 0xB63ACAE1, 0x81B23F76, 0x4FDB70C5, 0x12307F5C, 0xBC39A0BF,
0xB1E59BB8, 0xB941F54E, 0xD45F9088, 0x6C5BFC11, 0x4275BF7B, 0x22E0B1EF,
0x5B4758C0, 0x91F9E672, 0x6BCF67ED, 0x5A8A9D30, 0x97517ABD, 0x209E0C64,
0x830E9A7C, 0x3BF4296D, 0x34096FAA, 0x16C3D911, 0x61B2AA30, 0xFAF7DF45,
0xD61957D4, 0xE00DF8F1, 0x435E3B00, 0x5D2CEED4, 0x660DD0F2, 0x8CEEF608,
0x65195999, 0xFFBBD19C, 0xB4B6663C, 0x87A8E61D
};
static const BN_ULONG dh2048_256_g[] = {
0x6CC41659, 0x664B4C0F, 0xEF98C582, 0x5E2327CF, 0xD4795451, 0xD647D148,
0x90F00EF8, 0x2F630784, 0x1DB246C3, 0x184B523D, 0xCDC67EB6, 0xC7891428,
0x0DF92B52, 0x7FD02837, 0x64E0EC37, 0xB3353BBB, 0x57CD0915, 0xECD06E15,
0xDF016199, 0xB7D2BBD2, 0x052588B9, 0xC8484B1E, 0x13D3FE14, 0xDB2A3B73,
0xD182EA0A, 0xD052B985, 0xE83B9C80, 0xA4BD1BFF, 0xFB3F2E55, 0xDFC967C1,
0x767164E1, 0xB5045AF2, 0x6F2F9193, 0x1D14348F, 0x428EBC83, 0x64E67982,
0x82D6ED38, 0x8AC376D2, 0xAAB8A862, 0x777DE62A, 0xE9EC144B, 0xDDF463E5,
0xC77A57F2, 0x0196F931, 0x41000A65, 0xA55AE313, 0xC28CBB18, 0x901228F8,
0x7E8C6F62, 0xBC3773BF, 0x0C6B47B1, 0xBE3A6C1B, 0xAC0BB555, 0xFF4FED4A,
0x77BE463F, 0x10DBC150, 0x1A0BA125, 0x07F4793A, 0x21EF2054, 0x4CA7B18F,
0x60EDBD48, 0x2E775066, 0x73134D0B, 0x3FB32C9B
};
static const BN_ULONG dh2048_256_q[] = {
0x64F5FBD3, 0xA308B0FE, 0x1EB3750B, 0x99B1A47D, 0x40129DA2, 0xB4479976,
0xA709A097, 0x8CF83642
};
/* Primes from RFC 7919 */
static const BN_ULONG ffdhe2048_p[] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x61285C97, 0x886B4238, 0xC1B2EFFA, 0xC6F34A26,
0x7D1683B2, 0xC58EF183, 0x2EC22005, 0x3BB5FCBC, 0x4C6FAD73, 0xC3FE3B1B,
0xEEF28183, 0x8E4F1232, 0xE98583FF, 0x9172FE9C, 0x28342F61, 0xC03404CD,
0xCDF7E2EC, 0x9E02FCE1, 0xEE0A6D70, 0x0B07A7C8, 0x6372BB19, 0xAE56EDE7,
0xDE394DF4, 0x1D4F42A3, 0x60D7F468, 0xB96ADAB7, 0xB2C8E3FB, 0xD108A94B,
0xB324FB61, 0xBC0AB182, 0x483A797A, 0x30ACCA4F, 0x36ADE735, 0x1DF158A1,
0xF3EFE872, 0xE2A689DA, 0xE0E68B77, 0x984F0C70, 0x7F57C935, 0xB557135E,
0x3DED1AF3, 0x85636555, 0x5F066ED0, 0x2433F51F, 0xD5FD6561, 0xD3DF1ED5,
0xAEC4617A, 0xF681B202, 0x630C75D8, 0x7D2FE363, 0x249B3EF9, 0xCC939DCE,
0x146433FB, 0xA9E13641, 0xCE2D3695, 0xD8B9C583, 0x273D3CF1, 0xAFDC5620,
0xA2BB4A9A, 0xADF85458, 0xFFFFFFFF, 0xFFFFFFFF
};
static const BN_ULONG ffdhe3072_p[] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x66C62E37, 0x25E41D2B, 0x3FD59D7C, 0x3C1B20EE,
0xFA53DDEF, 0x0ABCD06B, 0xD5C4484E, 0x1DBF9A42, 0x9B0DEADA, 0xABC52197,
0x22363A0D, 0xE86D2BC5, 0x9C9DF69E, 0x5CAE82AB, 0x71F54BFF, 0x64F2E21E,
0xE2D74DD3, 0xF4FD4452, 0xBC437944, 0xB4130C93, 0x85139270, 0xAEFE1309,
0xC186D91C, 0x598CB0FA, 0x91F7F7EE, 0x7AD91D26, 0xD6E6C907, 0x61B46FC9,
0xF99C0238, 0xBC34F4DE, 0x6519035B, 0xDE355B3B, 0x611FCFDC, 0x886B4238,
0xC1B2EFFA, 0xC6F34A26, 0x7D1683B2, 0xC58EF183, 0x2EC22005, 0x3BB5FCBC,
0x4C6FAD73, 0xC3FE3B1B, 0xEEF28183, 0x8E4F1232, 0xE98583FF, 0x9172FE9C,
0x28342F61, 0xC03404CD, 0xCDF7E2EC, 0x9E02FCE1, 0xEE0A6D70, 0x0B07A7C8,
0x6372BB19, 0xAE56EDE7, 0xDE394DF4, 0x1D4F42A3, 0x60D7F468, 0xB96ADAB7,
0xB2C8E3FB, 0xD108A94B, 0xB324FB61, 0xBC0AB182, 0x483A797A, 0x30ACCA4F,
0x36ADE735, 0x1DF158A1, 0xF3EFE872, 0xE2A689DA, 0xE0E68B77, 0x984F0C70,
0x7F57C935, 0xB557135E, 0x3DED1AF3, 0x85636555, 0x5F066ED0, 0x2433F51F,
0xD5FD6561, 0xD3DF1ED5, 0xAEC4617A, 0xF681B202, 0x630C75D8, 0x7D2FE363,
0x249B3EF9, 0xCC939DCE, 0x146433FB, 0xA9E13641, 0xCE2D3695, 0xD8B9C583,
0x273D3CF1, 0xAFDC5620, 0xA2BB4A9A, 0xADF85458, 0xFFFFFFFF, 0xFFFFFFFF
};
static const BN_ULONG ffdhe4096_p[] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x5E655F6A, 0xC68A007E, 0xF44182E1, 0x4DB5A851,
0x7F88A46B, 0x8EC9B55A, 0xCEC97DCF, 0x0A8291CD, 0xF98D0ACC, 0x2A4ECEA9,
0x7140003C, 0x1A1DB93D, 0x33CB8B7A, 0x092999A3, 0x71AD0038, 0x6DC778F9,
0x918130C4, 0xA907600A, 0x2D9E6832, 0xED6A1E01, 0xEFB4318A, 0x7135C886,
0x7E31CC7A, 0x87F55BA5, 0x55034004, 0x7763CF1D, 0xD69F6D18, 0xAC7D5F42,
0xE58857B6, 0x7930E9E4, 0x164DF4FB, 0x6E6F52C3, 0x669E1EF1, 0x25E41D2B,
0x3FD59D7C, 0x3C1B20EE, 0xFA53DDEF, 0x0ABCD06B, 0xD5C4484E, 0x1DBF9A42,
0x9B0DEADA, 0xABC52197, 0x22363A0D, 0xE86D2BC5, 0x9C9DF69E, 0x5CAE82AB,
0x71F54BFF, 0x64F2E21E, 0xE2D74DD3, 0xF4FD4452, 0xBC437944, 0xB4130C93,
0x85139270, 0xAEFE1309, 0xC186D91C, 0x598CB0FA, 0x91F7F7EE, 0x7AD91D26,
0xD6E6C907, 0x61B46FC9, 0xF99C0238, 0xBC34F4DE, 0x6519035B, 0xDE355B3B,
0x611FCFDC, 0x886B4238, 0xC1B2EFFA, 0xC6F34A26, 0x7D1683B2, 0xC58EF183,
0x2EC22005, 0x3BB5FCBC, 0x4C6FAD73, 0xC3FE3B1B, 0xEEF28183, 0x8E4F1232,
0xE98583FF, 0x9172FE9C, 0x28342F61, 0xC03404CD, 0xCDF7E2EC, 0x9E02FCE1,
0xEE0A6D70, 0x0B07A7C8, 0x6372BB19, 0xAE56EDE7, 0xDE394DF4, 0x1D4F42A3,
0x60D7F468, 0xB96ADAB7, 0xB2C8E3FB, 0xD108A94B, 0xB324FB61, 0xBC0AB182,
0x483A797A, 0x30ACCA4F, 0x36ADE735, 0x1DF158A1, 0xF3EFE872, 0xE2A689DA,
0xE0E68B77, 0x984F0C70, 0x7F57C935, 0xB557135E, 0x3DED1AF3, 0x85636555,
0x5F066ED0, 0x2433F51F, 0xD5FD6561, 0xD3DF1ED5, 0xAEC4617A, 0xF681B202,
0x630C75D8, 0x7D2FE363, 0x249B3EF9, 0xCC939DCE, 0x146433FB, 0xA9E13641,
0xCE2D3695, 0xD8B9C583, 0x273D3CF1, 0xAFDC5620, 0xA2BB4A9A, 0xADF85458,
0xFFFFFFFF, 0xFFFFFFFF
};
static const BN_ULONG ffdhe6144_p[] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xD0E40E65, 0xA40E329C, 0x7938DAD4, 0xA41D570D,
0xD43161C1, 0x62A69526, 0x9ADB1E69, 0x3FDD4A8E, 0xDC6B80D6, 0x5B3B71F9,
0xC6272B04, 0xEC9D1810, 0xCACEF403, 0x8CCF2DD5, 0xC95B9117, 0xE49F5235,
0xB854338A, 0x505DC82D, 0x1562A846, 0x62292C31, 0x6AE77F5E, 0xD72B0374,
0x462D538C, 0xF9C9091B, 0x47A67CBE, 0x0AE8DB58, 0x22611682, 0xB3A739C1,
0x2A281BF6, 0xEEAAC023, 0x77CAF992, 0x94C6651E, 0x94B2BBC1, 0x763E4E4B,
0x0077D9B4, 0x587E38DA, 0x183023C3, 0x7FB29F8C, 0xF9E3A26E, 0x0ABEC1FF,
0x350511E3, 0xA00EF092, 0xDB6340D8, 0xB855322E, 0xA9A96910, 0xA52471F7,
0x4CFDB477, 0x388147FB, 0x4E46041F, 0x9B1F5C3E, 0xFCCFEC71, 0xCDAD0657,
0x4C701C3A, 0xB38E8C33, 0xB1C0FD4C, 0x917BDD64, 0x9B7624C8, 0x3BB45432,
0xCAF53EA6, 0x23BA4442, 0x38532A3A, 0x4E677D2C, 0x45036C7A, 0x0BFD64B6,
0x5E0DD902, 0xC68A007E, 0xF44182E1, 0x4DB5A851, 0x7F88A46B, 0x8EC9B55A,
0xCEC97DCF, 0x0A8291CD, 0xF98D0ACC, 0x2A4ECEA9, 0x7140003C, 0x1A1DB93D,
0x33CB8B7A, 0x092999A3, 0x71AD0038, 0x6DC778F9, 0x918130C4, 0xA907600A,
0x2D9E6832, 0xED6A1E01, 0xEFB4318A, 0x7135C886, 0x7E31CC7A, 0x87F55BA5,
0x55034004, 0x7763CF1D, 0xD69F6D18, 0xAC7D5F42, 0xE58857B6, 0x7930E9E4,
0x164DF4FB, 0x6E6F52C3, 0x669E1EF1, 0x25E41D2B, 0x3FD59D7C, 0x3C1B20EE,
0xFA53DDEF, 0x0ABCD06B, 0xD5C4484E, 0x1DBF9A42, 0x9B0DEADA, 0xABC52197,
0x22363A0D, 0xE86D2BC5, 0x9C9DF69E, 0x5CAE82AB, 0x71F54BFF, 0x64F2E21E,
0xE2D74DD3, 0xF4FD4452, 0xBC437944, 0xB4130C93, 0x85139270, 0xAEFE1309,
0xC186D91C, 0x598CB0FA, 0x91F7F7EE, 0x7AD91D26, 0xD6E6C907, 0x61B46FC9,
0xF99C0238, 0xBC34F4DE, 0x6519035B, 0xDE355B3B, 0x611FCFDC, 0x886B4238,
0xC1B2EFFA, 0xC6F34A26, 0x7D1683B2, 0xC58EF183, 0x2EC22005, 0x3BB5FCBC,
0x4C6FAD73, 0xC3FE3B1B, 0xEEF28183, 0x8E4F1232, 0xE98583FF, 0x9172FE9C,
0x28342F61, 0xC03404CD, 0xCDF7E2EC, 0x9E02FCE1, 0xEE0A6D70, 0x0B07A7C8,
0x6372BB19, 0xAE56EDE7, 0xDE394DF4, 0x1D4F42A3, 0x60D7F468, 0xB96ADAB7,
0xB2C8E3FB, 0xD108A94B, 0xB324FB61, 0xBC0AB182, 0x483A797A, 0x30ACCA4F,
0x36ADE735, 0x1DF158A1, 0xF3EFE872, 0xE2A689DA, 0xE0E68B77, 0x984F0C70,
0x7F57C935, 0xB557135E, 0x3DED1AF3, 0x85636555, 0x5F066ED0, 0x2433F51F,
0xD5FD6561, 0xD3DF1ED5, 0xAEC4617A, 0xF681B202, 0x630C75D8, 0x7D2FE363,
0x249B3EF9, 0xCC939DCE, 0x146433FB, 0xA9E13641, 0xCE2D3695, 0xD8B9C583,
0x273D3CF1, 0xAFDC5620, 0xA2BB4A9A, 0xADF85458, 0xFFFFFFFF, 0xFFFFFFFF
};
static const BN_ULONG ffdhe8192_p[] = {
0xFFFFFFFF, 0xFFFFFFFF, 0xC5C6424C, 0xD68C8BB7, 0x838FF88C, 0x011E2A94,
0xA9F4614E, 0x0822E506, 0xF7A8443D, 0x97D11D49, 0x30677F0D, 0xA6BBFDE5,
0xC1FE86FE, 0x2F741EF8, 0x5D71A87E, 0xFAFABE1C, 0xFBE58A30, 0xDED2FBAB,
0x72B0A66E, 0xB6855DFE, 0xBA8A4FE8, 0x1EFC8CE0, 0x3F2FA457, 0x83F81D4A,
0xA577E231, 0xA1FE3075, 0x88D9C0A0, 0xD5B80194, 0xAD9A95F9, 0x624816CD,
0x50C1217B, 0x99E9E316, 0x0E423CFC, 0x51AA691E, 0x3826E52C, 0x1C217E6C,
0x09703FEE, 0x51A8A931, 0x6A460E74, 0xBB709987, 0x9C86B022, 0x541FC68C,
0x46FD8251, 0x59160CC0, 0x35C35F5C, 0x2846C0BA, 0x8B758282, 0x54504AC7,
0xD2AF05E4, 0x29388839, 0xC01BD702, 0xCB2C0F1C, 0x7C932665, 0x555B2F74,
0xA3AB8829, 0x86B63142, 0xF64B10EF, 0x0B8CC3BD, 0xEDD1CC5E, 0x687FEB69,
0xC9509D43, 0xFDB23FCE, 0xD951AE64, 0x1E425A31, 0xF600C838, 0x36AD004C,
0xCFF46AAA, 0xA40E329C, 0x7938DAD4, 0xA41D570D, 0xD43161C1, 0x62A69526,
0x9ADB1E69, 0x3FDD4A8E, 0xDC6B80D6, 0x5B3B71F9, 0xC6272B04, 0xEC9D1810,
0xCACEF403, 0x8CCF2DD5, 0xC95B9117, 0xE49F5235, 0xB854338A, 0x505DC82D,
0x1562A846, 0x62292C31, 0x6AE77F5E, 0xD72B0374, 0x462D538C, 0xF9C9091B,
0x47A67CBE, 0x0AE8DB58, 0x22611682, 0xB3A739C1, 0x2A281BF6, 0xEEAAC023,
0x77CAF992, 0x94C6651E, 0x94B2BBC1, 0x763E4E4B, 0x0077D9B4, 0x587E38DA,
0x183023C3, 0x7FB29F8C, 0xF9E3A26E, 0x0ABEC1FF, 0x350511E3, 0xA00EF092,
0xDB6340D8, 0xB855322E, 0xA9A96910, 0xA52471F7, 0x4CFDB477, 0x388147FB,
0x4E46041F, 0x9B1F5C3E, 0xFCCFEC71, 0xCDAD0657, 0x4C701C3A, 0xB38E8C33,
0xB1C0FD4C, 0x917BDD64, 0x9B7624C8, 0x3BB45432, 0xCAF53EA6, 0x23BA4442,
0x38532A3A, 0x4E677D2C, 0x45036C7A, 0x0BFD64B6, 0x5E0DD902, 0xC68A007E,
0xF44182E1, 0x4DB5A851, 0x7F88A46B, 0x8EC9B55A, 0xCEC97DCF, 0x0A8291CD,
0xF98D0ACC, 0x2A4ECEA9, 0x7140003C, 0x1A1DB93D, 0x33CB8B7A, 0x092999A3,
0x71AD0038, 0x6DC778F9, 0x918130C4, 0xA907600A, 0x2D9E6832, 0xED6A1E01,
0xEFB4318A, 0x7135C886, 0x7E31CC7A, 0x87F55BA5, 0x55034004, 0x7763CF1D,
0xD69F6D18, 0xAC7D5F42, 0xE58857B6, 0x7930E9E4, 0x164DF4FB, 0x6E6F52C3,
0x669E1EF1, 0x25E41D2B, 0x3FD59D7C, 0x3C1B20EE, 0xFA53DDEF, 0x0ABCD06B,
0xD5C4484E, 0x1DBF9A42, 0x9B0DEADA, 0xABC52197, 0x22363A0D, 0xE86D2BC5,
0x9C9DF69E, 0x5CAE82AB, 0x71F54BFF, 0x64F2E21E, 0xE2D74DD3, 0xF4FD4452,
0xBC437944, 0xB4130C93, 0x85139270, 0xAEFE1309, 0xC186D91C, 0x598CB0FA,
0x91F7F7EE, 0x7AD91D26, 0xD6E6C907, 0x61B46FC9, 0xF99C0238, 0xBC34F4DE,
0x6519035B, 0xDE355B3B, 0x611FCFDC, 0x886B4238, 0xC1B2EFFA, 0xC6F34A26,
0x7D1683B2, 0xC58EF183, 0x2EC22005, 0x3BB5FCBC, 0x4C6FAD73, 0xC3FE3B1B,
0xEEF28183, 0x8E4F1232, 0xE98583FF, 0x9172FE9C, 0x28342F61, 0xC03404CD,
0xCDF7E2EC, 0x9E02FCE1, 0xEE0A6D70, 0x0B07A7C8, 0x6372BB19, 0xAE56EDE7,
0xDE394DF4, 0x1D4F42A3, 0x60D7F468, 0xB96ADAB7, 0xB2C8E3FB, 0xD108A94B,
0xB324FB61, 0xBC0AB182, 0x483A797A, 0x30ACCA4F, 0x36ADE735, 0x1DF158A1,
0xF3EFE872, 0xE2A689DA, 0xE0E68B77, 0x984F0C70, 0x7F57C935, 0xB557135E,
0x3DED1AF3, 0x85636555, 0x5F066ED0, 0x2433F51F, 0xD5FD6561, 0xD3DF1ED5,
0xAEC4617A, 0xF681B202, 0x630C75D8, 0x7D2FE363, 0x249B3EF9, 0xCC939DCE,
0x146433FB, 0xA9E13641, 0xCE2D3695, 0xD8B9C583, 0x273D3CF1, 0xAFDC5620,
0xA2BB4A9A, 0xADF85458, 0xFFFFFFFF, 0xFFFFFFFF
};
# else
# error "unsupported BN_BITS2"
# endif
/* Macro to make a BIGNUM from static data */
# define make_dh_bn(x) extern const BIGNUM _bignum_##x; \
const BIGNUM _bignum_##x = { (BN_ULONG *) x, \
OSSL_NELEM(x),\
OSSL_NELEM(x),\
0, BN_FLG_STATIC_DATA };
static const BN_ULONG value_2 = 2;
const BIGNUM _bignum_const_2 =
{ (BN_ULONG *)&value_2, 1, 1, 0, BN_FLG_STATIC_DATA };
make_dh_bn(dh1024_160_p)
make_dh_bn(dh1024_160_g)
make_dh_bn(dh1024_160_q)
make_dh_bn(dh2048_224_p)
make_dh_bn(dh2048_224_g)
make_dh_bn(dh2048_224_q)
make_dh_bn(dh2048_256_p)
make_dh_bn(dh2048_256_g)
make_dh_bn(dh2048_256_q)
make_dh_bn(ffdhe2048_p)
make_dh_bn(ffdhe3072_p)
make_dh_bn(ffdhe4096_p)
make_dh_bn(ffdhe6144_p)
make_dh_bn(ffdhe8192_p)
#endif

View file

@ -0,0 +1,457 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <assert.h>
#include <openssl/bn.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/* The old slow way */
#if 0
int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
BN_CTX *ctx)
{
int i, nm, nd;
int ret = 0;
BIGNUM *D;
bn_check_top(m);
bn_check_top(d);
if (BN_is_zero(d)) {
BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
return 0;
}
if (BN_ucmp(m, d) < 0) {
if (rem != NULL) {
if (BN_copy(rem, m) == NULL)
return 0;
}
if (dv != NULL)
BN_zero(dv);
return 1;
}
BN_CTX_start(ctx);
D = BN_CTX_get(ctx);
if (dv == NULL)
dv = BN_CTX_get(ctx);
if (rem == NULL)
rem = BN_CTX_get(ctx);
if (D == NULL || dv == NULL || rem == NULL)
goto end;
nd = BN_num_bits(d);
nm = BN_num_bits(m);
if (BN_copy(D, d) == NULL)
goto end;
if (BN_copy(rem, m) == NULL)
goto end;
/*
* The next 2 are needed so we can do a dv->d[0]|=1 later since
* BN_lshift1 will only work once there is a value :-)
*/
BN_zero(dv);
if (bn_wexpand(dv, 1) == NULL)
goto end;
dv->top = 1;
if (!BN_lshift(D, D, nm - nd))
goto end;
for (i = nm - nd; i >= 0; i--) {
if (!BN_lshift1(dv, dv))
goto end;
if (BN_ucmp(rem, D) >= 0) {
dv->d[0] |= 1;
if (!BN_usub(rem, rem, D))
goto end;
}
/* CAN IMPROVE (and have now :=) */
if (!BN_rshift1(D, D))
goto end;
}
rem->neg = BN_is_zero(rem) ? 0 : m->neg;
dv->neg = m->neg ^ d->neg;
ret = 1;
end:
BN_CTX_end(ctx);
return ret;
}
#else
# if defined(BN_DIV3W)
BN_ULONG bn_div_3_words(const BN_ULONG *m, BN_ULONG d1, BN_ULONG d0);
# elif 0
/*
* This is #if-ed away, because it's a reference for assembly implementations,
* where it can and should be made constant-time. But if you want to test it,
* just replace 0 with 1.
*/
# if BN_BITS2 == 64 && defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
# undef BN_ULLONG
# define BN_ULLONG __uint128_t
# define BN_LLONG
# endif
# ifdef BN_LLONG
# define BN_DIV3W
/*
* Interface is somewhat quirky, |m| is pointer to most significant limb,
* and less significant limb is referred at |m[-1]|. This means that caller
* is responsible for ensuring that |m[-1]| is valid. Second condition that
* has to be met is that |d0|'s most significant bit has to be set. Or in
* other words divisor has to be "bit-aligned to the left." bn_div_fixed_top
* does all this. The subroutine considers four limbs, two of which are
* "overlapping," hence the name...
*/
static BN_ULONG bn_div_3_words(const BN_ULONG *m, BN_ULONG d1, BN_ULONG d0)
{
BN_ULLONG R = ((BN_ULLONG)m[0] << BN_BITS2) | m[-1];
BN_ULLONG D = ((BN_ULLONG)d0 << BN_BITS2) | d1;
BN_ULONG Q = 0, mask;
int i;
for (i = 0; i < BN_BITS2; i++) {
Q <<= 1;
if (R >= D) {
Q |= 1;
R -= D;
}
D >>= 1;
}
mask = 0 - (Q >> (BN_BITS2 - 1)); /* does it overflow? */
Q <<= 1;
Q |= (R >= D);
return (Q | mask) & BN_MASK2;
}
# endif
# endif
static int bn_left_align(BIGNUM *num)
{
BN_ULONG *d = num->d, n, m, rmask;
int top = num->top;
int rshift = BN_num_bits_word(d[top - 1]), lshift, i;
lshift = BN_BITS2 - rshift;
rshift %= BN_BITS2; /* say no to undefined behaviour */
rmask = (BN_ULONG)0 - rshift; /* rmask = 0 - (rshift != 0) */
rmask |= rmask >> 8;
for (i = 0, m = 0; i < top; i++) {
n = d[i];
d[i] = ((n << lshift) | m) & BN_MASK2;
m = (n >> rshift) & rmask;
}
return lshift;
}
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
&& !defined(PEDANTIC) && !defined(BN_DIV3W)
# if defined(__GNUC__) && __GNUC__>=2
# if defined(__i386) || defined (__i386__)
/*-
* There were two reasons for implementing this template:
* - GNU C generates a call to a function (__udivdi3 to be exact)
* in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
* understand why...);
* - divl doesn't only calculate quotient, but also leaves
* remainder in %edx which we can definitely use here:-)
*/
# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divl %4" \
: "=a"(q), "=d"(rem) \
: "a"(n1), "d"(n0), "r"(d0) \
: "cc"); \
q; \
})
# define REMAINDER_IS_ALREADY_CALCULATED
# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
/*
* Same story here, but it's 128-bit by 64-bit division. Wow!
*/
# undef bn_div_words
# define bn_div_words(n0,n1,d0) \
({ asm volatile ( \
"divq %4" \
: "=a"(q), "=d"(rem) \
: "a"(n1), "d"(n0), "r"(d0) \
: "cc"); \
q; \
})
# define REMAINDER_IS_ALREADY_CALCULATED
# endif /* __<cpu> */
# endif /* __GNUC__ */
# endif /* OPENSSL_NO_ASM */
/*-
* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
* Thus:
* dv->neg == num->neg ^ divisor->neg (unless the result is zero)
* rm->neg == num->neg (unless the remainder is zero)
* If 'dv' or 'rm' is NULL, the respective value is not returned.
*/
int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
BN_CTX *ctx)
{
int ret;
if (BN_is_zero(divisor)) {
BNerr(BN_F_BN_DIV, BN_R_DIV_BY_ZERO);
return 0;
}
/*
* Invalid zero-padding would have particularly bad consequences so don't
* just rely on bn_check_top() here (bn_check_top() works only for
* BN_DEBUG builds)
*/
if (divisor->d[divisor->top - 1] == 0) {
BNerr(BN_F_BN_DIV, BN_R_NOT_INITIALIZED);
return 0;
}
ret = bn_div_fixed_top(dv, rm, num, divisor, ctx);
if (ret) {
if (dv != NULL)
bn_correct_top(dv);
if (rm != NULL)
bn_correct_top(rm);
}
return ret;
}
/*
* It's argued that *length* of *significant* part of divisor is public.
* Even if it's private modulus that is. Again, *length* is assumed
* public, but not *value*. Former is likely to be pre-defined by
* algorithm with bit granularity, though below subroutine is invariant
* of limb length. Thanks to this assumption we can require that |divisor|
* may not be zero-padded, yet claim this subroutine "constant-time"(*).
* This is because zero-padded dividend, |num|, is tolerated, so that
* caller can pass dividend of public length(*), but with smaller amount
* of significant limbs. This naturally means that quotient, |dv|, would
* contain correspongly less significant limbs as well, and will be zero-
* padded accordingly. Returned remainder, |rm|, will have same bit length
* as divisor, also zero-padded if needed. These actually leave sign bits
* in ambiguous state. In sense that we try to avoid negative zeros, while
* zero-padded zeros would retain sign.
*
* (*) "Constant-time-ness" has two pre-conditions:
*
* - availability of constant-time bn_div_3_words;
* - dividend is at least as "wide" as divisor, limb-wise, zero-padded
* if so requied, which shouldn't be a privacy problem, because
* divisor's length is considered public;
*/
int bn_div_fixed_top(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
const BIGNUM *divisor, BN_CTX *ctx)
{
int norm_shift, i, j, loop;
BIGNUM *tmp, *snum, *sdiv, *res;
BN_ULONG *resp, *wnum, *wnumtop;
BN_ULONG d0, d1;
int num_n, div_n;
assert(divisor->top > 0 && divisor->d[divisor->top - 1] != 0);
bn_check_top(num);
bn_check_top(divisor);
bn_check_top(dv);
bn_check_top(rm);
BN_CTX_start(ctx);
res = (dv == NULL) ? BN_CTX_get(ctx) : dv;
tmp = BN_CTX_get(ctx);
snum = BN_CTX_get(ctx);
sdiv = BN_CTX_get(ctx);
if (sdiv == NULL)
goto err;
/* First we normalise the numbers */
if (!BN_copy(sdiv, divisor))
goto err;
norm_shift = bn_left_align(sdiv);
sdiv->neg = 0;
/*
* Note that bn_lshift_fixed_top's output is always one limb longer
* than input, even when norm_shift is zero. This means that amount of
* inner loop iterations is invariant of dividend value, and that one
* doesn't need to compare dividend and divisor if they were originally
* of the same bit length.
*/
if (!(bn_lshift_fixed_top(snum, num, norm_shift)))
goto err;
div_n = sdiv->top;
num_n = snum->top;
if (num_n <= div_n) {
/* caller didn't pad dividend -> no constant-time guarantee... */
if (bn_wexpand(snum, div_n + 1) == NULL)
goto err;
memset(&(snum->d[num_n]), 0, (div_n - num_n + 1) * sizeof(BN_ULONG));
snum->top = num_n = div_n + 1;
}
loop = num_n - div_n;
/*
* Lets setup a 'window' into snum This is the part that corresponds to
* the current 'area' being divided
*/
wnum = &(snum->d[loop]);
wnumtop = &(snum->d[num_n - 1]);
/* Get the top 2 words of sdiv */
d0 = sdiv->d[div_n - 1];
d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
/* Setup quotient */
if (!bn_wexpand(res, loop))
goto err;
res->neg = (num->neg ^ divisor->neg);
res->top = loop;
res->flags |= BN_FLG_FIXED_TOP;
resp = &(res->d[loop]);
/* space for temp */
if (!bn_wexpand(tmp, (div_n + 1)))
goto err;
for (i = 0; i < loop; i++, wnumtop--) {
BN_ULONG q, l0;
/*
* the first part of the loop uses the top two words of snum and sdiv
* to calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv
*/
# if defined(BN_DIV3W)
q = bn_div_3_words(wnumtop, d1, d0);
# else
BN_ULONG n0, n1, rem = 0;
n0 = wnumtop[0];
n1 = wnumtop[-1];
if (n0 == d0)
q = BN_MASK2;
else { /* n0 < d0 */
BN_ULONG n2 = (wnumtop == wnum) ? 0 : wnumtop[-2];
# ifdef BN_LLONG
BN_ULLONG t2;
# if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
q = (BN_ULONG)(((((BN_ULLONG) n0) << BN_BITS2) | n1) / d0);
# else
q = bn_div_words(n0, n1, d0);
# endif
# ifndef REMAINDER_IS_ALREADY_CALCULATED
/*
* rem doesn't have to be BN_ULLONG. The least we
* know it's less that d0, isn't it?
*/
rem = (n1 - q * d0) & BN_MASK2;
# endif
t2 = (BN_ULLONG) d1 *q;
for (;;) {
if (t2 <= ((((BN_ULLONG) rem) << BN_BITS2) | n2))
break;
q--;
rem += d0;
if (rem < d0)
break; /* don't let rem overflow */
t2 -= d1;
}
# else /* !BN_LLONG */
BN_ULONG t2l, t2h;
q = bn_div_words(n0, n1, d0);
# ifndef REMAINDER_IS_ALREADY_CALCULATED
rem = (n1 - q * d0) & BN_MASK2;
# endif
# if defined(BN_UMULT_LOHI)
BN_UMULT_LOHI(t2l, t2h, d1, q);
# elif defined(BN_UMULT_HIGH)
t2l = d1 * q;
t2h = BN_UMULT_HIGH(d1, q);
# else
{
BN_ULONG ql, qh;
t2l = LBITS(d1);
t2h = HBITS(d1);
ql = LBITS(q);
qh = HBITS(q);
mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
}
# endif
for (;;) {
if ((t2h < rem) || ((t2h == rem) && (t2l <= n2)))
break;
q--;
rem += d0;
if (rem < d0)
break; /* don't let rem overflow */
if (t2l < d1)
t2h--;
t2l -= d1;
}
# endif /* !BN_LLONG */
}
# endif /* !BN_DIV3W */
l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
tmp->d[div_n] = l0;
wnum--;
/*
* ignore top values of the bignums just sub the two BN_ULONG arrays
* with bn_sub_words
*/
l0 = bn_sub_words(wnum, wnum, tmp->d, div_n + 1);
q -= l0;
/*
* Note: As we have considered only the leading two BN_ULONGs in
* the calculation of q, sdiv * q might be greater than wnum (but
* then (q-1) * sdiv is less or equal than wnum)
*/
for (l0 = 0 - l0, j = 0; j < div_n; j++)
tmp->d[j] = sdiv->d[j] & l0;
l0 = bn_add_words(wnum, wnum, tmp->d, div_n);
(*wnumtop) += l0;
assert((*wnumtop) == 0);
/* store part of the result */
*--resp = q;
}
/* snum holds remainder, it's as wide as divisor */
snum->neg = num->neg;
snum->top = div_n;
snum->flags |= BN_FLG_FIXED_TOP;
if (rm != NULL)
bn_rshift_fixed_top(rm, snum, norm_shift);
BN_CTX_end(ctx);
return 1;
err:
bn_check_top(rm);
BN_CTX_end(ctx);
return 0;
}
#endif

View file

@ -0,0 +1,118 @@
/*
* Generated by util/mkerr.pl DO NOT EDIT
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <openssl/err.h>
#include <openssl/bnerr.h>
#ifndef OPENSSL_NO_ERR
static const ERR_STRING_DATA BN_str_functs[] = {
{ERR_PACK(ERR_LIB_BN, BN_F_BNRAND, 0), "bnrand"},
{ERR_PACK(ERR_LIB_BN, BN_F_BNRAND_RANGE, 0), "bnrand_range"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BLINDING_CONVERT_EX, 0),
"BN_BLINDING_convert_ex"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BLINDING_CREATE_PARAM, 0),
"BN_BLINDING_create_param"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BLINDING_INVERT_EX, 0),
"BN_BLINDING_invert_ex"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BLINDING_NEW, 0), "BN_BLINDING_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BLINDING_UPDATE, 0), "BN_BLINDING_update"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BN2DEC, 0), "BN_bn2dec"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_BN2HEX, 0), "BN_bn2hex"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_COMPUTE_WNAF, 0), "bn_compute_wNAF"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_GET, 0), "BN_CTX_get"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_NEW, 0), "BN_CTX_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_START, 0), "BN_CTX_start"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_DIV, 0), "BN_div"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_DIV_RECP, 0), "BN_div_recp"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_EXP, 0), "BN_exp"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_EXPAND_INTERNAL, 0), "bn_expand_internal"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GENCB_NEW, 0), "BN_GENCB_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GENERATE_DSA_NONCE, 0),
"BN_generate_dsa_nonce"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GENERATE_PRIME_EX, 0),
"BN_generate_prime_ex"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD, 0), "BN_GF2m_mod"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_EXP, 0), "BN_GF2m_mod_exp"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_MUL, 0), "BN_GF2m_mod_mul"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_SOLVE_QUAD, 0),
"BN_GF2m_mod_solve_quad"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_SOLVE_QUAD_ARR, 0),
"BN_GF2m_mod_solve_quad_arr"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_SQR, 0), "BN_GF2m_mod_sqr"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_GF2M_MOD_SQRT, 0), "BN_GF2m_mod_sqrt"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_LSHIFT, 0), "BN_lshift"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP2_MONT, 0), "BN_mod_exp2_mont"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP_MONT, 0), "BN_mod_exp_mont"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP_MONT_CONSTTIME, 0),
"BN_mod_exp_mont_consttime"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP_MONT_WORD, 0),
"BN_mod_exp_mont_word"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP_RECP, 0), "BN_mod_exp_recp"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_EXP_SIMPLE, 0), "BN_mod_exp_simple"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_INVERSE, 0), "BN_mod_inverse"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_INVERSE_NO_BRANCH, 0),
"BN_mod_inverse_no_branch"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_LSHIFT_QUICK, 0), "BN_mod_lshift_quick"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MOD_SQRT, 0), "BN_mod_sqrt"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MONT_CTX_NEW, 0), "BN_MONT_CTX_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_MPI2BN, 0), "BN_mpi2bn"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_NEW, 0), "BN_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_POOL_GET, 0), "BN_POOL_get"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_RAND, 0), "BN_rand"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_RAND_RANGE, 0), "BN_rand_range"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_RECP_CTX_NEW, 0), "BN_RECP_CTX_new"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_RSHIFT, 0), "BN_rshift"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_SET_WORDS, 0), "bn_set_words"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_STACK_PUSH, 0), "BN_STACK_push"},
{ERR_PACK(ERR_LIB_BN, BN_F_BN_USUB, 0), "BN_usub"},
{0, NULL}
};
static const ERR_STRING_DATA BN_str_reasons[] = {
{ERR_PACK(ERR_LIB_BN, 0, BN_R_ARG2_LT_ARG3), "arg2 lt arg3"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_BAD_RECIPROCAL), "bad reciprocal"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_BIGNUM_TOO_LONG), "bignum too long"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_BITS_TOO_SMALL), "bits too small"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_CALLED_WITH_EVEN_MODULUS),
"called with even modulus"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_DIV_BY_ZERO), "div by zero"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_ENCODING_ERROR), "encoding error"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA),
"expand on static bignum data"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_INPUT_NOT_REDUCED), "input not reduced"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_INVALID_LENGTH), "invalid length"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_INVALID_RANGE), "invalid range"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_INVALID_SHIFT), "invalid shift"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_A_SQUARE), "not a square"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_INITIALIZED), "not initialized"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_NO_INVERSE), "no inverse"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_NO_SOLUTION), "no solution"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_PRIVATE_KEY_TOO_LARGE),
"private key too large"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_P_IS_NOT_PRIME), "p is not prime"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_ITERATIONS), "too many iterations"},
{ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_TEMPORARY_VARIABLES),
"too many temporary variables"},
{0, NULL}
};
#endif
int ERR_load_BN_strings(void)
{
#ifndef OPENSSL_NO_ERR
if (ERR_func_error_string(BN_str_functs[0].error) == NULL) {
ERR_load_strings_const(BN_str_functs);
ERR_load_strings_const(BN_str_reasons);
}
#endif
return 1;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,201 @@
/*
* Copyright 1995-2017 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#define TABLE_SIZE 32
int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
BN_CTX *ctx, BN_MONT_CTX *in_mont)
{
int i, j, bits, b, bits1, bits2, ret =
0, wpos1, wpos2, window1, window2, wvalue1, wvalue2;
int r_is_one = 1;
BIGNUM *d, *r;
const BIGNUM *a_mod_m;
/* Tables of variables obtained from 'ctx' */
BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
BN_MONT_CTX *mont = NULL;
bn_check_top(a1);
bn_check_top(p1);
bn_check_top(a2);
bn_check_top(p2);
bn_check_top(m);
if (!(m->d[0] & 1)) {
BNerr(BN_F_BN_MOD_EXP2_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
return 0;
}
bits1 = BN_num_bits(p1);
bits2 = BN_num_bits(p2);
if ((bits1 == 0) && (bits2 == 0)) {
ret = BN_one(rr);
return ret;
}
bits = (bits1 > bits2) ? bits1 : bits2;
BN_CTX_start(ctx);
d = BN_CTX_get(ctx);
r = BN_CTX_get(ctx);
val1[0] = BN_CTX_get(ctx);
val2[0] = BN_CTX_get(ctx);
if (val2[0] == NULL)
goto err;
if (in_mont != NULL)
mont = in_mont;
else {
if ((mont = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont, m, ctx))
goto err;
}
window1 = BN_window_bits_for_exponent_size(bits1);
window2 = BN_window_bits_for_exponent_size(bits2);
/*
* Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 .. 2^(window1-1)
*/
if (a1->neg || BN_ucmp(a1, m) >= 0) {
if (!BN_mod(val1[0], a1, m, ctx))
goto err;
a_mod_m = val1[0];
} else
a_mod_m = a1;
if (BN_is_zero(a_mod_m)) {
BN_zero(rr);
ret = 1;
goto err;
}
if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx))
goto err;
if (window1 > 1) {
if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx))
goto err;
j = 1 << (window1 - 1);
for (i = 1; i < j; i++) {
if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
!BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx))
goto err;
}
}
/*
* Build table for a2: val2[i] := a2^(2*i + 1) mod m for i = 0 .. 2^(window2-1)
*/
if (a2->neg || BN_ucmp(a2, m) >= 0) {
if (!BN_mod(val2[0], a2, m, ctx))
goto err;
a_mod_m = val2[0];
} else
a_mod_m = a2;
if (BN_is_zero(a_mod_m)) {
BN_zero(rr);
ret = 1;
goto err;
}
if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx))
goto err;
if (window2 > 1) {
if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx))
goto err;
j = 1 << (window2 - 1);
for (i = 1; i < j; i++) {
if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
!BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx))
goto err;
}
}
/* Now compute the power product, using independent windows. */
r_is_one = 1;
wvalue1 = 0; /* The 'value' of the first window */
wvalue2 = 0; /* The 'value' of the second window */
wpos1 = 0; /* If wvalue1 > 0, the bottom bit of the
* first window */
wpos2 = 0; /* If wvalue2 > 0, the bottom bit of the
* second window */
if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
goto err;
for (b = bits - 1; b >= 0; b--) {
if (!r_is_one) {
if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
goto err;
}
if (!wvalue1)
if (BN_is_bit_set(p1, b)) {
/*
* consider bits b-window1+1 .. b for this window
*/
i = b - window1 + 1;
while (!BN_is_bit_set(p1, i)) /* works for i<0 */
i++;
wpos1 = i;
wvalue1 = 1;
for (i = b - 1; i >= wpos1; i--) {
wvalue1 <<= 1;
if (BN_is_bit_set(p1, i))
wvalue1++;
}
}
if (!wvalue2)
if (BN_is_bit_set(p2, b)) {
/*
* consider bits b-window2+1 .. b for this window
*/
i = b - window2 + 1;
while (!BN_is_bit_set(p2, i))
i++;
wpos2 = i;
wvalue2 = 1;
for (i = b - 1; i >= wpos2; i--) {
wvalue2 <<= 1;
if (BN_is_bit_set(p2, i))
wvalue2++;
}
}
if (wvalue1 && b == wpos1) {
/* wvalue1 is odd and < 2^window1 */
if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx))
goto err;
wvalue1 = 0;
r_is_one = 0;
}
if (wvalue2 && b == wpos2) {
/* wvalue2 is odd and < 2^window2 */
if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx))
goto err;
wvalue2 = 0;
r_is_one = 0;
}
}
if (!BN_from_montgomery(rr, r, mont, ctx))
goto err;
ret = 1;
err:
if (in_mont == NULL)
BN_MONT_CTX_free(mont);
BN_CTX_end(ctx);
bn_check_top(rr);
return ret;
}

View file

@ -0,0 +1,623 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
{
BIGNUM *a, *b, *t;
int ret = 0;
bn_check_top(in_a);
bn_check_top(in_b);
BN_CTX_start(ctx);
a = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
if (b == NULL)
goto err;
if (BN_copy(a, in_a) == NULL)
goto err;
if (BN_copy(b, in_b) == NULL)
goto err;
a->neg = 0;
b->neg = 0;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
t = euclid(a, b);
if (t == NULL)
goto err;
if (BN_copy(r, t) == NULL)
goto err;
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(r);
return ret;
}
static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
{
BIGNUM *t;
int shifts = 0;
bn_check_top(a);
bn_check_top(b);
/* 0 <= b <= a */
while (!BN_is_zero(b)) {
/* 0 < b <= a */
if (BN_is_odd(a)) {
if (BN_is_odd(b)) {
if (!BN_sub(a, a, b))
goto err;
if (!BN_rshift1(a, a))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
} else { /* a odd - b even */
if (!BN_rshift1(b, b))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
}
} else { /* a is even */
if (BN_is_odd(b)) {
if (!BN_rshift1(a, a))
goto err;
if (BN_cmp(a, b) < 0) {
t = a;
a = b;
b = t;
}
} else { /* a even - b even */
if (!BN_rshift1(a, a))
goto err;
if (!BN_rshift1(b, b))
goto err;
shifts++;
}
}
/* 0 <= b <= a */
}
if (shifts) {
if (!BN_lshift(a, a, shifts))
goto err;
}
bn_check_top(a);
return a;
err:
return NULL;
}
/* solves ax == 1 (mod n) */
static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx);
BIGNUM *BN_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
{
BIGNUM *rv;
int noinv;
rv = int_bn_mod_inverse(in, a, n, ctx, &noinv);
if (noinv)
BNerr(BN_F_BN_MOD_INVERSE, BN_R_NO_INVERSE);
return rv;
}
BIGNUM *int_bn_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx,
int *pnoinv)
{
BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
BIGNUM *ret = NULL;
int sign;
/* This is invalid input so we don't worry about constant time here */
if (BN_abs_is_word(n, 1) || BN_is_zero(n)) {
if (pnoinv != NULL)
*pnoinv = 1;
return NULL;
}
if (pnoinv != NULL)
*pnoinv = 0;
if ((BN_get_flags(a, BN_FLG_CONSTTIME) != 0)
|| (BN_get_flags(n, BN_FLG_CONSTTIME) != 0)) {
return BN_mod_inverse_no_branch(in, a, n, ctx);
}
bn_check_top(a);
bn_check_top(n);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
X = BN_CTX_get(ctx);
D = BN_CTX_get(ctx);
M = BN_CTX_get(ctx);
Y = BN_CTX_get(ctx);
T = BN_CTX_get(ctx);
if (T == NULL)
goto err;
if (in == NULL)
R = BN_new();
else
R = in;
if (R == NULL)
goto err;
BN_one(X);
BN_zero(Y);
if (BN_copy(B, a) == NULL)
goto err;
if (BN_copy(A, n) == NULL)
goto err;
A->neg = 0;
if (B->neg || (BN_ucmp(B, A) >= 0)) {
if (!BN_nnmod(B, B, A, ctx))
goto err;
}
sign = -1;
/*-
* From B = a mod |n|, A = |n| it follows that
*
* 0 <= B < A,
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
*/
if (BN_is_odd(n) && (BN_num_bits(n) <= 2048)) {
/*
* Binary inversion algorithm; requires odd modulus. This is faster
* than the general algorithm if the modulus is sufficiently small
* (about 400 .. 500 bits on 32-bit systems, but much more on 64-bit
* systems)
*/
int shift;
while (!BN_is_zero(B)) {
/*-
* 0 < B < |n|,
* 0 < A <= |n|,
* (1) -sign*X*a == B (mod |n|),
* (2) sign*Y*a == A (mod |n|)
*/
/*
* Now divide B by the maximum possible power of two in the
* integers, and divide X by the same value mod |n|. When we're
* done, (1) still holds.
*/
shift = 0;
while (!BN_is_bit_set(B, shift)) { /* note that 0 < B */
shift++;
if (BN_is_odd(X)) {
if (!BN_uadd(X, X, n))
goto err;
}
/*
* now X is even, so we can easily divide it by two
*/
if (!BN_rshift1(X, X))
goto err;
}
if (shift > 0) {
if (!BN_rshift(B, B, shift))
goto err;
}
/*
* Same for A and Y. Afterwards, (2) still holds.
*/
shift = 0;
while (!BN_is_bit_set(A, shift)) { /* note that 0 < A */
shift++;
if (BN_is_odd(Y)) {
if (!BN_uadd(Y, Y, n))
goto err;
}
/* now Y is even */
if (!BN_rshift1(Y, Y))
goto err;
}
if (shift > 0) {
if (!BN_rshift(A, A, shift))
goto err;
}
/*-
* We still have (1) and (2).
* Both A and B are odd.
* The following computations ensure that
*
* 0 <= B < |n|,
* 0 < A < |n|,
* (1) -sign*X*a == B (mod |n|),
* (2) sign*Y*a == A (mod |n|),
*
* and that either A or B is even in the next iteration.
*/
if (BN_ucmp(B, A) >= 0) {
/* -sign*(X + Y)*a == B - A (mod |n|) */
if (!BN_uadd(X, X, Y))
goto err;
/*
* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
* actually makes the algorithm slower
*/
if (!BN_usub(B, B, A))
goto err;
} else {
/* sign*(X + Y)*a == A - B (mod |n|) */
if (!BN_uadd(Y, Y, X))
goto err;
/*
* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down
*/
if (!BN_usub(A, A, B))
goto err;
}
}
} else {
/* general inversion algorithm */
while (!BN_is_zero(B)) {
BIGNUM *tmp;
/*-
* 0 < B < A,
* (*) -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|)
*/
/* (D, M) := (A/B, A%B) ... */
if (BN_num_bits(A) == BN_num_bits(B)) {
if (!BN_one(D))
goto err;
if (!BN_sub(M, A, B))
goto err;
} else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
/* A/B is 1, 2, or 3 */
if (!BN_lshift1(T, B))
goto err;
if (BN_ucmp(A, T) < 0) {
/* A < 2*B, so D=1 */
if (!BN_one(D))
goto err;
if (!BN_sub(M, A, B))
goto err;
} else {
/* A >= 2*B, so D=2 or D=3 */
if (!BN_sub(M, A, T))
goto err;
if (!BN_add(D, T, B))
goto err; /* use D (:= 3*B) as temp */
if (BN_ucmp(A, D) < 0) {
/* A < 3*B, so D=2 */
if (!BN_set_word(D, 2))
goto err;
/*
* M (= A - 2*B) already has the correct value
*/
} else {
/* only D=3 remains */
if (!BN_set_word(D, 3))
goto err;
/*
* currently M = A - 2*B, but we need M = A - 3*B
*/
if (!BN_sub(M, M, B))
goto err;
}
}
} else {
if (!BN_div(D, M, A, B, ctx))
goto err;
}
/*-
* Now
* A = D*B + M;
* thus we have
* (**) sign*Y*a == D*B + M (mod |n|).
*/
tmp = A; /* keep the BIGNUM object, the value does not matter */
/* (A, B) := (B, A mod B) ... */
A = B;
B = M;
/* ... so we have 0 <= B < A again */
/*-
* Since the former M is now B and the former B is now A,
* (**) translates into
* sign*Y*a == D*A + B (mod |n|),
* i.e.
* sign*Y*a - D*A == B (mod |n|).
* Similarly, (*) translates into
* -sign*X*a == A (mod |n|).
*
* Thus,
* sign*Y*a + D*sign*X*a == B (mod |n|),
* i.e.
* sign*(Y + D*X)*a == B (mod |n|).
*
* So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
* Note that X and Y stay non-negative all the time.
*/
/*
* most of the time D is very small, so we can optimize tmp := D*X+Y
*/
if (BN_is_one(D)) {
if (!BN_add(tmp, X, Y))
goto err;
} else {
if (BN_is_word(D, 2)) {
if (!BN_lshift1(tmp, X))
goto err;
} else if (BN_is_word(D, 4)) {
if (!BN_lshift(tmp, X, 2))
goto err;
} else if (D->top == 1) {
if (!BN_copy(tmp, X))
goto err;
if (!BN_mul_word(tmp, D->d[0]))
goto err;
} else {
if (!BN_mul(tmp, D, X, ctx))
goto err;
}
if (!BN_add(tmp, tmp, Y))
goto err;
}
M = Y; /* keep the BIGNUM object, the value does not matter */
Y = X;
X = tmp;
sign = -sign;
}
}
/*-
* The while loop (Euclid's algorithm) ends when
* A == gcd(a,n);
* we have
* sign*Y*a == A (mod |n|),
* where Y is non-negative.
*/
if (sign < 0) {
if (!BN_sub(Y, n, Y))
goto err;
}
/* Now Y*a == A (mod |n|). */
if (BN_is_one(A)) {
/* Y*a == 1 (mod |n|) */
if (!Y->neg && BN_ucmp(Y, n) < 0) {
if (!BN_copy(R, Y))
goto err;
} else {
if (!BN_nnmod(R, Y, n, ctx))
goto err;
}
} else {
if (pnoinv)
*pnoinv = 1;
goto err;
}
ret = R;
err:
if ((ret == NULL) && (in == NULL))
BN_free(R);
BN_CTX_end(ctx);
bn_check_top(ret);
return ret;
}
/*
* BN_mod_inverse_no_branch is a special version of BN_mod_inverse. It does
* not contain branches that may leak sensitive information.
*/
static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n,
BN_CTX *ctx)
{
BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
BIGNUM *ret = NULL;
int sign;
bn_check_top(a);
bn_check_top(n);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
X = BN_CTX_get(ctx);
D = BN_CTX_get(ctx);
M = BN_CTX_get(ctx);
Y = BN_CTX_get(ctx);
T = BN_CTX_get(ctx);
if (T == NULL)
goto err;
if (in == NULL)
R = BN_new();
else
R = in;
if (R == NULL)
goto err;
BN_one(X);
BN_zero(Y);
if (BN_copy(B, a) == NULL)
goto err;
if (BN_copy(A, n) == NULL)
goto err;
A->neg = 0;
if (B->neg || (BN_ucmp(B, A) >= 0)) {
/*
* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
* BN_div_no_branch will be called eventually.
*/
{
BIGNUM local_B;
bn_init(&local_B);
BN_with_flags(&local_B, B, BN_FLG_CONSTTIME);
if (!BN_nnmod(B, &local_B, A, ctx))
goto err;
/* Ensure local_B goes out of scope before any further use of B */
}
}
sign = -1;
/*-
* From B = a mod |n|, A = |n| it follows that
*
* 0 <= B < A,
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
*/
while (!BN_is_zero(B)) {
BIGNUM *tmp;
/*-
* 0 < B < A,
* (*) -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|)
*/
/*
* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
* BN_div_no_branch will be called eventually.
*/
{
BIGNUM local_A;
bn_init(&local_A);
BN_with_flags(&local_A, A, BN_FLG_CONSTTIME);
/* (D, M) := (A/B, A%B) ... */
if (!BN_div(D, M, &local_A, B, ctx))
goto err;
/* Ensure local_A goes out of scope before any further use of A */
}
/*-
* Now
* A = D*B + M;
* thus we have
* (**) sign*Y*a == D*B + M (mod |n|).
*/
tmp = A; /* keep the BIGNUM object, the value does not
* matter */
/* (A, B) := (B, A mod B) ... */
A = B;
B = M;
/* ... so we have 0 <= B < A again */
/*-
* Since the former M is now B and the former B is now A,
* (**) translates into
* sign*Y*a == D*A + B (mod |n|),
* i.e.
* sign*Y*a - D*A == B (mod |n|).
* Similarly, (*) translates into
* -sign*X*a == A (mod |n|).
*
* Thus,
* sign*Y*a + D*sign*X*a == B (mod |n|),
* i.e.
* sign*(Y + D*X)*a == B (mod |n|).
*
* So if we set (X, Y, sign) := (Y + D*X, X, -sign), we arrive back at
* -sign*X*a == B (mod |n|),
* sign*Y*a == A (mod |n|).
* Note that X and Y stay non-negative all the time.
*/
if (!BN_mul(tmp, D, X, ctx))
goto err;
if (!BN_add(tmp, tmp, Y))
goto err;
M = Y; /* keep the BIGNUM object, the value does not
* matter */
Y = X;
X = tmp;
sign = -sign;
}
/*-
* The while loop (Euclid's algorithm) ends when
* A == gcd(a,n);
* we have
* sign*Y*a == A (mod |n|),
* where Y is non-negative.
*/
if (sign < 0) {
if (!BN_sub(Y, n, Y))
goto err;
}
/* Now Y*a == A (mod |n|). */
if (BN_is_one(A)) {
/* Y*a == 1 (mod |n|) */
if (!Y->neg && BN_ucmp(Y, n) < 0) {
if (!BN_copy(R, Y))
goto err;
} else {
if (!BN_nnmod(R, Y, n, ctx))
goto err;
}
} else {
BNerr(BN_F_BN_MOD_INVERSE_NO_BRANCH, BN_R_NO_INVERSE);
goto err;
}
ret = R;
err:
if ((ret == NULL) && (in == NULL))
BN_free(R);
BN_CTX_end(ctx);
bn_check_top(ret);
return ret;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,199 @@
/*
* Copyright 2014-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/*
* Determine the modified width-(w+1) Non-Adjacent Form (wNAF) of 'scalar'.
* This is an array r[] of values that are either zero or odd with an
* absolute value less than 2^w satisfying
* scalar = \sum_j r[j]*2^j
* where at most one of any w+1 consecutive digits is non-zero
* with the exception that the most significant digit may be only
* w-1 zeros away from that next non-zero digit.
*/
signed char *bn_compute_wNAF(const BIGNUM *scalar, int w, size_t *ret_len)
{
int window_val;
signed char *r = NULL;
int sign = 1;
int bit, next_bit, mask;
size_t len = 0, j;
if (BN_is_zero(scalar)) {
r = OPENSSL_malloc(1);
if (r == NULL) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_MALLOC_FAILURE);
goto err;
}
r[0] = 0;
*ret_len = 1;
return r;
}
if (w <= 0 || w > 7) { /* 'signed char' can represent integers with
* absolute values less than 2^7 */
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
bit = 1 << w; /* at most 128 */
next_bit = bit << 1; /* at most 256 */
mask = next_bit - 1; /* at most 255 */
if (BN_is_negative(scalar)) {
sign = -1;
}
if (scalar->d == NULL || scalar->top == 0) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
len = BN_num_bits(scalar);
r = OPENSSL_malloc(len + 1); /*
* Modified wNAF may be one digit longer than binary representation
* (*ret_len will be set to the actual length, i.e. at most
* BN_num_bits(scalar) + 1)
*/
if (r == NULL) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_MALLOC_FAILURE);
goto err;
}
window_val = scalar->d[0] & mask;
j = 0;
while ((window_val != 0) || (j + w + 1 < len)) { /* if j+w+1 >= len,
* window_val will not
* increase */
int digit = 0;
/* 0 <= window_val <= 2^(w+1) */
if (window_val & 1) {
/* 0 < window_val < 2^(w+1) */
if (window_val & bit) {
digit = window_val - next_bit; /* -2^w < digit < 0 */
#if 1 /* modified wNAF */
if (j + w + 1 >= len) {
/*
* Special case for generating modified wNAFs:
* no new bits will be added into window_val,
* so using a positive digit here will decrease
* the total length of the representation
*/
digit = window_val & (mask >> 1); /* 0 < digit < 2^w */
}
#endif
} else {
digit = window_val; /* 0 < digit < 2^w */
}
if (digit <= -bit || digit >= bit || !(digit & 1)) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
window_val -= digit;
/*
* now window_val is 0 or 2^(w+1) in standard wNAF generation;
* for modified window NAFs, it may also be 2^w
*/
if (window_val != 0 && window_val != next_bit
&& window_val != bit) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
}
r[j++] = sign * digit;
window_val >>= 1;
window_val += bit * BN_is_bit_set(scalar, j + w);
if (window_val > next_bit) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
}
if (j > len + 1) {
BNerr(BN_F_BN_COMPUTE_WNAF, ERR_R_INTERNAL_ERROR);
goto err;
}
*ret_len = j;
return r;
err:
OPENSSL_free(r);
return NULL;
}
int bn_get_top(const BIGNUM *a)
{
return a->top;
}
int bn_get_dmax(const BIGNUM *a)
{
return a->dmax;
}
void bn_set_all_zero(BIGNUM *a)
{
int i;
for (i = a->top; i < a->dmax; i++)
a->d[i] = 0;
}
int bn_copy_words(BN_ULONG *out, const BIGNUM *in, int size)
{
if (in->top > size)
return 0;
memset(out, 0, sizeof(*out) * size);
if (in->d != NULL)
memcpy(out, in->d, sizeof(*out) * in->top);
return 1;
}
BN_ULONG *bn_get_words(const BIGNUM *a)
{
return a->d;
}
void bn_set_static_words(BIGNUM *a, const BN_ULONG *words, int size)
{
/*
* |const| qualifier omission is compensated by BN_FLG_STATIC_DATA
* flag, which effectively means "read-only data".
*/
a->d = (BN_ULONG *)words;
a->dmax = a->top = size;
a->neg = 0;
a->flags |= BN_FLG_STATIC_DATA;
bn_correct_top(a);
}
int bn_set_words(BIGNUM *a, const BN_ULONG *words, int num_words)
{
if (bn_wexpand(a, num_words) == NULL) {
BNerr(BN_F_BN_SET_WORDS, ERR_R_MALLOC_FAILURE);
return 0;
}
memcpy(a->d, words, sizeof(BN_ULONG) * num_words);
a->top = num_words;
bn_correct_top(a);
return 1;
}

View file

@ -0,0 +1,140 @@
/*
* Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/* least significant word */
#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
/* Returns -2 for errors because both -1 and 0 are valid results. */
int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
int i;
int ret = -2; /* avoid 'uninitialized' warning */
int err = 0;
BIGNUM *A, *B, *tmp;
/*-
* In 'tab', only odd-indexed entries are relevant:
* For any odd BIGNUM n,
* tab[BN_lsw(n) & 7]
* is $(-1)^{(n^2-1)/8}$ (using TeX notation).
* Note that the sign of n does not matter.
*/
static const int tab[8] = { 0, 1, 0, -1, 0, -1, 0, 1 };
bn_check_top(a);
bn_check_top(b);
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
B = BN_CTX_get(ctx);
if (B == NULL)
goto end;
err = !BN_copy(A, a);
if (err)
goto end;
err = !BN_copy(B, b);
if (err)
goto end;
/*
* Kronecker symbol, implemented according to Henri Cohen,
* "A Course in Computational Algebraic Number Theory"
* (algorithm 1.4.10).
*/
/* Cohen's step 1: */
if (BN_is_zero(B)) {
ret = BN_abs_is_word(A, 1);
goto end;
}
/* Cohen's step 2: */
if (!BN_is_odd(A) && !BN_is_odd(B)) {
ret = 0;
goto end;
}
/* now B is non-zero */
i = 0;
while (!BN_is_bit_set(B, i))
i++;
err = !BN_rshift(B, B, i);
if (err)
goto end;
if (i & 1) {
/* i is odd */
/* (thus B was even, thus A must be odd!) */
/* set 'ret' to $(-1)^{(A^2-1)/8}$ */
ret = tab[BN_lsw(A) & 7];
} else {
/* i is even */
ret = 1;
}
if (B->neg) {
B->neg = 0;
if (A->neg)
ret = -ret;
}
/*
* now B is positive and odd, so what remains to be done is to compute
* the Jacobi symbol (A/B) and multiply it by 'ret'
*/
while (1) {
/* Cohen's step 3: */
/* B is positive and odd */
if (BN_is_zero(A)) {
ret = BN_is_one(B) ? ret : 0;
goto end;
}
/* now A is non-zero */
i = 0;
while (!BN_is_bit_set(A, i))
i++;
err = !BN_rshift(A, A, i);
if (err)
goto end;
if (i & 1) {
/* i is odd */
/* multiply 'ret' by $(-1)^{(B^2-1)/8}$ */
ret = ret * tab[BN_lsw(B) & 7];
}
/* Cohen's step 4: */
/* multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ */
if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2)
ret = -ret;
/* (A, B) := (B mod |A|, |A|) */
err = !BN_nnmod(B, B, A, ctx);
if (err)
goto end;
tmp = A;
A = B;
B = tmp;
tmp->neg = 0;
}
end:
BN_CTX_end(ctx);
if (err)
return -2;
else
return ret;
}

View file

@ -0,0 +1,671 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#ifndef HEADER_BN_LCL_H
# define HEADER_BN_LCL_H
/*
* The EDK2 build doesn't use bn_conf.h; it sets THIRTY_TWO_BIT or
* SIXTY_FOUR_BIT in its own environment since it doesn't re-run our
* Configure script and needs to support both 32-bit and 64-bit.
*/
# include <openssl/opensslconf.h>
# if !defined(OPENSSL_SYS_UEFI)
# include "internal/bn_conf.h"
# endif
# include "internal/bn_int.h"
/*
* These preprocessor symbols control various aspects of the bignum headers
* and library code. They're not defined by any "normal" configuration, as
* they are intended for development and testing purposes. NB: defining all
* three can be useful for debugging application code as well as openssl
* itself. BN_DEBUG - turn on various debugging alterations to the bignum
* code BN_DEBUG_RAND - uses random poisoning of unused words to trip up
* mismanagement of bignum internals. You must also define BN_DEBUG.
*/
/* #define BN_DEBUG */
/* #define BN_DEBUG_RAND */
# ifndef OPENSSL_SMALL_FOOTPRINT
# define BN_MUL_COMBA
# define BN_SQR_COMBA
# define BN_RECURSION
# endif
/*
* This next option uses the C libraries (2 word)/(1 word) function. If it is
* not defined, I use my C version (which is slower). The reason for this
* flag is that when the particular C compiler library routine is used, and
* the library is linked with a different compiler, the library is missing.
* This mostly happens when the library is built with gcc and then linked
* using normal cc. This would be a common occurrence because gcc normally
* produces code that is 2 times faster than system compilers for the big
* number stuff. For machines with only one compiler (or shared libraries),
* this should be on. Again this in only really a problem on machines using
* "long long's", are 32bit, and are not using my assembler code.
*/
# if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
defined(OPENSSL_SYS_WIN32) || defined(linux)
# define BN_DIV2W
# endif
/*
* 64-bit processor with LP64 ABI
*/
# ifdef SIXTY_FOUR_BIT_LONG
# define BN_ULLONG unsigned long long
# define BN_BITS4 32
# define BN_MASK2 (0xffffffffffffffffL)
# define BN_MASK2l (0xffffffffL)
# define BN_MASK2h (0xffffffff00000000L)
# define BN_MASK2h1 (0xffffffff80000000L)
# define BN_DEC_CONV (10000000000000000000UL)
# define BN_DEC_NUM 19
# define BN_DEC_FMT1 "%lu"
# define BN_DEC_FMT2 "%019lu"
# endif
/*
* 64-bit processor other than LP64 ABI
*/
# ifdef SIXTY_FOUR_BIT
# undef BN_LLONG
# undef BN_ULLONG
# define BN_BITS4 32
# define BN_MASK2 (0xffffffffffffffffLL)
# define BN_MASK2l (0xffffffffL)
# define BN_MASK2h (0xffffffff00000000LL)
# define BN_MASK2h1 (0xffffffff80000000LL)
# define BN_DEC_CONV (10000000000000000000ULL)
# define BN_DEC_NUM 19
# define BN_DEC_FMT1 "%llu"
# define BN_DEC_FMT2 "%019llu"
# endif
# ifdef THIRTY_TWO_BIT
# ifdef BN_LLONG
# if defined(_WIN32) && !defined(__GNUC__)
# define BN_ULLONG unsigned __int64
# else
# define BN_ULLONG unsigned long long
# endif
# endif
# define BN_BITS4 16
# define BN_MASK2 (0xffffffffL)
# define BN_MASK2l (0xffff)
# define BN_MASK2h1 (0xffff8000L)
# define BN_MASK2h (0xffff0000L)
# define BN_DEC_CONV (1000000000L)
# define BN_DEC_NUM 9
# define BN_DEC_FMT1 "%u"
# define BN_DEC_FMT2 "%09u"
# endif
/*-
* Bignum consistency macros
* There is one "API" macro, bn_fix_top(), for stripping leading zeroes from
* bignum data after direct manipulations on the data. There is also an
* "internal" macro, bn_check_top(), for verifying that there are no leading
* zeroes. Unfortunately, some auditing is required due to the fact that
* bn_fix_top() has become an overabused duct-tape because bignum data is
* occasionally passed around in an inconsistent state. So the following
* changes have been made to sort this out;
* - bn_fix_top()s implementation has been moved to bn_correct_top()
* - if BN_DEBUG isn't defined, bn_fix_top() maps to bn_correct_top(), and
* bn_check_top() is as before.
* - if BN_DEBUG *is* defined;
* - bn_check_top() tries to pollute unused words even if the bignum 'top' is
* consistent. (ed: only if BN_DEBUG_RAND is defined)
* - bn_fix_top() maps to bn_check_top() rather than "fixing" anything.
* The idea is to have debug builds flag up inconsistent bignums when they
* occur. If that occurs in a bn_fix_top(), we examine the code in question; if
* the use of bn_fix_top() was appropriate (ie. it follows directly after code
* that manipulates the bignum) it is converted to bn_correct_top(), and if it
* was not appropriate, we convert it permanently to bn_check_top() and track
* down the cause of the bug. Eventually, no internal code should be using the
* bn_fix_top() macro. External applications and libraries should try this with
* their own code too, both in terms of building against the openssl headers
* with BN_DEBUG defined *and* linking with a version of OpenSSL built with it
* defined. This not only improves external code, it provides more test
* coverage for openssl's own code.
*/
# ifdef BN_DEBUG
/*
* The new BN_FLG_FIXED_TOP flag marks vectors that were not treated with
* bn_correct_top, in other words such vectors are permitted to have zeros
* in most significant limbs. Such vectors are used internally to achieve
* execution time invariance for critical operations with private keys.
* It's BN_DEBUG-only flag, because user application is not supposed to
* observe it anyway. Moreover, optimizing compiler would actually remove
* all operations manipulating the bit in question in non-BN_DEBUG build.
*/
# define BN_FLG_FIXED_TOP 0x10000
# ifdef BN_DEBUG_RAND
# define bn_pollute(a) \
do { \
const BIGNUM *_bnum1 = (a); \
if (_bnum1->top < _bnum1->dmax) { \
unsigned char _tmp_char; \
/* We cast away const without the compiler knowing, any \
* *genuinely* constant variables that aren't mutable \
* wouldn't be constructed with top!=dmax. */ \
BN_ULONG *_not_const; \
memcpy(&_not_const, &_bnum1->d, sizeof(_not_const)); \
RAND_bytes(&_tmp_char, 1); /* Debug only - safe to ignore error return */\
memset(_not_const + _bnum1->top, _tmp_char, \
sizeof(*_not_const) * (_bnum1->dmax - _bnum1->top)); \
} \
} while(0)
# else
# define bn_pollute(a)
# endif
# define bn_check_top(a) \
do { \
const BIGNUM *_bnum2 = (a); \
if (_bnum2 != NULL) { \
int _top = _bnum2->top; \
(void)ossl_assert((_top == 0 && !_bnum2->neg) || \
(_top && ((_bnum2->flags & BN_FLG_FIXED_TOP) \
|| _bnum2->d[_top - 1] != 0))); \
bn_pollute(_bnum2); \
} \
} while(0)
# define bn_fix_top(a) bn_check_top(a)
# define bn_check_size(bn, bits) bn_wcheck_size(bn, ((bits+BN_BITS2-1))/BN_BITS2)
# define bn_wcheck_size(bn, words) \
do { \
const BIGNUM *_bnum2 = (bn); \
assert((words) <= (_bnum2)->dmax && \
(words) >= (_bnum2)->top); \
/* avoid unused variable warning with NDEBUG */ \
(void)(_bnum2); \
} while(0)
# else /* !BN_DEBUG */
# define BN_FLG_FIXED_TOP 0
# define bn_pollute(a)
# define bn_check_top(a)
# define bn_fix_top(a) bn_correct_top(a)
# define bn_check_size(bn, bits)
# define bn_wcheck_size(bn, words)
# endif
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
BN_ULONG w);
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int num);
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int num);
struct bignum_st {
BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit
* chunks. */
int top; /* Index of last used d +1. */
/* The next are internal book keeping for bn_expand. */
int dmax; /* Size of the d array. */
int neg; /* one if the number is negative */
int flags;
};
/* Used for montgomery multiplication */
struct bn_mont_ctx_st {
int ri; /* number of bits in R */
BIGNUM RR; /* used to convert to montgomery form,
possibly zero-padded */
BIGNUM N; /* The modulus */
BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1 (Ni is only
* stored for bignum algorithm) */
BN_ULONG n0[2]; /* least significant word(s) of Ni; (type
* changed with 0.9.9, was "BN_ULONG n0;"
* before) */
int flags;
};
/*
* Used for reciprocal division/mod functions It cannot be shared between
* threads
*/
struct bn_recp_ctx_st {
BIGNUM N; /* the divisor */
BIGNUM Nr; /* the reciprocal */
int num_bits;
int shift;
int flags;
};
/* Used for slow "generation" functions. */
struct bn_gencb_st {
unsigned int ver; /* To handle binary (in)compatibility */
void *arg; /* callback-specific data */
union {
/* if (ver==1) - handles old style callbacks */
void (*cb_1) (int, int, void *);
/* if (ver==2) - new callback style */
int (*cb_2) (int, int, BN_GENCB *);
} cb;
};
/*-
* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
*
*
* For window size 'w' (w >= 2) and a random 'b' bits exponent,
* the number of multiplications is a constant plus on average
*
* 2^(w-1) + (b-w)/(w+1);
*
* here 2^(w-1) is for precomputing the table (we actually need
* entries only for windows that have the lowest bit set), and
* (b-w)/(w+1) is an approximation for the expected number of
* w-bit windows, not counting the first one.
*
* Thus we should use
*
* w >= 6 if b > 671
* w = 5 if 671 > b > 239
* w = 4 if 239 > b > 79
* w = 3 if 79 > b > 23
* w <= 2 if 23 > b
*
* (with draws in between). Very small exponents are often selected
* with low Hamming weight, so we use w = 1 for b <= 23.
*/
# define BN_window_bits_for_exponent_size(b) \
((b) > 671 ? 6 : \
(b) > 239 ? 5 : \
(b) > 79 ? 4 : \
(b) > 23 ? 3 : 1)
/*
* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
* line width of the target processor is at least the following value.
*/
# define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
# define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
/*
* Window sizes optimized for fixed window size modular exponentiation
* algorithm (BN_mod_exp_mont_consttime). To achieve the security goals of
* BN_mode_exp_mont_consttime, the maximum size of the window must not exceed
* log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). Window size thresholds are
* defined for cache line sizes of 32 and 64, cache line sizes where
* log_2(32)=5 and log_2(64)=6 respectively. A window size of 7 should only be
* used on processors that have a 128 byte or greater cache line size.
*/
# if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
# define BN_window_bits_for_ctime_exponent_size(b) \
((b) > 937 ? 6 : \
(b) > 306 ? 5 : \
(b) > 89 ? 4 : \
(b) > 22 ? 3 : 1)
# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
# elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
# define BN_window_bits_for_ctime_exponent_size(b) \
((b) > 306 ? 5 : \
(b) > 89 ? 4 : \
(b) > 22 ? 3 : 1)
# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
# endif
/* Pentium pro 16,16,16,32,64 */
/* Alpha 16,16,16,16.64 */
# define BN_MULL_SIZE_NORMAL (16)/* 32 */
# define BN_MUL_RECURSIVE_SIZE_NORMAL (16)/* 32 less than */
# define BN_SQR_RECURSIVE_SIZE_NORMAL (16)/* 32 */
# define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32)/* 32 */
# define BN_MONT_CTX_SET_SIZE_WORD (64)/* 32 */
/*
* 2011-02-22 SMS. In various places, a size_t variable or a type cast to
* size_t was used to perform integer-only operations on pointers. This
* failed on VMS with 64-bit pointers (CC /POINTER_SIZE = 64) because size_t
* is still only 32 bits. What's needed in these cases is an integer type
* with the same size as a pointer, which size_t is not certain to be. The
* only fix here is VMS-specific.
*/
# if defined(OPENSSL_SYS_VMS)
# if __INITIAL_POINTER_SIZE == 64
# define PTR_SIZE_INT long long
# else /* __INITIAL_POINTER_SIZE == 64 */
# define PTR_SIZE_INT int
# endif /* __INITIAL_POINTER_SIZE == 64 [else] */
# elif !defined(PTR_SIZE_INT) /* defined(OPENSSL_SYS_VMS) */
# define PTR_SIZE_INT size_t
# endif /* defined(OPENSSL_SYS_VMS) [else] */
# if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
/*
* BN_UMULT_HIGH section.
* If the compiler doesn't support 2*N integer type, then you have to
* replace every N*N multiplication with 4 (N/2)*(N/2) accompanied by some
* shifts and additions which unavoidably results in severe performance
* penalties. Of course provided that the hardware is capable of producing
* 2*N result... That's when you normally start considering assembler
* implementation. However! It should be pointed out that some CPUs (e.g.,
* PowerPC, Alpha, and IA-64) provide *separate* instruction calculating
* the upper half of the product placing the result into a general
* purpose register. Now *if* the compiler supports inline assembler,
* then it's not impossible to implement the "bignum" routines (and have
* the compiler optimize 'em) exhibiting "native" performance in C. That's
* what BN_UMULT_HIGH macro is about:-) Note that more recent compilers do
* support 2*64 integer type, which is also used here.
*/
# if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16 && \
(defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
# define BN_UMULT_HIGH(a,b) (((__uint128_t)(a)*(b))>>64)
# define BN_UMULT_LOHI(low,high,a,b) ({ \
__uint128_t ret=(__uint128_t)(a)*(b); \
(high)=ret>>64; (low)=ret; })
# elif defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
# elif defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC64) && defined(SIXTY_FOUR_BIT_LONG)
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# elif (defined(__x86_64) || defined(__x86_64__)) && \
(defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret,discard; \
asm ("mulq %3" \
: "=a"(discard),"=d"(ret) \
: "a"(a), "g"(b) \
: "cc"); \
ret; })
# define BN_UMULT_LOHI(low,high,a,b) \
asm ("mulq %3" \
: "=a"(low),"=d"(high) \
: "a"(a),"g"(b) \
: "cc");
# endif
# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
# if defined(_MSC_VER) && _MSC_VER>=1400
unsigned __int64 __umulh(unsigned __int64 a, unsigned __int64 b);
unsigned __int64 _umul128(unsigned __int64 a, unsigned __int64 b,
unsigned __int64 *h);
# pragma intrinsic(__umulh,_umul128)
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
# endif
# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("dmultu %1,%2" \
: "=h"(ret) \
: "r"(a), "r"(b) : "l"); \
ret; })
# define BN_UMULT_LOHI(low,high,a,b) \
asm ("dmultu %2,%3" \
: "=l"(low),"=h"(high) \
: "r"(a), "r"(b));
# endif
# elif defined(__aarch64__) && defined(SIXTY_FOUR_BIT_LONG)
# if defined(__GNUC__) && __GNUC__>=2
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %0,%1,%2" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif
# endif /* cpu */
# endif /* OPENSSL_NO_ASM */
# ifdef BN_DEBUG_RAND
# define bn_clear_top2max(a) \
{ \
int ind = (a)->dmax - (a)->top; \
BN_ULONG *ftl = &(a)->d[(a)->top-1]; \
for (; ind != 0; ind--) \
*(++ftl) = 0x0; \
}
# else
# define bn_clear_top2max(a)
# endif
# ifdef BN_LLONG
/*******************************************************************
* Using the long long type, has to be twice as wide as BN_ULONG...
*/
# define Lw(t) (((BN_ULONG)(t))&BN_MASK2)
# define Hw(t) (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
# define mul_add(r,a,w,c) { \
BN_ULLONG t; \
t=(BN_ULLONG)w * (a) + (r) + (c); \
(r)= Lw(t); \
(c)= Hw(t); \
}
# define mul(r,a,w,c) { \
BN_ULLONG t; \
t=(BN_ULLONG)w * (a) + (c); \
(r)= Lw(t); \
(c)= Hw(t); \
}
# define sqr(r0,r1,a) { \
BN_ULLONG t; \
t=(BN_ULLONG)(a)*(a); \
(r0)=Lw(t); \
(r1)=Hw(t); \
}
# elif defined(BN_UMULT_LOHI)
# define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
ret = (r); \
BN_UMULT_LOHI(low,high,w,tmp); \
ret += (c); \
(c) = (ret<(c))?1:0; \
(c) += high; \
ret += low; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define mul(r,a,w,c) { \
BN_ULONG high,low,ret,ta=(a); \
BN_UMULT_LOHI(low,high,w,ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define sqr(r0,r1,a) { \
BN_ULONG tmp=(a); \
BN_UMULT_LOHI(r0,r1,tmp,tmp); \
}
# elif defined(BN_UMULT_HIGH)
# define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
ret = (r); \
high= BN_UMULT_HIGH(w,tmp); \
ret += (c); \
low = (w) * tmp; \
(c) = (ret<(c))?1:0; \
(c) += high; \
ret += low; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define mul(r,a,w,c) { \
BN_ULONG high,low,ret,ta=(a); \
low = (w) * ta; \
high= BN_UMULT_HIGH(w,ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
# define sqr(r0,r1,a) { \
BN_ULONG tmp=(a); \
(r0) = tmp * tmp; \
(r1) = BN_UMULT_HIGH(tmp,tmp); \
}
# else
/*************************************************************
* No long long type
*/
# define LBITS(a) ((a)&BN_MASK2l)
# define HBITS(a) (((a)>>BN_BITS4)&BN_MASK2l)
# define L2HBITS(a) (((a)<<BN_BITS4)&BN_MASK2)
# define LLBITS(a) ((a)&BN_MASKl)
# define LHBITS(a) (((a)>>BN_BITS2)&BN_MASKl)
# define LL2HBITS(a) ((BN_ULLONG)((a)&BN_MASKl)<<BN_BITS2)
# define mul64(l,h,bl,bh) \
{ \
BN_ULONG m,m1,lt,ht; \
\
lt=l; \
ht=h; \
m =(bh)*(lt); \
lt=(bl)*(lt); \
m1=(bl)*(ht); \
ht =(bh)*(ht); \
m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
ht+=HBITS(m); \
m1=L2HBITS(m); \
lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
(l)=lt; \
(h)=ht; \
}
# define sqr64(lo,ho,in) \
{ \
BN_ULONG l,h,m; \
\
h=(in); \
l=LBITS(h); \
h=HBITS(h); \
m =(l)*(h); \
l*=l; \
h*=h; \
h+=(m&BN_MASK2h1)>>(BN_BITS4-1); \
m =(m&BN_MASK2l)<<(BN_BITS4+1); \
l=(l+m)&BN_MASK2; if (l < m) h++; \
(lo)=l; \
(ho)=h; \
}
# define mul_add(r,a,bl,bh,c) { \
BN_ULONG l,h; \
\
h= (a); \
l=LBITS(h); \
h=HBITS(h); \
mul64(l,h,(bl),(bh)); \
\
/* non-multiply part */ \
l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
(c)=(r); \
l=(l+(c))&BN_MASK2; if (l < (c)) h++; \
(c)=h&BN_MASK2; \
(r)=l; \
}
# define mul(r,a,bl,bh,c) { \
BN_ULONG l,h; \
\
h= (a); \
l=LBITS(h); \
h=HBITS(h); \
mul64(l,h,(bl),(bh)); \
\
/* non-multiply part */ \
l+=(c); if ((l&BN_MASK2) < (c)) h++; \
(c)=h&BN_MASK2; \
(r)=l&BN_MASK2; \
}
# endif /* !BN_LLONG */
void BN_RECP_CTX_init(BN_RECP_CTX *recp);
void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
void bn_init(BIGNUM *a);
void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb);
void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
int dna, int dnb, BN_ULONG *t);
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b,
int n, int tna, int tnb, BN_ULONG *t);
void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t);
void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n);
void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
BN_ULONG *t);
BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl);
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
const BN_ULONG *np, const BN_ULONG *n0, int num);
BIGNUM *int_bn_mod_inverse(BIGNUM *in,
const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx,
int *noinv);
int bn_probable_prime_dh(BIGNUM *rnd, int bits,
const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
static ossl_inline BIGNUM *bn_expand(BIGNUM *a, int bits)
{
if (bits > (INT_MAX - BN_BITS2 + 1))
return NULL;
if (((bits+BN_BITS2-1)/BN_BITS2) <= (a)->dmax)
return a;
return bn_expand2((a),(bits+BN_BITS2-1)/BN_BITS2);
}
#endif

View file

@ -0,0 +1,964 @@
/*
* Copyright 1995-2019 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <assert.h>
#include <limits.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#include <openssl/opensslconf.h>
#include "internal/constant_time_locl.h"
/* This stuff appears to be completely unused, so is deprecated */
#if OPENSSL_API_COMPAT < 0x00908000L
/*-
* For a 32 bit machine
* 2 - 4 == 128
* 3 - 8 == 256
* 4 - 16 == 512
* 5 - 32 == 1024
* 6 - 64 == 2048
* 7 - 128 == 4096
* 8 - 256 == 8192
*/
static int bn_limit_bits = 0;
static int bn_limit_num = 8; /* (1<<bn_limit_bits) */
static int bn_limit_bits_low = 0;
static int bn_limit_num_low = 8; /* (1<<bn_limit_bits_low) */
static int bn_limit_bits_high = 0;
static int bn_limit_num_high = 8; /* (1<<bn_limit_bits_high) */
static int bn_limit_bits_mont = 0;
static int bn_limit_num_mont = 8; /* (1<<bn_limit_bits_mont) */
void BN_set_params(int mult, int high, int low, int mont)
{
if (mult >= 0) {
if (mult > (int)(sizeof(int) * 8) - 1)
mult = sizeof(int) * 8 - 1;
bn_limit_bits = mult;
bn_limit_num = 1 << mult;
}
if (high >= 0) {
if (high > (int)(sizeof(int) * 8) - 1)
high = sizeof(int) * 8 - 1;
bn_limit_bits_high = high;
bn_limit_num_high = 1 << high;
}
if (low >= 0) {
if (low > (int)(sizeof(int) * 8) - 1)
low = sizeof(int) * 8 - 1;
bn_limit_bits_low = low;
bn_limit_num_low = 1 << low;
}
if (mont >= 0) {
if (mont > (int)(sizeof(int) * 8) - 1)
mont = sizeof(int) * 8 - 1;
bn_limit_bits_mont = mont;
bn_limit_num_mont = 1 << mont;
}
}
int BN_get_params(int which)
{
if (which == 0)
return bn_limit_bits;
else if (which == 1)
return bn_limit_bits_high;
else if (which == 2)
return bn_limit_bits_low;
else if (which == 3)
return bn_limit_bits_mont;
else
return 0;
}
#endif
const BIGNUM *BN_value_one(void)
{
static const BN_ULONG data_one = 1L;
static const BIGNUM const_one =
{ (BN_ULONG *)&data_one, 1, 1, 0, BN_FLG_STATIC_DATA };
return &const_one;
}
int BN_num_bits_word(BN_ULONG l)
{
BN_ULONG x, mask;
int bits = (l != 0);
#if BN_BITS2 > 32
x = l >> 32;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 32 & mask;
l ^= (x ^ l) & mask;
#endif
x = l >> 16;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 16 & mask;
l ^= (x ^ l) & mask;
x = l >> 8;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 8 & mask;
l ^= (x ^ l) & mask;
x = l >> 4;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 4 & mask;
l ^= (x ^ l) & mask;
x = l >> 2;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 2 & mask;
l ^= (x ^ l) & mask;
x = l >> 1;
mask = (0 - x) & BN_MASK2;
mask = (0 - (mask >> (BN_BITS2 - 1)));
bits += 1 & mask;
return bits;
}
int BN_num_bits(const BIGNUM *a)
{
int i = a->top - 1;
bn_check_top(a);
if (BN_is_zero(a))
return 0;
return ((i * BN_BITS2) + BN_num_bits_word(a->d[i]));
}
static void bn_free_d(BIGNUM *a)
{
if (BN_get_flags(a, BN_FLG_SECURE))
OPENSSL_secure_free(a->d);
else
OPENSSL_free(a->d);
}
void BN_clear_free(BIGNUM *a)
{
if (a == NULL)
return;
if (a->d != NULL && !BN_get_flags(a, BN_FLG_STATIC_DATA)) {
OPENSSL_cleanse(a->d, a->dmax * sizeof(a->d[0]));
bn_free_d(a);
}
if (BN_get_flags(a, BN_FLG_MALLOCED)) {
OPENSSL_cleanse(a, sizeof(*a));
OPENSSL_free(a);
}
}
void BN_free(BIGNUM *a)
{
if (a == NULL)
return;
if (!BN_get_flags(a, BN_FLG_STATIC_DATA))
bn_free_d(a);
if (a->flags & BN_FLG_MALLOCED)
OPENSSL_free(a);
}
void bn_init(BIGNUM *a)
{
static BIGNUM nilbn;
*a = nilbn;
bn_check_top(a);
}
BIGNUM *BN_new(void)
{
BIGNUM *ret;
if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
ret->flags = BN_FLG_MALLOCED;
bn_check_top(ret);
return ret;
}
BIGNUM *BN_secure_new(void)
{
BIGNUM *ret = BN_new();
if (ret != NULL)
ret->flags |= BN_FLG_SECURE;
return ret;
}
/* This is used by bn_expand2() */
/* The caller MUST check that words > b->dmax before calling this */
static BN_ULONG *bn_expand_internal(const BIGNUM *b, int words)
{
BN_ULONG *a = NULL;
if (words > (INT_MAX / (4 * BN_BITS2))) {
BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_BIGNUM_TOO_LONG);
return NULL;
}
if (BN_get_flags(b, BN_FLG_STATIC_DATA)) {
BNerr(BN_F_BN_EXPAND_INTERNAL, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
return NULL;
}
if (BN_get_flags(b, BN_FLG_SECURE))
a = OPENSSL_secure_zalloc(words * sizeof(*a));
else
a = OPENSSL_zalloc(words * sizeof(*a));
if (a == NULL) {
BNerr(BN_F_BN_EXPAND_INTERNAL, ERR_R_MALLOC_FAILURE);
return NULL;
}
assert(b->top <= words);
if (b->top > 0)
memcpy(a, b->d, sizeof(*a) * b->top);
return a;
}
/*
* This is an internal function that should not be used in applications. It
* ensures that 'b' has enough room for a 'words' word number and initialises
* any unused part of b->d with leading zeros. It is mostly used by the
* various BIGNUM routines. If there is an error, NULL is returned. If not,
* 'b' is returned.
*/
BIGNUM *bn_expand2(BIGNUM *b, int words)
{
if (words > b->dmax) {
BN_ULONG *a = bn_expand_internal(b, words);
if (!a)
return NULL;
if (b->d) {
OPENSSL_cleanse(b->d, b->dmax * sizeof(b->d[0]));
bn_free_d(b);
}
b->d = a;
b->dmax = words;
}
return b;
}
BIGNUM *BN_dup(const BIGNUM *a)
{
BIGNUM *t;
if (a == NULL)
return NULL;
bn_check_top(a);
t = BN_get_flags(a, BN_FLG_SECURE) ? BN_secure_new() : BN_new();
if (t == NULL)
return NULL;
if (!BN_copy(t, a)) {
BN_free(t);
return NULL;
}
bn_check_top(t);
return t;
}
BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
{
bn_check_top(b);
if (a == b)
return a;
if (bn_wexpand(a, b->top) == NULL)
return NULL;
if (b->top > 0)
memcpy(a->d, b->d, sizeof(b->d[0]) * b->top);
a->neg = b->neg;
a->top = b->top;
a->flags |= b->flags & BN_FLG_FIXED_TOP;
bn_check_top(a);
return a;
}
#define FLAGS_DATA(flags) ((flags) & (BN_FLG_STATIC_DATA \
| BN_FLG_CONSTTIME \
| BN_FLG_SECURE \
| BN_FLG_FIXED_TOP))
#define FLAGS_STRUCT(flags) ((flags) & (BN_FLG_MALLOCED))
void BN_swap(BIGNUM *a, BIGNUM *b)
{
int flags_old_a, flags_old_b;
BN_ULONG *tmp_d;
int tmp_top, tmp_dmax, tmp_neg;
bn_check_top(a);
bn_check_top(b);
flags_old_a = a->flags;
flags_old_b = b->flags;
tmp_d = a->d;
tmp_top = a->top;
tmp_dmax = a->dmax;
tmp_neg = a->neg;
a->d = b->d;
a->top = b->top;
a->dmax = b->dmax;
a->neg = b->neg;
b->d = tmp_d;
b->top = tmp_top;
b->dmax = tmp_dmax;
b->neg = tmp_neg;
a->flags = FLAGS_STRUCT(flags_old_a) | FLAGS_DATA(flags_old_b);
b->flags = FLAGS_STRUCT(flags_old_b) | FLAGS_DATA(flags_old_a);
bn_check_top(a);
bn_check_top(b);
}
void BN_clear(BIGNUM *a)
{
bn_check_top(a);
if (a->d != NULL)
OPENSSL_cleanse(a->d, sizeof(*a->d) * a->dmax);
a->neg = 0;
a->top = 0;
a->flags &= ~BN_FLG_FIXED_TOP;
}
BN_ULONG BN_get_word(const BIGNUM *a)
{
if (a->top > 1)
return BN_MASK2;
else if (a->top == 1)
return a->d[0];
/* a->top == 0 */
return 0;
}
int BN_set_word(BIGNUM *a, BN_ULONG w)
{
bn_check_top(a);
if (bn_expand(a, (int)sizeof(BN_ULONG) * 8) == NULL)
return 0;
a->neg = 0;
a->d[0] = w;
a->top = (w ? 1 : 0);
a->flags &= ~BN_FLG_FIXED_TOP;
bn_check_top(a);
return 1;
}
BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
{
unsigned int i, m;
unsigned int n;
BN_ULONG l;
BIGNUM *bn = NULL;
if (ret == NULL)
ret = bn = BN_new();
if (ret == NULL)
return NULL;
bn_check_top(ret);
/* Skip leading zero's. */
for ( ; len > 0 && *s == 0; s++, len--)
continue;
n = len;
if (n == 0) {
ret->top = 0;
return ret;
}
i = ((n - 1) / BN_BYTES) + 1;
m = ((n - 1) % (BN_BYTES));
if (bn_wexpand(ret, (int)i) == NULL) {
BN_free(bn);
return NULL;
}
ret->top = i;
ret->neg = 0;
l = 0;
while (n--) {
l = (l << 8L) | *(s++);
if (m-- == 0) {
ret->d[--i] = l;
l = 0;
m = BN_BYTES - 1;
}
}
/*
* need to call this due to clear byte at top if avoiding having the top
* bit set (-ve number)
*/
bn_correct_top(ret);
return ret;
}
/* ignore negative */
static int bn2binpad(const BIGNUM *a, unsigned char *to, int tolen)
{
int n;
size_t i, lasti, j, atop, mask;
BN_ULONG l;
/*
* In case |a| is fixed-top, BN_num_bytes can return bogus length,
* but it's assumed that fixed-top inputs ought to be "nominated"
* even for padded output, so it works out...
*/
n = BN_num_bytes(a);
if (tolen == -1) {
tolen = n;
} else if (tolen < n) { /* uncommon/unlike case */
BIGNUM temp = *a;
bn_correct_top(&temp);
n = BN_num_bytes(&temp);
if (tolen < n)
return -1;
}
/* Swipe through whole available data and don't give away padded zero. */
atop = a->dmax * BN_BYTES;
if (atop == 0) {
OPENSSL_cleanse(to, tolen);
return tolen;
}
lasti = atop - 1;
atop = a->top * BN_BYTES;
for (i = 0, j = 0, to += tolen; j < (size_t)tolen; j++) {
l = a->d[i / BN_BYTES];
mask = 0 - ((j - atop) >> (8 * sizeof(i) - 1));
*--to = (unsigned char)(l >> (8 * (i % BN_BYTES)) & mask);
i += (i - lasti) >> (8 * sizeof(i) - 1); /* stay on last limb */
}
return tolen;
}
int BN_bn2binpad(const BIGNUM *a, unsigned char *to, int tolen)
{
if (tolen < 0)
return -1;
return bn2binpad(a, to, tolen);
}
int BN_bn2bin(const BIGNUM *a, unsigned char *to)
{
return bn2binpad(a, to, -1);
}
BIGNUM *BN_lebin2bn(const unsigned char *s, int len, BIGNUM *ret)
{
unsigned int i, m;
unsigned int n;
BN_ULONG l;
BIGNUM *bn = NULL;
if (ret == NULL)
ret = bn = BN_new();
if (ret == NULL)
return NULL;
bn_check_top(ret);
s += len;
/* Skip trailing zeroes. */
for ( ; len > 0 && s[-1] == 0; s--, len--)
continue;
n = len;
if (n == 0) {
ret->top = 0;
return ret;
}
i = ((n - 1) / BN_BYTES) + 1;
m = ((n - 1) % (BN_BYTES));
if (bn_wexpand(ret, (int)i) == NULL) {
BN_free(bn);
return NULL;
}
ret->top = i;
ret->neg = 0;
l = 0;
while (n--) {
s--;
l = (l << 8L) | *s;
if (m-- == 0) {
ret->d[--i] = l;
l = 0;
m = BN_BYTES - 1;
}
}
/*
* need to call this due to clear byte at top if avoiding having the top
* bit set (-ve number)
*/
bn_correct_top(ret);
return ret;
}
int BN_bn2lebinpad(const BIGNUM *a, unsigned char *to, int tolen)
{
int i;
BN_ULONG l;
bn_check_top(a);
i = BN_num_bytes(a);
if (tolen < i)
return -1;
/* Add trailing zeroes if necessary */
if (tolen > i)
memset(to + i, 0, tolen - i);
to += i;
while (i--) {
l = a->d[i / BN_BYTES];
to--;
*to = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
}
return tolen;
}
int BN_ucmp(const BIGNUM *a, const BIGNUM *b)
{
int i;
BN_ULONG t1, t2, *ap, *bp;
bn_check_top(a);
bn_check_top(b);
i = a->top - b->top;
if (i != 0)
return i;
ap = a->d;
bp = b->d;
for (i = a->top - 1; i >= 0; i--) {
t1 = ap[i];
t2 = bp[i];
if (t1 != t2)
return ((t1 > t2) ? 1 : -1);
}
return 0;
}
int BN_cmp(const BIGNUM *a, const BIGNUM *b)
{
int i;
int gt, lt;
BN_ULONG t1, t2;
if ((a == NULL) || (b == NULL)) {
if (a != NULL)
return -1;
else if (b != NULL)
return 1;
else
return 0;
}
bn_check_top(a);
bn_check_top(b);
if (a->neg != b->neg) {
if (a->neg)
return -1;
else
return 1;
}
if (a->neg == 0) {
gt = 1;
lt = -1;
} else {
gt = -1;
lt = 1;
}
if (a->top > b->top)
return gt;
if (a->top < b->top)
return lt;
for (i = a->top - 1; i >= 0; i--) {
t1 = a->d[i];
t2 = b->d[i];
if (t1 > t2)
return gt;
if (t1 < t2)
return lt;
}
return 0;
}
int BN_set_bit(BIGNUM *a, int n)
{
int i, j, k;
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i) {
if (bn_wexpand(a, i + 1) == NULL)
return 0;
for (k = a->top; k < i + 1; k++)
a->d[k] = 0;
a->top = i + 1;
a->flags &= ~BN_FLG_FIXED_TOP;
}
a->d[i] |= (((BN_ULONG)1) << j);
bn_check_top(a);
return 1;
}
int BN_clear_bit(BIGNUM *a, int n)
{
int i, j;
bn_check_top(a);
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i)
return 0;
a->d[i] &= (~(((BN_ULONG)1) << j));
bn_correct_top(a);
return 1;
}
int BN_is_bit_set(const BIGNUM *a, int n)
{
int i, j;
bn_check_top(a);
if (n < 0)
return 0;
i = n / BN_BITS2;
j = n % BN_BITS2;
if (a->top <= i)
return 0;
return (int)(((a->d[i]) >> j) & ((BN_ULONG)1));
}
int BN_mask_bits(BIGNUM *a, int n)
{
int b, w;
bn_check_top(a);
if (n < 0)
return 0;
w = n / BN_BITS2;
b = n % BN_BITS2;
if (w >= a->top)
return 0;
if (b == 0)
a->top = w;
else {
a->top = w + 1;
a->d[w] &= ~(BN_MASK2 << b);
}
bn_correct_top(a);
return 1;
}
void BN_set_negative(BIGNUM *a, int b)
{
if (b && !BN_is_zero(a))
a->neg = 1;
else
a->neg = 0;
}
int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
{
int i;
BN_ULONG aa, bb;
if (n == 0)
return 0;
aa = a[n - 1];
bb = b[n - 1];
if (aa != bb)
return ((aa > bb) ? 1 : -1);
for (i = n - 2; i >= 0; i--) {
aa = a[i];
bb = b[i];
if (aa != bb)
return ((aa > bb) ? 1 : -1);
}
return 0;
}
/*
* Here follows a specialised variants of bn_cmp_words(). It has the
* capability of performing the operation on arrays of different sizes. The
* sizes of those arrays is expressed through cl, which is the common length
* ( basically, min(len(a),len(b)) ), and dl, which is the delta between the
* two lengths, calculated as len(a)-len(b). All lengths are the number of
* BN_ULONGs...
*/
int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl)
{
int n, i;
n = cl - 1;
if (dl < 0) {
for (i = dl; i < 0; i++) {
if (b[n - i] != 0)
return -1; /* a < b */
}
}
if (dl > 0) {
for (i = dl; i > 0; i--) {
if (a[n + i] != 0)
return 1; /* a > b */
}
}
return bn_cmp_words(a, b, cl);
}
/*-
* Constant-time conditional swap of a and b.
* a and b are swapped if condition is not 0.
* nwords is the number of words to swap.
* Assumes that at least nwords are allocated in both a and b.
* Assumes that no more than nwords are used by either a or b.
*/
void BN_consttime_swap(BN_ULONG condition, BIGNUM *a, BIGNUM *b, int nwords)
{
BN_ULONG t;
int i;
if (a == b)
return;
bn_wcheck_size(a, nwords);
bn_wcheck_size(b, nwords);
condition = ((~condition & ((condition - 1))) >> (BN_BITS2 - 1)) - 1;
t = (a->top ^ b->top) & condition;
a->top ^= t;
b->top ^= t;
t = (a->neg ^ b->neg) & condition;
a->neg ^= t;
b->neg ^= t;
/*-
* BN_FLG_STATIC_DATA: indicates that data may not be written to. Intention
* is actually to treat it as it's read-only data, and some (if not most)
* of it does reside in read-only segment. In other words observation of
* BN_FLG_STATIC_DATA in BN_consttime_swap should be treated as fatal
* condition. It would either cause SEGV or effectively cause data
* corruption.
*
* BN_FLG_MALLOCED: refers to BN structure itself, and hence must be
* preserved.
*
* BN_FLG_SECURE: must be preserved, because it determines how x->d was
* allocated and hence how to free it.
*
* BN_FLG_CONSTTIME: sufficient to mask and swap
*
* BN_FLG_FIXED_TOP: indicates that we haven't called bn_correct_top() on
* the data, so the d array may be padded with additional 0 values (i.e.
* top could be greater than the minimal value that it could be). We should
* be swapping it
*/
#define BN_CONSTTIME_SWAP_FLAGS (BN_FLG_CONSTTIME | BN_FLG_FIXED_TOP)
t = ((a->flags ^ b->flags) & BN_CONSTTIME_SWAP_FLAGS) & condition;
a->flags ^= t;
b->flags ^= t;
/* conditionally swap the data */
for (i = 0; i < nwords; i++) {
t = (a->d[i] ^ b->d[i]) & condition;
a->d[i] ^= t;
b->d[i] ^= t;
}
}
#undef BN_CONSTTIME_SWAP_FLAGS
/* Bits of security, see SP800-57 */
int BN_security_bits(int L, int N)
{
int secbits, bits;
if (L >= 15360)
secbits = 256;
else if (L >= 7680)
secbits = 192;
else if (L >= 3072)
secbits = 128;
else if (L >= 2048)
secbits = 112;
else if (L >= 1024)
secbits = 80;
else
return 0;
if (N == -1)
return secbits;
bits = N / 2;
if (bits < 80)
return 0;
return bits >= secbits ? secbits : bits;
}
void BN_zero_ex(BIGNUM *a)
{
a->neg = 0;
a->top = 0;
a->flags &= ~BN_FLG_FIXED_TOP;
}
int BN_abs_is_word(const BIGNUM *a, const BN_ULONG w)
{
return ((a->top == 1) && (a->d[0] == w)) || ((w == 0) && (a->top == 0));
}
int BN_is_zero(const BIGNUM *a)
{
return a->top == 0;
}
int BN_is_one(const BIGNUM *a)
{
return BN_abs_is_word(a, 1) && !a->neg;
}
int BN_is_word(const BIGNUM *a, const BN_ULONG w)
{
return BN_abs_is_word(a, w) && (!w || !a->neg);
}
int BN_is_odd(const BIGNUM *a)
{
return (a->top > 0) && (a->d[0] & 1);
}
int BN_is_negative(const BIGNUM *a)
{
return (a->neg != 0);
}
int BN_to_montgomery(BIGNUM *r, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
return BN_mod_mul_montgomery(r, a, &(mont->RR), mont, ctx);
}
void BN_with_flags(BIGNUM *dest, const BIGNUM *b, int flags)
{
dest->d = b->d;
dest->top = b->top;
dest->dmax = b->dmax;
dest->neg = b->neg;
dest->flags = ((dest->flags & BN_FLG_MALLOCED)
| (b->flags & ~BN_FLG_MALLOCED)
| BN_FLG_STATIC_DATA | flags);
}
BN_GENCB *BN_GENCB_new(void)
{
BN_GENCB *ret;
if ((ret = OPENSSL_malloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_GENCB_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
return ret;
}
void BN_GENCB_free(BN_GENCB *cb)
{
if (cb == NULL)
return;
OPENSSL_free(cb);
}
void BN_set_flags(BIGNUM *b, int n)
{
b->flags |= n;
}
int BN_get_flags(const BIGNUM *b, int n)
{
return b->flags & n;
}
/* Populate a BN_GENCB structure with an "old"-style callback */
void BN_GENCB_set_old(BN_GENCB *gencb, void (*callback) (int, int, void *),
void *cb_arg)
{
BN_GENCB *tmp_gencb = gencb;
tmp_gencb->ver = 1;
tmp_gencb->arg = cb_arg;
tmp_gencb->cb.cb_1 = callback;
}
/* Populate a BN_GENCB structure with a "new"-style callback */
void BN_GENCB_set(BN_GENCB *gencb, int (*callback) (int, int, BN_GENCB *),
void *cb_arg)
{
BN_GENCB *tmp_gencb = gencb;
tmp_gencb->ver = 2;
tmp_gencb->arg = cb_arg;
tmp_gencb->cb.cb_2 = callback;
}
void *BN_GENCB_get_arg(BN_GENCB *cb)
{
return cb->arg;
}
BIGNUM *bn_wexpand(BIGNUM *a, int words)
{
return (words <= a->dmax) ? a : bn_expand2(a, words);
}
void bn_correct_top(BIGNUM *a)
{
BN_ULONG *ftl;
int tmp_top = a->top;
if (tmp_top > 0) {
for (ftl = &(a->d[tmp_top]); tmp_top > 0; tmp_top--) {
ftl--;
if (*ftl != 0)
break;
}
a->top = tmp_top;
}
if (a->top == 0)
a->neg = 0;
a->flags &= ~BN_FLG_FIXED_TOP;
bn_pollute(a);
}

View file

@ -0,0 +1,321 @@
/*
* Copyright 1998-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
{
/*
* like BN_mod, but returns non-negative remainder (i.e., 0 <= r < |d|
* always holds)
*/
if (!(BN_mod(r, m, d, ctx)))
return 0;
if (!r->neg)
return 1;
/* now -|d| < r < 0, so we have to set r := r + |d| */
return (d->neg ? BN_sub : BN_add) (r, r, d);
}
int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
if (!BN_add(r, a, b))
return 0;
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_add variant that may be used if both a and b are non-negative and
* less than m. The original algorithm was
*
* if (!BN_uadd(r, a, b))
* return 0;
* if (BN_ucmp(r, m) >= 0)
* return BN_usub(r, r, m);
*
* which is replaced with addition, subtracting modulus, and conditional
* move depending on whether or not subtraction borrowed.
*/
int bn_mod_add_fixed_top(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
size_t i, ai, bi, mtop = m->top;
BN_ULONG storage[1024 / BN_BITS2];
BN_ULONG carry, temp, mask, *rp, *tp = storage;
const BN_ULONG *ap, *bp;
if (bn_wexpand(r, mtop) == NULL)
return 0;
if (mtop > sizeof(storage) / sizeof(storage[0])
&& (tp = OPENSSL_malloc(mtop * sizeof(BN_ULONG))) == NULL)
return 0;
ap = a->d != NULL ? a->d : tp;
bp = b->d != NULL ? b->d : tp;
for (i = 0, ai = 0, bi = 0, carry = 0; i < mtop;) {
mask = (BN_ULONG)0 - ((i - a->top) >> (8 * sizeof(i) - 1));
temp = ((ap[ai] & mask) + carry) & BN_MASK2;
carry = (temp < carry);
mask = (BN_ULONG)0 - ((i - b->top) >> (8 * sizeof(i) - 1));
tp[i] = ((bp[bi] & mask) + temp) & BN_MASK2;
carry += (tp[i] < temp);
i++;
ai += (i - a->dmax) >> (8 * sizeof(i) - 1);
bi += (i - b->dmax) >> (8 * sizeof(i) - 1);
}
rp = r->d;
carry -= bn_sub_words(rp, tp, m->d, mtop);
for (i = 0; i < mtop; i++) {
rp[i] = (carry & tp[i]) | (~carry & rp[i]);
((volatile BN_ULONG *)tp)[i] = 0;
}
r->top = mtop;
r->flags |= BN_FLG_FIXED_TOP;
r->neg = 0;
if (tp != storage)
OPENSSL_free(tp);
return 1;
}
int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
int ret = bn_mod_add_fixed_top(r, a, b, m);
if (ret)
bn_correct_top(r);
return ret;
}
int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
if (!BN_sub(r, a, b))
return 0;
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_sub variant that may be used if both a and b are non-negative,
* a is less than m, while b is of same bit width as m. It's implemented
* as subtraction followed by two conditional additions.
*
* 0 <= a < m
* 0 <= b < 2^w < 2*m
*
* after subtraction
*
* -2*m < r = a - b < m
*
* Thus it takes up to two conditional additions to make |r| positive.
*/
int bn_mod_sub_fixed_top(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
size_t i, ai, bi, mtop = m->top;
BN_ULONG borrow, carry, ta, tb, mask, *rp;
const BN_ULONG *ap, *bp;
if (bn_wexpand(r, mtop) == NULL)
return 0;
rp = r->d;
ap = a->d != NULL ? a->d : rp;
bp = b->d != NULL ? b->d : rp;
for (i = 0, ai = 0, bi = 0, borrow = 0; i < mtop;) {
mask = (BN_ULONG)0 - ((i - a->top) >> (8 * sizeof(i) - 1));
ta = ap[ai] & mask;
mask = (BN_ULONG)0 - ((i - b->top) >> (8 * sizeof(i) - 1));
tb = bp[bi] & mask;
rp[i] = ta - tb - borrow;
if (ta != tb)
borrow = (ta < tb);
i++;
ai += (i - a->dmax) >> (8 * sizeof(i) - 1);
bi += (i - b->dmax) >> (8 * sizeof(i) - 1);
}
ap = m->d;
for (i = 0, mask = 0 - borrow, carry = 0; i < mtop; i++) {
ta = ((ap[i] & mask) + carry) & BN_MASK2;
carry = (ta < carry);
rp[i] = (rp[i] + ta) & BN_MASK2;
carry += (rp[i] < ta);
}
borrow -= carry;
for (i = 0, mask = 0 - borrow, carry = 0; i < mtop; i++) {
ta = ((ap[i] & mask) + carry) & BN_MASK2;
carry = (ta < carry);
rp[i] = (rp[i] + ta) & BN_MASK2;
carry += (rp[i] < ta);
}
r->top = mtop;
r->flags |= BN_FLG_FIXED_TOP;
r->neg = 0;
return 1;
}
/*
* BN_mod_sub variant that may be used if both a and b are non-negative and
* less than m
*/
int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
const BIGNUM *m)
{
if (!BN_sub(r, a, b))
return 0;
if (r->neg)
return BN_add(r, r, m);
return 1;
}
/* slow but works */
int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
BN_CTX *ctx)
{
BIGNUM *t;
int ret = 0;
bn_check_top(a);
bn_check_top(b);
bn_check_top(m);
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) == NULL)
goto err;
if (a == b) {
if (!BN_sqr(t, a, ctx))
goto err;
} else {
if (!BN_mul(t, a, b, ctx))
goto err;
}
if (!BN_nnmod(r, t, m, ctx))
goto err;
bn_check_top(r);
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
{
if (!BN_sqr(r, a, ctx))
return 0;
/* r->neg == 0, thus we don't need BN_nnmod */
return BN_mod(r, r, m, ctx);
}
int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
{
if (!BN_lshift1(r, a))
return 0;
bn_check_top(r);
return BN_nnmod(r, r, m, ctx);
}
/*
* BN_mod_lshift1 variant that may be used if a is non-negative and less than
* m
*/
int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
{
if (!BN_lshift1(r, a))
return 0;
bn_check_top(r);
if (BN_cmp(r, m) >= 0)
return BN_sub(r, r, m);
return 1;
}
int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
BN_CTX *ctx)
{
BIGNUM *abs_m = NULL;
int ret;
if (!BN_nnmod(r, a, m, ctx))
return 0;
if (m->neg) {
abs_m = BN_dup(m);
if (abs_m == NULL)
return 0;
abs_m->neg = 0;
}
ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
bn_check_top(r);
BN_free(abs_m);
return ret;
}
/*
* BN_mod_lshift variant that may be used if a is non-negative and less than
* m
*/
int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
{
if (r != a) {
if (BN_copy(r, a) == NULL)
return 0;
}
while (n > 0) {
int max_shift;
/* 0 < r < m */
max_shift = BN_num_bits(m) - BN_num_bits(r);
/* max_shift >= 0 */
if (max_shift < 0) {
BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
return 0;
}
if (max_shift > n)
max_shift = n;
if (max_shift) {
if (!BN_lshift(r, r, max_shift))
return 0;
n -= max_shift;
} else {
if (!BN_lshift1(r, r))
return 0;
--n;
}
/* BN_num_bits(r) <= BN_num_bits(m) */
if (BN_cmp(r, m) >= 0) {
if (!BN_sub(r, r, m))
return 0;
}
}
bn_check_top(r);
return 1;
}

View file

@ -0,0 +1,464 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
/*
* Details about Montgomery multiplication algorithms can be found at
* http://security.ece.orst.edu/publications.html, e.g.
* http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
* sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#define MONT_WORD /* use the faster word-based algorithm */
#ifdef MONT_WORD
static int bn_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
#endif
int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx)
{
int ret = bn_mul_mont_fixed_top(r, a, b, mont, ctx);
bn_correct_top(r);
bn_check_top(r);
return ret;
}
int bn_mul_mont_fixed_top(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
BN_MONT_CTX *mont, BN_CTX *ctx)
{
BIGNUM *tmp;
int ret = 0;
int num = mont->N.top;
#if defined(OPENSSL_BN_ASM_MONT) && defined(MONT_WORD)
if (num > 1 && a->top == num && b->top == num) {
if (bn_wexpand(r, num) == NULL)
return 0;
if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
r->neg = a->neg ^ b->neg;
r->top = num;
r->flags |= BN_FLG_FIXED_TOP;
return 1;
}
}
#endif
if ((a->top + b->top) > 2 * num)
return 0;
BN_CTX_start(ctx);
tmp = BN_CTX_get(ctx);
if (tmp == NULL)
goto err;
bn_check_top(tmp);
if (a == b) {
if (!bn_sqr_fixed_top(tmp, a, ctx))
goto err;
} else {
if (!bn_mul_fixed_top(tmp, a, b, ctx))
goto err;
}
/* reduce from aRR to aR */
#ifdef MONT_WORD
if (!bn_from_montgomery_word(r, tmp, mont))
goto err;
#else
if (!BN_from_montgomery(r, tmp, mont, ctx))
goto err;
#endif
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
#ifdef MONT_WORD
static int bn_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
{
BIGNUM *n;
BN_ULONG *ap, *np, *rp, n0, v, carry;
int nl, max, i;
unsigned int rtop;
n = &(mont->N);
nl = n->top;
if (nl == 0) {
ret->top = 0;
return 1;
}
max = (2 * nl); /* carry is stored separately */
if (bn_wexpand(r, max) == NULL)
return 0;
r->neg ^= n->neg;
np = n->d;
rp = r->d;
/* clear the top words of T */
for (rtop = r->top, i = 0; i < max; i++) {
v = (BN_ULONG)0 - ((i - rtop) >> (8 * sizeof(rtop) - 1));
rp[i] &= v;
}
r->top = max;
r->flags |= BN_FLG_FIXED_TOP;
n0 = mont->n0[0];
/*
* Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On
* input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r|
* includes |carry| which is stored separately.
*/
for (carry = 0, i = 0; i < nl; i++, rp++) {
v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
v = (v + carry + rp[nl]) & BN_MASK2;
carry |= (v != rp[nl]);
carry &= (v <= rp[nl]);
rp[nl] = v;
}
if (bn_wexpand(ret, nl) == NULL)
return 0;
ret->top = nl;
ret->flags |= BN_FLG_FIXED_TOP;
ret->neg = r->neg;
rp = ret->d;
/*
* Shift |nl| words to divide by R. We have |ap| < 2 * |n|. Note that |ap|
* includes |carry| which is stored separately.
*/
ap = &(r->d[nl]);
carry -= bn_sub_words(rp, ap, np, nl);
/*
* |carry| is -1 if |ap| - |np| underflowed or zero if it did not. Note
* |carry| cannot be 1. That would imply the subtraction did not fit in
* |nl| words, and we know at most one subtraction is needed.
*/
for (i = 0; i < nl; i++) {
rp[i] = (carry & ap[i]) | (~carry & rp[i]);
ap[i] = 0;
}
return 1;
}
#endif /* MONT_WORD */
int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
int retn;
retn = bn_from_mont_fixed_top(ret, a, mont, ctx);
bn_correct_top(ret);
bn_check_top(ret);
return retn;
}
int bn_from_mont_fixed_top(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
int retn = 0;
#ifdef MONT_WORD
BIGNUM *t;
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) && BN_copy(t, a)) {
retn = bn_from_montgomery_word(ret, t, mont);
}
BN_CTX_end(ctx);
#else /* !MONT_WORD */
BIGNUM *t1, *t2;
BN_CTX_start(ctx);
t1 = BN_CTX_get(ctx);
t2 = BN_CTX_get(ctx);
if (t2 == NULL)
goto err;
if (!BN_copy(t1, a))
goto err;
BN_mask_bits(t1, mont->ri);
if (!BN_mul(t2, t1, &mont->Ni, ctx))
goto err;
BN_mask_bits(t2, mont->ri);
if (!BN_mul(t1, t2, &mont->N, ctx))
goto err;
if (!BN_add(t2, a, t1))
goto err;
if (!BN_rshift(ret, t2, mont->ri))
goto err;
if (BN_ucmp(ret, &(mont->N)) >= 0) {
if (!BN_usub(ret, ret, &(mont->N)))
goto err;
}
retn = 1;
bn_check_top(ret);
err:
BN_CTX_end(ctx);
#endif /* MONT_WORD */
return retn;
}
int bn_to_mont_fixed_top(BIGNUM *r, const BIGNUM *a, BN_MONT_CTX *mont,
BN_CTX *ctx)
{
return bn_mul_mont_fixed_top(r, a, &(mont->RR), mont, ctx);
}
BN_MONT_CTX *BN_MONT_CTX_new(void)
{
BN_MONT_CTX *ret;
if ((ret = OPENSSL_malloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_MONT_CTX_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
BN_MONT_CTX_init(ret);
ret->flags = BN_FLG_MALLOCED;
return ret;
}
void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
{
ctx->ri = 0;
bn_init(&ctx->RR);
bn_init(&ctx->N);
bn_init(&ctx->Ni);
ctx->n0[0] = ctx->n0[1] = 0;
ctx->flags = 0;
}
void BN_MONT_CTX_free(BN_MONT_CTX *mont)
{
if (mont == NULL)
return;
BN_clear_free(&mont->RR);
BN_clear_free(&mont->N);
BN_clear_free(&mont->Ni);
if (mont->flags & BN_FLG_MALLOCED)
OPENSSL_free(mont);
}
int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *Ri, *R;
if (BN_is_zero(mod))
return 0;
BN_CTX_start(ctx);
if ((Ri = BN_CTX_get(ctx)) == NULL)
goto err;
R = &(mont->RR); /* grab RR as a temp */
if (!BN_copy(&(mont->N), mod))
goto err; /* Set N */
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(&(mont->N), BN_FLG_CONSTTIME);
mont->N.neg = 0;
#ifdef MONT_WORD
{
BIGNUM tmod;
BN_ULONG buf[2];
bn_init(&tmod);
tmod.d = buf;
tmod.dmax = 2;
tmod.neg = 0;
if (BN_get_flags(mod, BN_FLG_CONSTTIME) != 0)
BN_set_flags(&tmod, BN_FLG_CONSTTIME);
mont->ri = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
# if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
/*
* Only certain BN_BITS2<=32 platforms actually make use of n0[1],
* and we could use the #else case (with a shorter R value) for the
* others. However, currently only the assembler files do know which
* is which.
*/
BN_zero(R);
if (!(BN_set_bit(R, 2 * BN_BITS2)))
goto err;
tmod.top = 0;
if ((buf[0] = mod->d[0]))
tmod.top = 1;
if ((buf[1] = mod->top > 1 ? mod->d[1] : 0))
tmod.top = 2;
if (BN_is_one(&tmod))
BN_zero(Ri);
else if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, 2 * BN_BITS2))
goto err; /* R*Ri */
if (!BN_is_zero(Ri)) {
if (!BN_sub_word(Ri, 1))
goto err;
} else { /* if N mod word size == 1 */
if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL)
goto err;
/* Ri-- (mod double word size) */
Ri->neg = 0;
Ri->d[0] = BN_MASK2;
Ri->d[1] = BN_MASK2;
Ri->top = 2;
}
if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
goto err;
/*
* Ni = (R*Ri-1)/N, keep only couple of least significant words:
*/
mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
# else
BN_zero(R);
if (!(BN_set_bit(R, BN_BITS2)))
goto err; /* R */
buf[0] = mod->d[0]; /* tmod = N mod word size */
buf[1] = 0;
tmod.top = buf[0] != 0 ? 1 : 0;
/* Ri = R^-1 mod N */
if (BN_is_one(&tmod))
BN_zero(Ri);
else if ((BN_mod_inverse(Ri, R, &tmod, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, BN_BITS2))
goto err; /* R*Ri */
if (!BN_is_zero(Ri)) {
if (!BN_sub_word(Ri, 1))
goto err;
} else { /* if N mod word size == 1 */
if (!BN_set_word(Ri, BN_MASK2))
goto err; /* Ri-- (mod word size) */
}
if (!BN_div(Ri, NULL, Ri, &tmod, ctx))
goto err;
/*
* Ni = (R*Ri-1)/N, keep only least significant word:
*/
mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
mont->n0[1] = 0;
# endif
}
#else /* !MONT_WORD */
{ /* bignum version */
mont->ri = BN_num_bits(&mont->N);
BN_zero(R);
if (!BN_set_bit(R, mont->ri))
goto err; /* R = 2^ri */
/* Ri = R^-1 mod N */
if ((BN_mod_inverse(Ri, R, &mont->N, ctx)) == NULL)
goto err;
if (!BN_lshift(Ri, Ri, mont->ri))
goto err; /* R*Ri */
if (!BN_sub_word(Ri, 1))
goto err;
/*
* Ni = (R*Ri-1) / N
*/
if (!BN_div(&(mont->Ni), NULL, Ri, &mont->N, ctx))
goto err;
}
#endif
/* setup RR for conversions */
BN_zero(&(mont->RR));
if (!BN_set_bit(&(mont->RR), mont->ri * 2))
goto err;
if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx))
goto err;
for (i = mont->RR.top, ret = mont->N.top; i < ret; i++)
mont->RR.d[i] = 0;
mont->RR.top = ret;
mont->RR.flags |= BN_FLG_FIXED_TOP;
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
{
if (to == from)
return to;
if (!BN_copy(&(to->RR), &(from->RR)))
return NULL;
if (!BN_copy(&(to->N), &(from->N)))
return NULL;
if (!BN_copy(&(to->Ni), &(from->Ni)))
return NULL;
to->ri = from->ri;
to->n0[0] = from->n0[0];
to->n0[1] = from->n0[1];
return to;
}
BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_RWLOCK *lock,
const BIGNUM *mod, BN_CTX *ctx)
{
BN_MONT_CTX *ret;
CRYPTO_THREAD_read_lock(lock);
ret = *pmont;
CRYPTO_THREAD_unlock(lock);
if (ret)
return ret;
/*
* We don't want to serialise globally while doing our lazy-init math in
* BN_MONT_CTX_set. That punishes threads that are doing independent
* things. Instead, punish the case where more than one thread tries to
* lazy-init the same 'pmont', by having each do the lazy-init math work
* independently and only use the one from the thread that wins the race
* (the losers throw away the work they've done).
*/
ret = BN_MONT_CTX_new();
if (ret == NULL)
return NULL;
if (!BN_MONT_CTX_set(ret, mod, ctx)) {
BN_MONT_CTX_free(ret);
return NULL;
}
/* The locked compare-and-set, after the local work is done. */
CRYPTO_THREAD_write_lock(lock);
if (*pmont) {
BN_MONT_CTX_free(ret);
ret = *pmont;
} else
*pmont = ret;
CRYPTO_THREAD_unlock(lock);
return ret;
}

View file

@ -0,0 +1,86 @@
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
int BN_bn2mpi(const BIGNUM *a, unsigned char *d)
{
int bits;
int num = 0;
int ext = 0;
long l;
bits = BN_num_bits(a);
num = (bits + 7) / 8;
if (bits > 0) {
ext = ((bits & 0x07) == 0);
}
if (d == NULL)
return (num + 4 + ext);
l = num + ext;
d[0] = (unsigned char)(l >> 24) & 0xff;
d[1] = (unsigned char)(l >> 16) & 0xff;
d[2] = (unsigned char)(l >> 8) & 0xff;
d[3] = (unsigned char)(l) & 0xff;
if (ext)
d[4] = 0;
num = BN_bn2bin(a, &(d[4 + ext]));
if (a->neg)
d[4] |= 0x80;
return (num + 4 + ext);
}
BIGNUM *BN_mpi2bn(const unsigned char *d, int n, BIGNUM *ain)
{
long len;
int neg = 0;
BIGNUM *a = NULL;
if (n < 4) {
BNerr(BN_F_BN_MPI2BN, BN_R_INVALID_LENGTH);
return NULL;
}
len = ((long)d[0] << 24) | ((long)d[1] << 16) | ((int)d[2] << 8) | (int)
d[3];
if ((len + 4) != n) {
BNerr(BN_F_BN_MPI2BN, BN_R_ENCODING_ERROR);
return NULL;
}
if (ain == NULL)
a = BN_new();
else
a = ain;
if (a == NULL)
return NULL;
if (len == 0) {
a->neg = 0;
a->top = 0;
return a;
}
d += 4;
if ((*d) & 0x80)
neg = 1;
if (BN_bin2bn(d, (int)len, a) == NULL) {
if (ain == NULL)
BN_free(a);
return NULL;
}
a->neg = neg;
if (neg) {
BN_clear_bit(a, BN_num_bits(a) - 1);
}
bn_check_top(a);
return a;
}

View file

@ -0,0 +1,684 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <assert.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#if defined(OPENSSL_NO_ASM) || !defined(OPENSSL_BN_ASM_PART_WORDS)
/*
* Here follows specialised variants of bn_add_words() and bn_sub_words().
* They have the property performing operations on arrays of different sizes.
* The sizes of those arrays is expressed through cl, which is the common
* length ( basically, min(len(a),len(b)) ), and dl, which is the delta
* between the two lengths, calculated as len(a)-len(b). All lengths are the
* number of BN_ULONGs... For the operations that require a result array as
* parameter, it must have the length cl+abs(dl). These functions should
* probably end up in bn_asm.c as soon as there are assembler counterparts
* for the systems that use assembler files.
*/
BN_ULONG bn_sub_part_words(BN_ULONG *r,
const BN_ULONG *a, const BN_ULONG *b,
int cl, int dl)
{
BN_ULONG c, t;
assert(cl >= 0);
c = bn_sub_words(r, a, b, cl);
if (dl == 0)
return c;
r += cl;
a += cl;
b += cl;
if (dl < 0) {
for (;;) {
t = b[0];
r[0] = (0 - t - c) & BN_MASK2;
if (t != 0)
c = 1;
if (++dl >= 0)
break;
t = b[1];
r[1] = (0 - t - c) & BN_MASK2;
if (t != 0)
c = 1;
if (++dl >= 0)
break;
t = b[2];
r[2] = (0 - t - c) & BN_MASK2;
if (t != 0)
c = 1;
if (++dl >= 0)
break;
t = b[3];
r[3] = (0 - t - c) & BN_MASK2;
if (t != 0)
c = 1;
if (++dl >= 0)
break;
b += 4;
r += 4;
}
} else {
int save_dl = dl;
while (c) {
t = a[0];
r[0] = (t - c) & BN_MASK2;
if (t != 0)
c = 0;
if (--dl <= 0)
break;
t = a[1];
r[1] = (t - c) & BN_MASK2;
if (t != 0)
c = 0;
if (--dl <= 0)
break;
t = a[2];
r[2] = (t - c) & BN_MASK2;
if (t != 0)
c = 0;
if (--dl <= 0)
break;
t = a[3];
r[3] = (t - c) & BN_MASK2;
if (t != 0)
c = 0;
if (--dl <= 0)
break;
save_dl = dl;
a += 4;
r += 4;
}
if (dl > 0) {
if (save_dl > dl) {
switch (save_dl - dl) {
case 1:
r[1] = a[1];
if (--dl <= 0)
break;
/* fall thru */
case 2:
r[2] = a[2];
if (--dl <= 0)
break;
/* fall thru */
case 3:
r[3] = a[3];
if (--dl <= 0)
break;
}
a += 4;
r += 4;
}
}
if (dl > 0) {
for (;;) {
r[0] = a[0];
if (--dl <= 0)
break;
r[1] = a[1];
if (--dl <= 0)
break;
r[2] = a[2];
if (--dl <= 0)
break;
r[3] = a[3];
if (--dl <= 0)
break;
a += 4;
r += 4;
}
}
}
return c;
}
#endif
#ifdef BN_RECURSION
/*
* Karatsuba recursive multiplication algorithm (cf. Knuth, The Art of
* Computer Programming, Vol. 2)
*/
/*-
* r is 2*n2 words in size,
* a and b are both n2 words in size.
* n2 must be a power of 2.
* We multiply and return the result.
* t must be 2*n2 words in size
* We calculate
* a[0]*b[0]
* a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
* a[1]*b[1]
*/
/* dnX may not be positive, but n2/2+dnX has to be */
void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
int dna, int dnb, BN_ULONG *t)
{
int n = n2 / 2, c1, c2;
int tna = n + dna, tnb = n + dnb;
unsigned int neg, zero;
BN_ULONG ln, lo, *p;
# ifdef BN_MUL_COMBA
# if 0
if (n2 == 4) {
bn_mul_comba4(r, a, b);
return;
}
# endif
/*
* Only call bn_mul_comba 8 if n2 == 8 and the two arrays are complete
* [steve]
*/
if (n2 == 8 && dna == 0 && dnb == 0) {
bn_mul_comba8(r, a, b);
return;
}
# endif /* BN_MUL_COMBA */
/* Else do normal multiply */
if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
if ((dna + dnb) < 0)
memset(&r[2 * n2 + dna + dnb], 0,
sizeof(BN_ULONG) * -(dna + dnb));
return;
}
/* r=(a[0]-a[1])*(b[1]-b[0]) */
c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
zero = neg = 0;
switch (c1 * 3 + c2) {
case -4:
bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
break;
case -3:
zero = 1;
break;
case -2:
bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
neg = 1;
break;
case -1:
case 0:
case 1:
zero = 1;
break;
case 2:
bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
neg = 1;
break;
case 3:
zero = 1;
break;
case 4:
bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
break;
}
# ifdef BN_MUL_COMBA
if (n == 4 && dna == 0 && dnb == 0) { /* XXX: bn_mul_comba4 could take
* extra args to do this well */
if (!zero)
bn_mul_comba4(&(t[n2]), t, &(t[n]));
else
memset(&t[n2], 0, sizeof(*t) * 8);
bn_mul_comba4(r, a, b);
bn_mul_comba4(&(r[n2]), &(a[n]), &(b[n]));
} else if (n == 8 && dna == 0 && dnb == 0) { /* XXX: bn_mul_comba8 could
* take extra args to do
* this well */
if (!zero)
bn_mul_comba8(&(t[n2]), t, &(t[n]));
else
memset(&t[n2], 0, sizeof(*t) * 16);
bn_mul_comba8(r, a, b);
bn_mul_comba8(&(r[n2]), &(a[n]), &(b[n]));
} else
# endif /* BN_MUL_COMBA */
{
p = &(t[n2 * 2]);
if (!zero)
bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
else
memset(&t[n2], 0, sizeof(*t) * n2);
bn_mul_recursive(r, a, b, n, 0, 0, p);
bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), n, dna, dnb, p);
}
/*-
* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
* r[10] holds (a[0]*b[0])
* r[32] holds (b[1]*b[1])
*/
c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
if (neg) { /* if t[32] is negative */
c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
} else {
/* Might have a carry */
c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
}
/*-
* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
* r[10] holds (a[0]*b[0])
* r[32] holds (b[1]*b[1])
* c1 holds the carry bits
*/
c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
if (c1) {
p = &(r[n + n2]);
lo = *p;
ln = (lo + c1) & BN_MASK2;
*p = ln;
/*
* The overflow will stop before we over write words we should not
* overwrite
*/
if (ln < (BN_ULONG)c1) {
do {
p++;
lo = *p;
ln = (lo + 1) & BN_MASK2;
*p = ln;
} while (ln == 0);
}
}
}
/*
* n+tn is the word length t needs to be n*4 is size, as does r
*/
/* tnX may not be negative but less than n */
void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
int tna, int tnb, BN_ULONG *t)
{
int i, j, n2 = n * 2;
int c1, c2, neg;
BN_ULONG ln, lo, *p;
if (n < 8) {
bn_mul_normal(r, a, n + tna, b, n + tnb);
return;
}
/* r=(a[0]-a[1])*(b[1]-b[0]) */
c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
neg = 0;
switch (c1 * 3 + c2) {
case -4:
bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
break;
case -3:
case -2:
bn_sub_part_words(t, &(a[n]), a, tna, tna - n); /* - */
bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
neg = 1;
break;
case -1:
case 0:
case 1:
case 2:
bn_sub_part_words(t, a, &(a[n]), tna, n - tna); /* + */
bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
neg = 1;
break;
case 3:
case 4:
bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
break;
}
/*
* The zero case isn't yet implemented here. The speedup would probably
* be negligible.
*/
# if 0
if (n == 4) {
bn_mul_comba4(&(t[n2]), t, &(t[n]));
bn_mul_comba4(r, a, b);
bn_mul_normal(&(r[n2]), &(a[n]), tn, &(b[n]), tn);
memset(&r[n2 + tn * 2], 0, sizeof(*r) * (n2 - tn * 2));
} else
# endif
if (n == 8) {
bn_mul_comba8(&(t[n2]), t, &(t[n]));
bn_mul_comba8(r, a, b);
bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
memset(&r[n2 + tna + tnb], 0, sizeof(*r) * (n2 - tna - tnb));
} else {
p = &(t[n2 * 2]);
bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
bn_mul_recursive(r, a, b, n, 0, 0, p);
i = n / 2;
/*
* If there is only a bottom half to the number, just do it
*/
if (tna > tnb)
j = tna - i;
else
j = tnb - i;
if (j == 0) {
bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]),
i, tna - i, tnb - i, p);
memset(&r[n2 + i * 2], 0, sizeof(*r) * (n2 - i * 2));
} else if (j > 0) { /* eg, n == 16, i == 8 and tn == 11 */
bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]),
i, tna - i, tnb - i, p);
memset(&(r[n2 + tna + tnb]), 0,
sizeof(BN_ULONG) * (n2 - tna - tnb));
} else { /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
memset(&r[n2], 0, sizeof(*r) * n2);
if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL
&& tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
} else {
for (;;) {
i /= 2;
/*
* these simplified conditions work exclusively because
* difference between tna and tnb is 1 or 0
*/
if (i < tna || i < tnb) {
bn_mul_part_recursive(&(r[n2]),
&(a[n]), &(b[n]),
i, tna - i, tnb - i, p);
break;
} else if (i == tna || i == tnb) {
bn_mul_recursive(&(r[n2]),
&(a[n]), &(b[n]),
i, tna - i, tnb - i, p);
break;
}
}
}
}
}
/*-
* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
* r[10] holds (a[0]*b[0])
* r[32] holds (b[1]*b[1])
*/
c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
if (neg) { /* if t[32] is negative */
c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
} else {
/* Might have a carry */
c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
}
/*-
* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
* r[10] holds (a[0]*b[0])
* r[32] holds (b[1]*b[1])
* c1 holds the carry bits
*/
c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
if (c1) {
p = &(r[n + n2]);
lo = *p;
ln = (lo + c1) & BN_MASK2;
*p = ln;
/*
* The overflow will stop before we over write words we should not
* overwrite
*/
if (ln < (BN_ULONG)c1) {
do {
p++;
lo = *p;
ln = (lo + 1) & BN_MASK2;
*p = ln;
} while (ln == 0);
}
}
}
/*-
* a and b must be the same size, which is n2.
* r needs to be n2 words and t needs to be n2*2
*/
void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
BN_ULONG *t)
{
int n = n2 / 2;
bn_mul_recursive(r, a, b, n, 0, 0, &(t[0]));
if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL) {
bn_mul_low_recursive(&(t[0]), &(a[0]), &(b[n]), n, &(t[n2]));
bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
bn_mul_low_recursive(&(t[0]), &(a[n]), &(b[0]), n, &(t[n2]));
bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
} else {
bn_mul_low_normal(&(t[0]), &(a[0]), &(b[n]), n);
bn_mul_low_normal(&(t[n]), &(a[n]), &(b[0]), n);
bn_add_words(&(r[n]), &(r[n]), &(t[0]), n);
bn_add_words(&(r[n]), &(r[n]), &(t[n]), n);
}
}
#endif /* BN_RECURSION */
int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
int ret = bn_mul_fixed_top(r, a, b, ctx);
bn_correct_top(r);
bn_check_top(r);
return ret;
}
int bn_mul_fixed_top(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
{
int ret = 0;
int top, al, bl;
BIGNUM *rr;
#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
int i;
#endif
#ifdef BN_RECURSION
BIGNUM *t = NULL;
int j = 0, k;
#endif
bn_check_top(a);
bn_check_top(b);
bn_check_top(r);
al = a->top;
bl = b->top;
if ((al == 0) || (bl == 0)) {
BN_zero(r);
return 1;
}
top = al + bl;
BN_CTX_start(ctx);
if ((r == a) || (r == b)) {
if ((rr = BN_CTX_get(ctx)) == NULL)
goto err;
} else
rr = r;
#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
i = al - bl;
#endif
#ifdef BN_MUL_COMBA
if (i == 0) {
# if 0
if (al == 4) {
if (bn_wexpand(rr, 8) == NULL)
goto err;
rr->top = 8;
bn_mul_comba4(rr->d, a->d, b->d);
goto end;
}
# endif
if (al == 8) {
if (bn_wexpand(rr, 16) == NULL)
goto err;
rr->top = 16;
bn_mul_comba8(rr->d, a->d, b->d);
goto end;
}
}
#endif /* BN_MUL_COMBA */
#ifdef BN_RECURSION
if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) {
if (i >= -1 && i <= 1) {
/*
* Find out the power of two lower or equal to the longest of the
* two numbers
*/
if (i >= 0) {
j = BN_num_bits_word((BN_ULONG)al);
}
if (i == -1) {
j = BN_num_bits_word((BN_ULONG)bl);
}
j = 1 << (j - 1);
assert(j <= al || j <= bl);
k = j + j;
t = BN_CTX_get(ctx);
if (t == NULL)
goto err;
if (al > j || bl > j) {
if (bn_wexpand(t, k * 4) == NULL)
goto err;
if (bn_wexpand(rr, k * 4) == NULL)
goto err;
bn_mul_part_recursive(rr->d, a->d, b->d,
j, al - j, bl - j, t->d);
} else { /* al <= j || bl <= j */
if (bn_wexpand(t, k * 2) == NULL)
goto err;
if (bn_wexpand(rr, k * 2) == NULL)
goto err;
bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
}
rr->top = top;
goto end;
}
}
#endif /* BN_RECURSION */
if (bn_wexpand(rr, top) == NULL)
goto err;
rr->top = top;
bn_mul_normal(rr->d, a->d, al, b->d, bl);
#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
end:
#endif
rr->neg = a->neg ^ b->neg;
rr->flags |= BN_FLG_FIXED_TOP;
if (r != rr && BN_copy(r, rr) == NULL)
goto err;
ret = 1;
err:
bn_check_top(r);
BN_CTX_end(ctx);
return ret;
}
void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
{
BN_ULONG *rr;
if (na < nb) {
int itmp;
BN_ULONG *ltmp;
itmp = na;
na = nb;
nb = itmp;
ltmp = a;
a = b;
b = ltmp;
}
rr = &(r[na]);
if (nb <= 0) {
(void)bn_mul_words(r, a, na, 0);
return;
} else
rr[0] = bn_mul_words(r, a, na, b[0]);
for (;;) {
if (--nb <= 0)
return;
rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
if (--nb <= 0)
return;
rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
if (--nb <= 0)
return;
rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
if (--nb <= 0)
return;
rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
rr += 4;
r += 4;
b += 4;
}
}
void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
{
bn_mul_words(r, a, n, b[0]);
for (;;) {
if (--n <= 0)
return;
bn_mul_add_words(&(r[1]), a, n, b[1]);
if (--n <= 0)
return;
bn_mul_add_words(&(r[2]), a, n, b[2]);
if (--n <= 0)
return;
bn_mul_add_words(&(r[3]), a, n, b[3]);
if (--n <= 0)
return;
bn_mul_add_words(&(r[4]), a, n, b[4]);
r += 4;
b += 4;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,469 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include <time.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/*
* The quick sieve algorithm approach to weeding out primes is Philip
* Zimmermann's, as implemented in PGP. I have had a read of his comments
* and implemented my own version.
*/
#include "bn_prime.h"
static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
const BIGNUM *a1_odd, int k, BN_CTX *ctx,
BN_MONT_CTX *mont);
static int probable_prime(BIGNUM *rnd, int bits, prime_t *mods);
static int probable_prime_dh_safe(BIGNUM *rnd, int bits,
const BIGNUM *add, const BIGNUM *rem,
BN_CTX *ctx);
int BN_GENCB_call(BN_GENCB *cb, int a, int b)
{
/* No callback means continue */
if (!cb)
return 1;
switch (cb->ver) {
case 1:
/* Deprecated-style callbacks */
if (!cb->cb.cb_1)
return 1;
cb->cb.cb_1(a, b, cb->arg);
return 1;
case 2:
/* New-style callbacks */
return cb->cb.cb_2(a, b, cb);
default:
break;
}
/* Unrecognised callback type */
return 0;
}
int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe,
const BIGNUM *add, const BIGNUM *rem, BN_GENCB *cb)
{
BIGNUM *t;
int found = 0;
int i, j, c1 = 0;
BN_CTX *ctx = NULL;
prime_t *mods = NULL;
int checks = BN_prime_checks_for_size(bits);
if (bits < 2) {
/* There are no prime numbers this small. */
BNerr(BN_F_BN_GENERATE_PRIME_EX, BN_R_BITS_TOO_SMALL);
return 0;
} else if (bits == 2 && safe) {
/* The smallest safe prime (7) is three bits. */
BNerr(BN_F_BN_GENERATE_PRIME_EX, BN_R_BITS_TOO_SMALL);
return 0;
}
mods = OPENSSL_zalloc(sizeof(*mods) * NUMPRIMES);
if (mods == NULL)
goto err;
ctx = BN_CTX_new();
if (ctx == NULL)
goto err;
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (t == NULL)
goto err;
loop:
/* make a random number and set the top and bottom bits */
if (add == NULL) {
if (!probable_prime(ret, bits, mods))
goto err;
} else {
if (safe) {
if (!probable_prime_dh_safe(ret, bits, add, rem, ctx))
goto err;
} else {
if (!bn_probable_prime_dh(ret, bits, add, rem, ctx))
goto err;
}
}
if (!BN_GENCB_call(cb, 0, c1++))
/* aborted */
goto err;
if (!safe) {
i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
if (i == -1)
goto err;
if (i == 0)
goto loop;
} else {
/*
* for "safe prime" generation, check that (p-1)/2 is prime. Since a
* prime is odd, We just need to divide by 2
*/
if (!BN_rshift1(t, ret))
goto err;
for (i = 0; i < checks; i++) {
j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, cb);
if (j == -1)
goto err;
if (j == 0)
goto loop;
j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, cb);
if (j == -1)
goto err;
if (j == 0)
goto loop;
if (!BN_GENCB_call(cb, 2, c1 - 1))
goto err;
/* We have a safe prime test pass */
}
}
/* we have a prime :-) */
found = 1;
err:
OPENSSL_free(mods);
if (ctx != NULL)
BN_CTX_end(ctx);
BN_CTX_free(ctx);
bn_check_top(ret);
return found;
}
int BN_is_prime_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
BN_GENCB *cb)
{
return BN_is_prime_fasttest_ex(a, checks, ctx_passed, 0, cb);
}
int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
int do_trial_division, BN_GENCB *cb)
{
int i, j, ret = -1;
int k;
BN_CTX *ctx = NULL;
BIGNUM *A1, *A1_odd, *A3, *check; /* taken from ctx */
BN_MONT_CTX *mont = NULL;
/* Take care of the really small primes 2 & 3 */
if (BN_is_word(a, 2) || BN_is_word(a, 3))
return 1;
/* Check odd and bigger than 1 */
if (!BN_is_odd(a) || BN_cmp(a, BN_value_one()) <= 0)
return 0;
if (checks == BN_prime_checks)
checks = BN_prime_checks_for_size(BN_num_bits(a));
/* first look for small factors */
if (do_trial_division) {
for (i = 1; i < NUMPRIMES; i++) {
BN_ULONG mod = BN_mod_word(a, primes[i]);
if (mod == (BN_ULONG)-1)
goto err;
if (mod == 0)
return BN_is_word(a, primes[i]);
}
if (!BN_GENCB_call(cb, 1, -1))
goto err;
}
if (ctx_passed != NULL)
ctx = ctx_passed;
else if ((ctx = BN_CTX_new()) == NULL)
goto err;
BN_CTX_start(ctx);
A1 = BN_CTX_get(ctx);
A3 = BN_CTX_get(ctx);
A1_odd = BN_CTX_get(ctx);
check = BN_CTX_get(ctx);
if (check == NULL)
goto err;
/* compute A1 := a - 1 */
if (!BN_copy(A1, a) || !BN_sub_word(A1, 1))
goto err;
/* compute A3 := a - 3 */
if (!BN_copy(A3, a) || !BN_sub_word(A3, 3))
goto err;
/* write A1 as A1_odd * 2^k */
k = 1;
while (!BN_is_bit_set(A1, k))
k++;
if (!BN_rshift(A1_odd, A1, k))
goto err;
/* Montgomery setup for computations mod a */
mont = BN_MONT_CTX_new();
if (mont == NULL)
goto err;
if (!BN_MONT_CTX_set(mont, a, ctx))
goto err;
for (i = 0; i < checks; i++) {
/* 1 < check < a-1 */
if (!BN_priv_rand_range(check, A3) || !BN_add_word(check, 2))
goto err;
j = witness(check, a, A1, A1_odd, k, ctx, mont);
if (j == -1)
goto err;
if (j) {
ret = 0;
goto err;
}
if (!BN_GENCB_call(cb, 1, i))
goto err;
}
ret = 1;
err:
if (ctx != NULL) {
BN_CTX_end(ctx);
if (ctx_passed == NULL)
BN_CTX_free(ctx);
}
BN_MONT_CTX_free(mont);
return ret;
}
static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
const BIGNUM *a1_odd, int k, BN_CTX *ctx,
BN_MONT_CTX *mont)
{
if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) /* w := w^a1_odd mod a */
return -1;
if (BN_is_one(w))
return 0; /* probably prime */
if (BN_cmp(w, a1) == 0)
return 0; /* w == -1 (mod a), 'a' is probably prime */
while (--k) {
if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
return -1;
if (BN_is_one(w))
return 1; /* 'a' is composite, otherwise a previous 'w'
* would have been == -1 (mod 'a') */
if (BN_cmp(w, a1) == 0)
return 0; /* w == -1 (mod a), 'a' is probably prime */
}
/*
* If we get here, 'w' is the (a-1)/2-th power of the original 'w', and
* it is neither -1 nor +1 -- so 'a' cannot be prime
*/
bn_check_top(w);
return 1;
}
static int probable_prime(BIGNUM *rnd, int bits, prime_t *mods)
{
int i;
BN_ULONG delta;
BN_ULONG maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
char is_single_word = bits <= BN_BITS2;
again:
/* TODO: Not all primes are private */
if (!BN_priv_rand(rnd, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD))
return 0;
/* we now have a random number 'rnd' to test. */
for (i = 1; i < NUMPRIMES; i++) {
BN_ULONG mod = BN_mod_word(rnd, (BN_ULONG)primes[i]);
if (mod == (BN_ULONG)-1)
return 0;
mods[i] = (prime_t) mod;
}
/*
* If bits is so small that it fits into a single word then we
* additionally don't want to exceed that many bits.
*/
if (is_single_word) {
BN_ULONG size_limit;
if (bits == BN_BITS2) {
/*
* Shifting by this much has undefined behaviour so we do it a
* different way
*/
size_limit = ~((BN_ULONG)0) - BN_get_word(rnd);
} else {
size_limit = (((BN_ULONG)1) << bits) - BN_get_word(rnd) - 1;
}
if (size_limit < maxdelta)
maxdelta = size_limit;
}
delta = 0;
loop:
if (is_single_word) {
BN_ULONG rnd_word = BN_get_word(rnd);
/*-
* In the case that the candidate prime is a single word then
* we check that:
* 1) It's greater than primes[i] because we shouldn't reject
* 3 as being a prime number because it's a multiple of
* three.
* 2) That it's not a multiple of a known prime. We don't
* check that rnd-1 is also coprime to all the known
* primes because there aren't many small primes where
* that's true.
*/
for (i = 1; i < NUMPRIMES && primes[i] < rnd_word; i++) {
if ((mods[i] + delta) % primes[i] == 0) {
delta += 2;
if (delta > maxdelta)
goto again;
goto loop;
}
}
} else {
for (i = 1; i < NUMPRIMES; i++) {
/*
* check that rnd is not a prime and also that gcd(rnd-1,primes)
* == 1 (except for 2)
*/
if (((mods[i] + delta) % primes[i]) <= 1) {
delta += 2;
if (delta > maxdelta)
goto again;
goto loop;
}
}
}
if (!BN_add_word(rnd, delta))
return 0;
if (BN_num_bits(rnd) != bits)
goto again;
bn_check_top(rnd);
return 1;
}
int bn_probable_prime_dh(BIGNUM *rnd, int bits,
const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *t1;
BN_CTX_start(ctx);
if ((t1 = BN_CTX_get(ctx)) == NULL)
goto err;
if (!BN_rand(rnd, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD))
goto err;
/* we need ((rnd-rem) % add) == 0 */
if (!BN_mod(t1, rnd, add, ctx))
goto err;
if (!BN_sub(rnd, rnd, t1))
goto err;
if (rem == NULL) {
if (!BN_add_word(rnd, 1))
goto err;
} else {
if (!BN_add(rnd, rnd, rem))
goto err;
}
/* we now have a random number 'rand' to test. */
loop:
for (i = 1; i < NUMPRIMES; i++) {
/* check that rnd is a prime */
BN_ULONG mod = BN_mod_word(rnd, (BN_ULONG)primes[i]);
if (mod == (BN_ULONG)-1)
goto err;
if (mod <= 1) {
if (!BN_add(rnd, rnd, add))
goto err;
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(rnd);
return ret;
}
static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
const BIGNUM *rem, BN_CTX *ctx)
{
int i, ret = 0;
BIGNUM *t1, *qadd, *q;
bits--;
BN_CTX_start(ctx);
t1 = BN_CTX_get(ctx);
q = BN_CTX_get(ctx);
qadd = BN_CTX_get(ctx);
if (qadd == NULL)
goto err;
if (!BN_rshift1(qadd, padd))
goto err;
if (!BN_rand(q, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD))
goto err;
/* we need ((rnd-rem) % add) == 0 */
if (!BN_mod(t1, q, qadd, ctx))
goto err;
if (!BN_sub(q, q, t1))
goto err;
if (rem == NULL) {
if (!BN_add_word(q, 1))
goto err;
} else {
if (!BN_rshift1(t1, rem))
goto err;
if (!BN_add(q, q, t1))
goto err;
}
/* we now have a random number 'rand' to test. */
if (!BN_lshift1(p, q))
goto err;
if (!BN_add_word(p, 1))
goto err;
loop:
for (i = 1; i < NUMPRIMES; i++) {
/* check that p and q are prime */
/*
* check that for p and q gcd(p-1,primes) == 1 (except for 2)
*/
BN_ULONG pmod = BN_mod_word(p, (BN_ULONG)primes[i]);
BN_ULONG qmod = BN_mod_word(q, (BN_ULONG)primes[i]);
if (pmod == (BN_ULONG)-1 || qmod == (BN_ULONG)-1)
goto err;
if (pmod == 0 || qmod == 0) {
if (!BN_add(p, p, padd))
goto err;
if (!BN_add(q, q, qadd))
goto err;
goto loop;
}
}
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(p);
return ret;
}

View file

@ -0,0 +1,273 @@
/*
* WARNING: do not edit!
* Generated by crypto/bn/bn_prime.pl
*
* Copyright 1998-2019 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
typedef unsigned short prime_t;
# define NUMPRIMES 2048
static const prime_t primes[2048] = {
2, 3, 5, 7, 11, 13, 17, 19,
23, 29, 31, 37, 41, 43, 47, 53,
59, 61, 67, 71, 73, 79, 83, 89,
97, 101, 103, 107, 109, 113, 127, 131,
137, 139, 149, 151, 157, 163, 167, 173,
179, 181, 191, 193, 197, 199, 211, 223,
227, 229, 233, 239, 241, 251, 257, 263,
269, 271, 277, 281, 283, 293, 307, 311,
313, 317, 331, 337, 347, 349, 353, 359,
367, 373, 379, 383, 389, 397, 401, 409,
419, 421, 431, 433, 439, 443, 449, 457,
461, 463, 467, 479, 487, 491, 499, 503,
509, 521, 523, 541, 547, 557, 563, 569,
571, 577, 587, 593, 599, 601, 607, 613,
617, 619, 631, 641, 643, 647, 653, 659,
661, 673, 677, 683, 691, 701, 709, 719,
727, 733, 739, 743, 751, 757, 761, 769,
773, 787, 797, 809, 811, 821, 823, 827,
829, 839, 853, 857, 859, 863, 877, 881,
883, 887, 907, 911, 919, 929, 937, 941,
947, 953, 967, 971, 977, 983, 991, 997,
1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,
1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,
1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,
1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693,
1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747,
1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877,
1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949,
1951, 1973, 1979, 1987, 1993, 1997, 1999, 2003,
2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069,
2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203,
2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267,
2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311,
2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377,
2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503,
2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579,
2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657,
2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693,
2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801,
2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861,
2879, 2887, 2897, 2903, 2909, 2917, 2927, 2939,
2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011,
3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167,
3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221,
3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301,
3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347,
3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491,
3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541,
3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607,
3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671,
3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797,
3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863,
3877, 3881, 3889, 3907, 3911, 3917, 3919, 3923,
3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003,
4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129,
4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211,
4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259,
4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337,
4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481,
4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547,
4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621,
4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673,
4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751,
4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813,
4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909,
4919, 4931, 4933, 4937, 4943, 4951, 4957, 4967,
4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011,
5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087,
5099, 5101, 5107, 5113, 5119, 5147, 5153, 5167,
5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233,
5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309,
5323, 5333, 5347, 5351, 5381, 5387, 5393, 5399,
5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443,
5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507,
5519, 5521, 5527, 5531, 5557, 5563, 5569, 5573,
5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653,
5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711,
5717, 5737, 5741, 5743, 5749, 5779, 5783, 5791,
5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849,
5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897,
5903, 5923, 5927, 5939, 5953, 5981, 5987, 6007,
6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073,
6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133,
6143, 6151, 6163, 6173, 6197, 6199, 6203, 6211,
6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271,
6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329,
6337, 6343, 6353, 6359, 6361, 6367, 6373, 6379,
6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473,
6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563,
6569, 6571, 6577, 6581, 6599, 6607, 6619, 6637,
6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701,
6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779,
6781, 6791, 6793, 6803, 6823, 6827, 6829, 6833,
6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907,
6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971,
6977, 6983, 6991, 6997, 7001, 7013, 7019, 7027,
7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121,
7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207,
7211, 7213, 7219, 7229, 7237, 7243, 7247, 7253,
7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349,
7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457,
7459, 7477, 7481, 7487, 7489, 7499, 7507, 7517,
7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561,
7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621,
7639, 7643, 7649, 7669, 7673, 7681, 7687, 7691,
7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757,
7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853,
7867, 7873, 7877, 7879, 7883, 7901, 7907, 7919,
7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009,
8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087,
8089, 8093, 8101, 8111, 8117, 8123, 8147, 8161,
8167, 8171, 8179, 8191, 8209, 8219, 8221, 8231,
8233, 8237, 8243, 8263, 8269, 8273, 8287, 8291,
8293, 8297, 8311, 8317, 8329, 8353, 8363, 8369,
8377, 8387, 8389, 8419, 8423, 8429, 8431, 8443,
8447, 8461, 8467, 8501, 8513, 8521, 8527, 8537,
8539, 8543, 8563, 8573, 8581, 8597, 8599, 8609,
8623, 8627, 8629, 8641, 8647, 8663, 8669, 8677,
8681, 8689, 8693, 8699, 8707, 8713, 8719, 8731,
8737, 8741, 8747, 8753, 8761, 8779, 8783, 8803,
8807, 8819, 8821, 8831, 8837, 8839, 8849, 8861,
8863, 8867, 8887, 8893, 8923, 8929, 8933, 8941,
8951, 8963, 8969, 8971, 8999, 9001, 9007, 9011,
9013, 9029, 9041, 9043, 9049, 9059, 9067, 9091,
9103, 9109, 9127, 9133, 9137, 9151, 9157, 9161,
9173, 9181, 9187, 9199, 9203, 9209, 9221, 9227,
9239, 9241, 9257, 9277, 9281, 9283, 9293, 9311,
9319, 9323, 9337, 9341, 9343, 9349, 9371, 9377,
9391, 9397, 9403, 9413, 9419, 9421, 9431, 9433,
9437, 9439, 9461, 9463, 9467, 9473, 9479, 9491,
9497, 9511, 9521, 9533, 9539, 9547, 9551, 9587,
9601, 9613, 9619, 9623, 9629, 9631, 9643, 9649,
9661, 9677, 9679, 9689, 9697, 9719, 9721, 9733,
9739, 9743, 9749, 9767, 9769, 9781, 9787, 9791,
9803, 9811, 9817, 9829, 9833, 9839, 9851, 9857,
9859, 9871, 9883, 9887, 9901, 9907, 9923, 9929,
9931, 9941, 9949, 9967, 9973, 10007, 10009, 10037,
10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099,
10103, 10111, 10133, 10139, 10141, 10151, 10159, 10163,
10169, 10177, 10181, 10193, 10211, 10223, 10243, 10247,
10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303,
10313, 10321, 10331, 10333, 10337, 10343, 10357, 10369,
10391, 10399, 10427, 10429, 10433, 10453, 10457, 10459,
10463, 10477, 10487, 10499, 10501, 10513, 10529, 10531,
10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627,
10631, 10639, 10651, 10657, 10663, 10667, 10687, 10691,
10709, 10711, 10723, 10729, 10733, 10739, 10753, 10771,
10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937,
10939, 10949, 10957, 10973, 10979, 10987, 10993, 11003,
11027, 11047, 11057, 11059, 11069, 11071, 11083, 11087,
11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161,
11171, 11173, 11177, 11197, 11213, 11239, 11243, 11251,
11257, 11261, 11273, 11279, 11287, 11299, 11311, 11317,
11321, 11329, 11351, 11353, 11369, 11383, 11393, 11399,
11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483,
11489, 11491, 11497, 11503, 11519, 11527, 11549, 11551,
11579, 11587, 11593, 11597, 11617, 11621, 11633, 11657,
11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813,
11821, 11827, 11831, 11833, 11839, 11863, 11867, 11887,
11897, 11903, 11909, 11923, 11927, 11933, 11939, 11941,
11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011,
12037, 12041, 12043, 12049, 12071, 12073, 12097, 12101,
12107, 12109, 12113, 12119, 12143, 12149, 12157, 12161,
12163, 12197, 12203, 12211, 12227, 12239, 12241, 12251,
12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323,
12329, 12343, 12347, 12373, 12377, 12379, 12391, 12401,
12409, 12413, 12421, 12433, 12437, 12451, 12457, 12473,
12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589,
12601, 12611, 12613, 12619, 12637, 12641, 12647, 12653,
12659, 12671, 12689, 12697, 12703, 12713, 12721, 12739,
12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821,
12823, 12829, 12841, 12853, 12889, 12893, 12899, 12907,
12911, 12917, 12919, 12923, 12941, 12953, 12959, 12967,
12973, 12979, 12983, 13001, 13003, 13007, 13009, 13033,
13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109,
13121, 13127, 13147, 13151, 13159, 13163, 13171, 13177,
13183, 13187, 13217, 13219, 13229, 13241, 13249, 13259,
13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421,
13441, 13451, 13457, 13463, 13469, 13477, 13487, 13499,
13513, 13523, 13537, 13553, 13567, 13577, 13591, 13597,
13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681,
13687, 13691, 13693, 13697, 13709, 13711, 13721, 13723,
13729, 13751, 13757, 13759, 13763, 13781, 13789, 13799,
13807, 13829, 13831, 13841, 13859, 13873, 13877, 13879,
13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933,
13963, 13967, 13997, 13999, 14009, 14011, 14029, 14033,
14051, 14057, 14071, 14081, 14083, 14087, 14107, 14143,
14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323,
14327, 14341, 14347, 14369, 14387, 14389, 14401, 14407,
14411, 14419, 14423, 14431, 14437, 14447, 14449, 14461,
14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549,
14551, 14557, 14561, 14563, 14591, 14593, 14621, 14627,
14629, 14633, 14639, 14653, 14657, 14669, 14683, 14699,
14713, 14717, 14723, 14731, 14737, 14741, 14747, 14753,
14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821,
14827, 14831, 14843, 14851, 14867, 14869, 14879, 14887,
14891, 14897, 14923, 14929, 14939, 14947, 14951, 14957,
14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137,
15139, 15149, 15161, 15173, 15187, 15193, 15199, 15217,
15227, 15233, 15241, 15259, 15263, 15269, 15271, 15277,
15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331,
15349, 15359, 15361, 15373, 15377, 15383, 15391, 15401,
15413, 15427, 15439, 15443, 15451, 15461, 15467, 15473,
15493, 15497, 15511, 15527, 15541, 15551, 15559, 15569,
15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643,
15647, 15649, 15661, 15667, 15671, 15679, 15683, 15727,
15731, 15733, 15737, 15739, 15749, 15761, 15767, 15773,
15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919,
15923, 15937, 15959, 15971, 15973, 15991, 16001, 16007,
16033, 16057, 16061, 16063, 16067, 16069, 16073, 16087,
16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183,
16187, 16189, 16193, 16217, 16223, 16229, 16231, 16249,
16253, 16267, 16273, 16301, 16319, 16333, 16339, 16349,
16361, 16363, 16369, 16381, 16411, 16417, 16421, 16427,
16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493,
16519, 16529, 16547, 16553, 16561, 16567, 16573, 16603,
16607, 16619, 16631, 16633, 16649, 16651, 16657, 16661,
16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843,
16871, 16879, 16883, 16889, 16901, 16903, 16921, 16927,
16931, 16937, 16943, 16963, 16979, 16981, 16987, 16993,
17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053,
17077, 17093, 17099, 17107, 17117, 17123, 17137, 17159,
17167, 17183, 17189, 17191, 17203, 17207, 17209, 17231,
17239, 17257, 17291, 17293, 17299, 17317, 17321, 17327,
17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389,
17393, 17401, 17417, 17419, 17431, 17443, 17449, 17467,
17471, 17477, 17483, 17489, 17491, 17497, 17509, 17519,
17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683,
17707, 17713, 17729, 17737, 17747, 17749, 17761, 17783,
17789, 17791, 17807, 17827, 17837, 17839, 17851, 17863,
};

View file

@ -0,0 +1,48 @@
#! /usr/bin/env perl
# Copyright 1998-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# Output year depends on the year of the script.
my $YEAR = [localtime([stat($0)]->[9])]->[5] + 1900;
print <<"EOF";
/*
* WARNING: do not edit!
* Generated by crypto/bn/bn_prime.pl
*
* Copyright 1998-$YEAR The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
EOF
my $num = shift || 2048;
my @primes = ( 2 );
my $p = 1;
loop: while ($#primes < $num-1) {
$p += 2;
my $s = int(sqrt($p));
for (my $i = 0; defined($primes[$i]) && $primes[$i] <= $s; $i++) {
next loop if ($p % $primes[$i]) == 0;
}
push(@primes, $p);
}
print "typedef unsigned short prime_t;\n";
printf "# define NUMPRIMES %d\n\n", $num;
printf "static const prime_t primes[%d] = {", $num;
for (my $i = 0; $i <= $#primes; $i++) {
printf "\n " if ($i % 8) == 0;
printf " %5d,", $primes[$i];
}
print "\n};\n";

View file

@ -0,0 +1,345 @@
/*
* Copyright 1995-2017 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include "internal/ctype.h"
#include <limits.h>
#include "internal/cryptlib.h"
#include <openssl/buffer.h>
#include "bn_lcl.h"
static const char Hex[] = "0123456789ABCDEF";
/* Must 'OPENSSL_free' the returned data */
char *BN_bn2hex(const BIGNUM *a)
{
int i, j, v, z = 0;
char *buf;
char *p;
if (BN_is_zero(a))
return OPENSSL_strdup("0");
buf = OPENSSL_malloc(a->top * BN_BYTES * 2 + 2);
if (buf == NULL) {
BNerr(BN_F_BN_BN2HEX, ERR_R_MALLOC_FAILURE);
goto err;
}
p = buf;
if (a->neg)
*p++ = '-';
for (i = a->top - 1; i >= 0; i--) {
for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
/* strip leading zeros */
v = (int)((a->d[i] >> j) & 0xff);
if (z || v != 0) {
*p++ = Hex[v >> 4];
*p++ = Hex[v & 0x0f];
z = 1;
}
}
}
*p = '\0';
err:
return buf;
}
/* Must 'OPENSSL_free' the returned data */
char *BN_bn2dec(const BIGNUM *a)
{
int i = 0, num, ok = 0, n, tbytes;
char *buf = NULL;
char *p;
BIGNUM *t = NULL;
BN_ULONG *bn_data = NULL, *lp;
int bn_data_num;
/*-
* get an upper bound for the length of the decimal integer
* num <= (BN_num_bits(a) + 1) * log(2)
* <= 3 * BN_num_bits(a) * 0.101 + log(2) + 1 (rounding error)
* <= 3 * BN_num_bits(a) / 10 + 3 * BN_num_bits / 1000 + 1 + 1
*/
i = BN_num_bits(a) * 3;
num = (i / 10 + i / 1000 + 1) + 1;
tbytes = num + 3; /* negative and terminator and one spare? */
bn_data_num = num / BN_DEC_NUM + 1;
bn_data = OPENSSL_malloc(bn_data_num * sizeof(BN_ULONG));
buf = OPENSSL_malloc(tbytes);
if (buf == NULL || bn_data == NULL) {
BNerr(BN_F_BN_BN2DEC, ERR_R_MALLOC_FAILURE);
goto err;
}
if ((t = BN_dup(a)) == NULL)
goto err;
p = buf;
lp = bn_data;
if (BN_is_zero(t)) {
*p++ = '0';
*p++ = '\0';
} else {
if (BN_is_negative(t))
*p++ = '-';
while (!BN_is_zero(t)) {
if (lp - bn_data >= bn_data_num)
goto err;
*lp = BN_div_word(t, BN_DEC_CONV);
if (*lp == (BN_ULONG)-1)
goto err;
lp++;
}
lp--;
/*
* We now have a series of blocks, BN_DEC_NUM chars in length, where
* the last one needs truncation. The blocks need to be reversed in
* order.
*/
n = BIO_snprintf(p, tbytes - (size_t)(p - buf), BN_DEC_FMT1, *lp);
if (n < 0)
goto err;
p += n;
while (lp != bn_data) {
lp--;
n = BIO_snprintf(p, tbytes - (size_t)(p - buf), BN_DEC_FMT2, *lp);
if (n < 0)
goto err;
p += n;
}
}
ok = 1;
err:
OPENSSL_free(bn_data);
BN_free(t);
if (ok)
return buf;
OPENSSL_free(buf);
return NULL;
}
int BN_hex2bn(BIGNUM **bn, const char *a)
{
BIGNUM *ret = NULL;
BN_ULONG l = 0;
int neg = 0, h, m, i, j, k, c;
int num;
if (a == NULL || *a == '\0')
return 0;
if (*a == '-') {
neg = 1;
a++;
}
for (i = 0; i <= INT_MAX / 4 && ossl_isxdigit(a[i]); i++)
continue;
if (i == 0 || i > INT_MAX / 4)
goto err;
num = i + neg;
if (bn == NULL)
return num;
/* a is the start of the hex digits, and it is 'i' long */
if (*bn == NULL) {
if ((ret = BN_new()) == NULL)
return 0;
} else {
ret = *bn;
BN_zero(ret);
}
/* i is the number of hex digits */
if (bn_expand(ret, i * 4) == NULL)
goto err;
j = i; /* least significant 'hex' */
m = 0;
h = 0;
while (j > 0) {
m = (BN_BYTES * 2 <= j) ? BN_BYTES * 2 : j;
l = 0;
for (;;) {
c = a[j - m];
k = OPENSSL_hexchar2int(c);
if (k < 0)
k = 0; /* paranoia */
l = (l << 4) | k;
if (--m <= 0) {
ret->d[h++] = l;
break;
}
}
j -= BN_BYTES * 2;
}
ret->top = h;
bn_correct_top(ret);
*bn = ret;
bn_check_top(ret);
/* Don't set the negative flag if it's zero. */
if (ret->top != 0)
ret->neg = neg;
return num;
err:
if (*bn == NULL)
BN_free(ret);
return 0;
}
int BN_dec2bn(BIGNUM **bn, const char *a)
{
BIGNUM *ret = NULL;
BN_ULONG l = 0;
int neg = 0, i, j;
int num;
if (a == NULL || *a == '\0')
return 0;
if (*a == '-') {
neg = 1;
a++;
}
for (i = 0; i <= INT_MAX / 4 && ossl_isdigit(a[i]); i++)
continue;
if (i == 0 || i > INT_MAX / 4)
goto err;
num = i + neg;
if (bn == NULL)
return num;
/*
* a is the start of the digits, and it is 'i' long. We chop it into
* BN_DEC_NUM digits at a time
*/
if (*bn == NULL) {
if ((ret = BN_new()) == NULL)
return 0;
} else {
ret = *bn;
BN_zero(ret);
}
/* i is the number of digits, a bit of an over expand */
if (bn_expand(ret, i * 4) == NULL)
goto err;
j = BN_DEC_NUM - i % BN_DEC_NUM;
if (j == BN_DEC_NUM)
j = 0;
l = 0;
while (--i >= 0) {
l *= 10;
l += *a - '0';
a++;
if (++j == BN_DEC_NUM) {
if (!BN_mul_word(ret, BN_DEC_CONV)
|| !BN_add_word(ret, l))
goto err;
l = 0;
j = 0;
}
}
bn_correct_top(ret);
*bn = ret;
bn_check_top(ret);
/* Don't set the negative flag if it's zero. */
if (ret->top != 0)
ret->neg = neg;
return num;
err:
if (*bn == NULL)
BN_free(ret);
return 0;
}
int BN_asc2bn(BIGNUM **bn, const char *a)
{
const char *p = a;
if (*p == '-')
p++;
if (p[0] == '0' && (p[1] == 'X' || p[1] == 'x')) {
if (!BN_hex2bn(bn, p + 2))
return 0;
} else {
if (!BN_dec2bn(bn, p))
return 0;
}
/* Don't set the negative flag if it's zero. */
if (*a == '-' && (*bn)->top != 0)
(*bn)->neg = 1;
return 1;
}
# ifndef OPENSSL_NO_STDIO
int BN_print_fp(FILE *fp, const BIGNUM *a)
{
BIO *b;
int ret;
if ((b = BIO_new(BIO_s_file())) == NULL)
return 0;
BIO_set_fp(b, fp, BIO_NOCLOSE);
ret = BN_print(b, a);
BIO_free(b);
return ret;
}
# endif
int BN_print(BIO *bp, const BIGNUM *a)
{
int i, j, v, z = 0;
int ret = 0;
if ((a->neg) && BIO_write(bp, "-", 1) != 1)
goto end;
if (BN_is_zero(a) && BIO_write(bp, "0", 1) != 1)
goto end;
for (i = a->top - 1; i >= 0; i--) {
for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
/* strip leading zeros */
v = (int)((a->d[i] >> j) & 0x0f);
if (z || v != 0) {
if (BIO_write(bp, &Hex[v], 1) != 1)
goto end;
z = 1;
}
}
}
ret = 1;
end:
return ret;
}
char *BN_options(void)
{
static int init = 0;
static char data[16];
if (!init) {
init++;
#ifdef BN_LLONG
BIO_snprintf(data, sizeof(data), "bn(%zu,%zu)",
sizeof(BN_ULLONG) * 8, sizeof(BN_ULONG) * 8);
#else
BIO_snprintf(data, sizeof(data), "bn(%zu,%zu)",
sizeof(BN_ULONG) * 8, sizeof(BN_ULONG) * 8);
#endif
}
return data;
}

View file

@ -0,0 +1,268 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include <time.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
#include <openssl/rand.h>
#include <openssl/sha.h>
typedef enum bnrand_flag_e {
NORMAL, TESTING, PRIVATE
} BNRAND_FLAG;
static int bnrand(BNRAND_FLAG flag, BIGNUM *rnd, int bits, int top, int bottom)
{
unsigned char *buf = NULL;
int b, ret = 0, bit, bytes, mask;
if (bits == 0) {
if (top != BN_RAND_TOP_ANY || bottom != BN_RAND_BOTTOM_ANY)
goto toosmall;
BN_zero(rnd);
return 1;
}
if (bits < 0 || (bits == 1 && top > 0))
goto toosmall;
bytes = (bits + 7) / 8;
bit = (bits - 1) % 8;
mask = 0xff << (bit + 1);
buf = OPENSSL_malloc(bytes);
if (buf == NULL) {
BNerr(BN_F_BNRAND, ERR_R_MALLOC_FAILURE);
goto err;
}
/* make a random number and set the top and bottom bits */
b = flag == NORMAL ? RAND_bytes(buf, bytes) : RAND_priv_bytes(buf, bytes);
if (b <= 0)
goto err;
if (flag == TESTING) {
/*
* generate patterns that are more likely to trigger BN library bugs
*/
int i;
unsigned char c;
for (i = 0; i < bytes; i++) {
if (RAND_bytes(&c, 1) <= 0)
goto err;
if (c >= 128 && i > 0)
buf[i] = buf[i - 1];
else if (c < 42)
buf[i] = 0;
else if (c < 84)
buf[i] = 255;
}
}
if (top >= 0) {
if (top) {
if (bit == 0) {
buf[0] = 1;
buf[1] |= 0x80;
} else {
buf[0] |= (3 << (bit - 1));
}
} else {
buf[0] |= (1 << bit);
}
}
buf[0] &= ~mask;
if (bottom) /* set bottom bit if requested */
buf[bytes - 1] |= 1;
if (!BN_bin2bn(buf, bytes, rnd))
goto err;
ret = 1;
err:
OPENSSL_clear_free(buf, bytes);
bn_check_top(rnd);
return ret;
toosmall:
BNerr(BN_F_BNRAND, BN_R_BITS_TOO_SMALL);
return 0;
}
int BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(NORMAL, rnd, bits, top, bottom);
}
int BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(TESTING, rnd, bits, top, bottom);
}
int BN_priv_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return bnrand(PRIVATE, rnd, bits, top, bottom);
}
/* random number r: 0 <= r < range */
static int bnrand_range(BNRAND_FLAG flag, BIGNUM *r, const BIGNUM *range)
{
int n;
int count = 100;
if (range->neg || BN_is_zero(range)) {
BNerr(BN_F_BNRAND_RANGE, BN_R_INVALID_RANGE);
return 0;
}
n = BN_num_bits(range); /* n > 0 */
/* BN_is_bit_set(range, n - 1) always holds */
if (n == 1)
BN_zero(r);
else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
/*
* range = 100..._2, so 3*range (= 11..._2) is exactly one bit longer
* than range
*/
do {
if (!bnrand(flag, r, n + 1, BN_RAND_TOP_ANY, BN_RAND_BOTTOM_ANY))
return 0;
/*
* If r < 3*range, use r := r MOD range (which is either r, r -
* range, or r - 2*range). Otherwise, iterate once more. Since
* 3*range = 11..._2, each iteration succeeds with probability >=
* .75.
*/
if (BN_cmp(r, range) >= 0) {
if (!BN_sub(r, r, range))
return 0;
if (BN_cmp(r, range) >= 0)
if (!BN_sub(r, r, range))
return 0;
}
if (!--count) {
BNerr(BN_F_BNRAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
return 0;
}
}
while (BN_cmp(r, range) >= 0);
} else {
do {
/* range = 11..._2 or range = 101..._2 */
if (!bnrand(flag, r, n, BN_RAND_TOP_ANY, BN_RAND_BOTTOM_ANY))
return 0;
if (!--count) {
BNerr(BN_F_BNRAND_RANGE, BN_R_TOO_MANY_ITERATIONS);
return 0;
}
}
while (BN_cmp(r, range) >= 0);
}
bn_check_top(r);
return 1;
}
int BN_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bnrand_range(NORMAL, r, range);
}
int BN_priv_rand_range(BIGNUM *r, const BIGNUM *range)
{
return bnrand_range(PRIVATE, r, range);
}
int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
{
return BN_rand(rnd, bits, top, bottom);
}
int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range)
{
return BN_rand_range(r, range);
}
/*
* BN_generate_dsa_nonce generates a random number 0 <= out < range. Unlike
* BN_rand_range, it also includes the contents of |priv| and |message| in
* the generation so that an RNG failure isn't fatal as long as |priv|
* remains secret. This is intended for use in DSA and ECDSA where an RNG
* weakness leads directly to private key exposure unless this function is
* used.
*/
int BN_generate_dsa_nonce(BIGNUM *out, const BIGNUM *range,
const BIGNUM *priv, const unsigned char *message,
size_t message_len, BN_CTX *ctx)
{
SHA512_CTX sha;
/*
* We use 512 bits of random data per iteration to ensure that we have at
* least |range| bits of randomness.
*/
unsigned char random_bytes[64];
unsigned char digest[SHA512_DIGEST_LENGTH];
unsigned done, todo;
/* We generate |range|+8 bytes of random output. */
const unsigned num_k_bytes = BN_num_bytes(range) + 8;
unsigned char private_bytes[96];
unsigned char *k_bytes;
int ret = 0;
k_bytes = OPENSSL_malloc(num_k_bytes);
if (k_bytes == NULL)
goto err;
/* We copy |priv| into a local buffer to avoid exposing its length. */
todo = sizeof(priv->d[0]) * priv->top;
if (todo > sizeof(private_bytes)) {
/*
* No reasonable DSA or ECDSA key should have a private key this
* large and we don't handle this case in order to avoid leaking the
* length of the private key.
*/
BNerr(BN_F_BN_GENERATE_DSA_NONCE, BN_R_PRIVATE_KEY_TOO_LARGE);
goto err;
}
memcpy(private_bytes, priv->d, todo);
memset(private_bytes + todo, 0, sizeof(private_bytes) - todo);
for (done = 0; done < num_k_bytes;) {
if (RAND_priv_bytes(random_bytes, sizeof(random_bytes)) != 1)
goto err;
SHA512_Init(&sha);
SHA512_Update(&sha, &done, sizeof(done));
SHA512_Update(&sha, private_bytes, sizeof(private_bytes));
SHA512_Update(&sha, message, message_len);
SHA512_Update(&sha, random_bytes, sizeof(random_bytes));
SHA512_Final(digest, &sha);
todo = num_k_bytes - done;
if (todo > SHA512_DIGEST_LENGTH)
todo = SHA512_DIGEST_LENGTH;
memcpy(k_bytes + done, digest, todo);
done += todo;
}
if (!BN_bin2bn(k_bytes, num_k_bytes, out))
goto err;
if (BN_mod(out, out, range, ctx) != 1)
goto err;
ret = 1;
err:
OPENSSL_free(k_bytes);
OPENSSL_cleanse(private_bytes, sizeof(private_bytes));
return ret;
}

View file

@ -0,0 +1,194 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
void BN_RECP_CTX_init(BN_RECP_CTX *recp)
{
memset(recp, 0, sizeof(*recp));
bn_init(&(recp->N));
bn_init(&(recp->Nr));
}
BN_RECP_CTX *BN_RECP_CTX_new(void)
{
BN_RECP_CTX *ret;
if ((ret = OPENSSL_zalloc(sizeof(*ret))) == NULL) {
BNerr(BN_F_BN_RECP_CTX_NEW, ERR_R_MALLOC_FAILURE);
return NULL;
}
bn_init(&(ret->N));
bn_init(&(ret->Nr));
ret->flags = BN_FLG_MALLOCED;
return ret;
}
void BN_RECP_CTX_free(BN_RECP_CTX *recp)
{
if (recp == NULL)
return;
BN_free(&recp->N);
BN_free(&recp->Nr);
if (recp->flags & BN_FLG_MALLOCED)
OPENSSL_free(recp);
}
int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
{
if (!BN_copy(&(recp->N), d))
return 0;
BN_zero(&(recp->Nr));
recp->num_bits = BN_num_bits(d);
recp->shift = 0;
return 1;
}
int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
BN_RECP_CTX *recp, BN_CTX *ctx)
{
int ret = 0;
BIGNUM *a;
const BIGNUM *ca;
BN_CTX_start(ctx);
if ((a = BN_CTX_get(ctx)) == NULL)
goto err;
if (y != NULL) {
if (x == y) {
if (!BN_sqr(a, x, ctx))
goto err;
} else {
if (!BN_mul(a, x, y, ctx))
goto err;
}
ca = a;
} else
ca = x; /* Just do the mod */
ret = BN_div_recp(NULL, r, ca, recp, ctx);
err:
BN_CTX_end(ctx);
bn_check_top(r);
return ret;
}
int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
BN_RECP_CTX *recp, BN_CTX *ctx)
{
int i, j, ret = 0;
BIGNUM *a, *b, *d, *r;
BN_CTX_start(ctx);
d = (dv != NULL) ? dv : BN_CTX_get(ctx);
r = (rem != NULL) ? rem : BN_CTX_get(ctx);
a = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
if (b == NULL)
goto err;
if (BN_ucmp(m, &(recp->N)) < 0) {
BN_zero(d);
if (!BN_copy(r, m)) {
BN_CTX_end(ctx);
return 0;
}
BN_CTX_end(ctx);
return 1;
}
/*
* We want the remainder Given input of ABCDEF / ab we need multiply
* ABCDEF by 3 digests of the reciprocal of ab
*/
/* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
i = BN_num_bits(m);
j = recp->num_bits << 1;
if (j > i)
i = j;
/* Nr := round(2^i / N) */
if (i != recp->shift)
recp->shift = BN_reciprocal(&(recp->Nr), &(recp->N), i, ctx);
/* BN_reciprocal could have returned -1 for an error */
if (recp->shift == -1)
goto err;
/*-
* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
* = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
* <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
* = |m/N|
*/
if (!BN_rshift(a, m, recp->num_bits))
goto err;
if (!BN_mul(b, a, &(recp->Nr), ctx))
goto err;
if (!BN_rshift(d, b, i - recp->num_bits))
goto err;
d->neg = 0;
if (!BN_mul(b, &(recp->N), d, ctx))
goto err;
if (!BN_usub(r, m, b))
goto err;
r->neg = 0;
j = 0;
while (BN_ucmp(r, &(recp->N)) >= 0) {
if (j++ > 2) {
BNerr(BN_F_BN_DIV_RECP, BN_R_BAD_RECIPROCAL);
goto err;
}
if (!BN_usub(r, r, &(recp->N)))
goto err;
if (!BN_add_word(d, 1))
goto err;
}
r->neg = BN_is_zero(r) ? 0 : m->neg;
d->neg = m->neg ^ recp->N.neg;
ret = 1;
err:
BN_CTX_end(ctx);
bn_check_top(dv);
bn_check_top(rem);
return ret;
}
/*
* len is the expected size of the result We actually calculate with an extra
* word of precision, so we can do faster division if the remainder is not
* required.
*/
/* r := 2^len / m */
int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
{
int ret = -1;
BIGNUM *t;
BN_CTX_start(ctx);
if ((t = BN_CTX_get(ctx)) == NULL)
goto err;
if (!BN_set_bit(t, len))
goto err;
if (!BN_div(r, NULL, t, m, ctx))
goto err;
ret = len;
err:
bn_check_top(r);
BN_CTX_end(ctx);
return ret;
}

View file

@ -0,0 +1,257 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <assert.h>
#include "internal/cryptlib.h"
#include "bn_lcl.h"
int BN_lshift1(BIGNUM *r, const BIGNUM *a)
{
register BN_ULONG *ap, *rp, t, c;
int i;
bn_check_top(r);
bn_check_top(a);
if (r != a) {
r->neg = a->neg;
if (bn_wexpand(r, a->top + 1) == NULL)
return 0;
r->top = a->top;
} else {
if (bn_wexpand(r, a->top + 1) == NULL)
return 0;
}
ap = a->d;
rp = r->d;
c = 0;
for (i = 0; i < a->top; i++) {
t = *(ap++);
*(rp++) = ((t << 1) | c) & BN_MASK2;
c = (t & BN_TBIT) ? 1 : 0;
}
if (c) {
*rp = 1;
r->top++;
}
bn_check_top(r);
return 1;
}
int BN_rshift1(BIGNUM *r, const BIGNUM *a)
{
BN_ULONG *ap, *rp, t, c;
int i, j;
bn_check_top(r);
bn_check_top(a);
if (BN_is_zero(a)) {
BN_zero(r);
return 1;
}
i = a->top;
ap = a->d;
j = i - (ap[i - 1] == 1);
if (a != r) {
if (bn_wexpand(r, j) == NULL)
return 0;
r->neg = a->neg;
}
rp = r->d;
t = ap[--i];
c = (t & 1) ? BN_TBIT : 0;
if (t >>= 1)
rp[i] = t;
while (i > 0) {
t = ap[--i];
rp[i] = ((t >> 1) & BN_MASK2) | c;
c = (t & 1) ? BN_TBIT : 0;
}
r->top = j;
if (!r->top)
r->neg = 0; /* don't allow negative zero */
bn_check_top(r);
return 1;
}
int BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
{
int ret;
if (n < 0) {
BNerr(BN_F_BN_LSHIFT, BN_R_INVALID_SHIFT);
return 0;
}
ret = bn_lshift_fixed_top(r, a, n);
bn_correct_top(r);
bn_check_top(r);
return ret;
}
/*
* In respect to shift factor the execution time is invariant of
* |n % BN_BITS2|, but not |n / BN_BITS2|. Or in other words pre-condition
* for constant-time-ness is |n < BN_BITS2| or |n / BN_BITS2| being
* non-secret.
*/
int bn_lshift_fixed_top(BIGNUM *r, const BIGNUM *a, int n)
{
int i, nw;
unsigned int lb, rb;
BN_ULONG *t, *f;
BN_ULONG l, m, rmask = 0;
assert(n >= 0);
bn_check_top(r);
bn_check_top(a);
nw = n / BN_BITS2;
if (bn_wexpand(r, a->top + nw + 1) == NULL)
return 0;
if (a->top != 0) {
lb = (unsigned int)n % BN_BITS2;
rb = BN_BITS2 - lb;
rb %= BN_BITS2; /* say no to undefined behaviour */
rmask = (BN_ULONG)0 - rb; /* rmask = 0 - (rb != 0) */
rmask |= rmask >> 8;
f = &(a->d[0]);
t = &(r->d[nw]);
l = f[a->top - 1];
t[a->top] = (l >> rb) & rmask;
for (i = a->top - 1; i > 0; i--) {
m = l << lb;
l = f[i - 1];
t[i] = (m | ((l >> rb) & rmask)) & BN_MASK2;
}
t[0] = (l << lb) & BN_MASK2;
} else {
/* shouldn't happen, but formally required */
r->d[nw] = 0;
}
if (nw != 0)
memset(r->d, 0, sizeof(*t) * nw);
r->neg = a->neg;
r->top = a->top + nw + 1;
r->flags |= BN_FLG_FIXED_TOP;
return 1;
}
int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
{
int i, j, nw, lb, rb;
BN_ULONG *t, *f;
BN_ULONG l, tmp;
bn_check_top(r);
bn_check_top(a);
if (n < 0) {
BNerr(BN_F_BN_RSHIFT, BN_R_INVALID_SHIFT);
return 0;
}
nw = n / BN_BITS2;
rb = n % BN_BITS2;
lb = BN_BITS2 - rb;
if (nw >= a->top || a->top == 0) {
BN_zero(r);
return 1;
}
i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
if (r != a) {
if (bn_wexpand(r, i) == NULL)
return 0;
r->neg = a->neg;
} else {
if (n == 0)
return 1; /* or the copying loop will go berserk */
}
f = &(a->d[nw]);
t = r->d;
j = a->top - nw;
r->top = i;
if (rb == 0) {
for (i = j; i != 0; i--)
*(t++) = *(f++);
} else {
l = *(f++);
for (i = j - 1; i != 0; i--) {
tmp = (l >> rb) & BN_MASK2;
l = *(f++);
*(t++) = (tmp | (l << lb)) & BN_MASK2;
}
if ((l = (l >> rb) & BN_MASK2))
*(t) = l;
}
if (!r->top)
r->neg = 0; /* don't allow negative zero */
bn_check_top(r);
return 1;
}
/*
* In respect to shift factor the execution time is invariant of
* |n % BN_BITS2|, but not |n / BN_BITS2|. Or in other words pre-condition
* for constant-time-ness for sufficiently[!] zero-padded inputs is
* |n < BN_BITS2| or |n / BN_BITS2| being non-secret.
*/
int bn_rshift_fixed_top(BIGNUM *r, const BIGNUM *a, int n)
{
int i, top, nw;
unsigned int lb, rb;
BN_ULONG *t, *f;
BN_ULONG l, m, mask;
bn_check_top(r);
bn_check_top(a);
assert(n >= 0);
nw = n / BN_BITS2;
if (nw >= a->top) {
/* shouldn't happen, but formally required */
BN_zero(r);
return 1;
}
rb = (unsigned int)n % BN_BITS2;
lb = BN_BITS2 - rb;
lb %= BN_BITS2; /* say no to undefined behaviour */
mask = (BN_ULONG)0 - lb; /* mask = 0 - (lb != 0) */
mask |= mask >> 8;
top = a->top - nw;
if (r != a && bn_wexpand(r, top) == NULL)
return 0;
t = &(r->d[0]);
f = &(a->d[nw]);
l = f[0];
for (i = 0; i < top - 1; i++) {
m = f[i + 1];
t[i] = (l >> rb) | ((m << lb) & mask);
l = m;
}
t[i] = l >> rb;
r->neg = a->neg;
r->top = top;
r->flags |= BN_FLG_FIXED_TOP;
return 1;
}

View file

@ -0,0 +1,239 @@
/*
* Copyright 1995-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
/* r must not be a */
/*
* I've just gone over this and it is now %20 faster on x86 - eay - 27 Jun 96
*/
int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
{
int ret = bn_sqr_fixed_top(r, a, ctx);
bn_correct_top(r);
bn_check_top(r);
return ret;
}
int bn_sqr_fixed_top(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
{
int max, al;
int ret = 0;
BIGNUM *tmp, *rr;
bn_check_top(a);
al = a->top;
if (al <= 0) {
r->top = 0;
r->neg = 0;
return 1;
}
BN_CTX_start(ctx);
rr = (a != r) ? r : BN_CTX_get(ctx);
tmp = BN_CTX_get(ctx);
if (rr == NULL || tmp == NULL)
goto err;
max = 2 * al; /* Non-zero (from above) */
if (bn_wexpand(rr, max) == NULL)
goto err;
if (al == 4) {
#ifndef BN_SQR_COMBA
BN_ULONG t[8];
bn_sqr_normal(rr->d, a->d, 4, t);
#else
bn_sqr_comba4(rr->d, a->d);
#endif
} else if (al == 8) {
#ifndef BN_SQR_COMBA
BN_ULONG t[16];
bn_sqr_normal(rr->d, a->d, 8, t);
#else
bn_sqr_comba8(rr->d, a->d);
#endif
} else {
#if defined(BN_RECURSION)
if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
bn_sqr_normal(rr->d, a->d, al, t);
} else {
int j, k;
j = BN_num_bits_word((BN_ULONG)al);
j = 1 << (j - 1);
k = j + j;
if (al == j) {
if (bn_wexpand(tmp, k * 2) == NULL)
goto err;
bn_sqr_recursive(rr->d, a->d, al, tmp->d);
} else {
if (bn_wexpand(tmp, max) == NULL)
goto err;
bn_sqr_normal(rr->d, a->d, al, tmp->d);
}
}
#else
if (bn_wexpand(tmp, max) == NULL)
goto err;
bn_sqr_normal(rr->d, a->d, al, tmp->d);
#endif
}
rr->neg = 0;
rr->top = max;
rr->flags |= BN_FLG_FIXED_TOP;
if (r != rr && BN_copy(r, rr) == NULL)
goto err;
ret = 1;
err:
bn_check_top(rr);
bn_check_top(tmp);
BN_CTX_end(ctx);
return ret;
}
/* tmp must have 2*n words */
void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp)
{
int i, j, max;
const BN_ULONG *ap;
BN_ULONG *rp;
max = n * 2;
ap = a;
rp = r;
rp[0] = rp[max - 1] = 0;
rp++;
j = n;
if (--j > 0) {
ap++;
rp[j] = bn_mul_words(rp, ap, j, ap[-1]);
rp += 2;
}
for (i = n - 2; i > 0; i--) {
j--;
ap++;
rp[j] = bn_mul_add_words(rp, ap, j, ap[-1]);
rp += 2;
}
bn_add_words(r, r, r, max);
/* There will not be a carry */
bn_sqr_words(tmp, a, n);
bn_add_words(r, r, tmp, max);
}
#ifdef BN_RECURSION
/*-
* r is 2*n words in size,
* a and b are both n words in size. (There's not actually a 'b' here ...)
* n must be a power of 2.
* We multiply and return the result.
* t must be 2*n words in size
* We calculate
* a[0]*b[0]
* a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
* a[1]*b[1]
*/
void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t)
{
int n = n2 / 2;
int zero, c1;
BN_ULONG ln, lo, *p;
if (n2 == 4) {
# ifndef BN_SQR_COMBA
bn_sqr_normal(r, a, 4, t);
# else
bn_sqr_comba4(r, a);
# endif
return;
} else if (n2 == 8) {
# ifndef BN_SQR_COMBA
bn_sqr_normal(r, a, 8, t);
# else
bn_sqr_comba8(r, a);
# endif
return;
}
if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
bn_sqr_normal(r, a, n2, t);
return;
}
/* r=(a[0]-a[1])*(a[1]-a[0]) */
c1 = bn_cmp_words(a, &(a[n]), n);
zero = 0;
if (c1 > 0)
bn_sub_words(t, a, &(a[n]), n);
else if (c1 < 0)
bn_sub_words(t, &(a[n]), a, n);
else
zero = 1;
/* The result will always be negative unless it is zero */
p = &(t[n2 * 2]);
if (!zero)
bn_sqr_recursive(&(t[n2]), t, n, p);
else
memset(&t[n2], 0, sizeof(*t) * n2);
bn_sqr_recursive(r, a, n, p);
bn_sqr_recursive(&(r[n2]), &(a[n]), n, p);
/*-
* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
* r[10] holds (a[0]*b[0])
* r[32] holds (b[1]*b[1])
*/
c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
/* t[32] is negative */
c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
/*-
* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
* r[10] holds (a[0]*a[0])
* r[32] holds (a[1]*a[1])
* c1 holds the carry bits
*/
c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
if (c1) {
p = &(r[n + n2]);
lo = *p;
ln = (lo + c1) & BN_MASK2;
*p = ln;
/*
* The overflow will stop before we over write words we should not
* overwrite
*/
if (ln < (BN_ULONG)c1) {
do {
p++;
lo = *p;
ln = (lo + 1) & BN_MASK2;
*p = ln;
} while (ln == 0);
}
}
}
#endif

View file

@ -0,0 +1,358 @@
/*
* Copyright 2000-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
/*
* Returns 'ret' such that ret^2 == a (mod p), using the Tonelli/Shanks
* algorithm (cf. Henri Cohen, "A Course in Algebraic Computational Number
* Theory", algorithm 1.5.1). 'p' must be prime!
*/
{
BIGNUM *ret = in;
int err = 1;
int r;
BIGNUM *A, *b, *q, *t, *x, *y;
int e, i, j;
if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
if (BN_abs_is_word(p, 2)) {
if (ret == NULL)
ret = BN_new();
if (ret == NULL)
goto end;
if (!BN_set_word(ret, BN_is_bit_set(a, 0))) {
if (ret != in)
BN_free(ret);
return NULL;
}
bn_check_top(ret);
return ret;
}
BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
return NULL;
}
if (BN_is_zero(a) || BN_is_one(a)) {
if (ret == NULL)
ret = BN_new();
if (ret == NULL)
goto end;
if (!BN_set_word(ret, BN_is_one(a))) {
if (ret != in)
BN_free(ret);
return NULL;
}
bn_check_top(ret);
return ret;
}
BN_CTX_start(ctx);
A = BN_CTX_get(ctx);
b = BN_CTX_get(ctx);
q = BN_CTX_get(ctx);
t = BN_CTX_get(ctx);
x = BN_CTX_get(ctx);
y = BN_CTX_get(ctx);
if (y == NULL)
goto end;
if (ret == NULL)
ret = BN_new();
if (ret == NULL)
goto end;
/* A = a mod p */
if (!BN_nnmod(A, a, p, ctx))
goto end;
/* now write |p| - 1 as 2^e*q where q is odd */
e = 1;
while (!BN_is_bit_set(p, e))
e++;
/* we'll set q later (if needed) */
if (e == 1) {
/*-
* The easy case: (|p|-1)/2 is odd, so 2 has an inverse
* modulo (|p|-1)/2, and square roots can be computed
* directly by modular exponentiation.
* We have
* 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2),
* so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1.
*/
if (!BN_rshift(q, p, 2))
goto end;
q->neg = 0;
if (!BN_add_word(q, 1))
goto end;
if (!BN_mod_exp(ret, A, q, p, ctx))
goto end;
err = 0;
goto vrfy;
}
if (e == 2) {
/*-
* |p| == 5 (mod 8)
*
* In this case 2 is always a non-square since
* Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime.
* So if a really is a square, then 2*a is a non-square.
* Thus for
* b := (2*a)^((|p|-5)/8),
* i := (2*a)*b^2
* we have
* i^2 = (2*a)^((1 + (|p|-5)/4)*2)
* = (2*a)^((p-1)/2)
* = -1;
* so if we set
* x := a*b*(i-1),
* then
* x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
* = a^2 * b^2 * (-2*i)
* = a*(-i)*(2*a*b^2)
* = a*(-i)*i
* = a.
*
* (This is due to A.O.L. Atkin,
* <URL: http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
* November 1992.)
*/
/* t := 2*a */
if (!BN_mod_lshift1_quick(t, A, p))
goto end;
/* b := (2*a)^((|p|-5)/8) */
if (!BN_rshift(q, p, 3))
goto end;
q->neg = 0;
if (!BN_mod_exp(b, t, q, p, ctx))
goto end;
/* y := b^2 */
if (!BN_mod_sqr(y, b, p, ctx))
goto end;
/* t := (2*a)*b^2 - 1 */
if (!BN_mod_mul(t, t, y, p, ctx))
goto end;
if (!BN_sub_word(t, 1))
goto end;
/* x = a*b*t */
if (!BN_mod_mul(x, A, b, p, ctx))
goto end;
if (!BN_mod_mul(x, x, t, p, ctx))
goto end;
if (!BN_copy(ret, x))
goto end;
err = 0;
goto vrfy;
}
/*
* e > 2, so we really have to use the Tonelli/Shanks algorithm. First,
* find some y that is not a square.
*/
if (!BN_copy(q, p))
goto end; /* use 'q' as temp */
q->neg = 0;
i = 2;
do {
/*
* For efficiency, try small numbers first; if this fails, try random
* numbers.
*/
if (i < 22) {
if (!BN_set_word(y, i))
goto end;
} else {
if (!BN_priv_rand(y, BN_num_bits(p), 0, 0))
goto end;
if (BN_ucmp(y, p) >= 0) {
if (!(p->neg ? BN_add : BN_sub) (y, y, p))
goto end;
}
/* now 0 <= y < |p| */
if (BN_is_zero(y))
if (!BN_set_word(y, i))
goto end;
}
r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
if (r < -1)
goto end;
if (r == 0) {
/* m divides p */
BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
goto end;
}
}
while (r == 1 && ++i < 82);
if (r != -1) {
/*
* Many rounds and still no non-square -- this is more likely a bug
* than just bad luck. Even if p is not prime, we should have found
* some y such that r == -1.
*/
BNerr(BN_F_BN_MOD_SQRT, BN_R_TOO_MANY_ITERATIONS);
goto end;
}
/* Here's our actual 'q': */
if (!BN_rshift(q, q, e))
goto end;
/*
* Now that we have some non-square, we can find an element of order 2^e
* by computing its q'th power.
*/
if (!BN_mod_exp(y, y, q, p, ctx))
goto end;
if (BN_is_one(y)) {
BNerr(BN_F_BN_MOD_SQRT, BN_R_P_IS_NOT_PRIME);
goto end;
}
/*-
* Now we know that (if p is indeed prime) there is an integer
* k, 0 <= k < 2^e, such that
*
* a^q * y^k == 1 (mod p).
*
* As a^q is a square and y is not, k must be even.
* q+1 is even, too, so there is an element
*
* X := a^((q+1)/2) * y^(k/2),
*
* and it satisfies
*
* X^2 = a^q * a * y^k
* = a,
*
* so it is the square root that we are looking for.
*/
/* t := (q-1)/2 (note that q is odd) */
if (!BN_rshift1(t, q))
goto end;
/* x := a^((q-1)/2) */
if (BN_is_zero(t)) { /* special case: p = 2^e + 1 */
if (!BN_nnmod(t, A, p, ctx))
goto end;
if (BN_is_zero(t)) {
/* special case: a == 0 (mod p) */
BN_zero(ret);
err = 0;
goto end;
} else if (!BN_one(x))
goto end;
} else {
if (!BN_mod_exp(x, A, t, p, ctx))
goto end;
if (BN_is_zero(x)) {
/* special case: a == 0 (mod p) */
BN_zero(ret);
err = 0;
goto end;
}
}
/* b := a*x^2 (= a^q) */
if (!BN_mod_sqr(b, x, p, ctx))
goto end;
if (!BN_mod_mul(b, b, A, p, ctx))
goto end;
/* x := a*x (= a^((q+1)/2)) */
if (!BN_mod_mul(x, x, A, p, ctx))
goto end;
while (1) {
/*-
* Now b is a^q * y^k for some even k (0 <= k < 2^E
* where E refers to the original value of e, which we
* don't keep in a variable), and x is a^((q+1)/2) * y^(k/2).
*
* We have a*b = x^2,
* y^2^(e-1) = -1,
* b^2^(e-1) = 1.
*/
if (BN_is_one(b)) {
if (!BN_copy(ret, x))
goto end;
err = 0;
goto vrfy;
}
/* find smallest i such that b^(2^i) = 1 */
i = 1;
if (!BN_mod_sqr(t, b, p, ctx))
goto end;
while (!BN_is_one(t)) {
i++;
if (i == e) {
BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
goto end;
}
if (!BN_mod_mul(t, t, t, p, ctx))
goto end;
}
/* t := y^2^(e - i - 1) */
if (!BN_copy(t, y))
goto end;
for (j = e - i - 1; j > 0; j--) {
if (!BN_mod_sqr(t, t, p, ctx))
goto end;
}
if (!BN_mod_mul(y, t, t, p, ctx))
goto end;
if (!BN_mod_mul(x, x, t, p, ctx))
goto end;
if (!BN_mod_mul(b, b, y, p, ctx))
goto end;
e = i;
}
vrfy:
if (!err) {
/*
* verify the result -- the input might have been not a square (test
* added in 0.9.8)
*/
if (!BN_mod_sqr(x, ret, p, ctx))
err = 1;
if (!err && 0 != BN_cmp(x, A)) {
BNerr(BN_F_BN_MOD_SQRT, BN_R_NOT_A_SQUARE);
err = 1;
}
}
end:
if (err) {
if (ret != in)
BN_clear_free(ret);
ret = NULL;
}
BN_CTX_end(ctx);
bn_check_top(ret);
return ret;
}

View file

@ -0,0 +1,545 @@
/*
* Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "bn_lcl.h"
#include "internal/nelem.h"
#ifndef OPENSSL_NO_SRP
#include <openssl/srp.h>
#include "internal/bn_srp.h"
# if (BN_BYTES == 8)
# if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
# define bn_pack4(a1,a2,a3,a4) ((a1##UI64<<48)|(a2##UI64<<32)|(a3##UI64<<16)|a4##UI64)
# elif defined(__arch64__)
# define bn_pack4(a1,a2,a3,a4) ((a1##UL<<48)|(a2##UL<<32)|(a3##UL<<16)|a4##UL)
# else
# define bn_pack4(a1,a2,a3,a4) ((a1##ULL<<48)|(a2##ULL<<32)|(a3##ULL<<16)|a4##ULL)
# endif
# elif (BN_BYTES == 4)
# define bn_pack4(a1,a2,a3,a4) ((a3##UL<<16)|a4##UL), ((a1##UL<<16)|a2##UL)
# else
# error "unsupported BN_BYTES"
# endif
static const BN_ULONG bn_group_1024_value[] = {
bn_pack4(0x9FC6, 0x1D2F, 0xC0EB, 0x06E3),
bn_pack4(0xFD51, 0x38FE, 0x8376, 0x435B),
bn_pack4(0x2FD4, 0xCBF4, 0x976E, 0xAA9A),
bn_pack4(0x68ED, 0xBC3C, 0x0572, 0x6CC0),
bn_pack4(0xC529, 0xF566, 0x660E, 0x57EC),
bn_pack4(0x8255, 0x9B29, 0x7BCF, 0x1885),
bn_pack4(0xCE8E, 0xF4AD, 0x69B1, 0x5D49),
bn_pack4(0x5DC7, 0xD7B4, 0x6154, 0xD6B6),
bn_pack4(0x8E49, 0x5C1D, 0x6089, 0xDAD1),
bn_pack4(0xE0D5, 0xD8E2, 0x50B9, 0x8BE4),
bn_pack4(0x383B, 0x4813, 0xD692, 0xC6E0),
bn_pack4(0xD674, 0xDF74, 0x96EA, 0x81D3),
bn_pack4(0x9EA2, 0x314C, 0x9C25, 0x6576),
bn_pack4(0x6072, 0x6187, 0x75FF, 0x3C0B),
bn_pack4(0x9C33, 0xF80A, 0xFA8F, 0xC5E8),
bn_pack4(0xEEAF, 0x0AB9, 0xADB3, 0x8DD6)
};
const BIGNUM bn_group_1024 = {
(BN_ULONG *)bn_group_1024_value,
OSSL_NELEM(bn_group_1024_value),
OSSL_NELEM(bn_group_1024_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_1536_value[] = {
bn_pack4(0xCF76, 0xE3FE, 0xD135, 0xF9BB),
bn_pack4(0x1518, 0x0F93, 0x499A, 0x234D),
bn_pack4(0x8CE7, 0xA28C, 0x2442, 0xC6F3),
bn_pack4(0x5A02, 0x1FFF, 0x5E91, 0x479E),
bn_pack4(0x7F8A, 0x2FE9, 0xB8B5, 0x292E),
bn_pack4(0x837C, 0x264A, 0xE3A9, 0xBEB8),
bn_pack4(0xE442, 0x734A, 0xF7CC, 0xB7AE),
bn_pack4(0x6577, 0x2E43, 0x7D6C, 0x7F8C),
bn_pack4(0xDB2F, 0xD53D, 0x24B7, 0xC486),
bn_pack4(0x6EDF, 0x0195, 0x3934, 0x9627),
bn_pack4(0x158B, 0xFD3E, 0x2B9C, 0x8CF5),
bn_pack4(0x764E, 0x3F4B, 0x53DD, 0x9DA1),
bn_pack4(0x4754, 0x8381, 0xDBC5, 0xB1FC),
bn_pack4(0x9B60, 0x9E0B, 0xE3BA, 0xB63D),
bn_pack4(0x8134, 0xB1C8, 0xB979, 0x8914),
bn_pack4(0xDF02, 0x8A7C, 0xEC67, 0xF0D0),
bn_pack4(0x80B6, 0x55BB, 0x9A22, 0xE8DC),
bn_pack4(0x1558, 0x903B, 0xA0D0, 0xF843),
bn_pack4(0x51C6, 0xA94B, 0xE460, 0x7A29),
bn_pack4(0x5F4F, 0x5F55, 0x6E27, 0xCBDE),
bn_pack4(0xBEEE, 0xA961, 0x4B19, 0xCC4D),
bn_pack4(0xDBA5, 0x1DF4, 0x99AC, 0x4C80),
bn_pack4(0xB1F1, 0x2A86, 0x17A4, 0x7BBB),
bn_pack4(0x9DEF, 0x3CAF, 0xB939, 0x277A)
};
const BIGNUM bn_group_1536 = {
(BN_ULONG *)bn_group_1536_value,
OSSL_NELEM(bn_group_1536_value),
OSSL_NELEM(bn_group_1536_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_2048_value[] = {
bn_pack4(0x0FA7, 0x111F, 0x9E4A, 0xFF73),
bn_pack4(0x9B65, 0xE372, 0xFCD6, 0x8EF2),
bn_pack4(0x35DE, 0x236D, 0x525F, 0x5475),
bn_pack4(0x94B5, 0xC803, 0xD89F, 0x7AE4),
bn_pack4(0x71AE, 0x35F8, 0xE9DB, 0xFBB6),
bn_pack4(0x2A56, 0x98F3, 0xA8D0, 0xC382),
bn_pack4(0x9CCC, 0x041C, 0x7BC3, 0x08D8),
bn_pack4(0xAF87, 0x4E73, 0x03CE, 0x5329),
bn_pack4(0x6160, 0x2790, 0x04E5, 0x7AE6),
bn_pack4(0x032C, 0xFBDB, 0xF52F, 0xB378),
bn_pack4(0x5EA7, 0x7A27, 0x75D2, 0xECFA),
bn_pack4(0x5445, 0x23B5, 0x24B0, 0xD57D),
bn_pack4(0x5B9D, 0x32E6, 0x88F8, 0x7748),
bn_pack4(0xF1D2, 0xB907, 0x8717, 0x461A),
bn_pack4(0x76BD, 0x207A, 0x436C, 0x6481),
bn_pack4(0xCA97, 0xB43A, 0x23FB, 0x8016),
bn_pack4(0x1D28, 0x1E44, 0x6B14, 0x773B),
bn_pack4(0x7359, 0xD041, 0xD5C3, 0x3EA7),
bn_pack4(0xA80D, 0x740A, 0xDBF4, 0xFF74),
bn_pack4(0x55F9, 0x7993, 0xEC97, 0x5EEA),
bn_pack4(0x2918, 0xA996, 0x2F0B, 0x93B8),
bn_pack4(0x661A, 0x05FB, 0xD5FA, 0xAAE8),
bn_pack4(0xCF60, 0x9517, 0x9A16, 0x3AB3),
bn_pack4(0xE808, 0x3969, 0xEDB7, 0x67B0),
bn_pack4(0xCD7F, 0x48A9, 0xDA04, 0xFD50),
bn_pack4(0xD523, 0x12AB, 0x4B03, 0x310D),
bn_pack4(0x8193, 0xE075, 0x7767, 0xA13D),
bn_pack4(0xA373, 0x29CB, 0xB4A0, 0x99ED),
bn_pack4(0xFC31, 0x9294, 0x3DB5, 0x6050),
bn_pack4(0xAF72, 0xB665, 0x1987, 0xEE07),
bn_pack4(0xF166, 0xDE5E, 0x1389, 0x582F),
bn_pack4(0xAC6B, 0xDB41, 0x324A, 0x9A9B)
};
const BIGNUM bn_group_2048 = {
(BN_ULONG *)bn_group_2048_value,
OSSL_NELEM(bn_group_2048_value),
OSSL_NELEM(bn_group_2048_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_3072_value[] = {
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF),
bn_pack4(0x4B82, 0xD120, 0xA93A, 0xD2CA),
bn_pack4(0x43DB, 0x5BFC, 0xE0FD, 0x108E),
bn_pack4(0x08E2, 0x4FA0, 0x74E5, 0xAB31),
bn_pack4(0x7709, 0x88C0, 0xBAD9, 0x46E2),
bn_pack4(0xBBE1, 0x1757, 0x7A61, 0x5D6C),
bn_pack4(0x521F, 0x2B18, 0x177B, 0x200C),
bn_pack4(0xD876, 0x0273, 0x3EC8, 0x6A64),
bn_pack4(0xF12F, 0xFA06, 0xD98A, 0x0864),
bn_pack4(0xCEE3, 0xD226, 0x1AD2, 0xEE6B),
bn_pack4(0x1E8C, 0x94E0, 0x4A25, 0x619D),
bn_pack4(0xABF5, 0xAE8C, 0xDB09, 0x33D7),
bn_pack4(0xB397, 0x0F85, 0xA6E1, 0xE4C7),
bn_pack4(0x8AEA, 0x7157, 0x5D06, 0x0C7D),
bn_pack4(0xECFB, 0x8504, 0x58DB, 0xEF0A),
bn_pack4(0xA855, 0x21AB, 0xDF1C, 0xBA64),
bn_pack4(0xAD33, 0x170D, 0x0450, 0x7A33),
bn_pack4(0x1572, 0x8E5A, 0x8AAA, 0xC42D),
bn_pack4(0x15D2, 0x2618, 0x98FA, 0x0510),
bn_pack4(0x3995, 0x497C, 0xEA95, 0x6AE5),
bn_pack4(0xDE2B, 0xCBF6, 0x9558, 0x1718),
bn_pack4(0xB5C5, 0x5DF0, 0x6F4C, 0x52C9),
bn_pack4(0x9B27, 0x83A2, 0xEC07, 0xA28F),
bn_pack4(0xE39E, 0x772C, 0x180E, 0x8603),
bn_pack4(0x3290, 0x5E46, 0x2E36, 0xCE3B),
bn_pack4(0xF174, 0x6C08, 0xCA18, 0x217C),
bn_pack4(0x670C, 0x354E, 0x4ABC, 0x9804),
bn_pack4(0x9ED5, 0x2907, 0x7096, 0x966D),
bn_pack4(0x1C62, 0xF356, 0x2085, 0x52BB),
bn_pack4(0x8365, 0x5D23, 0xDCA3, 0xAD96),
bn_pack4(0x6916, 0x3FA8, 0xFD24, 0xCF5F),
bn_pack4(0x98DA, 0x4836, 0x1C55, 0xD39A),
bn_pack4(0xC200, 0x7CB8, 0xA163, 0xBF05),
bn_pack4(0x4928, 0x6651, 0xECE4, 0x5B3D),
bn_pack4(0xAE9F, 0x2411, 0x7C4B, 0x1FE6),
bn_pack4(0xEE38, 0x6BFB, 0x5A89, 0x9FA5),
bn_pack4(0x0BFF, 0x5CB6, 0xF406, 0xB7ED),
bn_pack4(0xF44C, 0x42E9, 0xA637, 0xED6B),
bn_pack4(0xE485, 0xB576, 0x625E, 0x7EC6),
bn_pack4(0x4FE1, 0x356D, 0x6D51, 0xC245),
bn_pack4(0x302B, 0x0A6D, 0xF25F, 0x1437),
bn_pack4(0xEF95, 0x19B3, 0xCD3A, 0x431B),
bn_pack4(0x514A, 0x0879, 0x8E34, 0x04DD),
bn_pack4(0x020B, 0xBEA6, 0x3B13, 0x9B22),
bn_pack4(0x2902, 0x4E08, 0x8A67, 0xCC74),
bn_pack4(0xC4C6, 0x628B, 0x80DC, 0x1CD1),
bn_pack4(0xC90F, 0xDAA2, 0x2168, 0xC234),
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF)
};
const BIGNUM bn_group_3072 = {
(BN_ULONG *)bn_group_3072_value,
OSSL_NELEM(bn_group_3072_value),
OSSL_NELEM(bn_group_3072_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_4096_value[] = {
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF),
bn_pack4(0x4DF4, 0x35C9, 0x3406, 0x3199),
bn_pack4(0x86FF, 0xB7DC, 0x90A6, 0xC08F),
bn_pack4(0x93B4, 0xEA98, 0x8D8F, 0xDDC1),
bn_pack4(0xD006, 0x9127, 0xD5B0, 0x5AA9),
bn_pack4(0xB81B, 0xDD76, 0x2170, 0x481C),
bn_pack4(0x1F61, 0x2970, 0xCEE2, 0xD7AF),
bn_pack4(0x233B, 0xA186, 0x515B, 0xE7ED),
bn_pack4(0x99B2, 0x964F, 0xA090, 0xC3A2),
bn_pack4(0x287C, 0x5947, 0x4E6B, 0xC05D),
bn_pack4(0x2E8E, 0xFC14, 0x1FBE, 0xCAA6),
bn_pack4(0xDBBB, 0xC2DB, 0x04DE, 0x8EF9),
bn_pack4(0x2583, 0xE9CA, 0x2AD4, 0x4CE8),
bn_pack4(0x1A94, 0x6834, 0xB615, 0x0BDA),
bn_pack4(0x99C3, 0x2718, 0x6AF4, 0xE23C),
bn_pack4(0x8871, 0x9A10, 0xBDBA, 0x5B26),
bn_pack4(0x1A72, 0x3C12, 0xA787, 0xE6D7),
bn_pack4(0x4B82, 0xD120, 0xA921, 0x0801),
bn_pack4(0x43DB, 0x5BFC, 0xE0FD, 0x108E),
bn_pack4(0x08E2, 0x4FA0, 0x74E5, 0xAB31),
bn_pack4(0x7709, 0x88C0, 0xBAD9, 0x46E2),
bn_pack4(0xBBE1, 0x1757, 0x7A61, 0x5D6C),
bn_pack4(0x521F, 0x2B18, 0x177B, 0x200C),
bn_pack4(0xD876, 0x0273, 0x3EC8, 0x6A64),
bn_pack4(0xF12F, 0xFA06, 0xD98A, 0x0864),
bn_pack4(0xCEE3, 0xD226, 0x1AD2, 0xEE6B),
bn_pack4(0x1E8C, 0x94E0, 0x4A25, 0x619D),
bn_pack4(0xABF5, 0xAE8C, 0xDB09, 0x33D7),
bn_pack4(0xB397, 0x0F85, 0xA6E1, 0xE4C7),
bn_pack4(0x8AEA, 0x7157, 0x5D06, 0x0C7D),
bn_pack4(0xECFB, 0x8504, 0x58DB, 0xEF0A),
bn_pack4(0xA855, 0x21AB, 0xDF1C, 0xBA64),
bn_pack4(0xAD33, 0x170D, 0x0450, 0x7A33),
bn_pack4(0x1572, 0x8E5A, 0x8AAA, 0xC42D),
bn_pack4(0x15D2, 0x2618, 0x98FA, 0x0510),
bn_pack4(0x3995, 0x497C, 0xEA95, 0x6AE5),
bn_pack4(0xDE2B, 0xCBF6, 0x9558, 0x1718),
bn_pack4(0xB5C5, 0x5DF0, 0x6F4C, 0x52C9),
bn_pack4(0x9B27, 0x83A2, 0xEC07, 0xA28F),
bn_pack4(0xE39E, 0x772C, 0x180E, 0x8603),
bn_pack4(0x3290, 0x5E46, 0x2E36, 0xCE3B),
bn_pack4(0xF174, 0x6C08, 0xCA18, 0x217C),
bn_pack4(0x670C, 0x354E, 0x4ABC, 0x9804),
bn_pack4(0x9ED5, 0x2907, 0x7096, 0x966D),
bn_pack4(0x1C62, 0xF356, 0x2085, 0x52BB),
bn_pack4(0x8365, 0x5D23, 0xDCA3, 0xAD96),
bn_pack4(0x6916, 0x3FA8, 0xFD24, 0xCF5F),
bn_pack4(0x98DA, 0x4836, 0x1C55, 0xD39A),
bn_pack4(0xC200, 0x7CB8, 0xA163, 0xBF05),
bn_pack4(0x4928, 0x6651, 0xECE4, 0x5B3D),
bn_pack4(0xAE9F, 0x2411, 0x7C4B, 0x1FE6),
bn_pack4(0xEE38, 0x6BFB, 0x5A89, 0x9FA5),
bn_pack4(0x0BFF, 0x5CB6, 0xF406, 0xB7ED),
bn_pack4(0xF44C, 0x42E9, 0xA637, 0xED6B),
bn_pack4(0xE485, 0xB576, 0x625E, 0x7EC6),
bn_pack4(0x4FE1, 0x356D, 0x6D51, 0xC245),
bn_pack4(0x302B, 0x0A6D, 0xF25F, 0x1437),
bn_pack4(0xEF95, 0x19B3, 0xCD3A, 0x431B),
bn_pack4(0x514A, 0x0879, 0x8E34, 0x04DD),
bn_pack4(0x020B, 0xBEA6, 0x3B13, 0x9B22),
bn_pack4(0x2902, 0x4E08, 0x8A67, 0xCC74),
bn_pack4(0xC4C6, 0x628B, 0x80DC, 0x1CD1),
bn_pack4(0xC90F, 0xDAA2, 0x2168, 0xC234),
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF)
};
const BIGNUM bn_group_4096 = {
(BN_ULONG *)bn_group_4096_value,
OSSL_NELEM(bn_group_4096_value),
OSSL_NELEM(bn_group_4096_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_6144_value[] = {
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF),
bn_pack4(0xE694, 0xF91E, 0x6DCC, 0x4024),
bn_pack4(0x12BF, 0x2D5B, 0x0B74, 0x74D6),
bn_pack4(0x043E, 0x8F66, 0x3F48, 0x60EE),
bn_pack4(0x387F, 0xE8D7, 0x6E3C, 0x0468),
bn_pack4(0xDA56, 0xC9EC, 0x2EF2, 0x9632),
bn_pack4(0xEB19, 0xCCB1, 0xA313, 0xD55C),
bn_pack4(0xF550, 0xAA3D, 0x8A1F, 0xBFF0),
bn_pack4(0x06A1, 0xD58B, 0xB7C5, 0xDA76),
bn_pack4(0xA797, 0x15EE, 0xF29B, 0xE328),
bn_pack4(0x14CC, 0x5ED2, 0x0F80, 0x37E0),
bn_pack4(0xCC8F, 0x6D7E, 0xBF48, 0xE1D8),
bn_pack4(0x4BD4, 0x07B2, 0x2B41, 0x54AA),
bn_pack4(0x0F1D, 0x45B7, 0xFF58, 0x5AC5),
bn_pack4(0x23A9, 0x7A7E, 0x36CC, 0x88BE),
bn_pack4(0x59E7, 0xC97F, 0xBEC7, 0xE8F3),
bn_pack4(0xB5A8, 0x4031, 0x900B, 0x1C9E),
bn_pack4(0xD55E, 0x702F, 0x4698, 0x0C82),
bn_pack4(0xF482, 0xD7CE, 0x6E74, 0xFEF6),
bn_pack4(0xF032, 0xEA15, 0xD172, 0x1D03),
bn_pack4(0x5983, 0xCA01, 0xC64B, 0x92EC),
bn_pack4(0x6FB8, 0xF401, 0x378C, 0xD2BF),
bn_pack4(0x3320, 0x5151, 0x2BD7, 0xAF42),
bn_pack4(0xDB7F, 0x1447, 0xE6CC, 0x254B),
bn_pack4(0x44CE, 0x6CBA, 0xCED4, 0xBB1B),
bn_pack4(0xDA3E, 0xDBEB, 0xCF9B, 0x14ED),
bn_pack4(0x1797, 0x27B0, 0x865A, 0x8918),
bn_pack4(0xB06A, 0x53ED, 0x9027, 0xD831),
bn_pack4(0xE5DB, 0x382F, 0x4130, 0x01AE),
bn_pack4(0xF8FF, 0x9406, 0xAD9E, 0x530E),
bn_pack4(0xC975, 0x1E76, 0x3DBA, 0x37BD),
bn_pack4(0xC1D4, 0xDCB2, 0x6026, 0x46DE),
bn_pack4(0x36C3, 0xFAB4, 0xD27C, 0x7026),
bn_pack4(0x4DF4, 0x35C9, 0x3402, 0x8492),
bn_pack4(0x86FF, 0xB7DC, 0x90A6, 0xC08F),
bn_pack4(0x93B4, 0xEA98, 0x8D8F, 0xDDC1),
bn_pack4(0xD006, 0x9127, 0xD5B0, 0x5AA9),
bn_pack4(0xB81B, 0xDD76, 0x2170, 0x481C),
bn_pack4(0x1F61, 0x2970, 0xCEE2, 0xD7AF),
bn_pack4(0x233B, 0xA186, 0x515B, 0xE7ED),
bn_pack4(0x99B2, 0x964F, 0xA090, 0xC3A2),
bn_pack4(0x287C, 0x5947, 0x4E6B, 0xC05D),
bn_pack4(0x2E8E, 0xFC14, 0x1FBE, 0xCAA6),
bn_pack4(0xDBBB, 0xC2DB, 0x04DE, 0x8EF9),
bn_pack4(0x2583, 0xE9CA, 0x2AD4, 0x4CE8),
bn_pack4(0x1A94, 0x6834, 0xB615, 0x0BDA),
bn_pack4(0x99C3, 0x2718, 0x6AF4, 0xE23C),
bn_pack4(0x8871, 0x9A10, 0xBDBA, 0x5B26),
bn_pack4(0x1A72, 0x3C12, 0xA787, 0xE6D7),
bn_pack4(0x4B82, 0xD120, 0xA921, 0x0801),
bn_pack4(0x43DB, 0x5BFC, 0xE0FD, 0x108E),
bn_pack4(0x08E2, 0x4FA0, 0x74E5, 0xAB31),
bn_pack4(0x7709, 0x88C0, 0xBAD9, 0x46E2),
bn_pack4(0xBBE1, 0x1757, 0x7A61, 0x5D6C),
bn_pack4(0x521F, 0x2B18, 0x177B, 0x200C),
bn_pack4(0xD876, 0x0273, 0x3EC8, 0x6A64),
bn_pack4(0xF12F, 0xFA06, 0xD98A, 0x0864),
bn_pack4(0xCEE3, 0xD226, 0x1AD2, 0xEE6B),
bn_pack4(0x1E8C, 0x94E0, 0x4A25, 0x619D),
bn_pack4(0xABF5, 0xAE8C, 0xDB09, 0x33D7),
bn_pack4(0xB397, 0x0F85, 0xA6E1, 0xE4C7),
bn_pack4(0x8AEA, 0x7157, 0x5D06, 0x0C7D),
bn_pack4(0xECFB, 0x8504, 0x58DB, 0xEF0A),
bn_pack4(0xA855, 0x21AB, 0xDF1C, 0xBA64),
bn_pack4(0xAD33, 0x170D, 0x0450, 0x7A33),
bn_pack4(0x1572, 0x8E5A, 0x8AAA, 0xC42D),
bn_pack4(0x15D2, 0x2618, 0x98FA, 0x0510),
bn_pack4(0x3995, 0x497C, 0xEA95, 0x6AE5),
bn_pack4(0xDE2B, 0xCBF6, 0x9558, 0x1718),
bn_pack4(0xB5C5, 0x5DF0, 0x6F4C, 0x52C9),
bn_pack4(0x9B27, 0x83A2, 0xEC07, 0xA28F),
bn_pack4(0xE39E, 0x772C, 0x180E, 0x8603),
bn_pack4(0x3290, 0x5E46, 0x2E36, 0xCE3B),
bn_pack4(0xF174, 0x6C08, 0xCA18, 0x217C),
bn_pack4(0x670C, 0x354E, 0x4ABC, 0x9804),
bn_pack4(0x9ED5, 0x2907, 0x7096, 0x966D),
bn_pack4(0x1C62, 0xF356, 0x2085, 0x52BB),
bn_pack4(0x8365, 0x5D23, 0xDCA3, 0xAD96),
bn_pack4(0x6916, 0x3FA8, 0xFD24, 0xCF5F),
bn_pack4(0x98DA, 0x4836, 0x1C55, 0xD39A),
bn_pack4(0xC200, 0x7CB8, 0xA163, 0xBF05),
bn_pack4(0x4928, 0x6651, 0xECE4, 0x5B3D),
bn_pack4(0xAE9F, 0x2411, 0x7C4B, 0x1FE6),
bn_pack4(0xEE38, 0x6BFB, 0x5A89, 0x9FA5),
bn_pack4(0x0BFF, 0x5CB6, 0xF406, 0xB7ED),
bn_pack4(0xF44C, 0x42E9, 0xA637, 0xED6B),
bn_pack4(0xE485, 0xB576, 0x625E, 0x7EC6),
bn_pack4(0x4FE1, 0x356D, 0x6D51, 0xC245),
bn_pack4(0x302B, 0x0A6D, 0xF25F, 0x1437),
bn_pack4(0xEF95, 0x19B3, 0xCD3A, 0x431B),
bn_pack4(0x514A, 0x0879, 0x8E34, 0x04DD),
bn_pack4(0x020B, 0xBEA6, 0x3B13, 0x9B22),
bn_pack4(0x2902, 0x4E08, 0x8A67, 0xCC74),
bn_pack4(0xC4C6, 0x628B, 0x80DC, 0x1CD1),
bn_pack4(0xC90F, 0xDAA2, 0x2168, 0xC234),
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF)
};
const BIGNUM bn_group_6144 = {
(BN_ULONG *)bn_group_6144_value,
OSSL_NELEM(bn_group_6144_value),
OSSL_NELEM(bn_group_6144_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_group_8192_value[] = {
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF),
bn_pack4(0x60C9, 0x80DD, 0x98ED, 0xD3DF),
bn_pack4(0xC81F, 0x56E8, 0x80B9, 0x6E71),
bn_pack4(0x9E30, 0x50E2, 0x7656, 0x94DF),
bn_pack4(0x9558, 0xE447, 0x5677, 0xE9AA),
bn_pack4(0xC919, 0x0DA6, 0xFC02, 0x6E47),
bn_pack4(0x889A, 0x002E, 0xD5EE, 0x382B),
bn_pack4(0x4009, 0x438B, 0x481C, 0x6CD7),
bn_pack4(0x3590, 0x46F4, 0xEB87, 0x9F92),
bn_pack4(0xFAF3, 0x6BC3, 0x1ECF, 0xA268),
bn_pack4(0xB1D5, 0x10BD, 0x7EE7, 0x4D73),
bn_pack4(0xF9AB, 0x4819, 0x5DED, 0x7EA1),
bn_pack4(0x64F3, 0x1CC5, 0x0846, 0x851D),
bn_pack4(0x4597, 0xE899, 0xA025, 0x5DC1),
bn_pack4(0xDF31, 0x0EE0, 0x74AB, 0x6A36),
bn_pack4(0x6D2A, 0x13F8, 0x3F44, 0xF82D),
bn_pack4(0x062B, 0x3CF5, 0xB3A2, 0x78A6),
bn_pack4(0x7968, 0x3303, 0xED5B, 0xDD3A),
bn_pack4(0xFA9D, 0x4B7F, 0xA2C0, 0x87E8),
bn_pack4(0x4BCB, 0xC886, 0x2F83, 0x85DD),
bn_pack4(0x3473, 0xFC64, 0x6CEA, 0x306B),
bn_pack4(0x13EB, 0x57A8, 0x1A23, 0xF0C7),
bn_pack4(0x2222, 0x2E04, 0xA403, 0x7C07),
bn_pack4(0xE3FD, 0xB8BE, 0xFC84, 0x8AD9),
bn_pack4(0x238F, 0x16CB, 0xE39D, 0x652D),
bn_pack4(0x3423, 0xB474, 0x2BF1, 0xC978),
bn_pack4(0x3AAB, 0x639C, 0x5AE4, 0xF568),
bn_pack4(0x2576, 0xF693, 0x6BA4, 0x2466),
bn_pack4(0x741F, 0xA7BF, 0x8AFC, 0x47ED),
bn_pack4(0x3BC8, 0x32B6, 0x8D9D, 0xD300),
bn_pack4(0xD8BE, 0xC4D0, 0x73B9, 0x31BA),
bn_pack4(0x3877, 0x7CB6, 0xA932, 0xDF8C),
bn_pack4(0x74A3, 0x926F, 0x12FE, 0xE5E4),
bn_pack4(0xE694, 0xF91E, 0x6DBE, 0x1159),
bn_pack4(0x12BF, 0x2D5B, 0x0B74, 0x74D6),
bn_pack4(0x043E, 0x8F66, 0x3F48, 0x60EE),
bn_pack4(0x387F, 0xE8D7, 0x6E3C, 0x0468),
bn_pack4(0xDA56, 0xC9EC, 0x2EF2, 0x9632),
bn_pack4(0xEB19, 0xCCB1, 0xA313, 0xD55C),
bn_pack4(0xF550, 0xAA3D, 0x8A1F, 0xBFF0),
bn_pack4(0x06A1, 0xD58B, 0xB7C5, 0xDA76),
bn_pack4(0xA797, 0x15EE, 0xF29B, 0xE328),
bn_pack4(0x14CC, 0x5ED2, 0x0F80, 0x37E0),
bn_pack4(0xCC8F, 0x6D7E, 0xBF48, 0xE1D8),
bn_pack4(0x4BD4, 0x07B2, 0x2B41, 0x54AA),
bn_pack4(0x0F1D, 0x45B7, 0xFF58, 0x5AC5),
bn_pack4(0x23A9, 0x7A7E, 0x36CC, 0x88BE),
bn_pack4(0x59E7, 0xC97F, 0xBEC7, 0xE8F3),
bn_pack4(0xB5A8, 0x4031, 0x900B, 0x1C9E),
bn_pack4(0xD55E, 0x702F, 0x4698, 0x0C82),
bn_pack4(0xF482, 0xD7CE, 0x6E74, 0xFEF6),
bn_pack4(0xF032, 0xEA15, 0xD172, 0x1D03),
bn_pack4(0x5983, 0xCA01, 0xC64B, 0x92EC),
bn_pack4(0x6FB8, 0xF401, 0x378C, 0xD2BF),
bn_pack4(0x3320, 0x5151, 0x2BD7, 0xAF42),
bn_pack4(0xDB7F, 0x1447, 0xE6CC, 0x254B),
bn_pack4(0x44CE, 0x6CBA, 0xCED4, 0xBB1B),
bn_pack4(0xDA3E, 0xDBEB, 0xCF9B, 0x14ED),
bn_pack4(0x1797, 0x27B0, 0x865A, 0x8918),
bn_pack4(0xB06A, 0x53ED, 0x9027, 0xD831),
bn_pack4(0xE5DB, 0x382F, 0x4130, 0x01AE),
bn_pack4(0xF8FF, 0x9406, 0xAD9E, 0x530E),
bn_pack4(0xC975, 0x1E76, 0x3DBA, 0x37BD),
bn_pack4(0xC1D4, 0xDCB2, 0x6026, 0x46DE),
bn_pack4(0x36C3, 0xFAB4, 0xD27C, 0x7026),
bn_pack4(0x4DF4, 0x35C9, 0x3402, 0x8492),
bn_pack4(0x86FF, 0xB7DC, 0x90A6, 0xC08F),
bn_pack4(0x93B4, 0xEA98, 0x8D8F, 0xDDC1),
bn_pack4(0xD006, 0x9127, 0xD5B0, 0x5AA9),
bn_pack4(0xB81B, 0xDD76, 0x2170, 0x481C),
bn_pack4(0x1F61, 0x2970, 0xCEE2, 0xD7AF),
bn_pack4(0x233B, 0xA186, 0x515B, 0xE7ED),
bn_pack4(0x99B2, 0x964F, 0xA090, 0xC3A2),
bn_pack4(0x287C, 0x5947, 0x4E6B, 0xC05D),
bn_pack4(0x2E8E, 0xFC14, 0x1FBE, 0xCAA6),
bn_pack4(0xDBBB, 0xC2DB, 0x04DE, 0x8EF9),
bn_pack4(0x2583, 0xE9CA, 0x2AD4, 0x4CE8),
bn_pack4(0x1A94, 0x6834, 0xB615, 0x0BDA),
bn_pack4(0x99C3, 0x2718, 0x6AF4, 0xE23C),
bn_pack4(0x8871, 0x9A10, 0xBDBA, 0x5B26),
bn_pack4(0x1A72, 0x3C12, 0xA787, 0xE6D7),
bn_pack4(0x4B82, 0xD120, 0xA921, 0x0801),
bn_pack4(0x43DB, 0x5BFC, 0xE0FD, 0x108E),
bn_pack4(0x08E2, 0x4FA0, 0x74E5, 0xAB31),
bn_pack4(0x7709, 0x88C0, 0xBAD9, 0x46E2),
bn_pack4(0xBBE1, 0x1757, 0x7A61, 0x5D6C),
bn_pack4(0x521F, 0x2B18, 0x177B, 0x200C),
bn_pack4(0xD876, 0x0273, 0x3EC8, 0x6A64),
bn_pack4(0xF12F, 0xFA06, 0xD98A, 0x0864),
bn_pack4(0xCEE3, 0xD226, 0x1AD2, 0xEE6B),
bn_pack4(0x1E8C, 0x94E0, 0x4A25, 0x619D),
bn_pack4(0xABF5, 0xAE8C, 0xDB09, 0x33D7),
bn_pack4(0xB397, 0x0F85, 0xA6E1, 0xE4C7),
bn_pack4(0x8AEA, 0x7157, 0x5D06, 0x0C7D),
bn_pack4(0xECFB, 0x8504, 0x58DB, 0xEF0A),
bn_pack4(0xA855, 0x21AB, 0xDF1C, 0xBA64),
bn_pack4(0xAD33, 0x170D, 0x0450, 0x7A33),
bn_pack4(0x1572, 0x8E5A, 0x8AAA, 0xC42D),
bn_pack4(0x15D2, 0x2618, 0x98FA, 0x0510),
bn_pack4(0x3995, 0x497C, 0xEA95, 0x6AE5),
bn_pack4(0xDE2B, 0xCBF6, 0x9558, 0x1718),
bn_pack4(0xB5C5, 0x5DF0, 0x6F4C, 0x52C9),
bn_pack4(0x9B27, 0x83A2, 0xEC07, 0xA28F),
bn_pack4(0xE39E, 0x772C, 0x180E, 0x8603),
bn_pack4(0x3290, 0x5E46, 0x2E36, 0xCE3B),
bn_pack4(0xF174, 0x6C08, 0xCA18, 0x217C),
bn_pack4(0x670C, 0x354E, 0x4ABC, 0x9804),
bn_pack4(0x9ED5, 0x2907, 0x7096, 0x966D),
bn_pack4(0x1C62, 0xF356, 0x2085, 0x52BB),
bn_pack4(0x8365, 0x5D23, 0xDCA3, 0xAD96),
bn_pack4(0x6916, 0x3FA8, 0xFD24, 0xCF5F),
bn_pack4(0x98DA, 0x4836, 0x1C55, 0xD39A),
bn_pack4(0xC200, 0x7CB8, 0xA163, 0xBF05),
bn_pack4(0x4928, 0x6651, 0xECE4, 0x5B3D),
bn_pack4(0xAE9F, 0x2411, 0x7C4B, 0x1FE6),
bn_pack4(0xEE38, 0x6BFB, 0x5A89, 0x9FA5),
bn_pack4(0x0BFF, 0x5CB6, 0xF406, 0xB7ED),
bn_pack4(0xF44C, 0x42E9, 0xA637, 0xED6B),
bn_pack4(0xE485, 0xB576, 0x625E, 0x7EC6),
bn_pack4(0x4FE1, 0x356D, 0x6D51, 0xC245),
bn_pack4(0x302B, 0x0A6D, 0xF25F, 0x1437),
bn_pack4(0xEF95, 0x19B3, 0xCD3A, 0x431B),
bn_pack4(0x514A, 0x0879, 0x8E34, 0x04DD),
bn_pack4(0x020B, 0xBEA6, 0x3B13, 0x9B22),
bn_pack4(0x2902, 0x4E08, 0x8A67, 0xCC74),
bn_pack4(0xC4C6, 0x628B, 0x80DC, 0x1CD1),
bn_pack4(0xC90F, 0xDAA2, 0x2168, 0xC234),
bn_pack4(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF)
};
const BIGNUM bn_group_8192 = {
(BN_ULONG *)bn_group_8192_value,
OSSL_NELEM(bn_group_8192_value),
OSSL_NELEM(bn_group_8192_value),
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_generator_19_value[] = { 19 };
const BIGNUM bn_generator_19 = {
(BN_ULONG *)bn_generator_19_value,
1,
1,
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_generator_5_value[] = { 5 };
const BIGNUM bn_generator_5 = {
(BN_ULONG *)bn_generator_5_value,
1,
1,
0,
BN_FLG_STATIC_DATA
};
static const BN_ULONG bn_generator_2_value[] = { 2 };
const BIGNUM bn_generator_2 = {
(BN_ULONG *)bn_generator_2_value,
1,
1,
0,
BN_FLG_STATIC_DATA
};
#endif

View file

@ -0,0 +1,201 @@
/*
* Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include "internal/cryptlib.h"
#include "bn_lcl.h"
BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
{
#ifndef BN_LLONG
BN_ULONG ret = 0;
#else
BN_ULLONG ret = 0;
#endif
int i;
if (w == 0)
return (BN_ULONG)-1;
#ifndef BN_LLONG
/*
* If |w| is too long and we don't have BN_ULLONG then we need to fall
* back to using BN_div_word
*/
if (w > ((BN_ULONG)1 << BN_BITS4)) {
BIGNUM *tmp = BN_dup(a);
if (tmp == NULL)
return (BN_ULONG)-1;
ret = BN_div_word(tmp, w);
BN_free(tmp);
return ret;
}
#endif
bn_check_top(a);
w &= BN_MASK2;
for (i = a->top - 1; i >= 0; i--) {
#ifndef BN_LLONG
/*
* We can assume here that | w <= ((BN_ULONG)1 << BN_BITS4) | and so
* | ret < ((BN_ULONG)1 << BN_BITS4) | and therefore the shifts here are
* safe and will not overflow
*/
ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
#else
ret = (BN_ULLONG) (((ret << (BN_ULLONG) BN_BITS2) | a->d[i]) %
(BN_ULLONG) w);
#endif
}
return (BN_ULONG)ret;
}
BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
{
BN_ULONG ret = 0;
int i, j;
bn_check_top(a);
w &= BN_MASK2;
if (!w)
/* actually this an error (division by zero) */
return (BN_ULONG)-1;
if (a->top == 0)
return 0;
/* normalize input (so bn_div_words doesn't complain) */
j = BN_BITS2 - BN_num_bits_word(w);
w <<= j;
if (!BN_lshift(a, a, j))
return (BN_ULONG)-1;
for (i = a->top - 1; i >= 0; i--) {
BN_ULONG l, d;
l = a->d[i];
d = bn_div_words(ret, l, w);
ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
a->d[i] = d;
}
if ((a->top > 0) && (a->d[a->top - 1] == 0))
a->top--;
ret >>= j;
if (!a->top)
a->neg = 0; /* don't allow negative zero */
bn_check_top(a);
return ret;
}
int BN_add_word(BIGNUM *a, BN_ULONG w)
{
BN_ULONG l;
int i;
bn_check_top(a);
w &= BN_MASK2;
/* degenerate case: w is zero */
if (!w)
return 1;
/* degenerate case: a is zero */
if (BN_is_zero(a))
return BN_set_word(a, w);
/* handle 'a' when negative */
if (a->neg) {
a->neg = 0;
i = BN_sub_word(a, w);
if (!BN_is_zero(a))
a->neg = !(a->neg);
return i;
}
for (i = 0; w != 0 && i < a->top; i++) {
a->d[i] = l = (a->d[i] + w) & BN_MASK2;
w = (w > l) ? 1 : 0;
}
if (w && i == a->top) {
if (bn_wexpand(a, a->top + 1) == NULL)
return 0;
a->top++;
a->d[i] = w;
}
bn_check_top(a);
return 1;
}
int BN_sub_word(BIGNUM *a, BN_ULONG w)
{
int i;
bn_check_top(a);
w &= BN_MASK2;
/* degenerate case: w is zero */
if (!w)
return 1;
/* degenerate case: a is zero */
if (BN_is_zero(a)) {
i = BN_set_word(a, w);
if (i != 0)
BN_set_negative(a, 1);
return i;
}
/* handle 'a' when negative */
if (a->neg) {
a->neg = 0;
i = BN_add_word(a, w);
a->neg = 1;
return i;
}
if ((a->top == 1) && (a->d[0] < w)) {
a->d[0] = w - a->d[0];
a->neg = 1;
return 1;
}
i = 0;
for (;;) {
if (a->d[i] >= w) {
a->d[i] -= w;
break;
} else {
a->d[i] = (a->d[i] - w) & BN_MASK2;
i++;
w = 1;
}
}
if ((a->d[i] == 0) && (i == (a->top - 1)))
a->top--;
bn_check_top(a);
return 1;
}
int BN_mul_word(BIGNUM *a, BN_ULONG w)
{
BN_ULONG ll;
bn_check_top(a);
w &= BN_MASK2;
if (a->top) {
if (w == 0)
BN_zero(a);
else {
ll = bn_mul_words(a->d, a->d, a->top, w);
if (ll) {
if (bn_wexpand(a, a->top + 1) == NULL)
return 0;
a->d[a->top++] = ll;
}
}
}
bn_check_top(a);
return 1;
}

View file

@ -0,0 +1,244 @@
/*
* Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*/
#include <stdio.h>
#include <openssl/bn.h>
#include "bn_lcl.h"
/* X9.31 routines for prime derivation */
/*
* X9.31 prime derivation. This is used to generate the primes pi (p1, p2,
* q1, q2) from a parameter Xpi by checking successive odd integers.
*/
static int bn_x931_derive_pi(BIGNUM *pi, const BIGNUM *Xpi, BN_CTX *ctx,
BN_GENCB *cb)
{
int i = 0, is_prime;
if (!BN_copy(pi, Xpi))
return 0;
if (!BN_is_odd(pi) && !BN_add_word(pi, 1))
return 0;
for (;;) {
i++;
BN_GENCB_call(cb, 0, i);
/* NB 27 MR is specified in X9.31 */
is_prime = BN_is_prime_fasttest_ex(pi, 27, ctx, 1, cb);
if (is_prime < 0)
return 0;
if (is_prime)
break;
if (!BN_add_word(pi, 2))
return 0;
}
BN_GENCB_call(cb, 2, i);
return 1;
}
/*
* This is the main X9.31 prime derivation function. From parameters Xp1, Xp2
* and Xp derive the prime p. If the parameters p1 or p2 are not NULL they
* will be returned too: this is needed for testing.
*/
int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
const BIGNUM *Xp, const BIGNUM *Xp1,
const BIGNUM *Xp2, const BIGNUM *e, BN_CTX *ctx,
BN_GENCB *cb)
{
int ret = 0;
BIGNUM *t, *p1p2, *pm1;
/* Only even e supported */
if (!BN_is_odd(e))
return 0;
BN_CTX_start(ctx);
if (p1 == NULL)
p1 = BN_CTX_get(ctx);
if (p2 == NULL)
p2 = BN_CTX_get(ctx);
t = BN_CTX_get(ctx);
p1p2 = BN_CTX_get(ctx);
pm1 = BN_CTX_get(ctx);
if (pm1 == NULL)
goto err;
if (!bn_x931_derive_pi(p1, Xp1, ctx, cb))
goto err;
if (!bn_x931_derive_pi(p2, Xp2, ctx, cb))
goto err;
if (!BN_mul(p1p2, p1, p2, ctx))
goto err;
/* First set p to value of Rp */
if (!BN_mod_inverse(p, p2, p1, ctx))
goto err;
if (!BN_mul(p, p, p2, ctx))
goto err;
if (!BN_mod_inverse(t, p1, p2, ctx))
goto err;
if (!BN_mul(t, t, p1, ctx))
goto err;
if (!BN_sub(p, p, t))
goto err;
if (p->neg && !BN_add(p, p, p1p2))
goto err;
/* p now equals Rp */
if (!BN_mod_sub(p, p, Xp, p1p2, ctx))
goto err;
if (!BN_add(p, p, Xp))
goto err;
/* p now equals Yp0 */
for (;;) {
int i = 1;
BN_GENCB_call(cb, 0, i++);
if (!BN_copy(pm1, p))
goto err;
if (!BN_sub_word(pm1, 1))
goto err;
if (!BN_gcd(t, pm1, e, ctx))
goto err;
if (BN_is_one(t)) {
/*
* X9.31 specifies 8 MR and 1 Lucas test or any prime test
* offering similar or better guarantees 50 MR is considerably
* better.
*/
int r = BN_is_prime_fasttest_ex(p, 50, ctx, 1, cb);
if (r < 0)
goto err;
if (r)
break;
}
if (!BN_add(p, p, p1p2))
goto err;
}
BN_GENCB_call(cb, 3, 0);
ret = 1;
err:
BN_CTX_end(ctx);
return ret;
}
/*
* Generate pair of parameters Xp, Xq for X9.31 prime generation. Note: nbits
* parameter is sum of number of bits in both.
*/
int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx)
{
BIGNUM *t;
int i;
/*
* Number of bits for each prime is of the form 512+128s for s = 0, 1,
* ...
*/
if ((nbits < 1024) || (nbits & 0xff))
return 0;
nbits >>= 1;
/*
* The random value Xp must be between sqrt(2) * 2^(nbits-1) and 2^nbits
* - 1. By setting the top two bits we ensure that the lower bound is
* exceeded.
*/
if (!BN_priv_rand(Xp, nbits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ANY))
goto err;
BN_CTX_start(ctx);
t = BN_CTX_get(ctx);
if (t == NULL)
goto err;
for (i = 0; i < 1000; i++) {
if (!BN_priv_rand(Xq, nbits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ANY))
goto err;
/* Check that |Xp - Xq| > 2^(nbits - 100) */
if (!BN_sub(t, Xp, Xq))
goto err;
if (BN_num_bits(t) > (nbits - 100))
break;
}
BN_CTX_end(ctx);
if (i < 1000)
return 1;
return 0;
err:
BN_CTX_end(ctx);
return 0;
}
/*
* Generate primes using X9.31 algorithm. Of the values p, p1, p2, Xp1 and
* Xp2 only 'p' needs to be non-NULL. If any of the others are not NULL the
* relevant parameter will be stored in it. Due to the fact that |Xp - Xq| >
* 2^(nbits - 100) must be satisfied Xp and Xq are generated using the
* previous function and supplied as input.
*/
int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
BIGNUM *Xp1, BIGNUM *Xp2,
const BIGNUM *Xp,
const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb)
{
int ret = 0;
BN_CTX_start(ctx);
if (Xp1 == NULL)
Xp1 = BN_CTX_get(ctx);
if (Xp2 == NULL)
Xp2 = BN_CTX_get(ctx);
if (Xp1 == NULL || Xp2 == NULL)
goto error;
if (!BN_priv_rand(Xp1, 101, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY))
goto error;
if (!BN_priv_rand(Xp2, 101, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY))
goto error;
if (!BN_X931_derive_prime_ex(p, p1, p2, Xp, Xp1, Xp2, e, ctx, cb))
goto error;
ret = 1;
error:
BN_CTX_end(ctx);
return ret;
}

View file

@ -0,0 +1,67 @@
LIBS=../../libcrypto
SOURCE[../../libcrypto]=\
bn_add.c bn_div.c bn_exp.c bn_lib.c bn_ctx.c bn_mul.c bn_mod.c \
bn_print.c bn_rand.c bn_shift.c bn_word.c bn_blind.c \
bn_kron.c bn_sqrt.c bn_gcd.c bn_prime.c bn_err.c bn_sqr.c \
{- $target{bn_asm_src} -} \
bn_recp.c bn_mont.c bn_mpi.c bn_exp2.c bn_gf2m.c bn_nist.c \
bn_depr.c bn_const.c bn_x931p.c bn_intern.c bn_dh.c bn_srp.c
INCLUDE[../../libcrypto]=../../crypto/include
INCLUDE[bn_exp.o]=..
GENERATE[bn-586.s]=asm/bn-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[bn-586.s]=../perlasm/x86asm.pl
GENERATE[co-586.s]=asm/co-586.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[co-586.s]=../perlasm/x86asm.pl
GENERATE[x86-mont.s]=asm/x86-mont.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[x86-mont.s]=../perlasm/x86asm.pl
GENERATE[x86-gf2m.s]=asm/x86-gf2m.pl \
$(PERLASM_SCHEME) $(LIB_CFLAGS) $(LIB_CPPFLAGS) $(PROCESSOR)
DEPEND[x86-gf2m.s]=../perlasm/x86asm.pl
GENERATE[sparcv9a-mont.S]=asm/sparcv9a-mont.pl $(PERLASM_SCHEME)
INCLUDE[sparcv9a-mont.o]=..
GENERATE[sparcv9-mont.S]=asm/sparcv9-mont.pl $(PERLASM_SCHEME)
INCLUDE[sparcv9-mont.o]=..
GENERATE[vis3-mont.S]=asm/vis3-mont.pl $(PERLASM_SCHEME)
INCLUDE[vis3-mont.o]=..
GENERATE[sparct4-mont.S]=asm/sparct4-mont.pl $(PERLASM_SCHEME)
INCLUDE[sparct4-mont.o]=..
GENERATE[sparcv9-gf2m.S]=asm/sparcv9-gf2m.pl $(PERLASM_SCHEME)
INCLUDE[sparcv9-gf2m.o]=..
GENERATE[bn-mips.S]=asm/mips.pl $(PERLASM_SCHEME)
INCLUDE[bn-mips.o]=..
GENERATE[mips-mont.S]=asm/mips-mont.pl $(PERLASM_SCHEME)
INCLUDE[mips-mont.o]=..
GENERATE[s390x-mont.S]=asm/s390x-mont.pl $(PERLASM_SCHEME)
GENERATE[s390x-gf2m.s]=asm/s390x-gf2m.pl $(PERLASM_SCHEME)
GENERATE[x86_64-mont.s]=asm/x86_64-mont.pl $(PERLASM_SCHEME)
GENERATE[x86_64-mont5.s]=asm/x86_64-mont5.pl $(PERLASM_SCHEME)
GENERATE[x86_64-gf2m.s]=asm/x86_64-gf2m.pl $(PERLASM_SCHEME)
GENERATE[rsaz-x86_64.s]=asm/rsaz-x86_64.pl $(PERLASM_SCHEME)
GENERATE[rsaz-avx2.s]=asm/rsaz-avx2.pl $(PERLASM_SCHEME)
GENERATE[bn-ia64.s]=asm/ia64.S
GENERATE[ia64-mont.s]=asm/ia64-mont.pl $(LIB_CFLAGS) $(LIB_CPPFLAGS)
GENERATE[parisc-mont.s]=asm/parisc-mont.pl $(PERLASM_SCHEME)
# ppc - AIX, Linux, MacOS X...
GENERATE[bn-ppc.s]=asm/ppc.pl $(PERLASM_SCHEME)
GENERATE[ppc-mont.s]=asm/ppc-mont.pl $(PERLASM_SCHEME)
GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl $(PERLASM_SCHEME)
GENERATE[alpha-mont.S]=asm/alpha-mont.pl $(PERLASM_SCHEME)
GENERATE[armv4-mont.S]=asm/armv4-mont.pl $(PERLASM_SCHEME)
INCLUDE[armv4-mont.o]=..
GENERATE[armv4-gf2m.S]=asm/armv4-gf2m.pl $(PERLASM_SCHEME)
INCLUDE[armv4-gf2m.o]=..
GENERATE[armv8-mont.S]=asm/armv8-mont.pl $(PERLASM_SCHEME)

View file

@ -0,0 +1,315 @@
/*
* Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2012, Intel Corporation. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
*/
#include <openssl/opensslconf.h>
#include "rsaz_exp.h"
#ifndef RSAZ_ENABLED
NON_EMPTY_TRANSLATION_UNIT
#else
/*
* See crypto/bn/asm/rsaz-avx2.pl for further details.
*/
void rsaz_1024_norm2red_avx2(void *red, const void *norm);
void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b,
const void *n, BN_ULONG k);
void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k,
int cnt);
void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i);
void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i);
void rsaz_1024_red2norm_avx2(void *norm, const void *red);
#if defined(__GNUC__)
# define ALIGN64 __attribute__((aligned(64)))
#elif defined(_MSC_VER)
# define ALIGN64 __declspec(align(64))
#elif defined(__SUNPRO_C)
# define ALIGN64
# pragma align 64(one,two80)
#else
/* not fatal, might hurt performance a little */
# define ALIGN64
#endif
ALIGN64 static const BN_ULONG one[40] = {
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
ALIGN64 static const BN_ULONG two80[40] = {
0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
const BN_ULONG base_norm[16],
const BN_ULONG exponent[16],
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
BN_ULONG k0)
{
unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */
unsigned char *p_str = storage + (64 - ((size_t)storage % 64));
unsigned char *a_inv, *m, *result;
unsigned char *table_s = p_str + 320 * 3;
unsigned char *R2 = table_s; /* borrow */
int index;
int wvalue;
if ((((size_t)p_str & 4095) + 320) >> 12) {
result = p_str;
a_inv = p_str + 320;
m = p_str + 320 * 2; /* should not cross page */
} else {
m = p_str; /* should not cross page */
result = p_str + 320;
a_inv = p_str + 320 * 2;
}
rsaz_1024_norm2red_avx2(m, m_norm);
rsaz_1024_norm2red_avx2(a_inv, base_norm);
rsaz_1024_norm2red_avx2(R2, RR);
rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
/* table[0] = 1 */
rsaz_1024_mul_avx2(result, R2, one, m, k0);
/* table[1] = a_inv^1 */
rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 0);
rsaz_1024_scatter5_avx2(table_s, a_inv, 1);
/* table[2] = a_inv^2 */
rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 2);
#if 0
/* this is almost 2x smaller and less than 1% slower */
for (index = 3; index < 32; index++) {
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, index);
}
#else
/* table[4] = a_inv^4 */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 4);
/* table[8] = a_inv^8 */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 8);
/* table[16] = a_inv^16 */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 16);
/* table[17] = a_inv^17 */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 17);
/* table[3] */
rsaz_1024_gather5_avx2(result, table_s, 2);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 3);
/* table[6] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 6);
/* table[12] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 12);
/* table[24] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 24);
/* table[25] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 25);
/* table[5] */
rsaz_1024_gather5_avx2(result, table_s, 4);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 5);
/* table[10] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 10);
/* table[20] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 20);
/* table[21] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 21);
/* table[7] */
rsaz_1024_gather5_avx2(result, table_s, 6);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 7);
/* table[14] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 14);
/* table[28] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 28);
/* table[29] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 29);
/* table[9] */
rsaz_1024_gather5_avx2(result, table_s, 8);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 9);
/* table[18] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 18);
/* table[19] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 19);
/* table[11] */
rsaz_1024_gather5_avx2(result, table_s, 10);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 11);
/* table[22] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 22);
/* table[23] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 23);
/* table[13] */
rsaz_1024_gather5_avx2(result, table_s, 12);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 13);
/* table[26] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 26);
/* table[27] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 27);
/* table[15] */
rsaz_1024_gather5_avx2(result, table_s, 14);
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 15);
/* table[30] */
rsaz_1024_sqr_avx2(result, result, m, k0, 1);
rsaz_1024_scatter5_avx2(table_s, result, 30);
/* table[31] */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
rsaz_1024_scatter5_avx2(table_s, result, 31);
#endif
/* load first window */
p_str = (unsigned char *)exponent;
wvalue = p_str[127] >> 3;
rsaz_1024_gather5_avx2(result, table_s, wvalue);
index = 1014;
while (index > -1) { /* loop for the remaining 127 windows */
rsaz_1024_sqr_avx2(result, result, m, k0, 5);
wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8];
wvalue = (wvalue >> (index % 8)) & 31;
index -= 5;
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
}
/* square four times */
rsaz_1024_sqr_avx2(result, result, m, k0, 4);
wvalue = p_str[0] & 15;
rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */
rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
/* from Montgomery */
rsaz_1024_mul_avx2(result, result, one, m, k0);
rsaz_1024_red2norm_avx2(result_norm, result);
OPENSSL_cleanse(storage, sizeof(storage));
}
/*
* See crypto/bn/rsaz-x86_64.pl for further details.
*/
void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n,
BN_ULONG k);
void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n,
BN_ULONG k, const void *tbl, unsigned int power);
void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl,
const void *n, BN_ULONG k, unsigned int power);
void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k);
void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k,
int cnt);
void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
void RSAZ_512_mod_exp(BN_ULONG result[8],
const BN_ULONG base[8], const BN_ULONG exponent[8],
const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
{
unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */
unsigned char *table = storage + (64 - ((size_t)storage % 64));
BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8);
BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8);
unsigned char *p_str = (unsigned char *)exponent;
int index;
unsigned int wvalue;
/* table[0] = 1_inv */
temp[0] = 0 - m[0];
temp[1] = ~m[1];
temp[2] = ~m[2];
temp[3] = ~m[3];
temp[4] = ~m[4];
temp[5] = ~m[5];
temp[6] = ~m[6];
temp[7] = ~m[7];
rsaz_512_scatter4(table, temp, 0);
/* table [1] = a_inv^1 */
rsaz_512_mul(a_inv, base, RR, m, k0);
rsaz_512_scatter4(table, a_inv, 1);
/* table [2] = a_inv^2 */
rsaz_512_sqr(temp, a_inv, m, k0, 1);
rsaz_512_scatter4(table, temp, 2);
for (index = 3; index < 16; index++)
rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
/* load first window */
wvalue = p_str[63];
rsaz_512_gather4(temp, table, wvalue >> 4);
rsaz_512_sqr(temp, temp, m, k0, 4);
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf);
for (index = 62; index >= 0; index--) {
wvalue = p_str[index];
rsaz_512_sqr(temp, temp, m, k0, 4);
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4);
rsaz_512_sqr(temp, temp, m, k0, 4);
rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f);
}
/* from Montgomery */
rsaz_512_mul_by_one(result, temp, m, k0);
OPENSSL_cleanse(storage, sizeof(storage));
}
#endif

View file

@ -0,0 +1,40 @@
/*
* Copyright 2013-2018 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2012, Intel Corporation. All Rights Reserved.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
* Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
*/
#ifndef RSAZ_EXP_H
# define RSAZ_EXP_H
# undef RSAZ_ENABLED
# if defined(OPENSSL_BN_ASM_MONT) && \
(defined(__x86_64) || defined(__x86_64__) || \
defined(_M_AMD64) || defined(_M_X64))
# define RSAZ_ENABLED
# include <openssl/bn.h>
void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
const BN_ULONG base_norm[16],
const BN_ULONG exponent[16],
const BN_ULONG m_norm[16], const BN_ULONG RR[16],
BN_ULONG k0);
int rsaz_avx2_eligible(void);
void RSAZ_512_mod_exp(BN_ULONG result[8],
const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
const BN_ULONG m_norm[8], BN_ULONG k0,
const BN_ULONG RR[8]);
# endif
#endif