mirror of
				https://github.com/ossrs/srs.git
				synced 2025-03-09 15:49:59 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			467 lines
		
	
	
	
		
			7.8 KiB
		
	
	
	
		
			Raku
		
	
	
	
	
	
			
		
		
	
	
			467 lines
		
	
	
	
		
			7.8 KiB
		
	
	
	
		
			Raku
		
	
	
	
	
	
| #! /usr/bin/env perl
 | |
| # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the OpenSSL license (the "License").  You may not use
 | |
| # this file except in compliance with the License.  You can obtain a copy
 | |
| # in the file LICENSE in the source distribution or at
 | |
| # https://www.openssl.org/source/license.html
 | |
| 
 | |
| #
 | |
| # ====================================================================
 | |
| # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 | |
| # project. The module is, however, dual licensed under OpenSSL and
 | |
| # CRYPTOGAMS licenses depending on where you obtain it. For further
 | |
| # details see http://www.openssl.org/~appro/cryptogams/.
 | |
| # ====================================================================
 | |
| #
 | |
| # March 2010
 | |
| #
 | |
| # The module implements "4-bit" GCM GHASH function and underlying
 | |
| # single multiplication operation in GF(2^128). "4-bit" means that it
 | |
| # uses 256 bytes per-key table [+128 bytes shared table]. Even though
 | |
| # loops are aggressively modulo-scheduled in respect to references to
 | |
| # Htbl and Z.hi updates for 8 cycles per byte, measured performance is
 | |
| # ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
 | |
| # scheduling "glitch," because uprofile(1) indicates uniform sample
 | |
| # distribution, as if all instruction bundles execute in 1.5 cycles.
 | |
| # Meaning that it could have been even faster, yet 12 cycles is ~60%
 | |
| # better than gcc-generated code and ~80% than code generated by vendor
 | |
| # compiler.
 | |
| 
 | |
| $cnt="v0";	# $0
 | |
| $t0="t0";
 | |
| $t1="t1";
 | |
| $t2="t2";
 | |
| $Thi0="t3";	# $4
 | |
| $Tlo0="t4";
 | |
| $Thi1="t5";
 | |
| $Tlo1="t6";
 | |
| $rem="t7";	# $8
 | |
| #################
 | |
| $Xi="a0";	# $16, input argument block
 | |
| $Htbl="a1";
 | |
| $inp="a2";
 | |
| $len="a3";
 | |
| $nlo="a4";	# $20
 | |
| $nhi="a5";
 | |
| $Zhi="t8";
 | |
| $Zlo="t9";
 | |
| $Xhi="t10";	# $24
 | |
| $Xlo="t11";
 | |
| $remp="t12";
 | |
| $rem_4bit="AT";	# $28
 | |
| 
 | |
| { my $N;
 | |
|   sub loop() {
 | |
| 
 | |
| 	$N++;
 | |
| $code.=<<___;
 | |
| .align	4
 | |
| 	extbl	$Xlo,7,$nlo
 | |
| 	and	$nlo,0xf0,$nhi
 | |
| 	sll	$nlo,4,$nlo
 | |
| 	and	$nlo,0xf0,$nlo
 | |
| 
 | |
| 	addq	$nlo,$Htbl,$nlo
 | |
| 	ldq	$Zlo,8($nlo)
 | |
| 	addq	$nhi,$Htbl,$nhi
 | |
| 	ldq	$Zhi,0($nlo)
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	lda	$cnt,6(zero)
 | |
| 	extbl	$Xlo,6,$nlo
 | |
| 
 | |
| 	ldq	$Tlo1,8($nhi)
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 	ldq	$Thi1,0($nhi)
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 	and	$nlo,0xf0,$nhi
 | |
| 
 | |
| 	xor	$Tlo1,$Zlo,$Zlo
 | |
| 	sll	$nlo,4,$nlo
 | |
| 	xor	$Thi1,$Zhi,$Zhi
 | |
| 	and	$nlo,0xf0,$nlo
 | |
| 
 | |
| 	addq	$nlo,$Htbl,$nlo
 | |
| 	ldq	$Tlo0,8($nlo)
 | |
| 	addq	$nhi,$Htbl,$nhi
 | |
| 	ldq	$Thi0,0($nlo)
 | |
| 
 | |
| .Looplo$N:
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	subq	$cnt,1,$cnt
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	ldq	$Tlo1,8($nhi)
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	ldq	$Thi1,0($nhi)
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 	extbl	$Xlo,$cnt,$nlo
 | |
| 
 | |
| 	and	$nlo,0xf0,$nhi
 | |
| 	xor	$Thi0,$Zhi,$Zhi
 | |
| 	xor	$Tlo0,$Zlo,$Zlo
 | |
| 	sll	$nlo,4,$nlo
 | |
| 
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	and	$nlo,0xf0,$nlo
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	addq	$nlo,$Htbl,$nlo
 | |
| 	addq	$nhi,$Htbl,$nhi
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	ldq	$Tlo0,8($nlo)
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 
 | |
| 	xor	$Tlo1,$Zlo,$Zlo
 | |
| 	xor	$Thi1,$Zhi,$Zhi
 | |
| 	ldq	$Thi0,0($nlo)
 | |
| 	bne	$cnt,.Looplo$N
 | |
| 
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	lda	$cnt,7(zero)
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	ldq	$Tlo1,8($nhi)
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	ldq	$Thi1,0($nhi)
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 	extbl	$Xhi,$cnt,$nlo
 | |
| 
 | |
| 	and	$nlo,0xf0,$nhi
 | |
| 	xor	$Thi0,$Zhi,$Zhi
 | |
| 	xor	$Tlo0,$Zlo,$Zlo
 | |
| 	sll	$nlo,4,$nlo
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	and	$nlo,0xf0,$nlo
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	addq	$nlo,$Htbl,$nlo
 | |
| 	addq	$nhi,$Htbl,$nhi
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	ldq	$Tlo0,8($nlo)
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 
 | |
| 	xor	$Tlo1,$Zlo,$Zlo
 | |
| 	xor	$Thi1,$Zhi,$Zhi
 | |
| 	ldq	$Thi0,0($nlo)
 | |
| 	unop
 | |
| 
 | |
| 
 | |
| .Loophi$N:
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	subq	$cnt,1,$cnt
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	ldq	$Tlo1,8($nhi)
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	ldq	$Thi1,0($nhi)
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 	extbl	$Xhi,$cnt,$nlo
 | |
| 
 | |
| 	and	$nlo,0xf0,$nhi
 | |
| 	xor	$Thi0,$Zhi,$Zhi
 | |
| 	xor	$Tlo0,$Zlo,$Zlo
 | |
| 	sll	$nlo,4,$nlo
 | |
| 
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	and	$nlo,0xf0,$nlo
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	addq	$nlo,$Htbl,$nlo
 | |
| 	addq	$nhi,$Htbl,$nhi
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	ldq	$Tlo0,8($nlo)
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 
 | |
| 	xor	$Tlo1,$Zlo,$Zlo
 | |
| 	xor	$Thi1,$Zhi,$Zhi
 | |
| 	ldq	$Thi0,0($nlo)
 | |
| 	bne	$cnt,.Loophi$N
 | |
| 
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	ldq	$Tlo1,8($nhi)
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 	ldq	$Thi1,0($nhi)
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 
 | |
| 	xor	$Tlo0,$Zlo,$Zlo
 | |
| 	xor	$Thi0,$Zhi,$Zhi
 | |
| 
 | |
| 	and	$Zlo,0x0f,$remp
 | |
| 	sll	$Zhi,60,$t0
 | |
| 	srl	$Zlo,4,$Zlo
 | |
| 
 | |
| 	s8addq	$remp,$rem_4bit,$remp
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| 
 | |
| 	ldq	$rem,0($remp)
 | |
| 	srl	$Zhi,4,$Zhi
 | |
| 	xor	$Tlo1,$Zlo,$Zlo
 | |
| 	xor	$Thi1,$Zhi,$Zhi
 | |
| 	xor	$t0,$Zlo,$Zlo
 | |
| 	xor	$rem,$Zhi,$Zhi
 | |
| ___
 | |
| }}
 | |
| 
 | |
| $code=<<___;
 | |
| #ifdef __linux__
 | |
| #include <asm/regdef.h>
 | |
| #else
 | |
| #include <asm.h>
 | |
| #include <regdef.h>
 | |
| #endif
 | |
| 
 | |
| .text
 | |
| 
 | |
| .set	noat
 | |
| .set	noreorder
 | |
| .globl	gcm_gmult_4bit
 | |
| .align	4
 | |
| .ent	gcm_gmult_4bit
 | |
| gcm_gmult_4bit:
 | |
| 	.frame	sp,0,ra
 | |
| 	.prologue 0
 | |
| 
 | |
| 	ldq	$Xlo,8($Xi)
 | |
| 	ldq	$Xhi,0($Xi)
 | |
| 
 | |
| 	bsr	$t0,picmeup
 | |
| 	nop
 | |
| ___
 | |
| 
 | |
| 	&loop();
 | |
| 
 | |
| $code.=<<___;
 | |
| 	srl	$Zlo,24,$t0	# byte swap
 | |
| 	srl	$Zlo,8,$t1
 | |
| 
 | |
| 	sll	$Zlo,8,$t2
 | |
| 	sll	$Zlo,24,$Zlo
 | |
| 	zapnot	$t0,0x11,$t0
 | |
| 	zapnot	$t1,0x22,$t1
 | |
| 
 | |
| 	zapnot	$Zlo,0x88,$Zlo
 | |
| 	or	$t0,$t1,$t0
 | |
| 	zapnot	$t2,0x44,$t2
 | |
| 
 | |
| 	or	$Zlo,$t0,$Zlo
 | |
| 	srl	$Zhi,24,$t0
 | |
| 	srl	$Zhi,8,$t1
 | |
| 
 | |
| 	or	$Zlo,$t2,$Zlo
 | |
| 	sll	$Zhi,8,$t2
 | |
| 	sll	$Zhi,24,$Zhi
 | |
| 
 | |
| 	srl	$Zlo,32,$Xlo
 | |
| 	sll	$Zlo,32,$Zlo
 | |
| 
 | |
| 	zapnot	$t0,0x11,$t0
 | |
| 	zapnot	$t1,0x22,$t1
 | |
| 	or	$Zlo,$Xlo,$Xlo
 | |
| 
 | |
| 	zapnot	$Zhi,0x88,$Zhi
 | |
| 	or	$t0,$t1,$t0
 | |
| 	zapnot	$t2,0x44,$t2
 | |
| 
 | |
| 	or	$Zhi,$t0,$Zhi
 | |
| 	or	$Zhi,$t2,$Zhi
 | |
| 
 | |
| 	srl	$Zhi,32,$Xhi
 | |
| 	sll	$Zhi,32,$Zhi
 | |
| 
 | |
| 	or	$Zhi,$Xhi,$Xhi
 | |
| 	stq	$Xlo,8($Xi)
 | |
| 	stq	$Xhi,0($Xi)
 | |
| 
 | |
| 	ret	(ra)
 | |
| .end	gcm_gmult_4bit
 | |
| ___
 | |
| 
 | |
| $inhi="s0";
 | |
| $inlo="s1";
 | |
| 
 | |
| $code.=<<___;
 | |
| .globl	gcm_ghash_4bit
 | |
| .align	4
 | |
| .ent	gcm_ghash_4bit
 | |
| gcm_ghash_4bit:
 | |
| 	lda	sp,-32(sp)
 | |
| 	stq	ra,0(sp)
 | |
| 	stq	s0,8(sp)
 | |
| 	stq	s1,16(sp)
 | |
| 	.mask	0x04000600,-32
 | |
| 	.frame	sp,32,ra
 | |
| 	.prologue 0
 | |
| 
 | |
| 	ldq_u	$inhi,0($inp)
 | |
| 	ldq_u	$Thi0,7($inp)
 | |
| 	ldq_u	$inlo,8($inp)
 | |
| 	ldq_u	$Tlo0,15($inp)
 | |
| 	ldq	$Xhi,0($Xi)
 | |
| 	ldq	$Xlo,8($Xi)
 | |
| 
 | |
| 	bsr	$t0,picmeup
 | |
| 	nop
 | |
| 
 | |
| .Louter:
 | |
| 	extql	$inhi,$inp,$inhi
 | |
| 	extqh	$Thi0,$inp,$Thi0
 | |
| 	or	$inhi,$Thi0,$inhi
 | |
| 	lda	$inp,16($inp)
 | |
| 
 | |
| 	extql	$inlo,$inp,$inlo
 | |
| 	extqh	$Tlo0,$inp,$Tlo0
 | |
| 	or	$inlo,$Tlo0,$inlo
 | |
| 	subq	$len,16,$len
 | |
| 
 | |
| 	xor	$Xlo,$inlo,$Xlo
 | |
| 	xor	$Xhi,$inhi,$Xhi
 | |
| ___
 | |
| 
 | |
| 	&loop();
 | |
| 
 | |
| $code.=<<___;
 | |
| 	srl	$Zlo,24,$t0	# byte swap
 | |
| 	srl	$Zlo,8,$t1
 | |
| 
 | |
| 	sll	$Zlo,8,$t2
 | |
| 	sll	$Zlo,24,$Zlo
 | |
| 	zapnot	$t0,0x11,$t0
 | |
| 	zapnot	$t1,0x22,$t1
 | |
| 
 | |
| 	zapnot	$Zlo,0x88,$Zlo
 | |
| 	or	$t0,$t1,$t0
 | |
| 	zapnot	$t2,0x44,$t2
 | |
| 
 | |
| 	or	$Zlo,$t0,$Zlo
 | |
| 	srl	$Zhi,24,$t0
 | |
| 	srl	$Zhi,8,$t1
 | |
| 
 | |
| 	or	$Zlo,$t2,$Zlo
 | |
| 	sll	$Zhi,8,$t2
 | |
| 	sll	$Zhi,24,$Zhi
 | |
| 
 | |
| 	srl	$Zlo,32,$Xlo
 | |
| 	sll	$Zlo,32,$Zlo
 | |
| 	beq	$len,.Ldone
 | |
| 
 | |
| 	zapnot	$t0,0x11,$t0
 | |
| 	zapnot	$t1,0x22,$t1
 | |
| 	or	$Zlo,$Xlo,$Xlo
 | |
| 	ldq_u	$inhi,0($inp)
 | |
| 
 | |
| 	zapnot	$Zhi,0x88,$Zhi
 | |
| 	or	$t0,$t1,$t0
 | |
| 	zapnot	$t2,0x44,$t2
 | |
| 	ldq_u	$Thi0,7($inp)
 | |
| 
 | |
| 	or	$Zhi,$t0,$Zhi
 | |
| 	or	$Zhi,$t2,$Zhi
 | |
| 	ldq_u	$inlo,8($inp)
 | |
| 	ldq_u	$Tlo0,15($inp)
 | |
| 
 | |
| 	srl	$Zhi,32,$Xhi
 | |
| 	sll	$Zhi,32,$Zhi
 | |
| 
 | |
| 	or	$Zhi,$Xhi,$Xhi
 | |
| 	br	zero,.Louter
 | |
| 
 | |
| .Ldone:
 | |
| 	zapnot	$t0,0x11,$t0
 | |
| 	zapnot	$t1,0x22,$t1
 | |
| 	or	$Zlo,$Xlo,$Xlo
 | |
| 
 | |
| 	zapnot	$Zhi,0x88,$Zhi
 | |
| 	or	$t0,$t1,$t0
 | |
| 	zapnot	$t2,0x44,$t2
 | |
| 
 | |
| 	or	$Zhi,$t0,$Zhi
 | |
| 	or	$Zhi,$t2,$Zhi
 | |
| 
 | |
| 	srl	$Zhi,32,$Xhi
 | |
| 	sll	$Zhi,32,$Zhi
 | |
| 
 | |
| 	or	$Zhi,$Xhi,$Xhi
 | |
| 
 | |
| 	stq	$Xlo,8($Xi)
 | |
| 	stq	$Xhi,0($Xi)
 | |
| 
 | |
| 	.set	noreorder
 | |
| 	/*ldq	ra,0(sp)*/
 | |
| 	ldq	s0,8(sp)
 | |
| 	ldq	s1,16(sp)
 | |
| 	lda	sp,32(sp)
 | |
| 	ret	(ra)
 | |
| .end	gcm_ghash_4bit
 | |
| 
 | |
| .align	4
 | |
| .ent	picmeup
 | |
| picmeup:
 | |
| 	.frame	sp,0,$t0
 | |
| 	.prologue 0
 | |
| 	br	$rem_4bit,.Lpic
 | |
| .Lpic:	lda	$rem_4bit,12($rem_4bit)
 | |
| 	ret	($t0)
 | |
| .end	picmeup
 | |
| 	nop
 | |
| rem_4bit:
 | |
| 	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
 | |
| 	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
 | |
| 	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
 | |
| 	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
 | |
| .ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
 | |
| .align	4
 | |
| 
 | |
| ___
 | |
| $output=pop and open STDOUT,">$output";
 | |
| print $code;
 | |
| close STDOUT or die "error closing STDOUT: $!";
 | |
| 
 |