mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-02-13 03:32:24 +00:00
Port cksum builtin performance improvements from illumos (#391)
This commit ports performance optimizations from illumos for the libsum
code (used by the cksum and sum builtins):
98bea71f0d
The new codepath in libsum uses prefetching and loop unrolling to
improve performance (prefetching is done with __builtin_prefetch()
or sun_prefetch_read_many() if either is available).
Script for testing (note that cksum must be enabled in
src/cmd/ksh93/data/builtins.c):
#!/bin/ksh
builtin cksum || exit 1
for ((i=0; i!=50000; i++)) do
cksum -x att /etc/hosts
done >/dev/null
Results on Linux x86_64 (using CCFLAGS=-O2):
$ echo 'UNPATCHED:'; time arch/linux.i386-64/bin/ksh /tmp/foo; echo 'PATCHED'; time /tmp/ksh /tmp/foo
UNPATCHED:
real 0m09.989s
user 0m07.582s
sys 0m02.406s
PATCHED:
real 0m06.536s
user 0m04.331s
sys 0m02.204s
src/lib/libsum/{sum-att.c,sum-crc.c,Mamfile}:
- Port the performance optimizations from illumos to 93u+m libsum. To
prevent problems with older versions of GCC, avoid the new codepath
if GCC is older than the 3.1 release series. Additionally, the ast.h
header must be included to handle tcc defining __GNUC__ on FreeBSD.
- Apply some build fixes to allow the new codepath to build with Clang
3.6 and newer (my own testing indicates an even better performance
improvement with Clang than with GCC).
This commit is contained in:
parent
8f9d1bec97
commit
4032050249
4 changed files with 236 additions and 9 deletions
4
NEWS
4
NEWS
|
@ -14,6 +14,10 @@ Any uppercase BUG_* names are modernish shell bug IDs.
|
|||
Note that to use these options the operating system must support the
|
||||
corresponding resource limit.
|
||||
|
||||
- Ported performance optimizations from illumos to improve the performance
|
||||
of the cksum builtin. (Note that the cksum builtin is not enabled by
|
||||
default.)
|
||||
|
||||
2021-12-27:
|
||||
|
||||
- Two bash-like flags for 'whence' were backported from ksh 93v-:
|
||||
|
|
|
@ -88,14 +88,10 @@ make install
|
|||
prev ${PACKAGE_ast_INCLUDE}/ast_common.h implicit
|
||||
done ${PACKAGE_ast_INCLUDE}/fnv.h
|
||||
done sum-prng.c
|
||||
make sum-crc.c implicit
|
||||
done sum-crc.c
|
||||
make sum-bsd.c implicit
|
||||
done sum-bsd.c
|
||||
make sum-ast4.c implicit
|
||||
done sum-ast4.c
|
||||
make sum-att.c implicit
|
||||
done sum-att.c
|
||||
make FEATURE/sum implicit
|
||||
meta FEATURE/sum features/%>FEATURE/% features/sum sum
|
||||
make features/sum
|
||||
|
@ -171,6 +167,12 @@ make install
|
|||
prev ${PACKAGE_ast_INCLUDE}/ast_std.h implicit
|
||||
done ${PACKAGE_ast_INCLUDE}/ast.h dontcare
|
||||
done sum.h
|
||||
make sum-att.c implicit
|
||||
prev ${PACKAGE_ast_INCLUDE}/ast.h implicit
|
||||
done sum-att.c
|
||||
make sum-crc.c implicit
|
||||
prev ${PACKAGE_ast_INCLUDE}/ast.h implicit
|
||||
done sum-crc.c
|
||||
done sumlib.c
|
||||
meta sumlib.o %.c>%.o sumlib.c sumlib
|
||||
prev sumlib.c
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
* att
|
||||
*/
|
||||
|
||||
#include <ast.h>
|
||||
|
||||
#define att_description \
|
||||
"The system 5 release 4 checksum. This is the default for \bsum\b \
|
||||
when \bgetconf UNIVERSE\b is \batt\b. This is the only true sum; \
|
||||
|
@ -35,6 +37,73 @@
|
|||
#define att_data long_data
|
||||
#define att_scale 512
|
||||
|
||||
#if !(defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))) && \
|
||||
(defined(__SUNPRO_C) || (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0))))
|
||||
|
||||
#if defined(__SUNPRO_C)
|
||||
# include <sun_prefetch.h>
|
||||
# define sum_prefetch(addr) sun_prefetch_read_many((void *)(addr))
|
||||
#elif defined(__GNUC__)
|
||||
# define sum_prefetch(addr) __builtin_prefetch((addr), 0, 3)
|
||||
#endif
|
||||
|
||||
#define CBLOCK_SIZE (64)
|
||||
#if !defined(__clang__)
|
||||
#pragma unroll(16)
|
||||
#endif
|
||||
|
||||
/* Inmos transputer would love this algorithm */
|
||||
static int
|
||||
att_block(register Sum_t* p, const void* s, size_t n)
|
||||
{
|
||||
register uint32_t c = ((Integral_t*)p)->sum;
|
||||
register const unsigned char* b = (const unsigned char*)s;
|
||||
register const unsigned char* e = b + n;
|
||||
register uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
register unsigned int i;
|
||||
|
||||
s0=s1=s2=s3=s4=s5=s6=s7=0U;
|
||||
|
||||
sum_prefetch((void *)b);
|
||||
|
||||
while (n > CBLOCK_SIZE)
|
||||
{
|
||||
sum_prefetch((b+CBLOCK_SIZE));
|
||||
|
||||
/* Compiler will unroll for() loops per #pragma unroll */
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop unroll_count(16)
|
||||
#endif
|
||||
for (i=0 ; i < (CBLOCK_SIZE/8) ; i++)
|
||||
{
|
||||
/*
|
||||
* use s0-s7 to decouple calculations (this improves pipelining)
|
||||
* because each operation is completely independent from its
|
||||
* siblings
|
||||
*/
|
||||
s0+=b[0];
|
||||
s1+=b[1];
|
||||
s2+=b[2];
|
||||
s3+=b[3];
|
||||
s4+=b[4];
|
||||
s5+=b[5];
|
||||
s6+=b[6];
|
||||
s7+=b[7];
|
||||
|
||||
b+=8;
|
||||
n-=8;
|
||||
}
|
||||
}
|
||||
|
||||
c+=s0+s1+s2+s3+s4+s5+s6+s7;
|
||||
|
||||
while (b < e)
|
||||
c += *b++;
|
||||
((Integral_t*)p)->sum = c;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
static int
|
||||
att_block(register Sum_t* p, const void* s, size_t n)
|
||||
{
|
||||
|
@ -47,6 +116,7 @@ att_block(register Sum_t* p, const void* s, size_t n)
|
|||
((Integral_t*)p)->sum = c;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
att_done(Sum_t* p)
|
||||
|
|
|
@ -23,6 +23,8 @@
|
|||
* crc
|
||||
*/
|
||||
|
||||
#include <ast.h>
|
||||
|
||||
#define crc_description \
|
||||
"32 bit CRC (cyclic redundancy check)."
|
||||
#define crc_options "\
|
||||
|
@ -48,7 +50,8 @@ typedef struct Crc_s
|
|||
Crcnum_t init;
|
||||
Crcnum_t done;
|
||||
Crcnum_t xorsize;
|
||||
Crcnum_t tab[256];
|
||||
const Crcnum_t *tab; /* use |const| to give the compiler a hint that the data won't change */
|
||||
Crcnum_t tabdata[256];
|
||||
unsigned int addsize;
|
||||
unsigned int rotate;
|
||||
} Crc_t;
|
||||
|
@ -56,6 +59,62 @@ typedef struct Crc_s
|
|||
#define CRC(p,s,c) (s = (s >> 8) ^ (p)->tab[(s ^ (c)) & 0xff])
|
||||
#define CRCROTATE(p,s,c) (s = (s << 8) ^ (p)->tab[((s >> 24) ^ (c)) & 0xff])
|
||||
|
||||
static const
|
||||
Crcnum_t posix_cksum_tab[256] = {
|
||||
0x00000000U,
|
||||
0x04c11db7U, 0x09823b6eU, 0x0d4326d9U, 0x130476dcU, 0x17c56b6bU,
|
||||
0x1a864db2U, 0x1e475005U, 0x2608edb8U, 0x22c9f00fU, 0x2f8ad6d6U,
|
||||
0x2b4bcb61U, 0x350c9b64U, 0x31cd86d3U, 0x3c8ea00aU, 0x384fbdbdU,
|
||||
0x4c11db70U, 0x48d0c6c7U, 0x4593e01eU, 0x4152fda9U, 0x5f15adacU,
|
||||
0x5bd4b01bU, 0x569796c2U, 0x52568b75U, 0x6a1936c8U, 0x6ed82b7fU,
|
||||
0x639b0da6U, 0x675a1011U, 0x791d4014U, 0x7ddc5da3U, 0x709f7b7aU,
|
||||
0x745e66cdU, 0x9823b6e0U, 0x9ce2ab57U, 0x91a18d8eU, 0x95609039U,
|
||||
0x8b27c03cU, 0x8fe6dd8bU, 0x82a5fb52U, 0x8664e6e5U, 0xbe2b5b58U,
|
||||
0xbaea46efU, 0xb7a96036U, 0xb3687d81U, 0xad2f2d84U, 0xa9ee3033U,
|
||||
0xa4ad16eaU, 0xa06c0b5dU, 0xd4326d90U, 0xd0f37027U, 0xddb056feU,
|
||||
0xd9714b49U, 0xc7361b4cU, 0xc3f706fbU, 0xceb42022U, 0xca753d95U,
|
||||
0xf23a8028U, 0xf6fb9d9fU, 0xfbb8bb46U, 0xff79a6f1U, 0xe13ef6f4U,
|
||||
0xe5ffeb43U, 0xe8bccd9aU, 0xec7dd02dU, 0x34867077U, 0x30476dc0U,
|
||||
0x3d044b19U, 0x39c556aeU, 0x278206abU, 0x23431b1cU, 0x2e003dc5U,
|
||||
0x2ac12072U, 0x128e9dcfU, 0x164f8078U, 0x1b0ca6a1U, 0x1fcdbb16U,
|
||||
0x018aeb13U, 0x054bf6a4U, 0x0808d07dU, 0x0cc9cdcaU, 0x7897ab07U,
|
||||
0x7c56b6b0U, 0x71159069U, 0x75d48ddeU, 0x6b93dddbU, 0x6f52c06cU,
|
||||
0x6211e6b5U, 0x66d0fb02U, 0x5e9f46bfU, 0x5a5e5b08U, 0x571d7dd1U,
|
||||
0x53dc6066U, 0x4d9b3063U, 0x495a2dd4U, 0x44190b0dU, 0x40d816baU,
|
||||
0xaca5c697U, 0xa864db20U, 0xa527fdf9U, 0xa1e6e04eU, 0xbfa1b04bU,
|
||||
0xbb60adfcU, 0xb6238b25U, 0xb2e29692U, 0x8aad2b2fU, 0x8e6c3698U,
|
||||
0x832f1041U, 0x87ee0df6U, 0x99a95df3U, 0x9d684044U, 0x902b669dU,
|
||||
0x94ea7b2aU, 0xe0b41de7U, 0xe4750050U, 0xe9362689U, 0xedf73b3eU,
|
||||
0xf3b06b3bU, 0xf771768cU, 0xfa325055U, 0xfef34de2U, 0xc6bcf05fU,
|
||||
0xc27dede8U, 0xcf3ecb31U, 0xcbffd686U, 0xd5b88683U, 0xd1799b34U,
|
||||
0xdc3abdedU, 0xd8fba05aU, 0x690ce0eeU, 0x6dcdfd59U, 0x608edb80U,
|
||||
0x644fc637U, 0x7a089632U, 0x7ec98b85U, 0x738aad5cU, 0x774bb0ebU,
|
||||
0x4f040d56U, 0x4bc510e1U, 0x46863638U, 0x42472b8fU, 0x5c007b8aU,
|
||||
0x58c1663dU, 0x558240e4U, 0x51435d53U, 0x251d3b9eU, 0x21dc2629U,
|
||||
0x2c9f00f0U, 0x285e1d47U, 0x36194d42U, 0x32d850f5U, 0x3f9b762cU,
|
||||
0x3b5a6b9bU, 0x0315d626U, 0x07d4cb91U, 0x0a97ed48U, 0x0e56f0ffU,
|
||||
0x1011a0faU, 0x14d0bd4dU, 0x19939b94U, 0x1d528623U, 0xf12f560eU,
|
||||
0xf5ee4bb9U, 0xf8ad6d60U, 0xfc6c70d7U, 0xe22b20d2U, 0xe6ea3d65U,
|
||||
0xeba91bbcU, 0xef68060bU, 0xd727bbb6U, 0xd3e6a601U, 0xdea580d8U,
|
||||
0xda649d6fU, 0xc423cd6aU, 0xc0e2d0ddU, 0xcda1f604U, 0xc960ebb3U,
|
||||
0xbd3e8d7eU, 0xb9ff90c9U, 0xb4bcb610U, 0xb07daba7U, 0xae3afba2U,
|
||||
0xaafbe615U, 0xa7b8c0ccU, 0xa379dd7bU, 0x9b3660c6U, 0x9ff77d71U,
|
||||
0x92b45ba8U, 0x9675461fU, 0x8832161aU, 0x8cf30badU, 0x81b02d74U,
|
||||
0x857130c3U, 0x5d8a9099U, 0x594b8d2eU, 0x5408abf7U, 0x50c9b640U,
|
||||
0x4e8ee645U, 0x4a4ffbf2U, 0x470cdd2bU, 0x43cdc09cU, 0x7b827d21U,
|
||||
0x7f436096U, 0x7200464fU, 0x76c15bf8U, 0x68860bfdU, 0x6c47164aU,
|
||||
0x61043093U, 0x65c52d24U, 0x119b4be9U, 0x155a565eU, 0x18197087U,
|
||||
0x1cd86d30U, 0x029f3d35U, 0x065e2082U, 0x0b1d065bU, 0x0fdc1becU,
|
||||
0x3793a651U, 0x3352bbe6U, 0x3e119d3fU, 0x3ad08088U, 0x2497d08dU,
|
||||
0x2056cd3aU, 0x2d15ebe3U, 0x29d4f654U, 0xc5a92679U, 0xc1683bceU,
|
||||
0xcc2b1d17U, 0xc8ea00a0U, 0xd6ad50a5U, 0xd26c4d12U, 0xdf2f6bcbU,
|
||||
0xdbee767cU, 0xe3a1cbc1U, 0xe760d676U, 0xea23f0afU, 0xeee2ed18U,
|
||||
0xf0a5bd1dU, 0xf464a0aaU, 0xf9278673U, 0xfde69bc4U, 0x89b8fd09U,
|
||||
0x8d79e0beU, 0x803ac667U, 0x84fbdbd0U, 0x9abc8bd5U, 0x9e7d9662U,
|
||||
0x933eb0bbU, 0x97ffad0cU, 0xafb010b1U, 0xab710d06U, 0xa6322bdfU,
|
||||
0xa2f33668U, 0xbcb4666dU, 0xb8757bdaU, 0xb5365d03U, 0xb1f740b4U
|
||||
};
|
||||
|
||||
static Sum_t*
|
||||
crc_open(const Method_t* method, const char* name)
|
||||
{
|
||||
|
@ -73,6 +132,20 @@ crc_open(const Method_t* method, const char* name)
|
|||
sum->method = (Method_t*)method;
|
||||
sum->name = name;
|
||||
}
|
||||
|
||||
if(!strcmp(name, "crc-0x04c11db7-rotate-done-size"))
|
||||
{
|
||||
sum->init=0;
|
||||
sum->done=0xffffffff;
|
||||
sum->xorsize=0x0;
|
||||
sum->addsize=0x1;
|
||||
sum->rotate=1;
|
||||
|
||||
/* Optimized codepath for POSIX cksum to save startup time */
|
||||
sum->tab=posix_cksum_tab;
|
||||
}
|
||||
else
|
||||
{
|
||||
polynomial = 0xedb88320;
|
||||
s = name;
|
||||
while (*(t = s))
|
||||
|
@ -106,7 +179,7 @@ crc_open(const Method_t* method, const char* name)
|
|||
p[0] = polynomial;
|
||||
for (i = 1; i < 8; i++)
|
||||
p[i] = (p[i-1] << 1) ^ ((p[i-1] & 0x80000000) ? polynomial : 0);
|
||||
for (i = 0; i < elementsof(sum->tab); i++)
|
||||
for (i = 0; i < elementsof(sum->tabdata); i++)
|
||||
{
|
||||
t = 0;
|
||||
x = i;
|
||||
|
@ -116,19 +189,23 @@ crc_open(const Method_t* method, const char* name)
|
|||
t ^= p[j];
|
||||
x >>= 1;
|
||||
}
|
||||
sum->tab[i] = t;
|
||||
sum->tabdata[i] = t;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < elementsof(sum->tab); i++)
|
||||
for (i = 0; i < elementsof(sum->tabdata); i++)
|
||||
{
|
||||
x = i;
|
||||
for (j = 0; j < 8; j++)
|
||||
x = (x>>1) ^ ((x & 1) ? polynomial : 0);
|
||||
sum->tab[i] = x;
|
||||
sum->tabdata[i] = x;
|
||||
}
|
||||
|
||||
sum->tab=sum->tabdata;
|
||||
}
|
||||
}
|
||||
|
||||
return (Sum_t*)sum;
|
||||
}
|
||||
|
||||
|
@ -141,6 +218,79 @@ crc_init(Sum_t* p)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if !(defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))) && \
|
||||
(defined(__SUNPRO_C) || (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0))))
|
||||
|
||||
#if defined(__SUNPRO_C)
|
||||
# include <sun_prefetch.h>
|
||||
# define sum_prefetch(addr) sun_prefetch_read_many((void *)(addr))
|
||||
#elif defined(__GNUC__)
|
||||
# define sum_prefetch(addr) __builtin_prefetch((addr), 0, 3)
|
||||
#endif
|
||||
|
||||
#define CBLOCK_SIZE (64)
|
||||
#if !defined(__clang__)
|
||||
#pragma unroll(16)
|
||||
#endif
|
||||
|
||||
static int
|
||||
crc_block(Sum_t* p, const void* s, size_t n)
|
||||
{
|
||||
Crc_t* sum = (Crc_t*)p;
|
||||
register Crcnum_t c = sum->sum;
|
||||
register const unsigned char* b = (const unsigned char*)s;
|
||||
register const unsigned char* e = b + n;
|
||||
unsigned short i;
|
||||
|
||||
sum_prefetch(b);
|
||||
|
||||
if (sum->rotate)
|
||||
{
|
||||
while (n > CBLOCK_SIZE)
|
||||
{
|
||||
sum_prefetch(b+CBLOCK_SIZE);
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop unroll_count(16)
|
||||
#endif
|
||||
for(i=0 ; i < CBLOCK_SIZE ; i++)
|
||||
{
|
||||
CRCROTATE(sum, c, *b++);
|
||||
}
|
||||
|
||||
n-=CBLOCK_SIZE;
|
||||
}
|
||||
|
||||
while (b < e)
|
||||
{
|
||||
CRCROTATE(sum, c, *b++);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (n > CBLOCK_SIZE)
|
||||
{
|
||||
sum_prefetch(b+CBLOCK_SIZE);
|
||||
#if defined(__clang__)
|
||||
#pragma clang loop unroll_count(16)
|
||||
#endif
|
||||
for(i=0 ; i < CBLOCK_SIZE ; i++)
|
||||
{
|
||||
CRC(sum, c, *b++);
|
||||
}
|
||||
|
||||
n-=CBLOCK_SIZE;
|
||||
}
|
||||
|
||||
while (b < e)
|
||||
{
|
||||
CRC(sum, c, *b++);
|
||||
}
|
||||
}
|
||||
|
||||
sum->sum = c;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static int
|
||||
crc_block(Sum_t* p, const void* s, size_t n)
|
||||
{
|
||||
|
@ -158,6 +308,7 @@ crc_block(Sum_t* p, const void* s, size_t n)
|
|||
sum->sum = c;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
crc_done(Sum_t* p)
|
||||
|
|
Loading…
Reference in a new issue