mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-03-09 15:50:02 +00:00
This commit ports performance optimizations from illumos for the libsum
code (used by the cksum and sum builtins):
98bea71f0d
The new codepath in libsum uses prefetching and loop unrolling to
improve performance (prefetching is done with __builtin_prefetch()
or sun_prefetch_read_many() if either is available).
Script for testing (note that cksum must be enabled in
src/cmd/ksh93/data/builtins.c):
#!/bin/ksh
builtin cksum || exit 1
for ((i=0; i!=50000; i++)) do
cksum -x att /etc/hosts
done >/dev/null
Results on Linux x86_64 (using CCFLAGS=-O2):
$ echo 'UNPATCHED:'; time arch/linux.i386-64/bin/ksh /tmp/foo; echo 'PATCHED'; time /tmp/ksh /tmp/foo
UNPATCHED:
real 0m09.989s
user 0m07.582s
sys 0m02.406s
PATCHED:
real 0m06.536s
user 0m04.331s
sys 0m02.204s
src/lib/libsum/{sum-att.c,sum-crc.c,Mamfile}:
- Port the performance optimizations from illumos to 93u+m libsum. To
prevent problems with older versions of GCC, avoid the new codepath
if GCC is older than the 3.1 release series. Additionally, the ast.h
header must be included to handle tcc defining __GNUC__ on FreeBSD.
- Apply some build fixes to allow the new codepath to build with Clang
3.6 and newer (my own testing indicates an even better performance
improvement with Clang than with GCC).
130 lines
3.8 KiB
C
130 lines
3.8 KiB
C
/***********************************************************************
|
|
* *
|
|
* This software is part of the ast package *
|
|
* Copyright (c) 1996-2011 AT&T Intellectual Property *
|
|
* Copyright (c) 2020-2021 Contributors to ksh 93u+m *
|
|
* and is licensed under the *
|
|
* Eclipse Public License, Version 1.0 *
|
|
* by AT&T Intellectual Property *
|
|
* *
|
|
* A copy of the License is available at *
|
|
* http://www.eclipse.org/org/documents/epl-v10.html *
|
|
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
|
|
* *
|
|
* Information and Software Systems Research *
|
|
* AT&T Research *
|
|
* Florham Park NJ *
|
|
* *
|
|
* Glenn Fowler <gsf@research.att.com> *
|
|
* *
|
|
***********************************************************************/
|
|
|
|
/*
|
|
* att
|
|
*/
|
|
|
|
#include <ast.h>
|
|
|
|
#define att_description \
|
|
"The system 5 release 4 checksum. This is the default for \bsum\b \
|
|
when \bgetconf UNIVERSE\b is \batt\b. This is the only true sum; \
|
|
all of the other methods are order dependent."
|
|
#define att_options 0
|
|
#define att_match "att|sys5|s5|default"
|
|
#define att_open long_open
|
|
#define att_init long_init
|
|
#define att_print long_print
|
|
#define att_data long_data
|
|
#define att_scale 512
|
|
|
|
#if !(defined(__clang__) && (__clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 6))) && \
|
|
(defined(__SUNPRO_C) || (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 0))))
|
|
|
|
#if defined(__SUNPRO_C)
|
|
# include <sun_prefetch.h>
|
|
# define sum_prefetch(addr) sun_prefetch_read_many((void *)(addr))
|
|
#elif defined(__GNUC__)
|
|
# define sum_prefetch(addr) __builtin_prefetch((addr), 0, 3)
|
|
#endif
|
|
|
|
#define CBLOCK_SIZE (64)
|
|
#if !defined(__clang__)
|
|
#pragma unroll(16)
|
|
#endif
|
|
|
|
/* Inmos transputer would love this algorithm */
|
|
static int
|
|
att_block(register Sum_t* p, const void* s, size_t n)
|
|
{
|
|
register uint32_t c = ((Integral_t*)p)->sum;
|
|
register const unsigned char* b = (const unsigned char*)s;
|
|
register const unsigned char* e = b + n;
|
|
register uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
|
|
register unsigned int i;
|
|
|
|
s0=s1=s2=s3=s4=s5=s6=s7=0U;
|
|
|
|
sum_prefetch((void *)b);
|
|
|
|
while (n > CBLOCK_SIZE)
|
|
{
|
|
sum_prefetch((b+CBLOCK_SIZE));
|
|
|
|
/* Compiler will unroll for() loops per #pragma unroll */
|
|
#if defined(__clang__)
|
|
#pragma clang loop unroll_count(16)
|
|
#endif
|
|
for (i=0 ; i < (CBLOCK_SIZE/8) ; i++)
|
|
{
|
|
/*
|
|
* use s0-s7 to decouple calculations (this improves pipelining)
|
|
* because each operation is completely independent from its
|
|
* siblings
|
|
*/
|
|
s0+=b[0];
|
|
s1+=b[1];
|
|
s2+=b[2];
|
|
s3+=b[3];
|
|
s4+=b[4];
|
|
s5+=b[5];
|
|
s6+=b[6];
|
|
s7+=b[7];
|
|
|
|
b+=8;
|
|
n-=8;
|
|
}
|
|
}
|
|
|
|
c+=s0+s1+s2+s3+s4+s5+s6+s7;
|
|
|
|
while (b < e)
|
|
c += *b++;
|
|
((Integral_t*)p)->sum = c;
|
|
return 0;
|
|
}
|
|
|
|
#else
|
|
static int
|
|
att_block(register Sum_t* p, const void* s, size_t n)
|
|
{
|
|
register uint32_t c = ((Integral_t*)p)->sum;
|
|
register unsigned char* b = (unsigned char*)s;
|
|
register unsigned char* e = b + n;
|
|
|
|
while (b < e)
|
|
c += *b++;
|
|
((Integral_t*)p)->sum = c;
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static int
|
|
att_done(Sum_t* p)
|
|
{
|
|
register uint32_t c = ((Integral_t*)p)->sum;
|
|
|
|
c = (c & 0xffff) + ((c >> 16) & 0xffff);
|
|
c = (c & 0xffff) + (c >> 16);
|
|
((Integral_t*)p)->sum = c & 0xffff;
|
|
return short_done(p);
|
|
}
|