mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-03-09 15:50:02 +00:00
This change is pulled from here: https://github.com/oracle/solaris-userland/blob/master/components/ksh93/patches/055-CR7026179.patch It's a fix for wide-character handling in the wc (word count) library.
515 lines
11 KiB
C
515 lines
11 KiB
C
/***********************************************************************
|
|
* *
|
|
* This software is part of the ast package *
|
|
* Copyright (c) 1992-2011 AT&T Intellectual Property *
|
|
* and is licensed under the *
|
|
* Eclipse Public License, Version 1.0 *
|
|
* by AT&T Intellectual Property *
|
|
* *
|
|
* A copy of the License is available at *
|
|
* http://www.eclipse.org/org/documents/epl-v10.html *
|
|
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
|
|
* *
|
|
* Information and Software Systems Research *
|
|
* AT&T Research *
|
|
* Florham Park NJ *
|
|
* *
|
|
* Glenn Fowler <gsf@research.att.com> *
|
|
* David Korn <dgk@research.att.com> *
|
|
* *
|
|
***********************************************************************/
|
|
#pragma prototyped
|
|
/*
|
|
* David Korn
|
|
* AT&T Bell Laboratories
|
|
*
|
|
* library interface for word count
|
|
*/
|
|
|
|
#include <cmd.h>
|
|
#include <wc.h>
|
|
#include <ctype.h>
|
|
|
|
#if _hdr_wchar && _hdr_wctype && _lib_iswctype
|
|
|
|
#include <wchar.h>
|
|
#include <wctype.h>
|
|
#include <lc.h>
|
|
|
|
#else
|
|
|
|
#ifndef iswspace
|
|
#define iswspace(x) isspace(x)
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#define WC_SP 0x08
|
|
#define WC_NL 0x10
|
|
#define WC_MB 0x20
|
|
#define WC_ERR 0x40
|
|
|
|
#define eol(c) ((c)&WC_NL)
|
|
#define mbc(c) ((c)&WC_MB)
|
|
#define spc(c) ((c)&WC_SP)
|
|
#define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
|
|
|
|
Wc_t* wc_init(int mode)
|
|
{
|
|
register int n;
|
|
register int w;
|
|
Wc_t* wp;
|
|
|
|
if (!(wp = (Wc_t*)stakalloc(sizeof(Wc_t))))
|
|
return 0;
|
|
if (!mbwide())
|
|
wp->mb = 0;
|
|
#if _hdr_wchar && _hdr_wctype && _lib_iswctype
|
|
else if (!(mode & WC_NOUTF8) && (lcinfo(LC_CTYPE)->lc->flags & LC_utf8))
|
|
wp->mb = 1;
|
|
#endif
|
|
else
|
|
wp->mb = -1;
|
|
w = mode & WC_WORDS;
|
|
for (n = (1<<CHAR_BIT); --n >= 0;)
|
|
wp->type[n] = (w && isspace(n)) ? WC_SP : 0;
|
|
wp->type['\n'] = WC_SP|WC_NL;
|
|
if ((mode & (WC_MBYTE|WC_WORDS)) && wp->mb > 0)
|
|
{
|
|
for (n = 0; n < 64; n++)
|
|
{
|
|
wp->type[0x80+n] |= WC_MB;
|
|
if (n<32)
|
|
wp->type[0xc0+n] |= WC_MB+1;
|
|
else if (n<48)
|
|
wp->type[0xc0+n] |= WC_MB+2;
|
|
else if (n<56)
|
|
wp->type[0xc0+n] |= WC_MB+3;
|
|
else if (n<60)
|
|
wp->type[0xc0+n] |= WC_MB+4;
|
|
else if (n<62)
|
|
wp->type[0xc0+n] |= WC_MB+5;
|
|
}
|
|
wp->type[0xc0] = WC_MB|WC_ERR;
|
|
wp->type[0xc1] = WC_MB|WC_ERR;
|
|
wp->type[0xfe] = WC_MB|WC_ERR;
|
|
wp->type[0xff] = WC_MB|WC_ERR;
|
|
}
|
|
wp->mode = mode;
|
|
return wp;
|
|
}
|
|
|
|
static int invalid(const char *file, int nlines)
|
|
{
|
|
error_info.file = (char*)file;
|
|
error_info.line = nlines;
|
|
error(ERROR_SYSTEM|1, "invalid multibyte character");
|
|
error_info.file = 0;
|
|
error_info.line = 0;
|
|
return nlines;
|
|
}
|
|
|
|
/*
|
|
* handle utf space characters
|
|
*/
|
|
|
|
static int chkstate(int state, register unsigned int c)
|
|
{
|
|
switch(state)
|
|
{
|
|
case 1:
|
|
state = (c==0x9a?4:0);
|
|
break;
|
|
case 2:
|
|
state = ((c==0x80||c==0x81)?6+(c&1):0);
|
|
break;
|
|
case 3:
|
|
state = (c==0x80?5:0);
|
|
break;
|
|
case 4:
|
|
state = (c==0x80?10:0);
|
|
break;
|
|
case 5:
|
|
state = (c==0x80?10:0);
|
|
break;
|
|
case 6:
|
|
state = 0;
|
|
if(c==0xa0 || c==0xa1)
|
|
return(10);
|
|
else if((c&0xf0)== 0x80)
|
|
{
|
|
if((c&=0xf)==7)
|
|
return(iswspace(0x2007)?10:0);
|
|
if(c<=0xb)
|
|
return(10);
|
|
}
|
|
else if(c==0xaf && iswspace(0x202f))
|
|
return(10);
|
|
break;
|
|
case 7:
|
|
state = (c==0x9f?10:0);
|
|
break;
|
|
case 8:
|
|
return (iswspace(c)?10:0);
|
|
}
|
|
return state;
|
|
}
|
|
|
|
/*
|
|
* compute the line, word, and character count for file <fd>
|
|
*/
|
|
|
|
int wc_count(Wc_t *wp, Sfio_t *fd, const char* file)
|
|
{
|
|
register char* type = wp->type;
|
|
register unsigned char* cp;
|
|
register Sfoff_t nbytes;
|
|
register Sfoff_t nchars;
|
|
register Sfoff_t nwords;
|
|
register Sfoff_t nlines;
|
|
register Sfoff_t eline = -1;
|
|
register Sfoff_t longest = 0;
|
|
register ssize_t c;
|
|
register unsigned char* endbuff;
|
|
register int lasttype = WC_SP;
|
|
unsigned int lastchar;
|
|
ssize_t n;
|
|
ssize_t o;
|
|
unsigned char* buff;
|
|
wchar_t x;
|
|
unsigned char side[32];
|
|
|
|
sfset(fd,SF_WRITE,1);
|
|
nlines = nwords = nchars = nbytes = 0;
|
|
wp->longest = 0;
|
|
if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
|
|
{
|
|
cp = buff = endbuff = 0;
|
|
for (;;)
|
|
{
|
|
if (cp >= endbuff || (n = mb2wc(x, cp, endbuff-cp)) < 0)
|
|
{
|
|
if ((o = endbuff-cp) < sizeof(side))
|
|
{
|
|
if (buff)
|
|
{
|
|
if (o)
|
|
memcpy(side, cp, o);
|
|
mbinit();
|
|
}
|
|
else
|
|
o = 0;
|
|
cp = side + o;
|
|
if (!(buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) || (n = sfvalue(fd)) <= 0)
|
|
{
|
|
if ((nchars - longest) > wp->longest)
|
|
wp->longest = nchars - longest;
|
|
break;
|
|
}
|
|
nbytes += n;
|
|
if ((c = sizeof(side) - o) > n)
|
|
c = n;
|
|
if (c)
|
|
memcpy(cp, buff, c);
|
|
endbuff = buff + n;
|
|
cp = side;
|
|
x = mbchar(cp);
|
|
if ((cp-side) < o)
|
|
{
|
|
cp = buff;
|
|
nchars += (cp-side) - 1;
|
|
}
|
|
else
|
|
cp = buff + (cp-side) - o;
|
|
}
|
|
else
|
|
{
|
|
cp++;
|
|
x = -1;
|
|
}
|
|
if (x == -1 && eline != nlines && !(wp->mode & WC_QUIET))
|
|
eline = invalid(file, nlines);
|
|
}
|
|
else
|
|
cp += n ? n : 1;
|
|
if (x == '\n')
|
|
{
|
|
if ((nchars - longest) > wp->longest)
|
|
wp->longest = nchars - longest;
|
|
longest = nchars + 1;
|
|
nlines++;
|
|
lasttype = 1;
|
|
}
|
|
else if (iswspace(x))
|
|
lasttype = 1;
|
|
else if (lasttype)
|
|
{
|
|
lasttype = 0;
|
|
nwords++;
|
|
}
|
|
nchars++;
|
|
}
|
|
if (!(wp->mode & WC_MBYTE))
|
|
nchars = nbytes;
|
|
}
|
|
else if (!wp->mb && !(wp->mode & WC_LONGEST) || wp->mb > 0 && !(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
|
|
{
|
|
if (!(wp->mode & (WC_MBYTE|WC_WORDS|WC_LONGEST)))
|
|
{
|
|
while ((cp = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
|
|
{
|
|
nchars += c;
|
|
endbuff = cp + c;
|
|
if (*--endbuff == '\n')
|
|
nlines++;
|
|
else
|
|
*endbuff = '\n';
|
|
for (;;)
|
|
if (*cp++ == '\n')
|
|
{
|
|
if (cp > endbuff)
|
|
break;
|
|
nlines++;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
|
|
{
|
|
nchars += c;
|
|
/* check to see whether first character terminates word */
|
|
if (c==1)
|
|
{
|
|
if (eol(lasttype))
|
|
nlines++;
|
|
if ((c = type[*cp]) && !lasttype)
|
|
nwords++;
|
|
lasttype = c;
|
|
continue;
|
|
}
|
|
if (!lasttype && type[*cp])
|
|
nwords++;
|
|
lastchar = cp[--c];
|
|
*(endbuff = cp+c) = '\n';
|
|
c = lasttype;
|
|
/* process each buffer */
|
|
for (;;)
|
|
{
|
|
/* process spaces and new-lines */
|
|
do
|
|
{
|
|
if (eol(c))
|
|
for (;;)
|
|
{
|
|
/* check for end of buffer */
|
|
if (cp > endbuff)
|
|
goto beob;
|
|
nlines++;
|
|
if (*cp != '\n')
|
|
break;
|
|
cp++;
|
|
}
|
|
} while (c = type[*cp++]);
|
|
/* skip over word characters */
|
|
while (!(c = type[*cp++]));
|
|
nwords++;
|
|
}
|
|
beob:
|
|
if ((cp -= 2) >= buff)
|
|
c = type[*cp];
|
|
else
|
|
c = lasttype;
|
|
lasttype = type[lastchar];
|
|
/* see if was in word */
|
|
if (!c && !lasttype)
|
|
nwords--;
|
|
}
|
|
if (eol(lasttype))
|
|
nlines++;
|
|
else if (!lasttype)
|
|
nwords++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int lineoff=0;
|
|
int skip=0;
|
|
int adjust=0;
|
|
int state=0;
|
|
int oldc;
|
|
int xspace;
|
|
int wasspace = 1;
|
|
unsigned char* start;
|
|
int flagm = 0;
|
|
|
|
lastchar = 0;
|
|
start = (endbuff = side) + 1;
|
|
xspace = iswspace(0xa0) || iswspace(0x85);
|
|
while ((cp = buff = (unsigned char*)sfreserve(fd, SF_UNBOUND, 0)) && (c = sfvalue(fd)) > 0)
|
|
{
|
|
nbytes += c;
|
|
nchars += c;
|
|
start = cp-lineoff;
|
|
/* check to see whether first character terminates word */
|
|
if(c==1)
|
|
{
|
|
if(eol(lasttype))
|
|
nlines++;
|
|
if((c = type[*cp]) && !lasttype)
|
|
nwords++;
|
|
lasttype = c;
|
|
endbuff = start;
|
|
continue;
|
|
}
|
|
lastchar = cp[--c];
|
|
endbuff = cp+c;
|
|
cp[c] = '\n';
|
|
if(mbc(lasttype))
|
|
{
|
|
c = lasttype;
|
|
flagm = 1;
|
|
goto mbyte;
|
|
}
|
|
if(!lasttype && spc(type[*cp]))
|
|
nwords++;
|
|
c = lasttype;
|
|
/* process each buffer */
|
|
for (;;)
|
|
{
|
|
/* process spaces and new-lines */
|
|
spaces:
|
|
do
|
|
{
|
|
if (eol(c))
|
|
{
|
|
/* check for end of buffer */
|
|
if (cp > endbuff)
|
|
goto eob;
|
|
if(wp->mode&WC_LONGEST)
|
|
{
|
|
if((cp-start)-adjust > longest)
|
|
longest = (cp-start)-adjust-1;
|
|
start = cp;
|
|
}
|
|
nlines++;
|
|
nchars -= adjust;
|
|
adjust = 0;
|
|
}
|
|
} while (spc(c = type[*cp++]));
|
|
wasspace=1;
|
|
if(mbc(c))
|
|
{
|
|
mbyte:
|
|
do
|
|
{
|
|
if(c&WC_ERR)
|
|
goto err;
|
|
if(skip && (c&7))
|
|
break;
|
|
if(!skip)
|
|
{
|
|
if(!(c&7))
|
|
{
|
|
skip=1;
|
|
break;
|
|
}
|
|
skip = (c&7);
|
|
adjust += skip;
|
|
state = 0;
|
|
if(flagm==1)
|
|
{
|
|
flagm = 0;
|
|
oldc = *cp;
|
|
if(xspace && (iswspace(*cp)==1))
|
|
state = 8;
|
|
continue;
|
|
}
|
|
if(skip==2 && (cp[-1]&0xc)==0 && (state=(cp[-1]&0x3)))
|
|
oldc = *cp;
|
|
else if(xspace && cp[-1]==0xc2)
|
|
{
|
|
state = 8;
|
|
oldc = *cp;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
skip--;
|
|
if(state && (state=chkstate(state,oldc)))
|
|
{
|
|
if(state==10)
|
|
{
|
|
if(!wasspace)
|
|
nwords++;
|
|
wasspace = 1;
|
|
state=0;
|
|
goto spaces;
|
|
}
|
|
oldc = *cp;
|
|
}
|
|
}
|
|
} while (mbc(c = type[*cp++]));
|
|
wasspace = 0;
|
|
if(skip)
|
|
{
|
|
if(eol(c) && (cp > endbuff))
|
|
goto eob;
|
|
err:
|
|
skip = 0;
|
|
state = 0;
|
|
if(eline!=nlines && !(wp->mode & WC_QUIET))
|
|
eline = invalid(file, nlines);
|
|
while(mbc(c) && ((c|WC_ERR) || (c&7)==0))
|
|
c=type[*cp++];
|
|
if(eol(c) && (cp > endbuff))
|
|
{
|
|
c = WC_MB|WC_ERR;
|
|
goto eob;
|
|
}
|
|
if(mbc(c))
|
|
goto mbyte;
|
|
else if(c&WC_SP)
|
|
goto spaces;
|
|
}
|
|
if(spc(c))
|
|
{
|
|
nwords++;
|
|
continue;
|
|
}
|
|
}
|
|
/* skip over word characters */
|
|
while(!(c = type[*cp++]));
|
|
if(mbc(c))
|
|
goto mbyte;
|
|
nwords++;
|
|
}
|
|
eob:
|
|
lineoff = cp-start;
|
|
if((cp -= 2) >= buff)
|
|
c = type[*cp];
|
|
else
|
|
c = lasttype;
|
|
lasttype = type[lastchar];
|
|
/* see if was in word */
|
|
if(!c && !lasttype)
|
|
nwords--;
|
|
}
|
|
if ((wp->mode&WC_LONGEST) && ((endbuff + 1 - start) - adjust - (lastchar == '\n')) > longest)
|
|
longest = (endbuff + 1 - start) - adjust - (lastchar == '\n');
|
|
wp->longest = longest;
|
|
if (eol(lasttype))
|
|
nlines++;
|
|
else if (!lasttype)
|
|
nwords++;
|
|
if (wp->mode & WC_MBYTE)
|
|
nchars -= adjust;
|
|
else
|
|
nchars = nbytes;
|
|
}
|
|
wp->chars = nchars;
|
|
wp->words = nwords;
|
|
wp->lines = nlines;
|
|
return 0;
|
|
}
|
|
|