1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00
cde/src/cmd/ksh93/sh/string.c
Martijn Dekker 7c4418ccdc Multibyte character handling overhaul; allow global disable
The SHOPT_MULTIBYTE compile-time option did not make much sense as
disabling it only disabled multibyte support for ksh/libshell, not
libast or libcmd built-in commands. This commit allows disabling
multibyte support for the entire codebase by defining the macro
AST_NOMULTIBYTE (e.g. via CCFLAGS). This slightly speeds up the
code and makes an optimised binary about 5% smaller.

src/lib/libast/include/ast.h:
- Add non-multibyte fallback versions of the multibyte macros that
  are used if AST_NOMULTIBYTE is defined. This should cause most
  multibyte handling to be automatically optimised out everywhere.
- Reformat the multibyte macros for legibility.
- Similify mbchar() and and mbsize() macros by defining them in
  terms of mbnchar() and mbnsize(), eliminating code duplication.
- Correct non-multibyte fallback of mbwidth(). For consistent
  behaviour, control characters and out-of-range values should
  return -1 as they do for UTF-8. The fallback is now the same as
  default_wcwidth() in src/lib/libast/comp/setlocale.c.

src/lib/libast/comp/setlocale.c:
- If AST_NOMULTIBYTE is defined, do not compile in the debug and
  UTF-8 locale conversion functions, including several large
  conversion tables. Define their fallback macros as 0 as these are
  used as function pointers.

src/cmd/ksh93/SHOPT.sh,
src/cmd/ksh93/Mamfile:
- Change the SHOPT_MULTIBYTE default to empty, indicating "probe".
- Synchronise SHOPT_MULTIBYTE with !AST_NOMULTIBYTE by default.

src/cmd/ksh93/include/defs.h:
- When SHOPT_MULTIBYTE is zero but AST_NOMULTIBYTE is not non-zero,
  then enable AST_NOMULTIBYTE here to use the ast.h non-multibyte
  fallbacks for ksh. When this is done, the effect is that
  multibyte is optimized out for ksh only, as before.
- Remove previous fallback for disabling multibyte (re: c2cb0eae).

src/cmd/ksh93/include/lexstates.h,
src/cmd/ksh93/sh/lex.c:
- Define SETLEN() macro to assign to LEN (i.e. _Fcin.fclen) for
  multibyte only and do not assign to it directly. With no
  SHOPT_MULTIBYTE, define that macro as empty. This allows removing
  multiple '#if SHOPT_MULTIBYTE' directives from lex.c, as that
  code will all be optimised out automatically if it's disabled.

src/cmd/ksh93/include/national.h,
src/cmd/ksh93/sh/string.c:
- Fix flagrantly incorrect non-multibyte fallback for sh_strchr().
  The latter returns an integer offset (-1 if not found), whereas
  strchr(3) returns a char pointer (NULL if not found). Incorporate
  the fallback into the function for correct handling instead of
  falling back to strchr(3) directly.

src/cmd/ksh93/sh/macro.c:
- lastchar() optimisation: avoid function call if SHOPT_MULTIBYTE
  is enabled but we're not actually in a multibyte locale.

src/cmd/ksh93/sh/name.c:
- Use ja_size() even with SHOPT_MULTIBYTE disabled (re: 2182ecfa).
  Though no regression tests failed, the non-multibyte fallback for
  typeset -L/-R/-Z length calculation was probably not quite
  correct as ja_size() does more. The ast.h change to mbwidth()
  ensures correct behaviour for non-multibyte locales.

src/cmd/ksh93/tests/shtests:
- Since its value in SHOPT.sh is now empty by default, add a quick
  feature test (for the length of the UTF-8 character 'é') to check
  if SHOPT_MULTIBYTE needs to be enabled for the regression tests.
2022-07-09 00:32:27 +02:00

728 lines
14 KiB
C

/***********************************************************************
* *
* This software is part of the ast package *
* Copyright (c) 1982-2012 AT&T Intellectual Property *
* Copyright (c) 2020-2022 Contributors to ksh 93u+m *
* and is licensed under the *
* Eclipse Public License, Version 1.0 *
* by AT&T Intellectual Property *
* *
* A copy of the License is available at *
* http://www.eclipse.org/org/documents/epl-v10.html *
* (with md5 checksum b35adb5213ca9657e911e9befb180842) *
* *
* Information and Software Systems Research *
* AT&T Research *
* Florham Park NJ *
* *
* David Korn <dgk@research.att.com> *
* *
***********************************************************************/
/*
* string processing routines for Korn shell
*
*/
#include "shopt.h"
#include <ast.h>
#include <ast_wchar.h>
#include "defs.h"
#include <stak.h>
#include <ccode.h>
#include "shtable.h"
#include "lexstates.h"
#include "national.h"
#if _hdr_wctype
# include <wctype.h>
#endif
#if !_lib_iswprint && !defined(iswprint)
# define iswprint(c) (((c)&~0377) || isprint(c))
#endif
/*
* Table lookup routine
* <table> is searched for string <sp> and corresponding value is returned
* This is only used for small tables and is used to save non-shareable memory
*/
const Shtable_t *sh_locate(register const char *sp,const Shtable_t *table,int size)
{
register int first;
register const Shtable_t *tp;
register int c;
static const Shtable_t empty = {0,0};
if(sp==0 || (first= *sp)==0)
return(&empty);
tp=table;
while((c= *tp->sh_name) && (CC_NATIVE!=CC_ASCII || c <= first))
{
if(first == c && strcmp(sp,tp->sh_name)==0)
return(tp);
tp = (Shtable_t*)((char*)tp+size);
}
return(&empty);
}
/*
* shtab_options lookup routine
*
* Long-form option names are case-sensitive but insensitive to '-' and '_', and may be abbreviated to a
* non-arbitrary string. A no- prefix is skipped and inverts the meaning (special handling for 'notify').
* The table must be sorted in ASCII order after skipping the no- prefix.
*
* Returns 0 if not found, -1 if multiple found (ambiguous), or the number of the option found.
*/
#define sep(c) ((c)=='-'||(c)=='_')
int sh_lookopt(register const char *sp, int *invert)
{
register int first;
register const Shtable_t *tp;
register int c;
register const char *s, *t, *sw, *tw;
int amb;
int hit;
int inv;
int no;
if(sp==0)
return(0);
if(*sp=='n' && *(sp+1)=='o' && (*(sp+2)!='t' || *(sp+3)!='i'))
{
sp+=2;
if(sep(*sp))
sp++;
*invert = !*invert;
}
if((first= *sp)==0)
return(0);
tp=shtab_options;
amb=hit=0;
for(;;)
{
t=tp->sh_name;
if(no = *t=='n' && *(t+1)=='o' && *(t+2)!='t')
t+=2;
if(!(c= *t))
break;
if(first == c)
{
if(strcmp(sp,t)==0)
{
*invert ^= no;
return(tp->sh_number);
}
s=sw=sp;
tw=t;
for(;;)
{
if(!*s || *s=='=')
{
if (*s == '=' && !strtol(s+1, NiL, 0))
no = !no;
if (!*t)
{
*invert ^= no;
return(tp->sh_number);
}
if (hit || amb)
{
hit = 0;
amb = 1;
}
else
{
hit = tp->sh_number;
inv = no;
}
break;
}
else if(!*t)
break;
else if(sep(*s))
sw = ++s;
else if(sep(*t))
tw = ++t;
else if(*s==*t)
{
s++;
t++;
}
else if(s==sw && t==tw)
break;
else
{
if(t!=tw)
{
while(*t && !sep(*t))
t++;
if(!*t)
break;
tw = ++t;
}
while (s>sw && *s!=*t)
s--;
}
}
}
tp = (Shtable_t*)((char*)tp+sizeof(*shtab_options));
}
if(hit)
*invert ^= inv;
return(amb ? -1 : hit);
}
/*
* look for the substring <oldsp> in <string> and replace with <newsp>
* The new string is put on top of the stack
*/
char *sh_substitute(const char *string,const char *oldsp,char *newsp)
/*@
assume string!=NULL && oldsp!=NULL && newsp!=NULL;
return x satisfying x==NULL ||
strlen(x)==(strlen(in string)+strlen(in newsp)-strlen(in oldsp));
@*/
{
register const char *sp = string;
register const char *cp;
const char *savesp = 0;
stakseek(0);
if(*sp==0)
return((char*)0);
if(*(cp=oldsp) == 0)
goto found;
mbinit();
do
{
/* skip to first character which matches start of oldsp */
while(*sp && (savesp==sp || *sp != *cp))
{
#if SHOPT_MULTIBYTE
/* skip a whole character at a time */
int c = mbsize(sp);
if(c < 0)
sp++;
while(c-- > 0)
#endif /* SHOPT_MULTIBYTE */
stakputc(*sp++);
}
if(*sp == 0)
return((char*)0);
savesp = sp;
for(;*cp;cp++)
{
if(*cp != *sp++)
break;
}
if(*cp==0)
/* match found */
goto found;
sp = savesp;
cp = oldsp;
}
while(*sp);
return((char*)0);
found:
/* copy new */
stakputs(newsp);
/* copy rest of string */
stakputs(sp);
return(stakfreeze(1));
}
/*
* TRIM(sp)
* Remove escape characters from characters in <sp> and eliminate quoted nulls.
*/
void sh_trim(register char *sp)
/*@
assume sp!=NULL;
promise strlen(in sp) <= in strlen(sp);
@*/
{
register char *dp;
register int c;
if(sp)
{
dp = sp;
while(c= *sp)
{
int len;
if(mbwide() && (len=mbsize(sp))>1)
{
memmove(dp, sp, len);
dp += len;
sp += len;
continue;
}
sp++;
if(c == '\\')
c = *sp++;
if(c)
*dp++ = c;
}
*dp = 0;
}
}
/*
* format string as a csv field
*/
static char *sh_fmtcsv(const char *string)
{
register const char *cp = string;
register int c;
int offset;
if(!cp)
return((char*)0);
offset = staktell();
while((c=mbchar(cp)),isaname(c));
if(c==0)
return((char*)string);
stakputc('"');
stakwrite(string,cp-string);
if(c=='"')
stakputc('"');
string = cp;
while(c=mbchar(cp))
{
if(c=='"')
{
stakwrite(string,cp-string);
string = cp;
stakputc('"');
}
}
if(--cp>string)
stakwrite(string,cp-string);
stakputc('"');
stakputc(0);
return(stakptr(offset));
}
/*
* Returns false if c is an invisible Unicode character, excluding ASCII space.
* Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
* generally not possible as the OS-provided iswgraph(3) doesn't support that
* locale. So do a quick test and do our best with a fallback if necessary.
*/
static int sh_isprint(int c)
{
if(!mbwide()) /* not in multibyte locale? */
return(isprint(c)); /* use plain isprint(3) */
else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */
return(c == ' ' || iswgraph(c)); /* use iswgraph(3) */
else /* fallback: */
return(!(c <= 0x001F || /* control characters */
c >= 0x007F && c <= 0x009F || /* control characters */
c == 0x00A0 || /* non-breaking space */
c == 0x061C || /* arabic letter mark */
c == 0x1680 || /* ogham space mark */
c == 0x180E || /* mongolian vowel separator */
c >= 0x2000 && c <= 0x200F || /* spaces and format characters */
c >= 0x2028 && c <= 0x202F || /* separators and format characters */
c >= 0x205F && c <= 0x206F || /* various format characters */
c == 0x3000 || /* ideographic space */
c == 0xFEFF)); /* zero-width non-breaking space */
}
/*
* print <str> quoting chars so that it can be read by the shell
* puts null-terminated result on stack, but doesn't freeze it
*/
char *sh_fmtq(const char *string)
{
register const char *cp = string, *op;
register int c, state;
int offset;
if(!cp)
return((char*)0);
mbinit();
offset = staktell();
state = ((c= mbchar(cp))==0);
if(isaletter(c))
{
while((c=mbchar(cp)),isaname(c));
if(c==0)
return((char*)string);
if(c=='=')
{
if(*cp==0)
return((char*)string);
if(*cp=='=')
cp++;
c = cp - string;
stakwrite(string,c);
string = cp;
c = mbchar(cp);
}
}
if(c==0 || c=='#' || c=='~')
state = 1;
for(;c;c= mbchar(cp))
{
if(c=='\'' || !sh_isprint(c))
state = 2;
else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT))
state |=1;
}
if(state<2)
{
if(state==1)
stakputc('\'');
if(c = --cp - string)
stakwrite(string,c);
if(state==1)
stakputc('\'');
}
else
{
stakwrite("$'",2);
cp = string;
while(op = cp, c= mbchar(cp))
{
state=1;
switch(c)
{
case ('a'==97?'\033':39):
c = 'E';
break;
case '\n':
c = 'n';
break;
case '\r':
c = 'r';
break;
case '\t':
c = 't';
break;
case '\f':
c = 'f';
break;
case '\b':
c = 'b';
break;
case '\a':
c = 'a';
break;
case '\\': case '\'':
break;
default:
if(mbwide())
{
/* We're in a multibyte locale */
if(c<0 || c<128 && !isprint(c))
{
/* Invalid multibyte char, or unprintable ASCII char: quote as hex byte */
c = *((unsigned char *)op);
cp = op+1;
goto quote_one_byte;
}
if(!sh_isprint(c))
{
/* Unicode hex code */
sfprintf(staksp,"\\u[%x]",c);
continue;
}
}
else if(!isprint(c))
{
quote_one_byte:
sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);
continue;
}
state=0;
break;
}
if(state)
{
stakputc('\\');
stakputc(c);
}
else
stakwrite(op, cp-op);
}
stakputc('\'');
}
stakputc(0);
return(stakptr(offset));
}
/*
* print <str> quoting chars so that it can be read by the shell
* puts null-terminated result on stack, but doesn't freeze it
* single!=0 limits quoting to '...'
* fold>0 prints raw newlines and inserts appropriately
* escaped newlines every (fold-x) chars
*/
char *sh_fmtqf(const char *string, int single, int fold)
{
register const char *cp = string;
register const char *bp;
register const char *vp;
register int c;
register int n;
register int q;
register int a;
int offset;
if (--fold < 8)
fold = 0;
if(single)
return sh_fmtcsv(cp);
if (!cp || !*cp || !fold || fold && strlen(string) < fold)
return sh_fmtq(cp);
offset = staktell();
single = single ? 1 : 3;
c = mbchar(string);
a = isaletter(c) ? '=' : 0;
vp = cp + 1;
do
{
q = 0;
n = fold;
bp = cp;
while ((!n || n-- > 0) && (c = mbchar(cp)))
{
if (a && !isaname(c))
a = 0;
#if SHOPT_MULTIBYTE
if (c >= 0x200)
continue;
if (c == '\'' || !iswprint(c))
#else
if (c == '\'' || !isprint(c))
#endif /* SHOPT_MULTIBYTE */
{
q = single;
break;
}
if (c == '\n')
q = 1;
else if (c == a)
{
stakwrite(bp, cp - bp);
bp = cp;
vp = cp + 1;
a = 0;
}
else if ((c == '#' || c == '~') && cp == vp || c == ']' || c != ':' && (c = sh_lexstates[ST_NORM][c]) && c != S_EPAT)
q = 1;
}
if (q & 2)
{
stakputc('$');
stakputc('\'');
cp = bp;
n = fold - 3;
q = 1;
while (c = mbchar(cp))
{
switch (c)
{
case ('a'==97?'\033':39):
c = 'E';
break;
case '\n':
q = 0;
n = fold - 1;
break;
case '\r':
c = 'r';
break;
case '\t':
c = 't';
break;
case '\f':
c = 'f';
break;
case '\b':
c = 'b';
break;
case '\a':
c = 'a';
break;
case '\\':
if (*cp == 'n')
{
c = '\n';
q = 0;
n = fold - 1;
break;
}
case '\'':
break;
default:
#if SHOPT_MULTIBYTE
if(!iswprint(c))
#else
if(!isprint(c))
#endif
{
if ((n -= 4) <= 0)
{
stakwrite("'\\\n$'", 5);
n = fold - 7;
}
sfprintf(staksp, "\\%03o", c);
continue;
}
q = 0;
break;
}
if ((n -= q + 1) <= 0)
{
if (!q)
{
stakputc('\'');
cp = bp;
break;
}
stakwrite("'\\\n$'", 5);
n = fold - 5;
}
if (q)
stakputc('\\');
else
q = 1;
stakputc(c);
bp = cp;
}
if (!c)
stakputc('\'');
}
else if (q & 1)
{
stakputc('\'');
cp = bp;
n = fold ? (fold - 2) : 0;
while (c = mbchar(cp))
{
if (c == '\n')
n = fold - 1;
else if (n && --n <= 0)
{
n = fold - 2;
stakwrite(bp, --cp - bp);
bp = cp;
stakwrite("'\\\n'", 4);
}
else if (n == 1 && *cp == '\'')
{
n = fold - 5;
stakwrite(bp, --cp - bp);
bp = cp;
stakwrite("'\\\n\\''", 6);
}
else if (c == '\'')
{
stakwrite(bp, cp - bp - 1);
bp = cp;
if (n && (n -= 4) <= 0)
{
n = fold - 5;
stakwrite("'\\\n\\''", 6);
}
else
stakwrite("'\\''", 4);
}
}
stakwrite(bp, cp - bp - 1);
stakputc('\'');
}
else if (n = fold)
{
cp = bp;
while (c = mbchar(cp))
{
if (--n <= 0)
{
n = fold;
stakwrite(bp, --cp - bp);
bp = cp;
stakwrite("\\\n", 2);
}
}
stakwrite(bp, cp - bp - 1);
}
else
stakwrite(bp, cp - bp);
if (c)
{
stakputc('\\');
stakputc('\n');
}
} while (c);
stakputc(0);
return(stakptr(offset));
}
/*
* Find a multi-byte character in a string.
* NOTE: Unlike strchr(3), the return value is an integer offset or -1 if not found.
*/
int sh_strchr(const char *string, register const char *dp)
{
const char *cp;
if(mbwide())
{
wchar_t c, d;
cp = string;
mbinit();
d = mbchar(dp);
mbinit();
while(c = mbchar(cp))
{
if(c==d)
return(cp-string);
}
if(d==0)
return(cp-string);
return(-1);
}
cp = strchr(string,*dp);
return(cp ? cp-string : -1);
}
const char *_sh_translate(const char *message)
{
return(ERROR_translate(0,0,e_dict,message));
}
/*
* change '['identifier']' to identifier
* character before <str> must be a '['
* returns pointer to last character
*/
char *sh_checkid(char *str, char *last)
{
register unsigned char *cp = (unsigned char*)str;
register unsigned char *v = cp;
register int c;
if(c=mbchar(cp),isaletter(c))
while(c=mbchar(cp),isaname(c));
if(c==']' && (!last || ((char*)cp==last)))
{
/* eliminate [ and ] */
while(v < cp)
{
v[-1] = *v;
v++;
}
if(last)
last -=2;
else
{
while(*v)
{
v[-2] = *v;
v++;
}
v[-2] = 0;
last = (char*)v;
}
}
return(last);
}