1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00

Multibyte character handling overhaul; allow global disable

The SHOPT_MULTIBYTE compile-time option did not make much sense as
disabling it only disabled multibyte support for ksh/libshell, not
libast or libcmd built-in commands. This commit allows disabling
multibyte support for the entire codebase by defining the macro
AST_NOMULTIBYTE (e.g. via CCFLAGS). This slightly speeds up the
code and makes an optimised binary about 5% smaller.

src/lib/libast/include/ast.h:
- Add non-multibyte fallback versions of the multibyte macros that
  are used if AST_NOMULTIBYTE is defined. This should cause most
  multibyte handling to be automatically optimised out everywhere.
- Reformat the multibyte macros for legibility.
- Similify mbchar() and and mbsize() macros by defining them in
  terms of mbnchar() and mbnsize(), eliminating code duplication.
- Correct non-multibyte fallback of mbwidth(). For consistent
  behaviour, control characters and out-of-range values should
  return -1 as they do for UTF-8. The fallback is now the same as
  default_wcwidth() in src/lib/libast/comp/setlocale.c.

src/lib/libast/comp/setlocale.c:
- If AST_NOMULTIBYTE is defined, do not compile in the debug and
  UTF-8 locale conversion functions, including several large
  conversion tables. Define their fallback macros as 0 as these are
  used as function pointers.

src/cmd/ksh93/SHOPT.sh,
src/cmd/ksh93/Mamfile:
- Change the SHOPT_MULTIBYTE default to empty, indicating "probe".
- Synchronise SHOPT_MULTIBYTE with !AST_NOMULTIBYTE by default.

src/cmd/ksh93/include/defs.h:
- When SHOPT_MULTIBYTE is zero but AST_NOMULTIBYTE is not non-zero,
  then enable AST_NOMULTIBYTE here to use the ast.h non-multibyte
  fallbacks for ksh. When this is done, the effect is that
  multibyte is optimized out for ksh only, as before.
- Remove previous fallback for disabling multibyte (re: c2cb0eae).

src/cmd/ksh93/include/lexstates.h,
src/cmd/ksh93/sh/lex.c:
- Define SETLEN() macro to assign to LEN (i.e. _Fcin.fclen) for
  multibyte only and do not assign to it directly. With no
  SHOPT_MULTIBYTE, define that macro as empty. This allows removing
  multiple '#if SHOPT_MULTIBYTE' directives from lex.c, as that
  code will all be optimised out automatically if it's disabled.

src/cmd/ksh93/include/national.h,
src/cmd/ksh93/sh/string.c:
- Fix flagrantly incorrect non-multibyte fallback for sh_strchr().
  The latter returns an integer offset (-1 if not found), whereas
  strchr(3) returns a char pointer (NULL if not found). Incorporate
  the fallback into the function for correct handling instead of
  falling back to strchr(3) directly.

src/cmd/ksh93/sh/macro.c:
- lastchar() optimisation: avoid function call if SHOPT_MULTIBYTE
  is enabled but we're not actually in a multibyte locale.

src/cmd/ksh93/sh/name.c:
- Use ja_size() even with SHOPT_MULTIBYTE disabled (re: 2182ecfa).
  Though no regression tests failed, the non-multibyte fallback for
  typeset -L/-R/-Z length calculation was probably not quite
  correct as ja_size() does more. The ast.h change to mbwidth()
  ensures correct behaviour for non-multibyte locales.

src/cmd/ksh93/tests/shtests:
- Since its value in SHOPT.sh is now empty by default, add a quick
  feature test (for the length of the UTF-8 character 'é') to check
  if SHOPT_MULTIBYTE needs to be enabled for the regression tests.
This commit is contained in:
Martijn Dekker 2022-07-07 21:58:23 +02:00
parent 59e79dc026
commit 7c4418ccdc
16 changed files with 147 additions and 101 deletions

View file

@ -232,6 +232,8 @@ native_setlocale(int category, const char* locale)
#define DZ (DB-DX*DC+1) /* wchar_t embedded size bits */
#define DD 3 /* # mb delimiter chars <n...> */
#if !AST_NOMULTIBYTE
static unsigned char debug_order[] =
{
0, 1, 2, 3, 4, 5, 6, 7,
@ -490,6 +492,18 @@ debug_strcoll(const char* a, const char* b)
return strcmp(ab, bb);
}
#else
#define debug_mbtowc 0
#define debug_wctomb 0
#define debug_mblen 0
#define debug_wcwidth 0
#define debug_alpha 0
#define debug_strxfrm 0
#define debug_strcoll 0
#endif /* !AST_NOMULTIBYTE */
/*
* default locale
*/
@ -529,7 +543,7 @@ set_collate(Lc_category_t* cp)
* workaround the interesting SJIS that translates unshifted 7 bit ASCII!
*/
#if _hdr_wchar && _typ_mbstate_t && _lib_mbrtowc
#if _hdr_wchar && _typ_mbstate_t && _lib_mbrtowc && !AST_NOMULTIBYTE
#define mb_state_zero ((mbstate_t*)&ast.pad[sizeof(ast.pad)-2*sizeof(mbstate_t)])
#define mb_state ((mbstate_t*)&ast.pad[sizeof(ast.pad)-sizeof(mbstate_t)])
@ -547,6 +561,8 @@ sjis_mbtowc(register wchar_t* p, register const char* s, size_t n)
#endif
#if !AST_NOMULTIBYTE
static int
utf8_wctomb(char* u, wchar_t w)
{
@ -593,8 +609,6 @@ utf8_mbtowc(wchar_t* wp, const char* str, size_t n)
register int c;
register wchar_t w = 0;
if (!wp && !sp)
ast.mb_sync = 0; /* assume call from mbinit() macro: reset global multibyte sync state */
if (!sp || !n)
return 0;
if ((m = utf8tab[*sp]) > 0)
@ -2196,6 +2210,16 @@ utf8_alpha(wchar_t c)
return !!(utf8_wam[(c >> 3) & 0x1fff] & (1 << (c & 0x7)));
}
#else
#define utf8_wctomb 0
#define utf8_mbtowc 0
#define utf8_mblen 0
#define utf8_wcwidth 0
#define utf8_alpha 0
#endif /* !AST_NOMULTIBYTE */
#if !_hdr_wchar || !_lib_wctype || !_lib_iswctype
#undef iswalpha
#define iswalpha default_iswalpha

View file

@ -208,22 +208,46 @@ typedef struct
* multibyte macros
*/
#define mbmax() (ast.mb_cur_max)
#define mberr() (ast.tmp_int<0)
#if !AST_NOMULTIBYTE
#define mbcoll() (ast.mb_xfrm!=0)
#define mbwide() (mbmax()>1)
#define mbmax() ( ast.mb_cur_max )
#define mberr() ( ast.tmp_int < 0 )
#define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
#define mbchar(p) (mbwide()?((ast.tmp_int=(*ast.mb_towc)(&ast.tmp_wchar,(char*)(p),mbmax()))>0?((p+=ast.tmp_int),ast.tmp_wchar):(p+=ast.mb_sync+1,ast.tmp_int)):(*(unsigned char*)(p++)))
#define mbnchar(p,n) (mbwide()?((ast.tmp_int=(*ast.mb_towc)(&ast.tmp_wchar,(char*)(p),n))>0?((p+=ast.tmp_int),ast.tmp_wchar):(p+=ast.mb_sync+1,ast.tmp_int)):(*(unsigned char*)(p++)))
#define mbinit() (mbwide()?(*ast.mb_towc)((wchar_t*)0,(char*)0,mbmax()):0)
#define mbsize(p) (mbwide()?(*ast.mb_len)((char*)(p),mbmax()):((p),1))
#define mbnsize(p,n) (mbwide()?(*ast.mb_len)((char*)(p),n):((p),1))
#define mbconv(s,w) (ast.mb_conv?(*ast.mb_conv)(s,w):((*(s)=(w)),1))
#define mbwidth(w) (ast.mb_width?(*ast.mb_width)(w):1)
#define mbxfrm(t,f,n) (mbcoll()?(*ast.mb_xfrm)((char*)(t),(char*)(f),n):0)
#define mbalpha(w) (ast.mb_alpha?(*ast.mb_alpha)(w):isalpha((w)&0xff))
#define mbcoll() ( ast.mb_xfrm != 0 )
#define mbwide() ( mbmax() > 1 )
#define mb2wc(w,p,n) ( *ast.mb_towc)(&w, (char*)p, n )
#define mbchar(p) mbnchar(p, mbmax())
#define mbnchar(p,n) ( mbwide() ? ( (ast.tmp_int = (*ast.mb_towc)(&ast.tmp_wchar, (char*)(p), n)) > 0 ? \
( (p+=ast.tmp_int),ast.tmp_wchar) : (p+=ast.mb_sync+1,ast.tmp_int) ) : (*(unsigned char*)(p++)) )
#define mbinit() ( ast.mb_sync = 0 )
#define mbsize(p) mbnsize(p, mbmax())
#define mbnsize(p,n) ( mbwide() ? (*ast.mb_len)((char*)(p), n) : ((p), 1) )
#define mbconv(s,w) ( ast.mb_conv ? (*ast.mb_conv)(s,w) : ((*(s)=(w)), 1) )
#define mbwidth(w) ( ast.mb_width ? (*ast.mb_width)(w) : (w >= 0 && w <= 255 && !iscntrl(w) ? 1 : -1) )
#define mbxfrm(t,f,n) ( mbcoll() ? (*ast.mb_xfrm)((char*)(t), (char*)(f), n) : 0 )
#define mbalpha(w) ( ast.mb_alpha ? (*ast.mb_alpha)(w) : isalpha((w) & 0xff) )
#else
#define mbmax() 1
#define mberr() 0
#define mbcoll() 0
#define mbwide() 0
#define mb2wc(w,p,n) ( (w) = *(unsigned char*)(p), 1 )
#define mbchar(p) ( *(unsigned char*)(p++) )
#define mbnchar(p,n) mbchar(p)
#define mbinit() 0
#define mbsize(p) 1
#define mbnsize(p,n) 1
#define mbconv(s,w) ( (*(s)=(w)), 1 )
#define mbwidth(w) ( w >= 0 && w <= 255 && !iscntrl(w) ? 1 : -1 )
#define mbxfrm(t,f,n) 0
#define mbalpha(w) ( isalpha((w) & 0xff) )
#endif /* !AST_NOMULTIBYTE */
/*
* common macros