From c2cb0eae198f2c550147704abc5fe56b77b82473 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Thu, 18 Feb 2021 14:46:10 +0000 Subject: [PATCH] Make 'read' compatible with Shift-JIS This commit fixes a bug in the 'read' built-in: it did not properly skip over multibyte characters. The bug never affects UTF-8 locales because all UTF-8 bytes have the high-order bit set. But Shift-JIS characters may include a byte corresponding to the ASCII backslash character, which cauased buggy behaviour when using 'read' without the '-r' option that disables backslash escape processing. It also makes the regression tests compatible with Shift-JIS locales. They failed with syntax errors. src/cmd/ksh93/bltins/read.c: - Use the multibyte macros when skipping over word characters. Based on a patch from the old ast-developers mailing list: https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01848.html src/cmd/ksh93/include/defs.h: - Be a bit smarter about causing the compiler to optimise out multibyte code when SHOPT_MULTIBYTE is disabled. See the updated comment for details. src/cmd/ksh93/tests/locale.sh: - Put all the supported locales in an array for future tests. - Add test for the 'read' bug. Include it in a loop that tests 64 SHIFT-JIS character combinations. Only one fails on old ksh: the one where the final byte corresponds to the ASCII backslash. It doesn't hurt to test all the others anyway. src/cmd/ksh93/tests/basic.sh, src/cmd/ksh93/tests/builtins.sh, src/cmd/ksh93/tests/quoting2.sh: - Fix syntax errors that occurred in SHIFT-JIS locales as the parser was processing literal UTF-8 characters. Not executing that code is not enough; we need to make sure it never gets parsed as well. This is done by wrapping the commands containing literal UTF-8 strings in an 'eval' command as a single-quoted operand. .github/workflows/ci.yml: - Run the tests in the ja_JP.SJIS locale instead of ja_JP.UTF-8. UTF-8 is already covered by the nl_NL.UTF-8 test run; that should be good enough. --- .github/workflows/ci.yml | 4 ++-- NEWS | 5 +++++ src/cmd/ksh93/bltins/read.c | 11 ++++++++++- src/cmd/ksh93/include/defs.h | 11 ++++++++--- src/cmd/ksh93/include/version.h | 2 +- src/cmd/ksh93/tests/basic.sh | 19 +++++++++---------- src/cmd/ksh93/tests/builtins.sh | 7 ++++--- src/cmd/ksh93/tests/locale.sh | 24 +++++++++++++++++++----- src/cmd/ksh93/tests/quoting2.sh | 13 +++++++------ 9 files changed, 65 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f8b55d64..b053600e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,9 +20,9 @@ jobs: ulimit -n 1024 : default regression tests && script -q -e -c "bin/shtests" && - : regression tests with OS-provided UTF-8 locales && + : regression tests with OS-provided multibyte locales && LANG=nl_NL.UTF-8 script -q -e -c "bin/shtests --locale --nocompile" && - LANG=ja_JP.UTF-8 script -q -e -c "bin/shtests --locale --nocompile" && + LANG=ja_JP.SJIS script -q -e -c "bin/shtests --locale --nocompile" && : disable most SHOPTs, rebuild ksh && sed --regexp-extended --in-place=.orig \ '/^SHOPT (2DMATCH|AUDIT|BGX|BRACEPAT|DYNAMIC|EDPREDICT|ESH|FIXEDARRAY|HISTEXPAND|MULTIBYTE|NAMESPACE|OPTIMIZE|SUID_EXEC|STATS|VSH)=/ s/=1/=0/' \ diff --git a/NEWS b/NEWS index 0d9e57ad4..a6071529e 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,11 @@ For full details, see the git log at: https://github.com/ksh93/ksh Any uppercase BUG_* names are modernish shell bug IDs. +2021-02-18: + +- A bug was fixed in the 'read' builtin that caused it to fail to process + multibyte characters properly in Shift-JIS locales. + 2021-02-17: - Emacs mode fixes: diff --git a/src/cmd/ksh93/bltins/read.c b/src/cmd/ksh93/bltins/read.c index 76e7aff60..a2428c432 100644 --- a/src/cmd/ksh93/bltins/read.c +++ b/src/cmd/ksh93/bltins/read.c @@ -538,11 +538,13 @@ int sh_readline(register Shell_t *shp,char **names, volatile int fd, int flags,s c = S_NL; shp->nextprompt = 2; rel= staktell(); + mbinit(); /* val==0 at the start of a field */ val = 0; del = 0; while(1) { + ssize_t mbsz; switch(c) { #if SHOPT_MULTIBYTE @@ -679,11 +681,18 @@ int sh_readline(register Shell_t *shp,char **names, volatile int fd, int flags,s } /* skip over word characters */ wrd = -1; + /* skip a preceding multibyte character, if any */ + if(c == 0 && (mbsz = mbsize(cp-1)) > 1) + cp += mbsz - 1; while(1) { - while((c=shp->ifstable[*cp++])==0) + while((c = shp->ifstable[*cp]) == 0) + { + cp += (mbsz = mbsize(cp)) > 1 ? mbsz : 1; /* treat invalid char as 1 byte */ if(!wrd) wrd = 1; + } + cp++; if(inquote) { if(c==S_QUOTE) diff --git a/src/cmd/ksh93/include/defs.h b/src/cmd/ksh93/include/defs.h index ea4479d56..2a16e6a52 100644 --- a/src/cmd/ksh93/include/defs.h +++ b/src/cmd/ksh93/include/defs.h @@ -33,9 +33,14 @@ #error libast version 20111111 or later is required #endif #if !SHOPT_MULTIBYTE - /* disable multibyte without need for further '#if SHOPT_MULTIBYTE' */ -# undef mbwide -# define mbwide() 0 + /* + * Disable multibyte without need for excessive '#if SHOPT_MULTIBYTE' peprocessor conditionals. + * If we redefine the maximum character size mbmax() as 1 byte, the mbwide() macro will always + * evaluate to 0. All the other multibyte macros have multibtye code conditional upon mbwide(), + * so the compiler should optimize all of that code away. See src/lib/libast/include/ast.h + */ +# undef mbmax +# define mbmax() 1 #endif /* diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h index 14afcfb78..d653eab5a 100644 --- a/src/cmd/ksh93/include/version.h +++ b/src/cmd/ksh93/include/version.h @@ -20,7 +20,7 @@ #define SH_RELEASE_FORK "93u+m" /* only change if you develop a new ksh93 fork */ #define SH_RELEASE_SVER "1.0.0-alpha" /* semantic version number: https://semver.org */ -#define SH_RELEASE_DATE "2021-02-17" /* must be in this format for $((.sh.version)) */ +#define SH_RELEASE_DATE "2021-02-18" /* must be in this format for $((.sh.version)) */ #define SH_RELEASE_CPYR "(c) 2020-2021 Contributors to ksh " SH_RELEASE_FORK /* Scripts sometimes field-split ${.sh.version}, so don't change amount of whitespace. */ diff --git a/src/cmd/ksh93/tests/basic.sh b/src/cmd/ksh93/tests/basic.sh index 67e064b37..caccf3d04 100755 --- a/src/cmd/ksh93/tests/basic.sh +++ b/src/cmd/ksh93/tests/basic.sh @@ -603,16 +603,15 @@ eu=$( # ====== # Expansion of multibyte characters after expansion of single-character names $1..$9, $?, $!, $-, etc. -function exptest -{ - print -r "$1テスト" - print -r "$?テスト" - print -r "$#テスト" -} -expect=$'fooテスト\n0テスト\n1テスト' -actual=$(exptest foo) -[[ $actual == "$expect" ]] || err_exit 'Corruption of multibyte char following expansion of single-char name' \ - "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" +case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in +( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) + eval 'function exptest { print -r "$1テスト"; print -r "$?テスト" ; print -r "$#テスト"; }' + eval 'expect=$'\''fooテスト\n0テスト\n1テスト'\' + actual=$(exptest foo) + [[ $actual == "$expect" ]] || err_exit 'Corruption of multibyte char following expansion of single-char name' \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" + ;; +esac # ====== # ksh didn't rewrite argv correctly (rhbz#1047506) diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index 3e357696a..911253178 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -295,8 +295,9 @@ actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in ( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) # HTML encoding UTF-8 characters - expect='正常終了 正常終了' - actual=$(printf %H '正常終了 正常終了') + # (UTF-8 literal characters wrapped in 'eval' to avoid syntax error on ja_JP.SJIS) + eval 'expect='\''正常終了 正常終了'\' + eval 'actual=$(printf %H '\''正常終了 正常終了'\'')' [[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \ "(expected $expect; got $actual)" expect='w?h?á?t??' @@ -313,7 +314,7 @@ case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in [[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \ "(expected $expect; got $actual)" expect='%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86%20%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86' - actual=$(printf %#H '正常終了 正常終了') + eval 'actual=$(printf %#H '\''正常終了 正常終了'\'')' [[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \ "(expected $expect; got $actual)" expect='%C2%AB%20l%E2%80%99ab%C3%AEme%20de%20mon%C2%A0m%C3%A9tier%E2%80%A6%20%C2%BB' diff --git a/src/cmd/ksh93/tests/locale.sh b/src/cmd/ksh93/tests/locale.sh index 85b4cc374..3310e118e 100755 --- a/src/cmd/ksh93/tests/locale.sh +++ b/src/cmd/ksh93/tests/locale.sh @@ -30,7 +30,11 @@ integer Errors=0 [[ -d $tmp && -w $tmp && $tmp == "$PWD" ]] || { err\_exit "$LINENO" '$tmp not set; run this from shtests. Aborting.'; exit 1; } -unset LANG ${!LC_*} +unset LANG LANGUAGE "${!LC_@}" + +IFS=$'\n'; set -o noglob +typeset -a locales=( $(command -p locale -a 2>/dev/null) ) +IFS=$' \t\n' a=$($SHELL -c '/' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,") b=$($SHELL -c '(LC_ALL=debug / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,") @@ -38,13 +42,13 @@ b=$($SHELL -c '(LC_ALL=debug / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s, b=$($SHELL -c '(LC_ALL=debug; / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,") [[ "$b" == "$a" ]] || err_exit "locale not restored after subshell -- expected '$a', got '$b'" -if((SHOPT_MULTIBYTE)); then # test shift-jis \x81\x40 ... \x81\x7E encodings # (shift char followed by 7 bit ascii) typeset -i16 chr -for locale in $(command -p locale -a 2>/dev/null | grep -i jis) -do export LC_ALL=$locale +((SHOPT_MULTIBYTE)) && for locale in "${locales[@]}" +do [[ $locale == *[Jj][Ii][Ss] ]] || continue + export LC_ALL=$locale for ((chr=0x40; chr<=0x7E; chr++)) do c=${chr#16#} for s in \\x81\\x$c \\x$c @@ -55,9 +59,18 @@ do export LC_ALL=$locale q=$(print -- "$b") [[ $u == "$q" ]] || err_exit "LC_ALL=$locale quoted print difference for \"$s\" -- $b => '$u' vs \"$b\" => '$q'" done + + # https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01848.html + # In Shift_JIS (unlike in UTF-8), the trailing bytes of a multibyte character may + # contain a byte without the high-order bit set. If the final byte happened to + # correspond to an ASCII backslash (\x5C), 'read' would incorrectly treat this as a + # dangling final backslash (which is invalid) and return a nonzero exit status. + # Note that the byte sequence '\x95\x5c' represents a multibyte character U+8868, + # whereas '\x5c' is a backslash when interpreted as a single-byte character. + printf "\x95\x$c\n" | read x || err_exit "'read' doesn't skip multibyte input correctly ($LC_ALL, \x95\x$c)" done done -fi # SHOPT_MULTIBYTE +unset LC_ALL # this locale is supported by ast on all platforms # EU for { decimal_point="," thousands_sep="." } @@ -352,5 +365,6 @@ then LC_ALL=en_US.UTF-8 fi +# ====== exit $((Errors<125?Errors:125)) diff --git a/src/cmd/ksh93/tests/quoting2.sh b/src/cmd/ksh93/tests/quoting2.sh index 8f11ea490..4572d786d 100755 --- a/src/cmd/ksh93/tests/quoting2.sh +++ b/src/cmd/ksh93/tests/quoting2.sh @@ -233,16 +233,17 @@ LC_CTYPE=POSIX true # on buggy ksh, a locale re-init via temp assignment res # shell-quoting UTF-8 characters: check for unnecessary encoding case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in ( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) - expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'' - actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') + # must wrap literal UTF-8 characters in 'eval' to avoid syntax error in ja_JP.SJIS + eval 'expect=$'\''$\'\''عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'\'''\' + eval 'actual=$(printf %q '\''عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.'\'')' [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \ "(expected $expect; got $actual)" - expect="'正常終了 正常終了'" - actual=$(printf %q '正常終了 正常終了') + eval 'expect="'\''正常終了 正常終了'\''"' + eval 'actual=$(printf %q '\''正常終了 正常終了'\'')' [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \ "(expected $expect; got $actual)" - expect="'aeu aéu'" - actual=$(printf %q 'aeu aéu') + eval 'expect="'\''aeu aéu'\''"' + eval 'actual=$(printf %q '\''aeu aéu'\'')' [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \ "(expected $expect; got $actual)" expect=$'$\'\\x86\\u[86]\\xf0\\x96v\\xa7\\xb5\''