From c2cb0eae198f2c550147704abc5fe56b77b82473 Mon Sep 17 00:00:00 2001
From: Martijn Dekker <martijn@inlv.org>
Date: Thu, 18 Feb 2021 14:46:10 +0000
Subject: [PATCH] Make 'read' compatible with Shift-JIS

This commit fixes a bug in the 'read' built-in: it did not properly
skip over multibyte characters. The bug never affects UTF-8 locales
because all UTF-8 bytes have the high-order bit set. But Shift-JIS
characters may include a byte corresponding to the ASCII backslash
character, which cauased buggy behaviour when using 'read' without
the '-r' option that disables backslash escape processing.

It also makes the regression tests compatible with Shift-JIS
locales. They failed with syntax errors.

src/cmd/ksh93/bltins/read.c:
- Use the multibyte macros when skipping over word characters.
  Based on a patch from the old ast-developers mailing list:
  https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01848.html

src/cmd/ksh93/include/defs.h:
- Be a bit smarter about causing the compiler to optimise out
  multibyte code when SHOPT_MULTIBYTE is disabled. See the updated
  comment for details.

src/cmd/ksh93/tests/locale.sh:
- Put all the supported locales in an array for future tests.
- Add test for the 'read' bug. Include it in a loop that tests
  64 SHIFT-JIS character combinations. Only one fails on old ksh:
  the one where the final byte corresponds to the ASCII backslash.
  It doesn't hurt to test all the others anyway.

src/cmd/ksh93/tests/basic.sh,
src/cmd/ksh93/tests/builtins.sh,
src/cmd/ksh93/tests/quoting2.sh:
- Fix syntax errors that occurred in SHIFT-JIS locales as the
  parser was processing literal UTF-8 characters. Not executing
  that code is not enough; we need to make sure it never gets
  parsed as well. This is done by wrapping the commands containing
  literal UTF-8 strings in an 'eval' command as a single-quoted
  operand.

.github/workflows/ci.yml:
- Run the tests in the ja_JP.SJIS locale instead of ja_JP.UTF-8.
  UTF-8 is already covered by the nl_NL.UTF-8 test run; that should
  be good enough.
---
 .github/workflows/ci.yml        |  4 ++--
 NEWS                            |  5 +++++
 src/cmd/ksh93/bltins/read.c     | 11 ++++++++++-
 src/cmd/ksh93/include/defs.h    | 11 ++++++++---
 src/cmd/ksh93/include/version.h |  2 +-
 src/cmd/ksh93/tests/basic.sh    | 19 +++++++++----------
 src/cmd/ksh93/tests/builtins.sh |  7 ++++---
 src/cmd/ksh93/tests/locale.sh   | 24 +++++++++++++++++++-----
 src/cmd/ksh93/tests/quoting2.sh | 13 +++++++------
 9 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f8b55d64..b053600e4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -20,9 +20,9 @@ jobs:
         ulimit -n 1024
         : default regression tests &&
         script -q -e -c "bin/shtests" &&
-        : regression tests with OS-provided UTF-8 locales &&
+        : regression tests with OS-provided multibyte locales &&
         LANG=nl_NL.UTF-8 script -q -e -c "bin/shtests --locale --nocompile" &&
-        LANG=ja_JP.UTF-8 script -q -e -c "bin/shtests --locale --nocompile" &&
+        LANG=ja_JP.SJIS  script -q -e -c "bin/shtests --locale --nocompile" &&
         : disable most SHOPTs, rebuild ksh &&
         sed --regexp-extended --in-place=.orig \
           '/^SHOPT (2DMATCH|AUDIT|BGX|BRACEPAT|DYNAMIC|EDPREDICT|ESH|FIXEDARRAY|HISTEXPAND|MULTIBYTE|NAMESPACE|OPTIMIZE|SUID_EXEC|STATS|VSH)=/ s/=1/=0/' \
diff --git a/NEWS b/NEWS
index 0d9e57ad4..a6071529e 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,11 @@ For full details, see the git log at: https://github.com/ksh93/ksh
 
 Any uppercase BUG_* names are modernish shell bug IDs.
 
+2021-02-18:
+
+- A bug was fixed in the 'read' builtin that caused it to fail to process
+  multibyte characters properly in Shift-JIS locales.
+
 2021-02-17:
 
 - Emacs mode fixes:
diff --git a/src/cmd/ksh93/bltins/read.c b/src/cmd/ksh93/bltins/read.c
index 76e7aff60..a2428c432 100644
--- a/src/cmd/ksh93/bltins/read.c
+++ b/src/cmd/ksh93/bltins/read.c
@@ -538,11 +538,13 @@ int sh_readline(register Shell_t *shp,char **names, volatile int fd, int flags,s
 		c = S_NL;
 	shp->nextprompt = 2;
 	rel= staktell();
+	mbinit();
 	/* val==0 at the start of a field */
 	val = 0;
 	del = 0;
 	while(1)
 	{
+		ssize_t mbsz;
 		switch(c)
 		{
 #if SHOPT_MULTIBYTE
@@ -679,11 +681,18 @@ int sh_readline(register Shell_t *shp,char **names, volatile int fd, int flags,s
 			}
 			/* skip over word characters */
 			wrd = -1;
+			/* skip a preceding multibyte character, if any */
+			if(c == 0 && (mbsz = mbsize(cp-1)) > 1)
+				cp += mbsz - 1;
 			while(1)
 			{
-				while((c=shp->ifstable[*cp++])==0)
+				while((c = shp->ifstable[*cp]) == 0)
+				{
+					cp += (mbsz = mbsize(cp)) > 1 ? mbsz : 1;	/* treat invalid char as 1 byte */
 					if(!wrd)
 						wrd = 1;
+				}
+				cp++;
 				if(inquote)
 				{
 					if(c==S_QUOTE)
diff --git a/src/cmd/ksh93/include/defs.h b/src/cmd/ksh93/include/defs.h
index ea4479d56..2a16e6a52 100644
--- a/src/cmd/ksh93/include/defs.h
+++ b/src/cmd/ksh93/include/defs.h
@@ -33,9 +33,14 @@
 #error libast version 20111111 or later is required
 #endif
 #if !SHOPT_MULTIBYTE
-    /* disable multibyte without need for further '#if SHOPT_MULTIBYTE' */
-#   undef mbwide
-#   define mbwide()	0
+    /*
+     * Disable multibyte without need for excessive '#if SHOPT_MULTIBYTE' peprocessor conditionals.
+     * If we redefine the maximum character size mbmax() as 1 byte, the mbwide() macro will always
+     * evaluate to 0. All the other multibyte macros have multibtye code conditional upon mbwide(),
+     * so the compiler should optimize all of that code away. See src/lib/libast/include/ast.h
+     */
+#   undef mbmax
+#   define mbmax()	1
 #endif
 
 /*
diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h
index 14afcfb78..d653eab5a 100644
--- a/src/cmd/ksh93/include/version.h
+++ b/src/cmd/ksh93/include/version.h
@@ -20,7 +20,7 @@
 
 #define SH_RELEASE_FORK	"93u+m"		/* only change if you develop a new ksh93 fork */
 #define SH_RELEASE_SVER	"1.0.0-alpha"	/* semantic version number: https://semver.org */
-#define SH_RELEASE_DATE	"2021-02-17"	/* must be in this format for $((.sh.version)) */
+#define SH_RELEASE_DATE	"2021-02-18"	/* must be in this format for $((.sh.version)) */
 #define SH_RELEASE_CPYR	"(c) 2020-2021 Contributors to ksh " SH_RELEASE_FORK
 
 /* Scripts sometimes field-split ${.sh.version}, so don't change amount of whitespace. */
diff --git a/src/cmd/ksh93/tests/basic.sh b/src/cmd/ksh93/tests/basic.sh
index 67e064b37..caccf3d04 100755
--- a/src/cmd/ksh93/tests/basic.sh
+++ b/src/cmd/ksh93/tests/basic.sh
@@ -603,16 +603,15 @@ eu=$(
 
 # ======
 # Expansion of multibyte characters after expansion of single-character names $1..$9, $?, $!, $-, etc.
-function exptest
-{
-	print -r "$1テスト"
-	print -r "$?テスト"
-	print -r "$#テスト"
-}
-expect=$'fooテスト\n0テスト\n1テスト'
-actual=$(exptest foo)
-[[ $actual == "$expect" ]] || err_exit 'Corruption of multibyte char following expansion of single-char name' \
-	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
+( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
+	eval 'function exptest { print -r "$1テスト"; print -r "$?テスト" ; print -r "$#テスト"; }'
+	eval 'expect=$'\''fooテスト\n0テスト\n1テスト'\'
+	actual=$(exptest foo)
+	[[ $actual == "$expect" ]] || err_exit 'Corruption of multibyte char following expansion of single-char name' \
+		"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+	;;
+esac
 
 # ======
 # ksh didn't rewrite argv correctly (rhbz#1047506)
diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh
index 3e357696a..911253178 100755
--- a/src/cmd/ksh93/tests/builtins.sh
+++ b/src/cmd/ksh93/tests/builtins.sh
@@ -295,8 +295,9 @@ actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc')
 case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
 ( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
 	# HTML encoding UTF-8 characters
-	expect='正常終了 正常終了'
-	actual=$(printf %H '正常終了 正常終了')
+	# (UTF-8 literal characters wrapped in 'eval' to avoid syntax error on ja_JP.SJIS)
+	eval 'expect='\''正常終了 正常終了'\'
+	eval 'actual=$(printf %H '\''正常終了 正常終了'\'')'
 	[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
 				"(expected $expect; got $actual)"
 	expect='w?h?á?t??'
@@ -313,7 +314,7 @@ case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
 	[[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \
 				"(expected $expect; got $actual)"
 	expect='%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86%20%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86'
-	actual=$(printf %#H '正常終了 正常終了')
+	eval 'actual=$(printf %#H '\''正常終了 正常終了'\'')'
 	[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
 				"(expected $expect; got $actual)"
 	expect='%C2%AB%20l%E2%80%99ab%C3%AEme%20de%20mon%C2%A0m%C3%A9tier%E2%80%A6%20%C2%BB'
diff --git a/src/cmd/ksh93/tests/locale.sh b/src/cmd/ksh93/tests/locale.sh
index 85b4cc374..3310e118e 100755
--- a/src/cmd/ksh93/tests/locale.sh
+++ b/src/cmd/ksh93/tests/locale.sh
@@ -30,7 +30,11 @@ integer Errors=0
 
 [[ -d $tmp && -w $tmp && $tmp == "$PWD" ]] || { err\_exit "$LINENO" '$tmp not set; run this from shtests. Aborting.'; exit 1; }
 
-unset LANG ${!LC_*}
+unset LANG LANGUAGE "${!LC_@}"
+
+IFS=$'\n'; set -o noglob
+typeset -a locales=( $(command -p locale -a 2>/dev/null) )
+IFS=$' \t\n'
 
 a=$($SHELL -c '/' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,")
 b=$($SHELL -c '(LC_ALL=debug / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,")
@@ -38,13 +42,13 @@ b=$($SHELL -c '(LC_ALL=debug / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s,
 b=$($SHELL -c '(LC_ALL=debug; / 2>/dev/null); /' 2>&1 | sed -e "s,.*: *,," -e "s, *\[.*,,")
 [[ "$b" == "$a" ]] || err_exit "locale not restored after subshell -- expected '$a', got '$b'"
 
-if((SHOPT_MULTIBYTE)); then
 # test shift-jis \x81\x40 ... \x81\x7E encodings
 # (shift char followed by 7 bit ascii)
 
 typeset -i16 chr
-for locale in $(command -p locale -a 2>/dev/null | grep -i jis)
-do	export LC_ALL=$locale
+((SHOPT_MULTIBYTE)) && for locale in "${locales[@]}"
+do	[[ $locale == *[Jj][Ii][Ss] ]] || continue
+	export LC_ALL=$locale
 	for ((chr=0x40; chr<=0x7E; chr++))
 	do	c=${chr#16#}
 		for s in \\x81\\x$c \\x$c
@@ -55,9 +59,18 @@ do	export LC_ALL=$locale
 			q=$(print -- "$b")
 			[[ $u == "$q" ]] || err_exit "LC_ALL=$locale quoted print difference for \"$s\" -- $b => '$u' vs \"$b\" => '$q'"
 		done
+
+		# https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01848.html
+		# In Shift_JIS (unlike in UTF-8), the trailing bytes of a multibyte character may
+		# contain a byte without the high-order bit set. If the final byte happened to
+		# correspond to an ASCII backslash (\x5C), 'read' would incorrectly treat this as a
+		# dangling final backslash (which is invalid) and return a nonzero exit status.
+		# Note that the byte sequence '\x95\x5c' represents a multibyte character U+8868,
+		# whereas '\x5c' is a backslash when interpreted as a single-byte character.
+		printf "\x95\x$c\n" | read x || err_exit "'read' doesn't skip multibyte input correctly ($LC_ALL, \x95\x$c)"
 	done
 done
-fi # SHOPT_MULTIBYTE
+unset LC_ALL
 
 # this locale is supported by ast on all platforms
 # EU for { decimal_point="," thousands_sep="." }
@@ -352,5 +365,6 @@ then	LC_ALL=en_US.UTF-8
 	
 fi
 
+# ======
 exit $((Errors<125?Errors:125))
 
diff --git a/src/cmd/ksh93/tests/quoting2.sh b/src/cmd/ksh93/tests/quoting2.sh
index 8f11ea490..4572d786d 100755
--- a/src/cmd/ksh93/tests/quoting2.sh
+++ b/src/cmd/ksh93/tests/quoting2.sh
@@ -233,16 +233,17 @@ LC_CTYPE=POSIX true	    # on buggy ksh, a locale re-init via temp assignment res
 # shell-quoting UTF-8 characters: check for unnecessary encoding
 case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
 ( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
-	expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\''
-	actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
+	# must wrap literal UTF-8 characters in 'eval' to avoid syntax error in ja_JP.SJIS
+	eval 'expect=$'\''$\'\''عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'\'''\'
+	eval 'actual=$(printf %q '\''عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.'\'')'
 	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \
 				"(expected $expect; got $actual)"
-	expect="'正常終了 正常終了'"
-	actual=$(printf %q '正常終了 正常終了')
+	eval 'expect="'\''正常終了 正常終了'\''"'
+	eval 'actual=$(printf %q '\''正常終了 正常終了'\'')'
 	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \
 				"(expected $expect; got $actual)"
-	expect="'aeu aéu'"
-	actual=$(printf %q 'aeu aéu')
+	eval 'expect="'\''aeu aéu'\''"'
+	eval 'actual=$(printf %q '\''aeu aéu'\'')'
 	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \
 				"(expected $expect; got $actual)"
 	expect=$'$\'\\x86\\u[86]\\xf0\\x96v\\xa7\\xb5\''