Fix shellquoting of invalid multibyte char (re: f9d28935, 8c7c60ec)

This commit fixes two bugs in the generation of $'...' shellquoted strings: 1. A bug introduced in f9d28935. In UTF-8 locales, a byte that is invalid in UTF-8, e.g. hex byte 86, would be shellquoted as \u[86], which is not the same as the correct quoting, \x86. 2. A bug inherited from 93u+. Single bytes (e.g. hex 11) were always quoted as \x11 and not \x[11], even if a subsequent character was a hexadecimal digit. However, the parser reads past two hexadecimal digits, so we got: $ printf '%q\n' $'\x[11]1' $'\x111' $ printf $'\x111' | od -t x1 0000000 c4 91 0000002 After the bug fix, this works correctly: $ printf '%q\n' $'\x[11]1' $'\x[11]1' $ printf $'\x[11]1' | od -t x1 0000000 11 31 0000002 src/cmd/ksh93/sh/string.c: sh_fmtq(): - Make the multibyte code for $'...' more readable, eliminating the 'isbyte' flag. - When in a multibyte locale, make sure to shellquote both invalid multibyte characters and unprintable ASCII characters as hexadecimal bytes (\xNN). This reinstates 93u+ behaviour. - When quoting bytes, use isxdigit(3) to determine if the next character is a hex digit, and if so, protect the quoted byte with square brackets. src/cmd/ksh93/tests/quoting2.sh: - Move the 'printf %q' shellquoting regression tests here from builtins.sh; they test the shellquoting algorithm, not so much the printf builtin itself. - Add regression tests for these bugs.
2025-03-09 15:50:02 +00:00 · 2020-08-05 18:22:22 +01:00 · 2020-08-05 18:22:22 +01:00 · ac8991e525
commit ac8991e525
parent e53177abca
4 changed files with 73 additions and 48 deletions
--- a/src/cmd/ksh93/tests/builtins.sh
+++ b/src/cmd/ksh93/tests/builtins.sh
@ -303,39 +303,6 @@ if	[[ $(printf '%..*s\n' : abc def) != abc:def ]]
 then	err_exit "printf '%..*s' not working"
 fi

-# ======
-# shell-quoting using printf %q (same algorithm used for xtrace and output of 'set', 'trap', ...)
-
-[[ $(printf '%q\n') == '' ]] || err_exit 'printf "%q" with missing arguments'
-
-# the following fails on 2012-08-01 in UTF-8 locales
-expect="'shell-quoted string'"
-actual=$(
-	print -nr $'\303\274' | read -n1 foo  # interrupt processing of 2-byte UTF-8 char after reading 1 byte
-	printf '%q\n' "shell-quoted string"
-)
-LC_CTYPE=POSIX true	    # on buggy ksh, a locale re-init via temp assignment restores correct shellquoting
-[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \
-				"(expected $expect; got $actual)"
-
-# shell-quoting UTF-8 characters: check for unnecessary encoding
-case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
-( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
-	expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\''
-	actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	expect="'正常終了 正常終了'"
-	actual=$(printf %q '正常終了 正常終了')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	expect="'aeu aéu'"
-	actual=$(printf %q 'aeu aéu')
-	[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \
-				"(expected $expect; got $actual)"
-	;;
-esac
-
 # ======
 # we won't get hit by the one second boundary twice, right?
 expect= actual=