diff --git a/NEWS b/NEWS index d39f632c3..608bc44d1 100644 --- a/NEWS +++ b/NEWS @@ -18,6 +18,10 @@ Any uppercase BUG_* names are modernish shell bug IDs. - Fixed a bug that corrupted KIA/CQL cross-reference databases created using ksh's -R option; shell warnings were wrongly included in the database file. +- The shell's quoting algorithm (used in xtrace, printf %q, and more) has been + fixed for UTF-8 (Unicode) locales; it no longer needlessly and inconsistently + encodes normal printable UTF-8 characters into hexadecimal \u[xxxx] codes. + 2020-07-07: - Four of the date formats accepted by 'printf %()T' have had their diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c index 5eb124b75..fd620a09e 100644 --- a/src/cmd/ksh93/sh/string.c +++ b/src/cmd/ksh93/sh/string.c @@ -325,6 +325,34 @@ static char *sh_fmtcsv(const char *string) return(stakptr(offset)); } +#if SHOPT_MULTIBYTE +/* + * Returns true if c is an invisible Unicode character, excluding ASCII space. + * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is + * generally not possible as the OS-provided iswgraph(3) doesn't support that + * locale. So do a quick test and do our best with a fallback if necessary. + */ +static int is_invisible(int c) +{ + if(!mbwide()) /* not in multibyte locale? */ + return(c != ' ' && !isgraph(c)); /* use plain isgraph(3) */ + else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */ + return(c != ' ' && !iswgraph(c)); /* use iswgraph(3) */ + else /* fallback: */ + return( c <= 0x001F || /* control characters */ + c >= 0x007F && c <= 0x009F || /* control characters */ + c == 0x00A0 || /* non-breaking space */ + c == 0x061C || /* arabic letter mark */ + c == 0x1680 || /* ogham space mark */ + c == 0x180E || /* mongolian vowel separator */ + c >= 0x2000 && c <= 0x200F || /* spaces and format characters */ + c >= 0x2028 && c <= 0x202F || /* separators and format characters */ + c >= 0x205F && c <= 0x206F || /* various format characters */ + c == 0x3000 || /* ideographic space */ + c == 0xFEFF ); /* zero-width non-breaking space */ +} +#endif /* SHOPT_MULTIBYTE */ + /* * print quoting chars so that it can be read by the shell * puts null terminated result on stack, but doesn't freeze it @@ -363,7 +391,7 @@ char *sh_fmtq(const char *string) for(;c;c= mbchar(cp)) { #if SHOPT_MULTIBYTE - if(c=='\'' || c>=128 || c<0 || !iswprint(c)) + if(c=='\'' || is_invisible(c)) #else if(c=='\'' || !isprint(c)) #endif /* SHOPT_MULTIBYTE */ @@ -426,7 +454,7 @@ char *sh_fmtq(const char *string) cp = op+1; isbyte = 1; } - if(mbwide() && ((cp-op)>1)) + if(mbwide() && is_invisible(c)) { sfprintf(staksp,"\\u[%x]",c); continue; diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index 66d465e0a..34ef9c914 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -318,6 +318,24 @@ LC_CTYPE=POSIX true # on buggy ksh, a locale re-init via temp assignment res [[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \ "(expected $expect; got $actual)" +# shell-quoting UTF-8 characters: check for unnecessary encoding +case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in +( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) + expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\'' + actual=$(printf %q 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \ + "(expected $expect; got $actual)" + expect="'正常終了 正常終了'" + actual=$(printf %q '正常終了 正常終了') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \ + "(expected $expect; got $actual)" + expect="'aeu aéu'" + actual=$(printf %q 'aeu aéu') + [[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \ + "(expected $expect; got $actual)" + ;; +esac + # ====== # we won't get hit by the one second boundary twice, right? expect= actual=