From e01801572da15f81d05e47883001f9a31b23f695 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Tue, 11 Aug 2020 07:08:44 +0100 Subject: [PATCH] printf %H: fix/reduce encoding into entities (re: 8477d2ce) The   entity is not valid in XML, only in HTML. Since we must be compatible with both, it can't be used. Thanks to Andras Farkas for the bug report. In addition, the generation of numeric entities for unprintable characters was only valid while processing UTF-8 text while in a UTF-8 locale. In all other conditions it produced invalid results. This is not worth trying to fix. Discussion: https://groups.google.com/d/msgid/korn-shell/CAA0nTRta%3DPbOYduyBv%3DXCzumTcUCU8Lki%3DQQf2O8Erk2BFvO1g%40mail.gmail.com src/cmd/ksh93/bltins/print.c: - Remove conversion to   entity. - Remove conversion of non-graph characters to numeric entities. Convert only the 5 semantically meaningful characters: < > & " ' src/cmd/ksh93/include/defs.h, src/cmd/ksh93/sh/string.c: - We don't need sh_isprint() in print.c anymore, so turn it back into a static function. src/cmd/ksh93/tests/builtins.sh: - Update and trim regression tests. --- src/cmd/ksh93/bltins/print.c | 8 ++------ src/cmd/ksh93/include/defs.h | 6 ------ src/cmd/ksh93/sh/string.c | 6 +----- src/cmd/ksh93/tests/builtins.sh | 18 +++--------------- 4 files changed, 6 insertions(+), 32 deletions(-) diff --git a/src/cmd/ksh93/bltins/print.c b/src/cmd/ksh93/bltins/print.c index 6bcbb3f05..84b07325c 100644 --- a/src/cmd/ksh93/bltins/print.c +++ b/src/cmd/ksh93/bltins/print.c @@ -486,7 +486,7 @@ static char *fmthtml(const char *string, int flags) mbinit(); if(!(flags&SFFMT_ALTER)) { - /* Encode for HTML, for inside and outside single- and double-quoted strings. */ + /* Encode for HTML and XML, for main text and single- and double-quoted attributes. */ while(op = cp, c = mbchar(cp)) { if(!mbwide()) @@ -501,12 +501,8 @@ static char *fmthtml(const char *string, int flags) stakputs("&"); else if(c == 34) /* " */ stakputs("""); - else if(c == 39) /* ' (' is nonstandard) */ + else if(c == 39) /* ' (' is not HTML) */ stakputs("'"); - else if(c == 160 && mbwide()) /* non-breaking space */ - stakputs(" "); - else if(!sh_isprint(c) && c!='\n' && c!='\r') - sfprintf(stkstd, "&#%d;", c); else stakwrite(op, cp-op); } diff --git a/src/cmd/ksh93/include/defs.h b/src/cmd/ksh93/include/defs.h index 557cb08e8..0f7651975 100644 --- a/src/cmd/ksh93/include/defs.h +++ b/src/cmd/ksh93/include/defs.h @@ -446,12 +446,6 @@ extern int sh_whence(char**,int); extern Namval_t *sh_fsearch(Shell_t*,const char *,int); #endif /* SHOPT_NAMESPACE */ -#if SHOPT_MULTIBYTE - extern int sh_isprint(int); -#else -# define sh_isprint(c) isprint(c) -#endif /* SHOPT_MULTIBYTE */ - #define URI_RFC3986_UNRESERVED "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" #ifndef ERROR_dictionary diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c index 8b43961dd..41d7a3c3c 100644 --- a/src/cmd/ksh93/sh/string.c +++ b/src/cmd/ksh93/sh/string.c @@ -327,16 +327,13 @@ static char *sh_fmtcsv(const char *string) return(stakptr(offset)); } -#if SHOPT_MULTIBYTE /* - * Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3). - * * Returns false if c is an invisible Unicode character, excluding ASCII space. * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is * generally not possible as the OS-provided iswgraph(3) doesn't support that * locale. So do a quick test and do our best with a fallback if necessary. */ -int sh_isprint(int c) +static int sh_isprint(int c) { if(!mbwide()) /* not in multibyte locale? */ return(isprint(c)); /* use plain isprint(3) */ @@ -355,7 +352,6 @@ int sh_isprint(int c) c == 0x3000 || /* ideographic space */ c == 0xFEFF)); /* zero-width non-breaking space */ } -#endif /* SHOPT_MULTIBYTE */ /* * print quoting chars so that it can be read by the shell diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index 1b6bfa0ba..07ad5d6fe 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -276,7 +276,7 @@ if [[ $(getopts $'[+?X\ffoobar\fX]' v --man 2>&1) != *'Xhello world'X* ]] then err_exit '\f...\f not working in getopts usage strings' fi -expect='<>"& ' abc' +expect=$'<>"& '\tabc' actual=$(printf '%H\n' $'<>"& \'\tabc') [[ $expect == "$actual" ]] || err_exit 'printf %H not working' \ "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" @@ -295,24 +295,12 @@ actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in ( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) # HTML encoding UTF-8 characters - expect='what?' - actual=$(printf %H 'what?') - [[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \ - "(expected $expect; got $actual)" - expect='عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.' - actual=$(printf %H 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') - [[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \ - "(expected $expect; got $actual)" expect='正常終了 正常終了' actual=$(printf %H '正常終了 正常終了') [[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \ "(expected $expect; got $actual)" - expect='« l’abîme de mon métier… »' - actual=$(printf %H '« l’abîme de mon métier… »') - [[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \ - "(expected $expect; got $actual)" - expect='?†???' - actual=$(printf %H $'\x86\u86\xF0\x96\x76\xA7\xB5') + expect='w?h?á?t??' + actual=$(printf %H $'w\x80h\x81\uE1\x82t\x83?') [[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \ "(expected $expect; got $actual)" # URL/URI encoding of UTF-8 characters