diff --git a/NEWS b/NEWS index 0d35bb77e..a2e8dffd6 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,20 @@ For full details, see the git log at: https://github.com/ksh93/ksh Any uppercase BUG_* names are modernish shell bug IDs. +2020-08-10: + +- A number of fixes have been applied to the printf formatting directives + %H and %#H (as well as the undocumented equivalents %(html)q and %(url)q): + 1. Both formatters have been made multibyte/UTF-8 aware, and no longer + delete multibyte characters. Invalid UTF-8 byte sequences are rendered + as ASCII question marks. + 2. %H no longer wrongly changes spaces to non-breaking spaces ( ). + 3. %H now converts the single quote (') to '%#39;' instead of ''' + which is not a valid entity in all HTML versions. + 4. %#H failed to encode some reserved characters (e.g. '?') while encoding + some unreserved ones (e.g. '~'). It now percent-encodes all characters + except those 'unreserved' as per RFC3986 (ASCII alphanumeric plus -._~). + 2020-08-09: - File name generation (a.k.a. pathname expansion, a.k.a. globbing) now diff --git a/src/cmd/ksh93/bltins/print.c b/src/cmd/ksh93/bltins/print.c index d00aca628..6bcbb3f05 100644 --- a/src/cmd/ksh93/bltins/print.c +++ b/src/cmd/ksh93/bltins/print.c @@ -477,46 +477,65 @@ static char *genformat(char *format) static char *fmthtml(const char *string, int flags) { - register const char *cp = string; + register const char *cp = string, *op; register int c, offset = staktell(); + /* + * The only multibyte locale ksh currently supports is UTF-8, which is a superset of ASCII. So, if we're on an + * EBCDIC system, below we attempt to convert EBCDIC to ASCII only if we're not in a multibyte locale (mbwide()). + */ + mbinit(); if(!(flags&SFFMT_ALTER)) { - while(c= *(unsigned char*)cp++) + /* Encode for HTML, for inside and outside single- and double-quoted strings. */ + while(op = cp, c = mbchar(cp)) { -#if SHOPT_MULTIBYTE - register int s; - if((s=mbsize(cp-1)) > 1) - { - cp += (s-1); - continue; - } -#endif /* SHOPT_MULTIBYTE */ - if(c=='<') + if(!mbwide()) + c = CCMAPC(c,CC_NATIVE,CC_ASCII); + if(mbwide() && c < 0) /* invalid multibyte char */ + stakputc('?'); + else if(c == 60) /* < */ stakputs("<"); - else if(c=='>') + else if(c == 62) /* > */ stakputs(">"); - else if(c=='&') + else if(c == 38) /* & */ stakputs("&"); - else if(c=='"') + else if(c == 34) /* " */ stakputs("""); - else if(c=='\'') - stakputs("'"); - else if(c==' ') + else if(c == 39) /* ' (' is nonstandard) */ + stakputs("'"); + else if(c == 160 && mbwide()) /* non-breaking space */ stakputs(" "); - else if(!isprint(c) && c!='\n' && c!='\r') - sfprintf(stkstd,"&#%X;",CCMAPC(c,CC_NATIVE,CC_ASCII)); + else if(!sh_isprint(c) && c!='\n' && c!='\r') + sfprintf(stkstd, "&#%d;", c); else - stakputc(c); + stakwrite(op, cp-op); } } else { - while(c= *(unsigned char*)cp++) + /* Percent-encode for URI. Ref.: RFC 3986, section 2.3 */ + if(mbwide()) { - if(strchr("!*'();@&+$,#[]<>~.\"{}|\\-`^% ",c) || (!isprint(c) && c!='\n' && c!='\r')) - sfprintf(stkstd,"%%%02X",CCMAPC(c,CC_NATIVE,CC_ASCII)); - else - stakputc(c); + while(op = cp, c = mbchar(cp)) + { + if(c < 0) + stakputs("%3F"); + else if(c <= 255 && strchr(URI_RFC3986_UNRESERVED, c)) + stakwrite(op, cp-op); + else + while(c = *(unsigned char*)op++, op <= cp) + sfprintf(stkstd, "%%%02X", c); + } + } + else + { + while(c = *(unsigned char*)cp++) + { + if(strchr(URI_RFC3986_UNRESERVED, c)) + stakputc(c); + else + sfprintf(stkstd, "%%%02X", CCMAPC(c, CC_NATIVE, CC_ASCII)); + } } } stakputc(0); diff --git a/src/cmd/ksh93/data/builtins.c b/src/cmd/ksh93/data/builtins.c index 6e001a14a..e61272d04 100644 --- a/src/cmd/ksh93/data/builtins.c +++ b/src/cmd/ksh93/data/builtins.c @@ -1180,7 +1180,7 @@ USAGE_LICENSE ; const char sh_optprintf[] = -"[-1c?\n@(#)$Id: printf (AT&T Research) 2009-02-02 $\n]" +"[-1c?\n@(#)$Id: printf (AT&T Research/ksh93) 2020-08-10 $\n]" USAGE_LICENSE "[+NAME?printf - write formatted output]" "[+DESCRIPTION?\bprintf\b writes each \astring\a operand to " @@ -1211,7 +1211,7 @@ USAGE_LICENSE "[+%B?Treat the argument as a variable name and output the value " "without converting it to a string. This is most useful for " "variables of type \b-b\b.]" - "[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, " + "[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, \b'\b, " "\b\"\b, and non-printable characters properly escaped for " "use in HTML and XML documents. The alternate flag \b#\b " "formats the output for use as a URI.]" diff --git a/src/cmd/ksh93/include/defs.h b/src/cmd/ksh93/include/defs.h index 67fc40be6..557cb08e8 100644 --- a/src/cmd/ksh93/include/defs.h +++ b/src/cmd/ksh93/include/defs.h @@ -29,6 +29,11 @@ #define defs_h_defined #include +#if !SHOPT_MULTIBYTE +# undef mbwide +# define mbwide() (0) /* disable multibyte without need for further '#if SHOPT_MULTIBTYE' */ +#endif + #include #include #include "FEATURE/externs" @@ -441,6 +446,14 @@ extern int sh_whence(char**,int); extern Namval_t *sh_fsearch(Shell_t*,const char *,int); #endif /* SHOPT_NAMESPACE */ +#if SHOPT_MULTIBYTE + extern int sh_isprint(int); +#else +# define sh_isprint(c) isprint(c) +#endif /* SHOPT_MULTIBYTE */ + +#define URI_RFC3986_UNRESERVED "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" + #ifndef ERROR_dictionary # define ERROR_dictionary(s) (s) #endif diff --git a/src/cmd/ksh93/include/version.h b/src/cmd/ksh93/include/version.h index 191237b43..1167568b9 100644 --- a/src/cmd/ksh93/include/version.h +++ b/src/cmd/ksh93/include/version.h @@ -17,4 +17,4 @@ * David Korn * * * ***********************************************************************/ -#define SH_RELEASE "93u+m 2020-08-09" +#define SH_RELEASE "93u+m 2020-08-10" diff --git a/src/cmd/ksh93/sh/string.c b/src/cmd/ksh93/sh/string.c index 6c3720649..8b43961dd 100644 --- a/src/cmd/ksh93/sh/string.c +++ b/src/cmd/ksh93/sh/string.c @@ -191,9 +191,7 @@ char *sh_substitute(const char *string,const char *oldsp,char *newsp) return((char*)0); if(*(cp=oldsp) == 0) goto found; -#if SHOPT_MULTIBYTE mbinit(); -#endif /* SHOPT_MULTIBYTE */ do { /* skip to first character which matches start of oldsp */ @@ -331,19 +329,21 @@ static char *sh_fmtcsv(const char *string) #if SHOPT_MULTIBYTE /* - * Returns true if c is an invisible Unicode character, excluding ASCII space. + * Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3). + * + * Returns false if c is an invisible Unicode character, excluding ASCII space. * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is * generally not possible as the OS-provided iswgraph(3) doesn't support that * locale. So do a quick test and do our best with a fallback if necessary. */ -static int is_invisible(int c) +int sh_isprint(int c) { if(!mbwide()) /* not in multibyte locale? */ - return(c != ' ' && !isgraph(c)); /* use plain isgraph(3) */ + return(isprint(c)); /* use plain isprint(3) */ else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */ - return(c != ' ' && !iswgraph(c)); /* use iswgraph(3) */ + return(c == ' ' || iswgraph(c)); /* use iswgraph(3) */ else /* fallback: */ - return( c <= 0x001F || /* control characters */ + return(!(c <= 0x001F || /* control characters */ c >= 0x007F && c <= 0x009F || /* control characters */ c == 0x00A0 || /* non-breaking space */ c == 0x061C || /* arabic letter mark */ @@ -353,7 +353,7 @@ static int is_invisible(int c) c >= 0x2028 && c <= 0x202F || /* separators and format characters */ c >= 0x205F && c <= 0x206F || /* various format characters */ c == 0x3000 || /* ideographic space */ - c == 0xFEFF ); /* zero-width non-breaking space */ + c == 0xFEFF)); /* zero-width non-breaking space */ } #endif /* SHOPT_MULTIBYTE */ @@ -368,9 +368,7 @@ char *sh_fmtq(const char *string) int offset; if(!cp) return((char*)0); -#if SHOPT_MULTIBYTE mbinit(); -#endif /* SHOPT_MULTIBYTE */ offset = staktell(); state = ((c= mbchar(cp))==0); if(isaletter(c)) @@ -394,11 +392,7 @@ char *sh_fmtq(const char *string) state = 1; for(;c;c= mbchar(cp)) { -#if SHOPT_MULTIBYTE - if(c=='\'' || is_invisible(c)) -#else - if(c=='\'' || !isprint(c)) -#endif /* SHOPT_MULTIBYTE */ + if(c=='\'' || !sh_isprint(c)) state = 2; else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT)) state |=1; @@ -416,11 +410,7 @@ char *sh_fmtq(const char *string) { stakwrite("$'",2); cp = string; -#if SHOPT_MULTIBYTE while(op = cp, c= mbchar(cp)) -#else - while(op = cp, c= *(unsigned char*)cp++) -#endif { state=1; switch(c) @@ -449,7 +439,6 @@ char *sh_fmtq(const char *string) case '\\': case '\'': break; default: -#if SHOPT_MULTIBYTE if(mbwide()) { /* We're in a multibyte locale */ @@ -460,16 +449,14 @@ char *sh_fmtq(const char *string) cp = op+1; goto quote_one_byte; } - if(is_invisible(c)) + if(!sh_isprint(c)) { /* Unicode hex code */ sfprintf(staksp,"\\u[%x]",c); continue; } } - else -#endif /* SHOPT_MULTIBYTE */ - if(!isprint(c)) + else if(!isprint(c)) { quote_one_byte: sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c); diff --git a/src/cmd/ksh93/tests/builtins.sh b/src/cmd/ksh93/tests/builtins.sh index bb9cc3fa3..1b6bfa0ba 100755 --- a/src/cmd/ksh93/tests/builtins.sh +++ b/src/cmd/ksh93/tests/builtins.sh @@ -275,15 +275,70 @@ OPTIND=1 if [[ $(getopts $'[+?X\ffoobar\fX]' v --man 2>&1) != *'Xhello world'X* ]] then err_exit '\f...\f not working in getopts usage strings' fi -if [[ $(printf '%H\n' $'<>"& \'\tabc') != '<>"& ' abc' ]] -then err_exit 'printf %H not working' -fi -if [[ $(printf '%(html)q\n' $'<>"& \'\tabc') != '<>"& ' abc' ]] -then err_exit 'printf %(html)q not working' -fi -if [[ $( printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') != 'foo://ab_c%3C%3E%22%26%20%27%09abc' ]] -then err_exit 'printf %(url)q not working' -fi + +expect='<>"& ' abc' +actual=$(printf '%H\n' $'<>"& \'\tabc') +[[ $expect == "$actual" ]] || err_exit 'printf %H not working' \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" +actual=$(printf '%(html)q\n' $'<>"& \'\tabc') +[[ $expect == "$actual" ]] || err_exit 'printf %(html)q not working' \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" + +expect='foo://ab_c%3C%3E%22%26%20%27%09abc' +actual=$(printf 'foo://ab_c%#H\n' $'<>"& \'\tabc') +[[ $expect == "$actual" ]] || err_exit 'printf %#H not working' \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" +actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') +[[ $expect == "$actual" ]] || err_exit 'printf %(url)q not working' \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" + +case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in +( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* ) + # HTML encoding UTF-8 characters + expect='what?' + actual=$(printf %H 'what?') + [[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \ + "(expected $expect; got $actual)" + expect='عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.' + actual=$(printf %H 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='正常終了 正常終了' + actual=$(printf %H '正常終了 正常終了') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='« l’abîme de mon métier… »' + actual=$(printf %H '« l’abîme de mon métier… »') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='?†???' + actual=$(printf %H $'\x86\u86\xF0\x96\x76\xA7\xB5') + [[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \ + "(expected $expect; got $actual)" + # URL/URI encoding of UTF-8 characters + expect='wh.at%3F' + actual=$(printf %#H 'wh.at?') + [[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \ + "(expected $expect; got $actual)" + expect='%D8%B9%D9%86%D8%AF%D9%85%D8%A7%20%D9%8A%D8%B1%D9%8A%D8%AF%20%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85%20%D8%A3%D9%86%20%E2%80%AA%D9%8A%D8%AA%D9%83%D9%84%D9%91%D9%85%20%E2%80%AC%20%D8%8C%20%D9%81%D9%87%D9%88%20%D9%8A%D8%AA%D8%AD%D8%AF%D9%91%D8%AB%20%D8%A8%D9%84%D8%BA%D8%A9%20%D9%8A%D9%88%D9%86%D9%8A%D9%83%D9%88%D8%AF.' + actual=$(printf %#H 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86%20%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86' + actual=$(printf %#H '正常終了 正常終了') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='%C2%AB%20l%E2%80%99ab%C3%AEme%20de%20mon%C2%A0m%C3%A9tier%E2%80%A6%20%C2%BB' + actual=$(printf %#H '« l’abîme de mon métier… »') + [[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \ + "(expected $expect; got $actual)" + expect='%3F%C2%86%3F%3F%3F' + actual=$(printf %#H $'\x86\u86\xF0\x96\x76\xA7\xB5') + [[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \ + "(expected $expect; got $actual)" + ;; +esac + if [[ $(printf '%R %R %R %R\n' 'a.b' '*.c' '^' '!(*.*)') != '^a\.b$ \.c$ ^\^$ ^(.*\..*)!$' ]] then err_exit 'printf %T not working' fi