printf: Fix HTML and URI encoding (%H, %#H)

This applies a number of fixes to the printf formatting directives %H and %#H (as well as their equivalents %(html)q and %(url)q): 1. Both formatters have been made multibyte/UTF-8 aware, and no longer delete multibyte characters. Invalid UTF-8 byte sequences are rendered as ASCII question marks. 2. %H no longer wrongly encodes spaces as non-breaking spaces ( ) and instead correctly encodes the UTF-8 non-breaking space as such. 3. %H now converts the single quote (') to '%#39;' instead of ''' which is not a valid entity in all HTML versions. 4. %#H failed to encode some reserved characters (e.g. '?') while encoding some unreserved ones (e.g. '~'). It now percent-encodes all characters except those 'unreserved' as per RFC3986 (ASCII alphanumeric plus -._~). Prior discussion: https://groups.google.com/d/msgid/korn-shell/ce8d1467-4a6d-883b-45ad-fc3c7b90e681%40inlv.org src/cmd/ksh93/include/defs.h: src/cmd/ksh93/sh/string.c: - defs.h: If compiling without SHOPT_MULTIBYTE, redefine the mbwide() macro (which tests if we're in a multibyte locale) as 0. This lets the compiler optimiser do the work that would otherwise require a lot of tedious '#if SHOPT_MULTIBYTE' directives. - string.c: Remove some now-unneeded '#if SHOPT_MULTIBYTE' stuff. - defs.h, string.c: Rename is_invisible() to sh_isprint(), invert the boolean return value, and make it an extern for use in fmthtml() -- see below. If compiling without SHOPT_MULTIBYTE, simply #define sh_isprint() as equivalent to isprint(3). - defs.h: Add URI_RFC3986_UNRESERVED macro for fmthtml() containing the characters "unreserved" for purposes of URI percent-encoding. src/cmd/ksh93/bltins/print.c: fmthtml(): - Remove kludge that skipped all multibyte characters (!). - Complete rewrite to implement fixes described above. - Don't bother with '#if SHOPT_MULTIBYTE' directives (see above). src/cmd/ksh93/data/builtins.c: - sh_optprintf[]: %H: Add single quote to encoded chars doc. - Edit credits and bump version date. src/cmd/ksh93/tests/builtins.sh: - Update and tweak old regression tests. - Add a number of new tests for UTF-8 HTML and URI encoding, which are only run when running tests in a UTF-8 locale (shtests -u).
2025-03-09 15:50:02 +00:00 · 2020-08-10 22:15:53 +01:00 · 2020-08-10 22:15:53 +01:00 · 8477d2ce22
commit 8477d2ce22
parent aff63e382d
7 changed files with 149 additions and 61 deletions
--- a/14
+++ b/14
@ -3,6 +3,20 @@ For full details, see the git log at: https://github.com/ksh93/ksh

 Any uppercase BUG_* names are modernish shell bug IDs.

+2020-08-10:
+
+- A number of fixes have been applied to the printf formatting directives
+  %H and %#H (as well as the undocumented equivalents %(html)q and %(url)q):
+  1. Both formatters have been made multibyte/UTF-8 aware, and no longer
+     delete multibyte characters. Invalid UTF-8 byte sequences are rendered
+     as ASCII question marks.
+  2. %H no longer wrongly changes spaces to non-breaking spaces (&nbsp;).
+  3. %H now converts the single quote (') to '%#39;' instead of '&apos;'
+     which is not a valid entity in all HTML versions.
+  4. %#H failed to encode some reserved characters (e.g. '?') while encoding
+     some unreserved ones (e.g. '~'). It now percent-encodes all characters
+     except those 'unreserved' as per RFC3986 (ASCII alphanumeric plus -._~).
+
 2020-08-09:

 - File name generation (a.k.a. pathname expansion, a.k.a. globbing) now
--- a/src/cmd/ksh93/bltins/print.c
+++ b/src/cmd/ksh93/bltins/print.c
@ -477,46 +477,65 @@ static char *genformat(char *format)

 static char *fmthtml(const char *string, int flags)
 {
-	register const char *cp = string;
+	register const char *cp = string, *op;
 	register int c, offset = staktell();
+	/*
+	 * The only multibyte locale ksh currently supports is UTF-8, which is a superset of ASCII. So, if we're on an
+	 * EBCDIC system, below we attempt to convert EBCDIC to ASCII only if we're not in a multibyte locale (mbwide()).
+	 */
+	mbinit();
 	if(!(flags&SFFMT_ALTER))
 	{
-		while(c= *(unsigned char*)cp++)
+		/* Encode for HTML, for inside and outside single- and double-quoted strings. */
+		while(op = cp, c = mbchar(cp))
 		{
-#if SHOPT_MULTIBYTE
-			register int s;
-			if((s=mbsize(cp-1)) > 1)
-			{
-				cp += (s-1);
-				continue;
-			}
-#endif /* SHOPT_MULTIBYTE */
-			if(c=='<')
+			if(!mbwide())
+				c = CCMAPC(c,CC_NATIVE,CC_ASCII);
+			if(mbwide() && c < 0)		/* invalid multibyte char */
+				stakputc('?');
+			else if(c == 60)		/* < */
 				stakputs("&lt;");
-			else if(c=='>')
+			else if(c == 62)		/* > */
 				stakputs("&gt;");
-			else if(c=='&')
+			else if(c == 38)		/* & */
 				stakputs("&amp;");
-			else if(c=='"')
+			else if(c == 34)		/* " */
 				stakputs("&quot;");
-			else if(c=='\'')
-				stakputs("&apos;");
-			else if(c==' ')
+			else if(c == 39)		/* ' (&apos; is nonstandard) */
+				stakputs("&#39;");
+			else if(c == 160 && mbwide())	/* non-breaking space */
 				stakputs("&nbsp;");
-			else if(!isprint(c) && c!='\n' && c!='\r')
-				sfprintf(stkstd,"&#%X;",CCMAPC(c,CC_NATIVE,CC_ASCII));
+			else if(!sh_isprint(c) && c!='\n' && c!='\r')
+				sfprintf(stkstd, "&#%d;", c);
 			else
-				stakputc(c);
+				stakwrite(op, cp-op);
 		}
 	}
 	else
 	{
-		while(c= *(unsigned char*)cp++)
+		/* Percent-encode for URI. Ref.: RFC 3986, section 2.3 */
+		if(mbwide())
 		{
-			if(strchr("!*'();@&+$,#[]<>~.\"{}|\\-`^% ",c) || (!isprint(c) && c!='\n' && c!='\r'))
-				sfprintf(stkstd,"%%%02X",CCMAPC(c,CC_NATIVE,CC_ASCII));
-			else
-				stakputc(c);
+			while(op = cp, c = mbchar(cp))
+			{
+				if(c < 0)
+					stakputs("%3F");
+				else if(c <= 255 && strchr(URI_RFC3986_UNRESERVED, c))
+					stakwrite(op, cp-op);
+				else
+					while(c = *(unsigned char*)op++, op <= cp)
+						sfprintf(stkstd, "%%%02X", c);
+			}
+		}
+		else
+		{
+			while(c = *(unsigned char*)cp++)
+			{
+				if(strchr(URI_RFC3986_UNRESERVED, c))
+					stakputc(c);
+				else
+					sfprintf(stkstd, "%%%02X", CCMAPC(c, CC_NATIVE, CC_ASCII));
+			}
 		}
 	}
 	stakputc(0);
--- a/src/cmd/ksh93/data/builtins.c
+++ b/src/cmd/ksh93/data/builtins.c
@ -1180,7 +1180,7 @@ USAGE_LICENSE
 ;

 const char sh_optprintf[] =
-"[-1c?\n@(#)$Id: printf (AT&T Research) 2009-02-02 $\n]"
+"[-1c?\n@(#)$Id: printf (AT&T Research/ksh93) 2020-08-10 $\n]"
 USAGE_LICENSE
 "[+NAME?printf - write formatted output]"
 "[+DESCRIPTION?\bprintf\b writes each \astring\a operand to "
@ -1211,7 +1211,7 @@ USAGE_LICENSE
 	"[+%B?Treat the argument as a variable name and output the value "
 		"without converting it to a string.  This is most useful for "
 		"variables of type \b-b\b.]"
-	"[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, "
+	"[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, \b'\b, "
 		"\b\"\b, and non-printable characters properly escaped for "
 		"use in HTML and XML documents.  The alternate flag \b#\b "
 		"formats the output for use as a URI.]"
--- a/src/cmd/ksh93/include/defs.h
+++ b/src/cmd/ksh93/include/defs.h
@ -29,6 +29,11 @@
 #define defs_h_defined

 #include	<ast.h>
+#if !SHOPT_MULTIBYTE
+#   undef mbwide
+#   define mbwide() (0)	/* disable multibyte without need for further '#if SHOPT_MULTIBTYE' */
+#endif
+
 #include	<sfio.h>
 #include	<error.h>
 #include	"FEATURE/externs"
@ -441,6 +446,14 @@ extern int 		sh_whence(char**,int);
    extern Namval_t	*sh_fsearch(Shell_t*,const char *,int);
 #endif /* SHOPT_NAMESPACE */

+#if SHOPT_MULTIBYTE
+    extern int		sh_isprint(int);
+#else
+#   define sh_isprint(c) isprint(c)
+#endif /* SHOPT_MULTIBYTE */
+
+#define URI_RFC3986_UNRESERVED "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+
 #ifndef ERROR_dictionary
 #   define ERROR_dictionary(s)	(s)
 #endif
--- a/src/cmd/ksh93/include/version.h
+++ b/src/cmd/ksh93/include/version.h
@ -17,4 +17,4 @@
 *                  David Korn <dgk@research.att.com>                   *
 *                                                                      *
 ***********************************************************************/
-#define SH_RELEASE	"93u+m 2020-08-09"
+#define SH_RELEASE	"93u+m 2020-08-10"
--- a/src/cmd/ksh93/sh/string.c
+++ b/src/cmd/ksh93/sh/string.c
@ -191,9 +191,7 @@ char *sh_substitute(const char *string,const char *oldsp,char *newsp)
 		return((char*)0);
 	if(*(cp=oldsp) == 0)
 		goto found;
-#if SHOPT_MULTIBYTE
 	mbinit();
-#endif /* SHOPT_MULTIBYTE */
 	do
 	{
 	/* skip to first character which matches start of oldsp */
@ -331,19 +329,21 @@ static char	*sh_fmtcsv(const char *string)

 #if SHOPT_MULTIBYTE
 /*
- * Returns true if c is an invisible Unicode character, excluding ASCII space.
+ * Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3).
+ *
+ * Returns false if c is an invisible Unicode character, excluding ASCII space.
 * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
 * generally not possible as the OS-provided iswgraph(3) doesn't support that
 * locale. So do a quick test and do our best with a fallback if necessary.
 */
-static int	is_invisible(int c)
+int	sh_isprint(int c)
 {
 	if(!mbwide())					/* not in multibyte locale? */
-		return(c != ' ' && !isgraph(c));	/* use plain isgraph(3) */
+		return(isprint(c));			/* use plain isprint(3) */
 	else if(iswgraph(0x5E38) && !iswgraph(0xFEFF))	/* can we use iswgraph(3)? */
-		return(c != ' ' && !iswgraph(c));	/* use iswgraph(3) */
+		return(c == ' ' || iswgraph(c));	/* use iswgraph(3) */
 	else						/* fallback: */
-		return(	c <= 0x001F ||			/* control characters */
+		return(!(c <= 0x001F ||			/* control characters */
 			c >= 0x007F && c <= 0x009F ||	/* control characters */
 			c == 0x00A0 ||			/* non-breaking space */
 			c == 0x061C ||			/* arabic letter mark */
@ -353,7 +353,7 @@ static int	is_invisible(int c)
 			c >= 0x2028 && c <= 0x202F ||	/* separators and format characters */
 			c >= 0x205F && c <= 0x206F ||	/* various format characters */
 			c == 0x3000 ||			/* ideographic space */
-			c == 0xFEFF );			/* zero-width non-breaking space */
+			c == 0xFEFF));			/* zero-width non-breaking space */
 }
 #endif /* SHOPT_MULTIBYTE */

@ -368,9 +368,7 @@ char	*sh_fmtq(const char *string)
 	int offset;
 	if(!cp)
 		return((char*)0);
-#if SHOPT_MULTIBYTE
 	mbinit();
-#endif /* SHOPT_MULTIBYTE */
 	offset = staktell();
 	state = ((c= mbchar(cp))==0);
 	if(isaletter(c))
@ -394,11 +392,7 @@ char	*sh_fmtq(const char *string)
 		state = 1;
 	for(;c;c= mbchar(cp))
 	{
-#if SHOPT_MULTIBYTE
-		if(c=='\'' || is_invisible(c))
-#else
-		if(c=='\'' || !isprint(c))
-#endif /* SHOPT_MULTIBYTE */
+		if(c=='\'' || !sh_isprint(c))
 			state = 2;
 		else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT))
 			state |=1;
@ -416,11 +410,7 @@ char	*sh_fmtq(const char *string)
 	{
 		stakwrite("$'",2);
 		cp = string;
-#if SHOPT_MULTIBYTE
 		while(op = cp, c= mbchar(cp))
-#else
-		while(op = cp, c= *(unsigned char*)cp++)
-#endif
 		{
 			state=1;
 			switch(c)
@ -449,7 +439,6 @@ char	*sh_fmtq(const char *string)
 			    case '\\':	case '\'':
 				break;
 			    default:
-#if SHOPT_MULTIBYTE
 				if(mbwide())
 				{
 					/* We're in a multibyte locale */
@ -460,16 +449,14 @@ char	*sh_fmtq(const char *string)
 						cp = op+1;
 						goto quote_one_byte;
 					}
-					if(is_invisible(c))
+					if(!sh_isprint(c))
 					{
 						/* Unicode hex code */
 						sfprintf(staksp,"\\u[%x]",c);
 						continue;
 					}
 				}
-				else
-#endif /* SHOPT_MULTIBYTE */
-				if(!isprint(c))
+				else if(!isprint(c))
 				{
 				quote_one_byte:
 					sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);
--- a/src/cmd/ksh93/tests/builtins.sh
+++ b/src/cmd/ksh93/tests/builtins.sh
@ -275,15 +275,70 @@ OPTIND=1
 if	[[ $(getopts  $'[+?X\ffoobar\fX]' v --man 2>&1) != *'Xhello world'X* ]]
 then	err_exit '\f...\f not working in getopts usage strings'
 fi
-if	[[ $(printf '%H\n' $'<>"& \'\tabc') != '&lt;&gt;&quot;&amp;&nbsp;&apos;&#9;abc' ]]
-then	err_exit 'printf %H not working'
-fi
-if	[[ $(printf '%(html)q\n' $'<>"& \'\tabc') != '&lt;&gt;&quot;&amp;&nbsp;&apos;&#9;abc' ]]
-then	err_exit 'printf %(html)q not working'
-fi
-if	[[ $( printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') != 'foo://ab_c%3C%3E%22%26%20%27%09abc' ]]
-then	err_exit 'printf %(url)q not working'
-fi
+
+expect='&lt;&gt;&quot;&amp; &#39;&#9;abc'
+actual=$(printf '%H\n' $'<>"& \'\tabc')
+[[ $expect == "$actual" ]] || err_exit 'printf %H not working' \
+	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+actual=$(printf '%(html)q\n' $'<>"& \'\tabc')
+[[ $expect == "$actual" ]] || err_exit 'printf %(html)q not working' \
+	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+
+expect='foo://ab_c%3C%3E%22%26%20%27%09abc'
+actual=$(printf 'foo://ab_c%#H\n' $'<>"& \'\tabc')
+[[ $expect == "$actual" ]] || err_exit 'printf %#H not working' \
+	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc')
+[[ $expect == "$actual" ]] || err_exit 'printf %(url)q not working' \
+	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
+
+case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
+( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
+	# HTML encoding UTF-8 characters
+	expect='what?'
+	actual=$(printf %H 'what?')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \
+				"(expected $expect; got $actual)"
+	expect='عندما يريد العالم أن &#8234;يتكلّم &#8236; ، فهو يتحدّث بلغة يونيكود.'
+	actual=$(printf %H 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='正常終了 正常終了'
+	actual=$(printf %H '正常終了 正常終了')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='« l’abîme de mon&nbsp;métier… »'
+	actual=$(printf %H '« l’abîme de mon métier… »')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='?&#134;???'
+	actual=$(printf %H $'\x86\u86\xF0\x96\x76\xA7\xB5')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	# URL/URI encoding of UTF-8 characters
+	expect='wh.at%3F'
+	actual=$(printf %#H 'wh.at?')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \
+				"(expected $expect; got $actual)"
+	expect='%D8%B9%D9%86%D8%AF%D9%85%D8%A7%20%D9%8A%D8%B1%D9%8A%D8%AF%20%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85%20%D8%A3%D9%86%20%E2%80%AA%D9%8A%D8%AA%D9%83%D9%84%D9%91%D9%85%20%E2%80%AC%20%D8%8C%20%D9%81%D9%87%D9%88%20%D9%8A%D8%AA%D8%AD%D8%AF%D9%91%D8%AB%20%D8%A8%D9%84%D8%BA%D8%A9%20%D9%8A%D9%88%D9%86%D9%8A%D9%83%D9%88%D8%AF.'
+	actual=$(printf %#H 'عندما يريد العالم أن ‪يتكلّم ‬ ، فهو يتحدّث بلغة يونيكود.')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86%20%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86'
+	actual=$(printf %#H '正常終了 正常終了')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='%C2%AB%20l%E2%80%99ab%C3%AEme%20de%20mon%C2%A0m%C3%A9tier%E2%80%A6%20%C2%BB'
+	actual=$(printf %#H '« l’abîme de mon métier… »')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	expect='%3F%C2%86%3F%3F%3F'
+	actual=$(printf %#H $'\x86\u86\xF0\x96\x76\xA7\xB5')
+	[[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \
+				"(expected $expect; got $actual)"
+	;;
+esac
+
 if	[[ $(printf '%R %R %R %R\n' 'a.b' '*.c' '^'  '!(*.*)') != '^a\.b$ \.c$ ^\^$ ^(.*\..*)!$' ]]
 then	err_exit 'printf %T not working'
 fi