printf: Fix HTML and URI encoding (%H, %#H)

This applies a number of fixes to the printf formatting directives %H and %#H (as well as their equivalents %(html)q and %(url)q): 1. Both formatters have been made multibyte/UTF-8 aware, and no longer delete multibyte characters. Invalid UTF-8 byte sequences are rendered as ASCII question marks. 2. %H no longer wrongly encodes spaces as non-breaking spaces ( ) and instead correctly encodes the UTF-8 non-breaking space as such. 3. %H now converts the single quote (') to '%#39;' instead of ''' which is not a valid entity in all HTML versions. 4. %#H failed to encode some reserved characters (e.g. '?') while encoding some unreserved ones (e.g. '~'). It now percent-encodes all characters except those 'unreserved' as per RFC3986 (ASCII alphanumeric plus -._~). Prior discussion: ce8d1467-4a6d-883b-45ad-fc3c7b90e681%40inlv.org src/cmd/ksh93/include/defs.h: src/cmd/ksh93/sh/string.c: - defs.h: If compiling without SHOPT_MULTIBYTE, redefine the mbwide() macro (which tests if we're in a multibyte locale) as 0. This lets the compiler optimiser do the work that would otherwise require a lot of tedious '#if SHOPT_MULTIBYTE' directives. - string.c: Remove some now-unneeded '#if SHOPT_MULTIBYTE' stuff. - defs.h, string.c: Rename is_invisible() to sh_isprint(), invert the boolean return value, and make it an extern for use in fmthtml() -- see below. If compiling without SHOPT_MULTIBYTE, simply #define sh_isprint() as equivalent to isprint(3). - defs.h: Add URI_RFC3986_UNRESERVED macro for fmthtml() containing the characters "unreserved" for purposes of URI percent-encoding. src/cmd/ksh93/bltins/print.c: fmthtml(): - Remove kludge that skipped all multibyte characters (!). - Complete rewrite to implement fixes described above. - Don't bother with '#if SHOPT_MULTIBYTE' directives (see above). src/cmd/ksh93/data/builtins.c: - sh_optprintf[]: %H: Add single quote to encoded chars doc. - Edit credits and bump version date. src/cmd/ksh93/tests/builtins.sh: - Update and tweak old regression tests. - Add a number of new tests for UTF-8 HTML and URI encoding, which are only run when running tests in a UTF-8 locale (shtests -u).
2025-03-09 15:50:02 +00:00 · 2020-08-10 22:15:53 +01:00 · 2020-08-10 22:15:53 +01:00 · 8477d2ce22
commit 8477d2ce22
parent aff63e382d
7 changed files with 149 additions and 61 deletions
--- a/src/cmd/ksh93/sh/string.c
+++ b/src/cmd/ksh93/sh/string.c
@ -191,9 +191,7 @@ char *sh_substitute(const char *string,const char *oldsp,char *newsp)
 		return((char*)0);
 	if(*(cp=oldsp) == 0)
 		goto found;
-#if SHOPT_MULTIBYTE
 	mbinit();
-#endif /* SHOPT_MULTIBYTE */
 	do
 	{
 	/* skip to first character which matches start of oldsp */
@ -331,19 +329,21 @@ static char	*sh_fmtcsv(const char *string)

 #if SHOPT_MULTIBYTE
 /*
- * Returns true if c is an invisible Unicode character, excluding ASCII space.
+ * Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3).
+ *
+ * Returns false if c is an invisible Unicode character, excluding ASCII space.
 * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
 * generally not possible as the OS-provided iswgraph(3) doesn't support that
 * locale. So do a quick test and do our best with a fallback if necessary.
 */
-static int	is_invisible(int c)
+int	sh_isprint(int c)
 {
 	if(!mbwide())					/* not in multibyte locale? */
-		return(c != ' ' && !isgraph(c));	/* use plain isgraph(3) */
+		return(isprint(c));			/* use plain isprint(3) */
 	else if(iswgraph(0x5E38) && !iswgraph(0xFEFF))	/* can we use iswgraph(3)? */
-		return(c != ' ' && !iswgraph(c));	/* use iswgraph(3) */
+		return(c == ' ' || iswgraph(c));	/* use iswgraph(3) */
 	else						/* fallback: */
-		return(	c <= 0x001F ||			/* control characters */
+		return(!(c <= 0x001F ||			/* control characters */
 			c >= 0x007F && c <= 0x009F ||	/* control characters */
 			c == 0x00A0 ||			/* non-breaking space */
 			c == 0x061C ||			/* arabic letter mark */
@ -353,7 +353,7 @@ static int	is_invisible(int c)
 			c >= 0x2028 && c <= 0x202F ||	/* separators and format characters */
 			c >= 0x205F && c <= 0x206F ||	/* various format characters */
 			c == 0x3000 ||			/* ideographic space */
-			c == 0xFEFF );			/* zero-width non-breaking space */
+			c == 0xFEFF));			/* zero-width non-breaking space */
 }
 #endif /* SHOPT_MULTIBYTE */

@ -368,9 +368,7 @@ char	*sh_fmtq(const char *string)
 	int offset;
 	if(!cp)
 		return((char*)0);
-#if SHOPT_MULTIBYTE
 	mbinit();
-#endif /* SHOPT_MULTIBYTE */
 	offset = staktell();
 	state = ((c= mbchar(cp))==0);
 	if(isaletter(c))
@ -394,11 +392,7 @@ char	*sh_fmtq(const char *string)
 		state = 1;
 	for(;c;c= mbchar(cp))
 	{
-#if SHOPT_MULTIBYTE
-		if(c=='\'' || is_invisible(c))
-#else
-		if(c=='\'' || !isprint(c))
-#endif /* SHOPT_MULTIBYTE */
+		if(c=='\'' || !sh_isprint(c))
 			state = 2;
 		else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT))
 			state |=1;
@ -416,11 +410,7 @@ char	*sh_fmtq(const char *string)
 	{
 		stakwrite("$'",2);
 		cp = string;
-#if SHOPT_MULTIBYTE
 		while(op = cp, c= mbchar(cp))
-#else
-		while(op = cp, c= *(unsigned char*)cp++)
-#endif
 		{
 			state=1;
 			switch(c)
@ -449,7 +439,6 @@ char	*sh_fmtq(const char *string)
 			    case '\\':	case '\'':
 				break;
 			    default:
-#if SHOPT_MULTIBYTE
 				if(mbwide())
 				{
 					/* We're in a multibyte locale */
@ -460,16 +449,14 @@ char	*sh_fmtq(const char *string)
 						cp = op+1;
 						goto quote_one_byte;
 					}
-					if(is_invisible(c))
+					if(!sh_isprint(c))
 					{
 						/* Unicode hex code */
 						sfprintf(staksp,"\\u[%x]",c);
 						continue;
 					}
 				}
-				else
-#endif /* SHOPT_MULTIBYTE */
-				if(!isprint(c))
+				else if(!isprint(c))
 				{
 				quote_one_byte:
 					sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);