1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00

printf: Fix HTML and URI encoding (%H, %#H)

This applies a number of fixes to the printf formatting directives
%H and %#H (as well as their equivalents %(html)q and %(url)q):
1. Both formatters have been made multibyte/UTF-8 aware, and no
   longer delete multibyte characters. Invalid UTF-8 byte sequences
   are rendered as ASCII question marks.
2. %H no longer wrongly encodes spaces as non-breaking spaces
   ( ) and instead correctly encodes the UTF-8 non-breaking
   space as such.
3. %H now converts the single quote (') to '%#39;' instead of
   ''' which is not a valid entity in all HTML versions.
4. %#H failed to encode some reserved characters (e.g. '?') while
   encoding some unreserved ones (e.g. '~'). It now percent-encodes
   all characters except those 'unreserved' as per RFC3986 (ASCII
   alphanumeric plus -._~).

Prior discussion:
ce8d1467-4a6d-883b-45ad-fc3c7b90e681%40inlv.org

src/cmd/ksh93/include/defs.h:
src/cmd/ksh93/sh/string.c:
- defs.h: If compiling without SHOPT_MULTIBYTE, redefine the
  mbwide() macro (which tests if we're in a multibyte locale) as 0.
  This lets the compiler optimiser do the work that would otherwise
  require a lot of tedious '#if SHOPT_MULTIBYTE' directives.
- string.c: Remove some now-unneeded '#if SHOPT_MULTIBYTE' stuff.
- defs.h, string.c: Rename is_invisible() to sh_isprint(), invert
  the boolean return value, and make it an extern for use in
  fmthtml() -- see below. If compiling without SHOPT_MULTIBYTE,
  simply #define sh_isprint() as equivalent to isprint(3).
- defs.h: Add URI_RFC3986_UNRESERVED macro for fmthtml() containing
  the characters "unreserved" for purposes of URI percent-encoding.

src/cmd/ksh93/bltins/print.c: fmthtml():
- Remove kludge that skipped all multibyte characters (!).
- Complete rewrite to implement fixes described above.
- Don't bother with '#if SHOPT_MULTIBYTE' directives (see above).

src/cmd/ksh93/data/builtins.c:
- sh_optprintf[]: %H: Add single quote to encoded chars doc.
- Edit credits and bump version date.

src/cmd/ksh93/tests/builtins.sh:
- Update and tweak old regression tests.
- Add a number of new tests for UTF-8 HTML and URI encoding, which
  are only run when running tests in a UTF-8 locale (shtests -u).
This commit is contained in:
Martijn Dekker 2020-08-10 22:15:53 +01:00
parent aff63e382d
commit 8477d2ce22
7 changed files with 149 additions and 61 deletions

View file

@ -191,9 +191,7 @@ char *sh_substitute(const char *string,const char *oldsp,char *newsp)
return((char*)0);
if(*(cp=oldsp) == 0)
goto found;
#if SHOPT_MULTIBYTE
mbinit();
#endif /* SHOPT_MULTIBYTE */
do
{
/* skip to first character which matches start of oldsp */
@ -331,19 +329,21 @@ static char *sh_fmtcsv(const char *string)
#if SHOPT_MULTIBYTE
/*
* Returns true if c is an invisible Unicode character, excluding ASCII space.
* Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3).
*
* Returns false if c is an invisible Unicode character, excluding ASCII space.
* Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
* generally not possible as the OS-provided iswgraph(3) doesn't support that
* locale. So do a quick test and do our best with a fallback if necessary.
*/
static int is_invisible(int c)
int sh_isprint(int c)
{
if(!mbwide()) /* not in multibyte locale? */
return(c != ' ' && !isgraph(c)); /* use plain isgraph(3) */
return(isprint(c)); /* use plain isprint(3) */
else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */
return(c != ' ' && !iswgraph(c)); /* use iswgraph(3) */
return(c == ' ' || iswgraph(c)); /* use iswgraph(3) */
else /* fallback: */
return( c <= 0x001F || /* control characters */
return(!(c <= 0x001F || /* control characters */
c >= 0x007F && c <= 0x009F || /* control characters */
c == 0x00A0 || /* non-breaking space */
c == 0x061C || /* arabic letter mark */
@ -353,7 +353,7 @@ static int is_invisible(int c)
c >= 0x2028 && c <= 0x202F || /* separators and format characters */
c >= 0x205F && c <= 0x206F || /* various format characters */
c == 0x3000 || /* ideographic space */
c == 0xFEFF ); /* zero-width non-breaking space */
c == 0xFEFF)); /* zero-width non-breaking space */
}
#endif /* SHOPT_MULTIBYTE */
@ -368,9 +368,7 @@ char *sh_fmtq(const char *string)
int offset;
if(!cp)
return((char*)0);
#if SHOPT_MULTIBYTE
mbinit();
#endif /* SHOPT_MULTIBYTE */
offset = staktell();
state = ((c= mbchar(cp))==0);
if(isaletter(c))
@ -394,11 +392,7 @@ char *sh_fmtq(const char *string)
state = 1;
for(;c;c= mbchar(cp))
{
#if SHOPT_MULTIBYTE
if(c=='\'' || is_invisible(c))
#else
if(c=='\'' || !isprint(c))
#endif /* SHOPT_MULTIBYTE */
if(c=='\'' || !sh_isprint(c))
state = 2;
else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT))
state |=1;
@ -416,11 +410,7 @@ char *sh_fmtq(const char *string)
{
stakwrite("$'",2);
cp = string;
#if SHOPT_MULTIBYTE
while(op = cp, c= mbchar(cp))
#else
while(op = cp, c= *(unsigned char*)cp++)
#endif
{
state=1;
switch(c)
@ -449,7 +439,6 @@ char *sh_fmtq(const char *string)
case '\\': case '\'':
break;
default:
#if SHOPT_MULTIBYTE
if(mbwide())
{
/* We're in a multibyte locale */
@ -460,16 +449,14 @@ char *sh_fmtq(const char *string)
cp = op+1;
goto quote_one_byte;
}
if(is_invisible(c))
if(!sh_isprint(c))
{
/* Unicode hex code */
sfprintf(staksp,"\\u[%x]",c);
continue;
}
}
else
#endif /* SHOPT_MULTIBYTE */
if(!isprint(c))
else if(!isprint(c))
{
quote_one_byte:
sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);