1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00

printf: Fix HTML and URI encoding (%H, %#H)

This applies a number of fixes to the printf formatting directives
%H and %#H (as well as their equivalents %(html)q and %(url)q):
1. Both formatters have been made multibyte/UTF-8 aware, and no
   longer delete multibyte characters. Invalid UTF-8 byte sequences
   are rendered as ASCII question marks.
2. %H no longer wrongly encodes spaces as non-breaking spaces
   ( ) and instead correctly encodes the UTF-8 non-breaking
   space as such.
3. %H now converts the single quote (') to '%#39;' instead of
   ''' which is not a valid entity in all HTML versions.
4. %#H failed to encode some reserved characters (e.g. '?') while
   encoding some unreserved ones (e.g. '~'). It now percent-encodes
   all characters except those 'unreserved' as per RFC3986 (ASCII
   alphanumeric plus -._~).

Prior discussion:
ce8d1467-4a6d-883b-45ad-fc3c7b90e681%40inlv.org

src/cmd/ksh93/include/defs.h:
src/cmd/ksh93/sh/string.c:
- defs.h: If compiling without SHOPT_MULTIBYTE, redefine the
  mbwide() macro (which tests if we're in a multibyte locale) as 0.
  This lets the compiler optimiser do the work that would otherwise
  require a lot of tedious '#if SHOPT_MULTIBYTE' directives.
- string.c: Remove some now-unneeded '#if SHOPT_MULTIBYTE' stuff.
- defs.h, string.c: Rename is_invisible() to sh_isprint(), invert
  the boolean return value, and make it an extern for use in
  fmthtml() -- see below. If compiling without SHOPT_MULTIBYTE,
  simply #define sh_isprint() as equivalent to isprint(3).
- defs.h: Add URI_RFC3986_UNRESERVED macro for fmthtml() containing
  the characters "unreserved" for purposes of URI percent-encoding.

src/cmd/ksh93/bltins/print.c: fmthtml():
- Remove kludge that skipped all multibyte characters (!).
- Complete rewrite to implement fixes described above.
- Don't bother with '#if SHOPT_MULTIBYTE' directives (see above).

src/cmd/ksh93/data/builtins.c:
- sh_optprintf[]: %H: Add single quote to encoded chars doc.
- Edit credits and bump version date.

src/cmd/ksh93/tests/builtins.sh:
- Update and tweak old regression tests.
- Add a number of new tests for UTF-8 HTML and URI encoding, which
  are only run when running tests in a UTF-8 locale (shtests -u).
This commit is contained in:
Martijn Dekker 2020-08-10 22:15:53 +01:00
parent aff63e382d
commit 8477d2ce22
7 changed files with 149 additions and 61 deletions

14
NEWS
View file

@ -3,6 +3,20 @@ For full details, see the git log at: https://github.com/ksh93/ksh
Any uppercase BUG_* names are modernish shell bug IDs. Any uppercase BUG_* names are modernish shell bug IDs.
2020-08-10:
- A number of fixes have been applied to the printf formatting directives
%H and %#H (as well as the undocumented equivalents %(html)q and %(url)q):
1. Both formatters have been made multibyte/UTF-8 aware, and no longer
delete multibyte characters. Invalid UTF-8 byte sequences are rendered
as ASCII question marks.
2. %H no longer wrongly changes spaces to non-breaking spaces ( ).
3. %H now converts the single quote (') to '%#39;' instead of '''
which is not a valid entity in all HTML versions.
4. %#H failed to encode some reserved characters (e.g. '?') while encoding
some unreserved ones (e.g. '~'). It now percent-encodes all characters
except those 'unreserved' as per RFC3986 (ASCII alphanumeric plus -._~).
2020-08-09: 2020-08-09:
- File name generation (a.k.a. pathname expansion, a.k.a. globbing) now - File name generation (a.k.a. pathname expansion, a.k.a. globbing) now

View file

@ -477,46 +477,65 @@ static char *genformat(char *format)
static char *fmthtml(const char *string, int flags) static char *fmthtml(const char *string, int flags)
{ {
register const char *cp = string; register const char *cp = string, *op;
register int c, offset = staktell(); register int c, offset = staktell();
/*
* The only multibyte locale ksh currently supports is UTF-8, which is a superset of ASCII. So, if we're on an
* EBCDIC system, below we attempt to convert EBCDIC to ASCII only if we're not in a multibyte locale (mbwide()).
*/
mbinit();
if(!(flags&SFFMT_ALTER)) if(!(flags&SFFMT_ALTER))
{ {
while(c= *(unsigned char*)cp++) /* Encode for HTML, for inside and outside single- and double-quoted strings. */
while(op = cp, c = mbchar(cp))
{ {
#if SHOPT_MULTIBYTE if(!mbwide())
register int s; c = CCMAPC(c,CC_NATIVE,CC_ASCII);
if((s=mbsize(cp-1)) > 1) if(mbwide() && c < 0) /* invalid multibyte char */
{ stakputc('?');
cp += (s-1); else if(c == 60) /* < */
continue;
}
#endif /* SHOPT_MULTIBYTE */
if(c=='<')
stakputs("&lt;"); stakputs("&lt;");
else if(c=='>') else if(c == 62) /* > */
stakputs("&gt;"); stakputs("&gt;");
else if(c=='&') else if(c == 38) /* & */
stakputs("&amp;"); stakputs("&amp;");
else if(c=='"') else if(c == 34) /* " */
stakputs("&quot;"); stakputs("&quot;");
else if(c=='\'') else if(c == 39) /* ' (&apos; is nonstandard) */
stakputs("&apos;"); stakputs("&#39;");
else if(c==' ') else if(c == 160 && mbwide()) /* non-breaking space */
stakputs("&nbsp;"); stakputs("&nbsp;");
else if(!isprint(c) && c!='\n' && c!='\r') else if(!sh_isprint(c) && c!='\n' && c!='\r')
sfprintf(stkstd,"&#%X;",CCMAPC(c,CC_NATIVE,CC_ASCII)); sfprintf(stkstd, "&#%d;", c);
else else
stakputc(c); stakwrite(op, cp-op);
} }
} }
else else
{ {
while(c= *(unsigned char*)cp++) /* Percent-encode for URI. Ref.: RFC 3986, section 2.3 */
if(mbwide())
{ {
if(strchr("!*'();@&+$,#[]<>~.\"{}|\\-`^% ",c) || (!isprint(c) && c!='\n' && c!='\r')) while(op = cp, c = mbchar(cp))
sfprintf(stkstd,"%%%02X",CCMAPC(c,CC_NATIVE,CC_ASCII)); {
else if(c < 0)
stakputc(c); stakputs("%3F");
else if(c <= 255 && strchr(URI_RFC3986_UNRESERVED, c))
stakwrite(op, cp-op);
else
while(c = *(unsigned char*)op++, op <= cp)
sfprintf(stkstd, "%%%02X", c);
}
}
else
{
while(c = *(unsigned char*)cp++)
{
if(strchr(URI_RFC3986_UNRESERVED, c))
stakputc(c);
else
sfprintf(stkstd, "%%%02X", CCMAPC(c, CC_NATIVE, CC_ASCII));
}
} }
} }
stakputc(0); stakputc(0);

View file

@ -1180,7 +1180,7 @@ USAGE_LICENSE
; ;
const char sh_optprintf[] = const char sh_optprintf[] =
"[-1c?\n@(#)$Id: printf (AT&T Research) 2009-02-02 $\n]" "[-1c?\n@(#)$Id: printf (AT&T Research/ksh93) 2020-08-10 $\n]"
USAGE_LICENSE USAGE_LICENSE
"[+NAME?printf - write formatted output]" "[+NAME?printf - write formatted output]"
"[+DESCRIPTION?\bprintf\b writes each \astring\a operand to " "[+DESCRIPTION?\bprintf\b writes each \astring\a operand to "
@ -1211,7 +1211,7 @@ USAGE_LICENSE
"[+%B?Treat the argument as a variable name and output the value " "[+%B?Treat the argument as a variable name and output the value "
"without converting it to a string. This is most useful for " "without converting it to a string. This is most useful for "
"variables of type \b-b\b.]" "variables of type \b-b\b.]"
"[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, " "[+%H?Output \astring\a with characters \b<\b, \b&\b, \b>\b, \b'\b, "
"\b\"\b, and non-printable characters properly escaped for " "\b\"\b, and non-printable characters properly escaped for "
"use in HTML and XML documents. The alternate flag \b#\b " "use in HTML and XML documents. The alternate flag \b#\b "
"formats the output for use as a URI.]" "formats the output for use as a URI.]"

View file

@ -29,6 +29,11 @@
#define defs_h_defined #define defs_h_defined
#include <ast.h> #include <ast.h>
#if !SHOPT_MULTIBYTE
# undef mbwide
# define mbwide() (0) /* disable multibyte without need for further '#if SHOPT_MULTIBTYE' */
#endif
#include <sfio.h> #include <sfio.h>
#include <error.h> #include <error.h>
#include "FEATURE/externs" #include "FEATURE/externs"
@ -441,6 +446,14 @@ extern int sh_whence(char**,int);
extern Namval_t *sh_fsearch(Shell_t*,const char *,int); extern Namval_t *sh_fsearch(Shell_t*,const char *,int);
#endif /* SHOPT_NAMESPACE */ #endif /* SHOPT_NAMESPACE */
#if SHOPT_MULTIBYTE
extern int sh_isprint(int);
#else
# define sh_isprint(c) isprint(c)
#endif /* SHOPT_MULTIBYTE */
#define URI_RFC3986_UNRESERVED "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
#ifndef ERROR_dictionary #ifndef ERROR_dictionary
# define ERROR_dictionary(s) (s) # define ERROR_dictionary(s) (s)
#endif #endif

View file

@ -17,4 +17,4 @@
* David Korn <dgk@research.att.com> * * David Korn <dgk@research.att.com> *
* * * *
***********************************************************************/ ***********************************************************************/
#define SH_RELEASE "93u+m 2020-08-09" #define SH_RELEASE "93u+m 2020-08-10"

View file

@ -191,9 +191,7 @@ char *sh_substitute(const char *string,const char *oldsp,char *newsp)
return((char*)0); return((char*)0);
if(*(cp=oldsp) == 0) if(*(cp=oldsp) == 0)
goto found; goto found;
#if SHOPT_MULTIBYTE
mbinit(); mbinit();
#endif /* SHOPT_MULTIBYTE */
do do
{ {
/* skip to first character which matches start of oldsp */ /* skip to first character which matches start of oldsp */
@ -331,19 +329,21 @@ static char *sh_fmtcsv(const char *string)
#if SHOPT_MULTIBYTE #if SHOPT_MULTIBYTE
/* /*
* Returns true if c is an invisible Unicode character, excluding ASCII space. * Note: without SHOPT_MULTIBYTE, defs.h makes this an alias of isprint(3).
*
* Returns false if c is an invisible Unicode character, excluding ASCII space.
* Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
* generally not possible as the OS-provided iswgraph(3) doesn't support that * generally not possible as the OS-provided iswgraph(3) doesn't support that
* locale. So do a quick test and do our best with a fallback if necessary. * locale. So do a quick test and do our best with a fallback if necessary.
*/ */
static int is_invisible(int c) int sh_isprint(int c)
{ {
if(!mbwide()) /* not in multibyte locale? */ if(!mbwide()) /* not in multibyte locale? */
return(c != ' ' && !isgraph(c)); /* use plain isgraph(3) */ return(isprint(c)); /* use plain isprint(3) */
else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */ else if(iswgraph(0x5E38) && !iswgraph(0xFEFF)) /* can we use iswgraph(3)? */
return(c != ' ' && !iswgraph(c)); /* use iswgraph(3) */ return(c == ' ' || iswgraph(c)); /* use iswgraph(3) */
else /* fallback: */ else /* fallback: */
return( c <= 0x001F || /* control characters */ return(!(c <= 0x001F || /* control characters */
c >= 0x007F && c <= 0x009F || /* control characters */ c >= 0x007F && c <= 0x009F || /* control characters */
c == 0x00A0 || /* non-breaking space */ c == 0x00A0 || /* non-breaking space */
c == 0x061C || /* arabic letter mark */ c == 0x061C || /* arabic letter mark */
@ -353,7 +353,7 @@ static int is_invisible(int c)
c >= 0x2028 && c <= 0x202F || /* separators and format characters */ c >= 0x2028 && c <= 0x202F || /* separators and format characters */
c >= 0x205F && c <= 0x206F || /* various format characters */ c >= 0x205F && c <= 0x206F || /* various format characters */
c == 0x3000 || /* ideographic space */ c == 0x3000 || /* ideographic space */
c == 0xFEFF ); /* zero-width non-breaking space */ c == 0xFEFF)); /* zero-width non-breaking space */
} }
#endif /* SHOPT_MULTIBYTE */ #endif /* SHOPT_MULTIBYTE */
@ -368,9 +368,7 @@ char *sh_fmtq(const char *string)
int offset; int offset;
if(!cp) if(!cp)
return((char*)0); return((char*)0);
#if SHOPT_MULTIBYTE
mbinit(); mbinit();
#endif /* SHOPT_MULTIBYTE */
offset = staktell(); offset = staktell();
state = ((c= mbchar(cp))==0); state = ((c= mbchar(cp))==0);
if(isaletter(c)) if(isaletter(c))
@ -394,11 +392,7 @@ char *sh_fmtq(const char *string)
state = 1; state = 1;
for(;c;c= mbchar(cp)) for(;c;c= mbchar(cp))
{ {
#if SHOPT_MULTIBYTE if(c=='\'' || !sh_isprint(c))
if(c=='\'' || is_invisible(c))
#else
if(c=='\'' || !isprint(c))
#endif /* SHOPT_MULTIBYTE */
state = 2; state = 2;
else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT)) else if(c==']' || c=='=' || (c!=':' && c<=0x7f && (c=sh_lexstates[ST_NORM][c]) && c!=S_EPAT))
state |=1; state |=1;
@ -416,11 +410,7 @@ char *sh_fmtq(const char *string)
{ {
stakwrite("$'",2); stakwrite("$'",2);
cp = string; cp = string;
#if SHOPT_MULTIBYTE
while(op = cp, c= mbchar(cp)) while(op = cp, c= mbchar(cp))
#else
while(op = cp, c= *(unsigned char*)cp++)
#endif
{ {
state=1; state=1;
switch(c) switch(c)
@ -449,7 +439,6 @@ char *sh_fmtq(const char *string)
case '\\': case '\'': case '\\': case '\'':
break; break;
default: default:
#if SHOPT_MULTIBYTE
if(mbwide()) if(mbwide())
{ {
/* We're in a multibyte locale */ /* We're in a multibyte locale */
@ -460,16 +449,14 @@ char *sh_fmtq(const char *string)
cp = op+1; cp = op+1;
goto quote_one_byte; goto quote_one_byte;
} }
if(is_invisible(c)) if(!sh_isprint(c))
{ {
/* Unicode hex code */ /* Unicode hex code */
sfprintf(staksp,"\\u[%x]",c); sfprintf(staksp,"\\u[%x]",c);
continue; continue;
} }
} }
else else if(!isprint(c))
#endif /* SHOPT_MULTIBYTE */
if(!isprint(c))
{ {
quote_one_byte: quote_one_byte:
sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c); sfprintf(staksp, isxdigit(*cp) ? "\\x[%.2x]" : "\\x%.2x", c);

View file

@ -275,15 +275,70 @@ OPTIND=1
if [[ $(getopts $'[+?X\ffoobar\fX]' v --man 2>&1) != *'Xhello world'X* ]] if [[ $(getopts $'[+?X\ffoobar\fX]' v --man 2>&1) != *'Xhello world'X* ]]
then err_exit '\f...\f not working in getopts usage strings' then err_exit '\f...\f not working in getopts usage strings'
fi fi
if [[ $(printf '%H\n' $'<>"& \'\tabc') != '&lt;&gt;&quot;&amp;&nbsp;&apos;&#9;abc' ]]
then err_exit 'printf %H not working' expect='&lt;&gt;&quot;&amp; &#39;&#9;abc'
fi actual=$(printf '%H\n' $'<>"& \'\tabc')
if [[ $(printf '%(html)q\n' $'<>"& \'\tabc') != '&lt;&gt;&quot;&amp;&nbsp;&apos;&#9;abc' ]] [[ $expect == "$actual" ]] || err_exit 'printf %H not working' \
then err_exit 'printf %(html)q not working' "(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
fi actual=$(printf '%(html)q\n' $'<>"& \'\tabc')
if [[ $( printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc') != 'foo://ab_c%3C%3E%22%26%20%27%09abc' ]] [[ $expect == "$actual" ]] || err_exit 'printf %(html)q not working' \
then err_exit 'printf %(url)q not working' "(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
fi
expect='foo://ab_c%3C%3E%22%26%20%27%09abc'
actual=$(printf 'foo://ab_c%#H\n' $'<>"& \'\tabc')
[[ $expect == "$actual" ]] || err_exit 'printf %#H not working' \
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
actual=$(printf 'foo://ab_c%(url)q\n' $'<>"& \'\tabc')
[[ $expect == "$actual" ]] || err_exit 'printf %(url)q not working' \
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
# HTML encoding UTF-8 characters
expect='what?'
actual=$(printf %H 'what?')
[[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \
"(expected $expect; got $actual)"
expect='عندما يريد العالم أن &#8234;يتكلّم &#8236; ، فهو يتحدّث بلغة يونيكود.'
actual=$(printf %H 'عندما يريد العالم أن ‪يتكلّم ، فهو يتحدّث بلغة يونيكود.')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \
"(expected $expect; got $actual)"
expect='正常終了 正常終了'
actual=$(printf %H '正常終了 正常終了')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
"(expected $expect; got $actual)"
expect='« labîme de mon&nbsp;métier… »'
actual=$(printf %H '« labîme de mon métier… »')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \
"(expected $expect; got $actual)"
expect='?&#134;???'
actual=$(printf %H $'\x86\u86\xF0\x96\x76\xA7\xB5')
[[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \
"(expected $expect; got $actual)"
# URL/URI encoding of UTF-8 characters
expect='wh.at%3F'
actual=$(printf %#H 'wh.at?')
[[ $actual == "$expect" ]] || err_exit 'printf %H: ASCII characters' \
"(expected $expect; got $actual)"
expect='%D8%B9%D9%86%D8%AF%D9%85%D8%A7%20%D9%8A%D8%B1%D9%8A%D8%AF%20%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85%20%D8%A3%D9%86%20%E2%80%AA%D9%8A%D8%AA%D9%83%D9%84%D9%91%D9%85%20%E2%80%AC%20%D8%8C%20%D9%81%D9%87%D9%88%20%D9%8A%D8%AA%D8%AD%D8%AF%D9%91%D8%AB%20%D8%A8%D9%84%D8%BA%D8%A9%20%D9%8A%D9%88%D9%86%D9%8A%D9%83%D9%88%D8%AF.'
actual=$(printf %#H 'عندما يريد العالم أن ‪يتكلّم ، فهو يتحدّث بلغة يونيكود.')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Arabic UTF-8 characters' \
"(expected $expect; got $actual)"
expect='%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86%20%E6%AD%A3%E5%B8%B8%E7%B5%82%E4%BA%86'
actual=$(printf %#H '正常終了 正常終了')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Japanese UTF-8 characters' \
"(expected $expect; got $actual)"
expect='%C2%AB%20l%E2%80%99ab%C3%AEme%20de%20mon%C2%A0m%C3%A9tier%E2%80%A6%20%C2%BB'
actual=$(printf %#H '« labîme de mon métier… »')
[[ $actual == "$expect" ]] || err_exit 'printf %H: Latin UTF-8 characters' \
"(expected $expect; got $actual)"
expect='%3F%C2%86%3F%3F%3F'
actual=$(printf %#H $'\x86\u86\xF0\x96\x76\xA7\xB5')
[[ $actual == "$expect" ]] || err_exit 'printf %H: invalid UTF-8 characters' \
"(expected $expect; got $actual)"
;;
esac
if [[ $(printf '%R %R %R %R\n' 'a.b' '*.c' '^' '!(*.*)') != '^a\.b$ \.c$ ^\^$ ^(.*\..*)!$' ]] if [[ $(printf '%R %R %R %R\n' 'a.b' '*.c' '^' '!(*.*)') != '^a\.b$ \.c$ ^\^$ ^(.*\..*)!$' ]]
then err_exit 'printf %T not working' then err_exit 'printf %T not working'
fi fi