mirror of
				git://git.code.sf.net/p/cdesktopenv/code
				synced 2025-03-09 15:50:02 +00:00 
			
		
		
		
	Fix UTF-8 shellquoting for xtrace, printf %q, etc.
This fixes an annoying issue in the shell's quoting algorithm (used for xtrace (set -x), printf %q, and other things) for UTF-8 locales, that caused it to encode perfectly printable UTF-8 characters unnecessarily and inconsistently. For example: $ (set -x; : 'aeu aéu') + : $'aeu a\u[e9]u' $ (set -x; : 'aéu aeu') + : 'aéu aeu' $ (set -x; : '正常終了 aeu') + : '正常終了 aeu' $ (set -x; : 'aeu 正常終了') + : $'aeu \u[6b63]\u[5e38]\u[7d42]\u[4e86]' This issue was originally reported by lijo george in May 2017: https://www.mail-archive.com/ast-developers@lists.research.att.com/msg01958.html src/cmd/ksh93/sh/string.c: - Add is_invisible() function that returns true if a character is a Unicode invisible (non-graph) character, excluding ASCII space. Ref.: https://unicode.org/charts/PDF/U2000.pdf - Use a fallback in is_invisible() if we cannot use the system's iswprint(3); this is the case for the ksh C.UTF-8 locale if the OS doesn't support that. Fall back to a hardcoded blacklist of invisible and control characters and put up with not encoding nonexistent characters into \u[xxxx] escapes. Ref.: https://unicode.org/charts/PDF/U2000.pdf - When deciding whether to switch to $'...' quoting mode (state=2), use is_invisible() instead of testing for ASCII 0-127 range. - In $'...' quoting mode, use is_invisible() to decide whether to encode wide characters into \u[xxxx] escapes. src/cmd/ksh93/tests/builtins.sh: - Add regression tests for shellquoting Arabic, Japanese and Latin UTF-8 characters, to be run only in a UTF-8 locale. The Arabic sample text[*] contains a couple of direction markers that are expected to be encoded into \u[xxxx] escapes. [*] source: https://r12a.github.io/scripts/tutorial/summaries/arabic
This commit is contained in:
		
							parent
							
								
									588a1ff7ca
								
							
						
					
					
						commit
						f9d28935bb
					
				
					 3 changed files with 52 additions and 2 deletions
				
			
		
							
								
								
									
										4
									
								
								NEWS
									
										
									
									
									
								
							
							
						
						
									
										4
									
								
								NEWS
									
										
									
									
									
								
							| 
						 | 
					@ -18,6 +18,10 @@ Any uppercase BUG_* names are modernish shell bug IDs.
 | 
				
			||||||
- Fixed a bug that corrupted KIA/CQL cross-reference databases created using
 | 
					- Fixed a bug that corrupted KIA/CQL cross-reference databases created using
 | 
				
			||||||
  ksh's -R option; shell warnings were wrongly included in the database file.
 | 
					  ksh's -R option; shell warnings were wrongly included in the database file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- The shell's quoting algorithm (used in xtrace, printf %q, and more) has been
 | 
				
			||||||
 | 
					  fixed for UTF-8 (Unicode) locales; it no longer needlessly and inconsistently
 | 
				
			||||||
 | 
					  encodes normal printable UTF-8 characters into hexadecimal \u[xxxx] codes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
2020-07-07:
 | 
					2020-07-07:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- Four of the date formats accepted by 'printf %()T' have had their
 | 
					- Four of the date formats accepted by 'printf %()T' have had their
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -325,6 +325,34 @@ static char	*sh_fmtcsv(const char *string)
 | 
				
			||||||
	return(stakptr(offset));
 | 
						return(stakptr(offset));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#if SHOPT_MULTIBYTE
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * Returns true if c is an invisible Unicode character, excluding ASCII space.
 | 
				
			||||||
 | 
					 * Use iswgraph(3) if possible. In the ksh-specific C.UTF-8 locale, this is
 | 
				
			||||||
 | 
					 * generally not possible as the OS-provided iswgraph(3) doesn't support that
 | 
				
			||||||
 | 
					 * locale. So do a quick test and do our best with a fallback if necessary.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static int	is_invisible(int c)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if(!mbwide())					/* not in multibyte locale? */
 | 
				
			||||||
 | 
							return(c != ' ' && !isgraph(c));	/* use plain isgraph(3) */
 | 
				
			||||||
 | 
						else if(iswgraph(0x5E38) && !iswgraph(0xFEFF))	/* can we use iswgraph(3)? */
 | 
				
			||||||
 | 
							return(c != ' ' && !iswgraph(c));	/* use iswgraph(3) */
 | 
				
			||||||
 | 
						else						/* fallback: */
 | 
				
			||||||
 | 
							return(	c <= 0x001F ||			/* control characters */
 | 
				
			||||||
 | 
								c >= 0x007F && c <= 0x009F ||	/* control characters */
 | 
				
			||||||
 | 
								c == 0x00A0 ||			/* non-breaking space */
 | 
				
			||||||
 | 
								c == 0x061C ||			/* arabic letter mark */
 | 
				
			||||||
 | 
								c == 0x1680 ||			/* ogham space mark */
 | 
				
			||||||
 | 
								c == 0x180E ||			/* mongolian vowel separator */
 | 
				
			||||||
 | 
								c >= 0x2000 && c <= 0x200F ||	/* spaces and format characters */
 | 
				
			||||||
 | 
								c >= 0x2028 && c <= 0x202F ||	/* separators and format characters */
 | 
				
			||||||
 | 
								c >= 0x205F && c <= 0x206F ||	/* various format characters */
 | 
				
			||||||
 | 
								c == 0x3000 ||			/* ideographic space */
 | 
				
			||||||
 | 
								c == 0xFEFF );			/* zero-width non-breaking space */
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif /* SHOPT_MULTIBYTE */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * print <str> quoting chars so that it can be read by the shell
 | 
					 * print <str> quoting chars so that it can be read by the shell
 | 
				
			||||||
 * puts null terminated result on stack, but doesn't freeze it
 | 
					 * puts null terminated result on stack, but doesn't freeze it
 | 
				
			||||||
| 
						 | 
					@ -363,7 +391,7 @@ char	*sh_fmtq(const char *string)
 | 
				
			||||||
	for(;c;c= mbchar(cp))
 | 
						for(;c;c= mbchar(cp))
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
#if SHOPT_MULTIBYTE
 | 
					#if SHOPT_MULTIBYTE
 | 
				
			||||||
		if(c=='\'' || c>=128 || c<0 || !iswprint(c)) 
 | 
							if(c=='\'' || is_invisible(c))
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
		if(c=='\'' || !isprint(c))
 | 
							if(c=='\'' || !isprint(c))
 | 
				
			||||||
#endif /* SHOPT_MULTIBYTE */
 | 
					#endif /* SHOPT_MULTIBYTE */
 | 
				
			||||||
| 
						 | 
					@ -426,7 +454,7 @@ char	*sh_fmtq(const char *string)
 | 
				
			||||||
					cp = op+1;
 | 
										cp = op+1;
 | 
				
			||||||
					isbyte = 1;
 | 
										isbyte = 1;
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
				if(mbwide() && ((cp-op)>1))
 | 
									if(mbwide() && is_invisible(c))
 | 
				
			||||||
				{
 | 
									{
 | 
				
			||||||
					sfprintf(staksp,"\\u[%x]",c);
 | 
										sfprintf(staksp,"\\u[%x]",c);
 | 
				
			||||||
					continue;
 | 
										continue;
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -318,6 +318,24 @@ LC_CTYPE=POSIX true	    # on buggy ksh, a locale re-init via temp assignment res
 | 
				
			||||||
[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \
 | 
					[[ $actual == "$expect" ]] || err_exit 'shell-quoting corrupted after interrupted processing of UTF-8 char' \
 | 
				
			||||||
				"(expected $expect; got $actual)"
 | 
									"(expected $expect; got $actual)"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# shell-quoting UTF-8 characters: check for unnecessary encoding
 | 
				
			||||||
 | 
					case ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} in
 | 
				
			||||||
 | 
					( *[Uu][Tt][Ff]8* | *[Uu][Tt][Ff]-8* )
 | 
				
			||||||
 | 
						expect=$'$\'عندما يريد العالم أن \\u[202a]يتكلّم \\u[202c] ، فهو يتحدّث بلغة يونيكود.\''
 | 
				
			||||||
 | 
						actual=$(printf %q 'عندما يريد العالم أن يتكلّم  ، فهو يتحدّث بلغة يونيكود.')
 | 
				
			||||||
 | 
						[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Arabic UTF-8 characters' \
 | 
				
			||||||
 | 
									"(expected $expect; got $actual)"
 | 
				
			||||||
 | 
						expect="'正常終了 正常終了'"
 | 
				
			||||||
 | 
						actual=$(printf %q '正常終了 正常終了')
 | 
				
			||||||
 | 
						[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Japanese UTF-8 characters' \
 | 
				
			||||||
 | 
									"(expected $expect; got $actual)"
 | 
				
			||||||
 | 
						expect="'aeu aéu'"
 | 
				
			||||||
 | 
						actual=$(printf %q 'aeu aéu')
 | 
				
			||||||
 | 
						[[ $actual == "$expect" ]] || err_exit 'shell-quoting: Latin UTF-8 characters' \
 | 
				
			||||||
 | 
									"(expected $expect; got $actual)"
 | 
				
			||||||
 | 
						;;
 | 
				
			||||||
 | 
					esac
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ======
 | 
					# ======
 | 
				
			||||||
# we won't get hit by the one second boundary twice, right?
 | 
					# we won't get hit by the one second boundary twice, right?
 | 
				
			||||||
expect= actual=
 | 
					expect= actual=
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue