Refactor "$*" multibyte handling (re: 8b5f11dc)

The first of the two multibyte fixes from 8b5f11dc (which was for using the first character of IFS as an output field separator when expanding "$*" and similar) had a minor backwards compatibility problem: if $IFS started with a byte sequence that is not a valid UTF-8 character, then it treated IFS as empty in UTF-8 locales, so the fields would be joined without any separator. The expected behaviour would be for it to fall back to using the first byte of IFS as it used to (and as bash and zsh do). The new code handling this was also a bit kludgy and inefficient, repeating the mbsize() calculation for every byte of the separator character and for every field joined by the expansion. src/cmd/ksh93/sh/macro.c: varsub(): - Rewrite code for joining fields for $* in a quoted or scalar context and $@ in a scalar context, eliminating a confusing 'd' variable and concentrating the routine in one block. - When expanding $* with a multibyte separator (first character of $IFS), only calculate the size in bytes once per expansion. - If $IFS starts with a byte sequence that represents an invalid multibyte character, fall back to using the first byte. src/cmd/ksh93/tests/variables.sh: - Tweak some regression tests, including one that overwrote $LANG. - Add test for invalid multibyte character behaviour as per above.
2025-03-09 15:50:02 +00:00 · 2020-08-29 21:52:29 +01:00 · 2020-08-29 21:52:29 +01:00 · 7c5d39fa04
commit 7c5d39fa04
parent 8f813bb0a3
2 changed files with 51 additions and 38 deletions
--- a/src/cmd/ksh93/sh/macro.c
+++ b/src/cmd/ksh93/sh/macro.c
@ -71,7 +71,7 @@ typedef struct  _mac_
 	char		*ifsp;		/* pointer to IFS value */
 	int		fields;		/* number of fields */
 	short		quoted;		/* set when word has quotes */
-	unsigned char	ifs;		/* first char of IFS */
+	unsigned char	ifs;		/* first byte of IFS */
 	char		atmode;		/* when processing $@ */
 	char		quote;		/* set within double quoted contexts */
 	char		lit;		/* set within single quotes */
@ -1818,7 +1818,7 @@ retry1:
 retry2:
 	if(v && (!nulflg || *v ) && c!='+')
 	{
-		register int d = (mode=='@'?' ':mp->ifs);
+		int ofs_size = 0;
 		regoff_t match[2*(MATCH_MAX+1)];
 		int nmatch, nmatch_prev, vsize_last;
 		char *vlast;
@ -1955,36 +1955,30 @@ retry2:
 				mp->atmode = mode=='@';
 				mp->pattern = oldpat;
 			}
-			else if(d)
+			else
 			{
-#if SHOPT_MULTIBYTE
 				Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp;
-
 				/*
-				 * We know from above that if we are not performing @-expansion
-				 * then we assigned `d` the value of `mp->ifs`, here we check
-				 * whether or not we have a valid string of IFS characters to
-				 * write as it is possible for `d` to be set to `mp->ifs` and
-				 * yet `mp->ifsp` to be NULL.
+				 * We're joining fields into one; write the output field separator, which may be multi-byte.
+				 * For "$@" it's a space, for "$*" it's the 1st char of IFS (space if unset, none if empty).
 				 */
-				if(mode != '@' && mp->ifsp)
+				if(mode == '@' || !mp->ifsp)		/* if expanding $@ or if IFS is unset... */
+					sfputc(sfio_ptr, ' ');
+				else if(mp->ifs)			/* else if IFS is non-empty... */
 				{
-					/*
-					 * Handle multi-byte characters being used for the internal
-					 * field separator (IFS).
-					 */
-					int i;
-					for(i = 0; i < mbsize(mp->ifsp); i++)
-						sfputc(sfio_ptr,mp->ifsp[i]);
+					if(!mbwide() || mp->ifs < 128)	/* if single-byte char... */
+						sfputc(sfio_ptr, mp->ifs);
+					else
+					{
+						if(!ofs_size)		/* only calculate this once per expansion */
+						{
+							ofs_size = mbsize(mp->ifsp);
+							if(ofs_size<0)	/* invalid mb char: fall back to using first byte */
+								ofs_size = 1;
+						}
+						sfwrite(sfio_ptr, mp->ifsp, ofs_size);
+					}
 				}
-				else
-					sfputc(sfio_ptr,d);
-#else
-				if(mp->sp)
-					sfputc(mp->sp,d);
-				else
-					sfputc(stkp,d);
-#endif
 			}
 		}
 		if(arrmax)
--- a/src/cmd/ksh93/tests/variables.sh
+++ b/src/cmd/ksh93/tests/variables.sh
@ -20,7 +20,7 @@
 function err_exit
 {
 	print -u2 -n "\t"
-	print -u2 -r ${Command}[$1]: "${@:2}"
+	print -u2 -r "${Command}[$1]: ${@:2}"
 	let Errors+=1
 }
 alias err_exit='err_exit $LINENO'
@ -167,10 +167,17 @@ COUNT=0
 if	(( COUNT != 1 || ACCESS!=2 ))
 then	err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS"
 fi
+
+save_LANG=$LANG
 LANG=C > /dev/null 2>&1
 if	[[ $LANG != C ]]
 then	err_exit "C locale not working"
 fi
+LANG=$save_LANG
+if	[[ $LANG != "$save_LANG" ]]
+then	err_exit "$save_LANG locale not working"
+fi
+
 unset RANDOM
 unset -n foo
 foo=junk
@ -205,8 +212,7 @@ do	false
 	if	[[ $i != [@*] && ${foo#?} != "$bar"  ]]
 	then	err_exit "\${$i#?} not correct"
 	fi
-	command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives synta
-x error"
+	command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives syntax error"
 	if	[[ $i != @([@*]) && ${#foo} != "$bar" ]]
 	then	err_exit "\${#$i} not correct"
 	fi
@ -436,16 +442,18 @@ case $(unset IFS; set -- $v; print $#) in
 esac

 # Multi-byte characters should work with $IFS
-(
-	LC_ALL=C.UTF-8  # The multi-byte tests are pointless without UTF-8
-
+if [[ ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} =~ [Uu][Tt][Ff]-?8 ]]	# The multi-byte tests are pointless without UTF-8
+then
 	# Test the following characters:
 	# Lowercase accented e  (two bytes)
 	# Roman sestertius sign (four bytes)
 	for delim in é 𐆘; do
-		IFS="$delim"
+		IFS=$delim
 		set : :
-		[ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)"
+		expect=:$delim:
+		actual=$*
+		[[ $actual == "$expect" ]] || err_exit "IFS failed with multi-byte character $delim" \
+			"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"

 		read -r first second third <<< "one${delim}two${delim}three"
 		[[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)"
@ -453,9 +461,11 @@ esac
 		[[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)"

 		# Ensure subshells don't get corrupted when IFS becomes a multi-byte character
-		expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")"
-		output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)"
-		[[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)"
+		IFS=$' \t\n'
+		expect=$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")
+		actual=$(set : :; IFS=$delim; echo "$*"; trap "echo end" EXIT; trap)
+		[[ $actual == "$expect" ]] || err_exit "IFS in subshell failed with multi-byte character $delim" \
+			"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
 	done

 	# Multibyte characters with the same initial byte shouldn't be parsed as the same
@ -466,7 +476,16 @@ esac
 	set -- $v
 	v="${#},${1-},${2-},${3-}"
 	[[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character"
-)
+fi
+
+# Ensure fallback to first byte if IFS doesn't start with a valid multibyte character
+# (however, this test should pass regardless of the locale)
+IFS=$'\x[A0]a'
+set : :
+expect=$':\x[A0]:'
+actual=$*
+[[ $actual == "$expect" ]] || err_exit "IFS failed with invalid multi-byte character" \
+	"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"

 # ^^^ end: IFS tests ^^^
 # restore default split: