From 7c5d39fa04a4bf14e82a69646365a2a5ee141ed4 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Sat, 29 Aug 2020 21:52:29 +0100 Subject: [PATCH] Refactor "$*" multibyte handling (re: 8b5f11dc) The first of the two multibyte fixes from 8b5f11dc (which was for using the first character of IFS as an output field separator when expanding "$*" and similar) had a minor backwards compatibility problem: if $IFS started with a byte sequence that is not a valid UTF-8 character, then it treated IFS as empty in UTF-8 locales, so the fields would be joined without any separator. The expected behaviour would be for it to fall back to using the first byte of IFS as it used to (and as bash and zsh do). The new code handling this was also a bit kludgy and inefficient, repeating the mbsize() calculation for every byte of the separator character and for every field joined by the expansion. src/cmd/ksh93/sh/macro.c: varsub(): - Rewrite code for joining fields for $* in a quoted or scalar context and $@ in a scalar context, eliminating a confusing 'd' variable and concentrating the routine in one block. - When expanding $* with a multibyte separator (first character of $IFS), only calculate the size in bytes once per expansion. - If $IFS starts with a byte sequence that represents an invalid multibyte character, fall back to using the first byte. src/cmd/ksh93/tests/variables.sh: - Tweak some regression tests, including one that overwrote $LANG. - Add test for invalid multibyte character behaviour as per above. --- src/cmd/ksh93/sh/macro.c | 46 ++++++++++++++------------------ src/cmd/ksh93/tests/variables.sh | 43 ++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/src/cmd/ksh93/sh/macro.c b/src/cmd/ksh93/sh/macro.c index b785435bf..038619554 100644 --- a/src/cmd/ksh93/sh/macro.c +++ b/src/cmd/ksh93/sh/macro.c @@ -71,7 +71,7 @@ typedef struct _mac_ char *ifsp; /* pointer to IFS value */ int fields; /* number of fields */ short quoted; /* set when word has quotes */ - unsigned char ifs; /* first char of IFS */ + unsigned char ifs; /* first byte of IFS */ char atmode; /* when processing $@ */ char quote; /* set within double quoted contexts */ char lit; /* set within single quotes */ @@ -1818,7 +1818,7 @@ retry1: retry2: if(v && (!nulflg || *v ) && c!='+') { - register int d = (mode=='@'?' ':mp->ifs); + int ofs_size = 0; regoff_t match[2*(MATCH_MAX+1)]; int nmatch, nmatch_prev, vsize_last; char *vlast; @@ -1955,36 +1955,30 @@ retry2: mp->atmode = mode=='@'; mp->pattern = oldpat; } - else if(d) + else { -#if SHOPT_MULTIBYTE Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp; - /* - * We know from above that if we are not performing @-expansion - * then we assigned `d` the value of `mp->ifs`, here we check - * whether or not we have a valid string of IFS characters to - * write as it is possible for `d` to be set to `mp->ifs` and - * yet `mp->ifsp` to be NULL. + * We're joining fields into one; write the output field separator, which may be multi-byte. + * For "$@" it's a space, for "$*" it's the 1st char of IFS (space if unset, none if empty). */ - if(mode != '@' && mp->ifsp) + if(mode == '@' || !mp->ifsp) /* if expanding $@ or if IFS is unset... */ + sfputc(sfio_ptr, ' '); + else if(mp->ifs) /* else if IFS is non-empty... */ { - /* - * Handle multi-byte characters being used for the internal - * field separator (IFS). - */ - int i; - for(i = 0; i < mbsize(mp->ifsp); i++) - sfputc(sfio_ptr,mp->ifsp[i]); + if(!mbwide() || mp->ifs < 128) /* if single-byte char... */ + sfputc(sfio_ptr, mp->ifs); + else + { + if(!ofs_size) /* only calculate this once per expansion */ + { + ofs_size = mbsize(mp->ifsp); + if(ofs_size<0) /* invalid mb char: fall back to using first byte */ + ofs_size = 1; + } + sfwrite(sfio_ptr, mp->ifsp, ofs_size); + } } - else - sfputc(sfio_ptr,d); -#else - if(mp->sp) - sfputc(mp->sp,d); - else - sfputc(stkp,d); -#endif } } if(arrmax) diff --git a/src/cmd/ksh93/tests/variables.sh b/src/cmd/ksh93/tests/variables.sh index 96266856f..e500adf22 100755 --- a/src/cmd/ksh93/tests/variables.sh +++ b/src/cmd/ksh93/tests/variables.sh @@ -20,7 +20,7 @@ function err_exit { print -u2 -n "\t" - print -u2 -r ${Command}[$1]: "${@:2}" + print -u2 -r "${Command}[$1]: ${@:2}" let Errors+=1 } alias err_exit='err_exit $LINENO' @@ -167,10 +167,17 @@ COUNT=0 if (( COUNT != 1 || ACCESS!=2 )) then err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS" fi + +save_LANG=$LANG LANG=C > /dev/null 2>&1 if [[ $LANG != C ]] then err_exit "C locale not working" fi +LANG=$save_LANG +if [[ $LANG != "$save_LANG" ]] +then err_exit "$save_LANG locale not working" +fi + unset RANDOM unset -n foo foo=junk @@ -205,8 +212,7 @@ do false if [[ $i != [@*] && ${foo#?} != "$bar" ]] then err_exit "\${$i#?} not correct" fi - command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives synta -x error" + command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives syntax error" if [[ $i != @([@*]) && ${#foo} != "$bar" ]] then err_exit "\${#$i} not correct" fi @@ -436,16 +442,18 @@ case $(unset IFS; set -- $v; print $#) in esac # Multi-byte characters should work with $IFS -( - LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8 - +if [[ ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} =~ [Uu][Tt][Ff]-?8 ]] # The multi-byte tests are pointless without UTF-8 +then # Test the following characters: # Lowercase accented e (two bytes) # Roman sestertius sign (four bytes) for delim in é 𐆘; do - IFS="$delim" + IFS=$delim set : : - [ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)" + expect=:$delim: + actual=$* + [[ $actual == "$expect" ]] || err_exit "IFS failed with multi-byte character $delim" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" read -r first second third <<< "one${delim}two${delim}three" [[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)" @@ -453,9 +461,11 @@ esac [[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)" # Ensure subshells don't get corrupted when IFS becomes a multi-byte character - expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")" - output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)" - [[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)" + IFS=$' \t\n' + expect=$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend") + actual=$(set : :; IFS=$delim; echo "$*"; trap "echo end" EXIT; trap) + [[ $actual == "$expect" ]] || err_exit "IFS in subshell failed with multi-byte character $delim" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" done # Multibyte characters with the same initial byte shouldn't be parsed as the same @@ -466,7 +476,16 @@ esac set -- $v v="${#},${1-},${2-},${3-}" [[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character" -) +fi + +# Ensure fallback to first byte if IFS doesn't start with a valid multibyte character +# (however, this test should pass regardless of the locale) +IFS=$'\x[A0]a' +set : : +expect=$':\x[A0]:' +actual=$* +[[ $actual == "$expect" ]] || err_exit "IFS failed with invalid multi-byte character" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" # ^^^ end: IFS tests ^^^ # restore default split: