diff --git a/src/cmd/ksh93/sh/macro.c b/src/cmd/ksh93/sh/macro.c index b785435bf..038619554 100644 --- a/src/cmd/ksh93/sh/macro.c +++ b/src/cmd/ksh93/sh/macro.c @@ -71,7 +71,7 @@ typedef struct _mac_ char *ifsp; /* pointer to IFS value */ int fields; /* number of fields */ short quoted; /* set when word has quotes */ - unsigned char ifs; /* first char of IFS */ + unsigned char ifs; /* first byte of IFS */ char atmode; /* when processing $@ */ char quote; /* set within double quoted contexts */ char lit; /* set within single quotes */ @@ -1818,7 +1818,7 @@ retry1: retry2: if(v && (!nulflg || *v ) && c!='+') { - register int d = (mode=='@'?' ':mp->ifs); + int ofs_size = 0; regoff_t match[2*(MATCH_MAX+1)]; int nmatch, nmatch_prev, vsize_last; char *vlast; @@ -1955,36 +1955,30 @@ retry2: mp->atmode = mode=='@'; mp->pattern = oldpat; } - else if(d) + else { -#if SHOPT_MULTIBYTE Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp; - /* - * We know from above that if we are not performing @-expansion - * then we assigned `d` the value of `mp->ifs`, here we check - * whether or not we have a valid string of IFS characters to - * write as it is possible for `d` to be set to `mp->ifs` and - * yet `mp->ifsp` to be NULL. + * We're joining fields into one; write the output field separator, which may be multi-byte. + * For "$@" it's a space, for "$*" it's the 1st char of IFS (space if unset, none if empty). */ - if(mode != '@' && mp->ifsp) + if(mode == '@' || !mp->ifsp) /* if expanding $@ or if IFS is unset... */ + sfputc(sfio_ptr, ' '); + else if(mp->ifs) /* else if IFS is non-empty... */ { - /* - * Handle multi-byte characters being used for the internal - * field separator (IFS). - */ - int i; - for(i = 0; i < mbsize(mp->ifsp); i++) - sfputc(sfio_ptr,mp->ifsp[i]); + if(!mbwide() || mp->ifs < 128) /* if single-byte char... */ + sfputc(sfio_ptr, mp->ifs); + else + { + if(!ofs_size) /* only calculate this once per expansion */ + { + ofs_size = mbsize(mp->ifsp); + if(ofs_size<0) /* invalid mb char: fall back to using first byte */ + ofs_size = 1; + } + sfwrite(sfio_ptr, mp->ifsp, ofs_size); + } } - else - sfputc(sfio_ptr,d); -#else - if(mp->sp) - sfputc(mp->sp,d); - else - sfputc(stkp,d); -#endif } } if(arrmax) diff --git a/src/cmd/ksh93/tests/variables.sh b/src/cmd/ksh93/tests/variables.sh index 96266856f..e500adf22 100755 --- a/src/cmd/ksh93/tests/variables.sh +++ b/src/cmd/ksh93/tests/variables.sh @@ -20,7 +20,7 @@ function err_exit { print -u2 -n "\t" - print -u2 -r ${Command}[$1]: "${@:2}" + print -u2 -r "${Command}[$1]: ${@:2}" let Errors+=1 } alias err_exit='err_exit $LINENO' @@ -167,10 +167,17 @@ COUNT=0 if (( COUNT != 1 || ACCESS!=2 )) then err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS" fi + +save_LANG=$LANG LANG=C > /dev/null 2>&1 if [[ $LANG != C ]] then err_exit "C locale not working" fi +LANG=$save_LANG +if [[ $LANG != "$save_LANG" ]] +then err_exit "$save_LANG locale not working" +fi + unset RANDOM unset -n foo foo=junk @@ -205,8 +212,7 @@ do false if [[ $i != [@*] && ${foo#?} != "$bar" ]] then err_exit "\${$i#?} not correct" fi - command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives synta -x error" + command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives syntax error" if [[ $i != @([@*]) && ${#foo} != "$bar" ]] then err_exit "\${#$i} not correct" fi @@ -436,16 +442,18 @@ case $(unset IFS; set -- $v; print $#) in esac # Multi-byte characters should work with $IFS -( - LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8 - +if [[ ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} =~ [Uu][Tt][Ff]-?8 ]] # The multi-byte tests are pointless without UTF-8 +then # Test the following characters: # Lowercase accented e (two bytes) # Roman sestertius sign (four bytes) for delim in é 𐆘; do - IFS="$delim" + IFS=$delim set : : - [ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)" + expect=:$delim: + actual=$* + [[ $actual == "$expect" ]] || err_exit "IFS failed with multi-byte character $delim" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" read -r first second third <<< "one${delim}two${delim}three" [[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)" @@ -453,9 +461,11 @@ esac [[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)" # Ensure subshells don't get corrupted when IFS becomes a multi-byte character - expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")" - output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)" - [[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)" + IFS=$' \t\n' + expect=$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend") + actual=$(set : :; IFS=$delim; echo "$*"; trap "echo end" EXIT; trap) + [[ $actual == "$expect" ]] || err_exit "IFS in subshell failed with multi-byte character $delim" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" done # Multibyte characters with the same initial byte shouldn't be parsed as the same @@ -466,7 +476,16 @@ esac set -- $v v="${#},${1-},${2-},${3-}" [[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character" -) +fi + +# Ensure fallback to first byte if IFS doesn't start with a valid multibyte character +# (however, this test should pass regardless of the locale) +IFS=$'\x[A0]a' +set : : +expect=$':\x[A0]:' +actual=$* +[[ $actual == "$expect" ]] || err_exit "IFS failed with invalid multi-byte character" \ + "(expected $(printf %q "$expect"), got $(printf %q "$actual"))" # ^^^ end: IFS tests ^^^ # restore default split: