1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00

Refactor "$*" multibyte handling (re: 8b5f11dc)

The first of the two multibyte fixes from 8b5f11dc (which was for
using the first character of IFS as an output field separator when
expanding "$*" and similar) had a minor backwards compatibility
problem: if $IFS started with a byte sequence that is not a valid
UTF-8 character, then it treated IFS as empty in UTF-8 locales, so
the fields would be joined without any separator. The expected
behaviour would be for it to fall back to using the first byte of
IFS as it used to (and as bash and zsh do).

The new code handling this was also a bit kludgy and inefficient,
repeating the mbsize() calculation for every byte of the separator
character and for every field joined by the expansion.

src/cmd/ksh93/sh/macro.c: varsub():
- Rewrite code for joining fields for $* in a quoted or scalar
  context and $@ in a scalar context, eliminating a confusing 'd'
  variable and concentrating the routine in one block.
- When expanding $* with a multibyte separator (first character
  of $IFS), only calculate the size in bytes once per expansion.
- If $IFS starts with a byte sequence that represents an invalid
  multibyte character, fall back to using the first byte.

src/cmd/ksh93/tests/variables.sh:
- Tweak some regression tests, including one that overwrote $LANG.
- Add test for invalid multibyte character behaviour as per above.
This commit is contained in:
Martijn Dekker 2020-08-29 21:52:29 +01:00
parent 8f813bb0a3
commit 7c5d39fa04
2 changed files with 51 additions and 38 deletions

View file

@ -71,7 +71,7 @@ typedef struct _mac_
char *ifsp; /* pointer to IFS value */ char *ifsp; /* pointer to IFS value */
int fields; /* number of fields */ int fields; /* number of fields */
short quoted; /* set when word has quotes */ short quoted; /* set when word has quotes */
unsigned char ifs; /* first char of IFS */ unsigned char ifs; /* first byte of IFS */
char atmode; /* when processing $@ */ char atmode; /* when processing $@ */
char quote; /* set within double quoted contexts */ char quote; /* set within double quoted contexts */
char lit; /* set within single quotes */ char lit; /* set within single quotes */
@ -1818,7 +1818,7 @@ retry1:
retry2: retry2:
if(v && (!nulflg || *v ) && c!='+') if(v && (!nulflg || *v ) && c!='+')
{ {
register int d = (mode=='@'?' ':mp->ifs); int ofs_size = 0;
regoff_t match[2*(MATCH_MAX+1)]; regoff_t match[2*(MATCH_MAX+1)];
int nmatch, nmatch_prev, vsize_last; int nmatch, nmatch_prev, vsize_last;
char *vlast; char *vlast;
@ -1955,36 +1955,30 @@ retry2:
mp->atmode = mode=='@'; mp->atmode = mode=='@';
mp->pattern = oldpat; mp->pattern = oldpat;
} }
else if(d) else
{ {
#if SHOPT_MULTIBYTE
Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp; Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp;
/* /*
* We know from above that if we are not performing @-expansion * We're joining fields into one; write the output field separator, which may be multi-byte.
* then we assigned `d` the value of `mp->ifs`, here we check * For "$@" it's a space, for "$*" it's the 1st char of IFS (space if unset, none if empty).
* whether or not we have a valid string of IFS characters to
* write as it is possible for `d` to be set to `mp->ifs` and
* yet `mp->ifsp` to be NULL.
*/ */
if(mode != '@' && mp->ifsp) if(mode == '@' || !mp->ifsp) /* if expanding $@ or if IFS is unset... */
sfputc(sfio_ptr, ' ');
else if(mp->ifs) /* else if IFS is non-empty... */
{ {
/* if(!mbwide() || mp->ifs < 128) /* if single-byte char... */
* Handle multi-byte characters being used for the internal sfputc(sfio_ptr, mp->ifs);
* field separator (IFS). else
*/ {
int i; if(!ofs_size) /* only calculate this once per expansion */
for(i = 0; i < mbsize(mp->ifsp); i++) {
sfputc(sfio_ptr,mp->ifsp[i]); ofs_size = mbsize(mp->ifsp);
if(ofs_size<0) /* invalid mb char: fall back to using first byte */
ofs_size = 1;
}
sfwrite(sfio_ptr, mp->ifsp, ofs_size);
}
} }
else
sfputc(sfio_ptr,d);
#else
if(mp->sp)
sfputc(mp->sp,d);
else
sfputc(stkp,d);
#endif
} }
} }
if(arrmax) if(arrmax)

View file

@ -20,7 +20,7 @@
function err_exit function err_exit
{ {
print -u2 -n "\t" print -u2 -n "\t"
print -u2 -r ${Command}[$1]: "${@:2}" print -u2 -r "${Command}[$1]: ${@:2}"
let Errors+=1 let Errors+=1
} }
alias err_exit='err_exit $LINENO' alias err_exit='err_exit $LINENO'
@ -167,10 +167,17 @@ COUNT=0
if (( COUNT != 1 || ACCESS!=2 )) if (( COUNT != 1 || ACCESS!=2 ))
then err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS" then err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS"
fi fi
save_LANG=$LANG
LANG=C > /dev/null 2>&1 LANG=C > /dev/null 2>&1
if [[ $LANG != C ]] if [[ $LANG != C ]]
then err_exit "C locale not working" then err_exit "C locale not working"
fi fi
LANG=$save_LANG
if [[ $LANG != "$save_LANG" ]]
then err_exit "$save_LANG locale not working"
fi
unset RANDOM unset RANDOM
unset -n foo unset -n foo
foo=junk foo=junk
@ -205,8 +212,7 @@ do false
if [[ $i != [@*] && ${foo#?} != "$bar" ]] if [[ $i != [@*] && ${foo#?} != "$bar" ]]
then err_exit "\${$i#?} not correct" then err_exit "\${$i#?} not correct"
fi fi
command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives synta command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives syntax error"
x error"
if [[ $i != @([@*]) && ${#foo} != "$bar" ]] if [[ $i != @([@*]) && ${#foo} != "$bar" ]]
then err_exit "\${#$i} not correct" then err_exit "\${#$i} not correct"
fi fi
@ -436,16 +442,18 @@ case $(unset IFS; set -- $v; print $#) in
esac esac
# Multi-byte characters should work with $IFS # Multi-byte characters should work with $IFS
( if [[ ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} =~ [Uu][Tt][Ff]-?8 ]] # The multi-byte tests are pointless without UTF-8
LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8 then
# Test the following characters: # Test the following characters:
# Lowercase accented e (two bytes) # Lowercase accented e (two bytes)
# Roman sestertius sign (four bytes) # Roman sestertius sign (four bytes)
for delim in é 𐆘; do for delim in é 𐆘; do
IFS="$delim" IFS=$delim
set : : set : :
[ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)" expect=:$delim:
actual=$*
[[ $actual == "$expect" ]] || err_exit "IFS failed with multi-byte character $delim" \
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
read -r first second third <<< "one${delim}two${delim}three" read -r first second third <<< "one${delim}two${delim}three"
[[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)" [[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)"
@ -453,9 +461,11 @@ esac
[[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)" [[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)"
# Ensure subshells don't get corrupted when IFS becomes a multi-byte character # Ensure subshells don't get corrupted when IFS becomes a multi-byte character
expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")" IFS=$' \t\n'
output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)" expect=$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")
[[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)" actual=$(set : :; IFS=$delim; echo "$*"; trap "echo end" EXIT; trap)
[[ $actual == "$expect" ]] || err_exit "IFS in subshell failed with multi-byte character $delim" \
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
done done
# Multibyte characters with the same initial byte shouldn't be parsed as the same # Multibyte characters with the same initial byte shouldn't be parsed as the same
@ -466,7 +476,16 @@ esac
set -- $v set -- $v
v="${#},${1-},${2-},${3-}" v="${#},${1-},${2-},${3-}"
[[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character" [[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character"
) fi
# Ensure fallback to first byte if IFS doesn't start with a valid multibyte character
# (however, this test should pass regardless of the locale)
IFS=$'\x[A0]a'
set : :
expect=$':\x[A0]:'
actual=$*
[[ $actual == "$expect" ]] || err_exit "IFS failed with invalid multi-byte character" \
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
# ^^^ end: IFS tests ^^^ # ^^^ end: IFS tests ^^^
# restore default split: # restore default split: