mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-03-09 15:50:02 +00:00
Refactor "$*" multibyte handling (re: 8b5f11dc
)
The first of the two multibyte fixes from 8b5f11dc
(which was for
using the first character of IFS as an output field separator when
expanding "$*" and similar) had a minor backwards compatibility
problem: if $IFS started with a byte sequence that is not a valid
UTF-8 character, then it treated IFS as empty in UTF-8 locales, so
the fields would be joined without any separator. The expected
behaviour would be for it to fall back to using the first byte of
IFS as it used to (and as bash and zsh do).
The new code handling this was also a bit kludgy and inefficient,
repeating the mbsize() calculation for every byte of the separator
character and for every field joined by the expansion.
src/cmd/ksh93/sh/macro.c: varsub():
- Rewrite code for joining fields for $* in a quoted or scalar
context and $@ in a scalar context, eliminating a confusing 'd'
variable and concentrating the routine in one block.
- When expanding $* with a multibyte separator (first character
of $IFS), only calculate the size in bytes once per expansion.
- If $IFS starts with a byte sequence that represents an invalid
multibyte character, fall back to using the first byte.
src/cmd/ksh93/tests/variables.sh:
- Tweak some regression tests, including one that overwrote $LANG.
- Add test for invalid multibyte character behaviour as per above.
This commit is contained in:
parent
8f813bb0a3
commit
7c5d39fa04
2 changed files with 51 additions and 38 deletions
|
@ -71,7 +71,7 @@ typedef struct _mac_
|
|||
char *ifsp; /* pointer to IFS value */
|
||||
int fields; /* number of fields */
|
||||
short quoted; /* set when word has quotes */
|
||||
unsigned char ifs; /* first char of IFS */
|
||||
unsigned char ifs; /* first byte of IFS */
|
||||
char atmode; /* when processing $@ */
|
||||
char quote; /* set within double quoted contexts */
|
||||
char lit; /* set within single quotes */
|
||||
|
@ -1818,7 +1818,7 @@ retry1:
|
|||
retry2:
|
||||
if(v && (!nulflg || *v ) && c!='+')
|
||||
{
|
||||
register int d = (mode=='@'?' ':mp->ifs);
|
||||
int ofs_size = 0;
|
||||
regoff_t match[2*(MATCH_MAX+1)];
|
||||
int nmatch, nmatch_prev, vsize_last;
|
||||
char *vlast;
|
||||
|
@ -1955,36 +1955,30 @@ retry2:
|
|||
mp->atmode = mode=='@';
|
||||
mp->pattern = oldpat;
|
||||
}
|
||||
else if(d)
|
||||
else
|
||||
{
|
||||
#if SHOPT_MULTIBYTE
|
||||
Sfio_t *sfio_ptr = (mp->sp) ? mp->sp : stkp;
|
||||
|
||||
/*
|
||||
* We know from above that if we are not performing @-expansion
|
||||
* then we assigned `d` the value of `mp->ifs`, here we check
|
||||
* whether or not we have a valid string of IFS characters to
|
||||
* write as it is possible for `d` to be set to `mp->ifs` and
|
||||
* yet `mp->ifsp` to be NULL.
|
||||
* We're joining fields into one; write the output field separator, which may be multi-byte.
|
||||
* For "$@" it's a space, for "$*" it's the 1st char of IFS (space if unset, none if empty).
|
||||
*/
|
||||
if(mode != '@' && mp->ifsp)
|
||||
if(mode == '@' || !mp->ifsp) /* if expanding $@ or if IFS is unset... */
|
||||
sfputc(sfio_ptr, ' ');
|
||||
else if(mp->ifs) /* else if IFS is non-empty... */
|
||||
{
|
||||
/*
|
||||
* Handle multi-byte characters being used for the internal
|
||||
* field separator (IFS).
|
||||
*/
|
||||
int i;
|
||||
for(i = 0; i < mbsize(mp->ifsp); i++)
|
||||
sfputc(sfio_ptr,mp->ifsp[i]);
|
||||
if(!mbwide() || mp->ifs < 128) /* if single-byte char... */
|
||||
sfputc(sfio_ptr, mp->ifs);
|
||||
else
|
||||
{
|
||||
if(!ofs_size) /* only calculate this once per expansion */
|
||||
{
|
||||
ofs_size = mbsize(mp->ifsp);
|
||||
if(ofs_size<0) /* invalid mb char: fall back to using first byte */
|
||||
ofs_size = 1;
|
||||
}
|
||||
sfwrite(sfio_ptr, mp->ifsp, ofs_size);
|
||||
}
|
||||
}
|
||||
else
|
||||
sfputc(sfio_ptr,d);
|
||||
#else
|
||||
if(mp->sp)
|
||||
sfputc(mp->sp,d);
|
||||
else
|
||||
sfputc(stkp,d);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
if(arrmax)
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
function err_exit
|
||||
{
|
||||
print -u2 -n "\t"
|
||||
print -u2 -r ${Command}[$1]: "${@:2}"
|
||||
print -u2 -r "${Command}[$1]: ${@:2}"
|
||||
let Errors+=1
|
||||
}
|
||||
alias err_exit='err_exit $LINENO'
|
||||
|
@ -167,10 +167,17 @@ COUNT=0
|
|||
if (( COUNT != 1 || ACCESS!=2 ))
|
||||
then err_exit " set discipline failure COUNT=$COUNT ACCESS=$ACCESS"
|
||||
fi
|
||||
|
||||
save_LANG=$LANG
|
||||
LANG=C > /dev/null 2>&1
|
||||
if [[ $LANG != C ]]
|
||||
then err_exit "C locale not working"
|
||||
fi
|
||||
LANG=$save_LANG
|
||||
if [[ $LANG != "$save_LANG" ]]
|
||||
then err_exit "$save_LANG locale not working"
|
||||
fi
|
||||
|
||||
unset RANDOM
|
||||
unset -n foo
|
||||
foo=junk
|
||||
|
@ -205,8 +212,7 @@ do false
|
|||
if [[ $i != [@*] && ${foo#?} != "$bar" ]]
|
||||
then err_exit "\${$i#?} not correct"
|
||||
fi
|
||||
command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives synta
|
||||
x error"
|
||||
command eval foo='$'{$i} bar='$'{#$i} || err_exit "\${#$i} gives syntax error"
|
||||
if [[ $i != @([@*]) && ${#foo} != "$bar" ]]
|
||||
then err_exit "\${#$i} not correct"
|
||||
fi
|
||||
|
@ -436,16 +442,18 @@ case $(unset IFS; set -- $v; print $#) in
|
|||
esac
|
||||
|
||||
# Multi-byte characters should work with $IFS
|
||||
(
|
||||
LC_ALL=C.UTF-8 # The multi-byte tests are pointless without UTF-8
|
||||
|
||||
if [[ ${LC_ALL:-${LC_CTYPE:-${LANG:-}}} =~ [Uu][Tt][Ff]-?8 ]] # The multi-byte tests are pointless without UTF-8
|
||||
then
|
||||
# Test the following characters:
|
||||
# Lowercase accented e (two bytes)
|
||||
# Roman sestertius sign (four bytes)
|
||||
for delim in é 𐆘; do
|
||||
IFS="$delim"
|
||||
IFS=$delim
|
||||
set : :
|
||||
[ "$*" == ":$delim:" ] || err_exit "IFS failed with multi-byte character $delim (expected :$delim:, got $*)"
|
||||
expect=:$delim:
|
||||
actual=$*
|
||||
[[ $actual == "$expect" ]] || err_exit "IFS failed with multi-byte character $delim" \
|
||||
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
|
||||
|
||||
read -r first second third <<< "one${delim}two${delim}three"
|
||||
[[ $first == one ]] || err_exit "IFS failed with multi-byte character $delim (expected one, got $first)"
|
||||
|
@ -453,9 +461,11 @@ esac
|
|||
[[ $third == three ]] || err_exit "IFS failed with multi-byte character $delim (expected three, got $three)"
|
||||
|
||||
# Ensure subshells don't get corrupted when IFS becomes a multi-byte character
|
||||
expected_output="$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")"
|
||||
output="$(LANG=C.UTF-8; IFS=$delim; set : :; echo "$*"; trap "echo end" EXIT; trap)"
|
||||
[[ $output == $expected_output ]] || err_exit "IFS in subshell failed with multi-byte character $delim (expected $expected_output, got $output)"
|
||||
IFS=$' \t\n'
|
||||
expect=$(printf ":$delim:\\ntrap -- 'echo end' EXIT\\nend")
|
||||
actual=$(set : :; IFS=$delim; echo "$*"; trap "echo end" EXIT; trap)
|
||||
[[ $actual == "$expect" ]] || err_exit "IFS in subshell failed with multi-byte character $delim" \
|
||||
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
|
||||
done
|
||||
|
||||
# Multibyte characters with the same initial byte shouldn't be parsed as the same
|
||||
|
@ -466,7 +476,16 @@ esac
|
|||
set -- $v
|
||||
v="${#},${1-},${2-},${3-}"
|
||||
[[ $v == '1,abc§def ghi§jkl,,' ]] || err_exit "IFS treats £ (C2 A3) and § (C2 A7) as the same character"
|
||||
)
|
||||
fi
|
||||
|
||||
# Ensure fallback to first byte if IFS doesn't start with a valid multibyte character
|
||||
# (however, this test should pass regardless of the locale)
|
||||
IFS=$'\x[A0]a'
|
||||
set : :
|
||||
expect=$':\x[A0]:'
|
||||
actual=$*
|
||||
[[ $actual == "$expect" ]] || err_exit "IFS failed with invalid multi-byte character" \
|
||||
"(expected $(printf %q "$expect"), got $(printf %q "$actual"))"
|
||||
|
||||
# ^^^ end: IFS tests ^^^
|
||||
# restore default split:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue