DtSearch: Migrate to UTF-8.

2025-03-09 15:50:02 +00:00 · 2021-11-09 02:20:45 +08:00 · 2021-11-09 02:20:45 +08:00 · 3f950009bc
commit 3f950009bc
parent e38f1ae3cf
2 changed files with 237 additions and 160 deletions
--- a/cde/lib/DtSearch/jpn.c
+++ b/cde/lib/DtSearch/jpn.c
@ -129,8 +129,6 @@
 #include <sys/stat.h>

 #define PROGNAME	"JPN"
-#define SS2_CHAR	0x8E	/* Single Shift char for Code Set 2 */
-#define SS3_CHAR	0x8F	/* Single Shift char for Code Set 3 */
 #define EXT_KATAKANA	".ktk"
 #define EXT_KANJI	".knj"
 #define SUBSTRBUFSZ	100
@ -277,23 +275,7 @@ static char	*display_jstate (int js)
 */
 static int	read_jchar (void)
 {
-    /* Jstates table for EUC Set 1 (JIS 0208) */
-    static int  jstates_set1 [] = {
-	JS_DISCARD,	JS_DISCARD,	JS_DISCARD,	/* 0 - 2 */
-	JS_ROMAN,	JS_DISCARD,	JS_KATAKANA,	/* 3 - 5 */
-	JS_GREEK,	JS_CYRILLIC,	JS_DISCARD	/* 6 - 8 */
-    };
-
-    /* Jstates table for EUC Set 3 (JIS 0212).
-     * Row 5 is presumed to be katakana because
-     * of four new unapproved katakana characters.
-     */
-    static int  jstates_set3 [] = {
-	JS_DISCARD,	JS_DISCARD,	JS_DISCARD,	/* 0 - 2 */
-	JS_DISCARD,	JS_DISCARD,	JS_KATAKANA,	/* 3 - 5 */
-	JS_GREEK,	JS_CYRILLIC,	JS_DISCARD,	/* 6 - 8 */
-	JS_ALPHA,	JS_ALPHA,	JS_ALPHA	/* 9 - 11 */
-    };
+    char str1[8], str2[8];

    if (readchar_arg) {
 	jchar[0] = readchar (readchar_arg);
@ -305,75 +287,78 @@ static int	read_jchar (void)
 	return (jstate = JS_ETX);
    readcount++;

-    /* Set 1 (JIS 0208) */
-    if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
-	jcharlen = 2;
-	if (jchar[0] > 0xA8)
-	    jstate = JS_KANJI;
-	else
-	    jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
-	if ((jchar[1] = readchar (NULL)))
-	    readcount++;
-	else
-	    jstate = JS_ETX;
-	return jstate;
-    } 
-
-    /* Set 0 (ASCII) */
-    if (jchar[0] < 0x80) {
-	jcharlen = 1;
-	return (jstate = JS_ASCII);
+    for (jcharlen = 1; jcharlen < MB_CUR_MAX; ++jcharlen) {
+	jchar[jcharlen] = 0;
+	if (mblen ((char *) jchar, MB_CUR_MAX) != -1) break;
+	if ((jchar[jcharlen] = readchar (NULL))) readcount++;
    }

-    /* Set 3 (JIS 0212) */
-    if (jchar[0] == SS3_CHAR) {
-	jcharlen = 3;
-	/*
-	 * Hop over the single shift char to get the first JIS byte.
-	 * Make sure first JIS byte is in proper
-	 * range to avoid indexing outside of table.
-	 */
-	if ((jchar[1] = readchar (NULL)) == 0)
-	    return (jstate = JS_ETX);
-	readcount++;
-	if (jchar[1] < 0xA1)
-	    return (jstate = JS_DISCARD);
-	if (jchar[1] > 0xAA)
-	    jstate = JS_KANJI;
-	else
-	    jstate = jstates_set3 [(*jchar & 0x7F) - 32];
-
-	if ((jchar[2] = readchar (NULL)) == 0)
-	    return (jstate = JS_ETX);
-	readcount++;
-	/* JS_ALPHA chars ('miscellaneous alphabetic chars' of
-	 * rows 9 - 11) are compatible with several other jstates,
-	 * so adjust as necessary.
-	 */
-	if (jstate == JS_ALPHA  &&
-		((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
-	    jstate = last_jstate;
-	else if (last_jstate == JS_ALPHA  &&
-		((jstate & JS_ALPHA_COMPATIBLE) != 0))
-	    last_jstate = jstate;
-	return jstate;
-    }
-
-    /* Set 2 (half-width katakana) */
-    if (jchar[0] == SS2_CHAR) {
-	jcharlen = 2;
-	jstate = JS_HALFKATA;
-	if ((jchar[1] = readchar (NULL)))
-	    readcount++;
-	else
-	    jstate = JS_ETX;
-	return jstate;
-    }
-
-    /* If first jchar doesn't match expected EUC coding,
+    /* If jchar is an invalid multibyte sequence,
     * discard it until we get back into sync.
     */
-    jcharlen = 1;
+    if (jcharlen == MB_CUR_MAX) return (jstate = JS_DISCARD);
+
+    if (jcharlen == 1) {
+	if (jchar[0] < 0x80) {
+	    jcharlen = 1;
+	    jstate = JS_ASCII;
+	}
+	else jstate = JS_DISCARD;
+
+	return jstate;
+    }
+
+    if (jcharlen == 2) {
+	str1[0] = 0xC2;
+	str1[1] = 0x80;
+	str1[2] = 0;
+
+	str2[0] = 0xDF;
+	str2[1] = 0xBF;
+	str2[2] = 0;
+
+	if (strcmp ((char *) jchar, str1) >= 0 &&
+	    strcmp ((char *) jchar, str2) <= 0)
+	    jstate = JS_ROMAN;
+	else jstate = JS_DISCARD;
+
+	return jstate;
+    }
+
+    if (jcharlen == 3) {
+	str1[0] = 0xE4;
+	str1[1] = 0xB8;
+	str1[2] = 0x80;
+	str1[3] = 0;
+
+	str2[0] = 0xE9;
+	str2[1] = 0xBF;
+	str2[2] = 0xBF;
+	str2[3] = 0;
+
+	if (strcmp ((char *) jchar, str1) >= 0 &&
+	    strcmp ((char *) jchar, str2) <= 0)
+	    jstate = JS_KANJI;
+	else {
+	    str1[0] = 0xEF;
+	    str1[1] = 0xBD;
+	    str1[2] = 0xA6;
+	    str1[3] = 0;
+
+	    str2[0] = 0xEF;
+	    str2[1] = 0xBE;
+	    str2[2] = 0x9F;
+	    str2[3] = 0;
+
+	    if (strcmp ((char *) jchar, str1) >= 0 &&
+		strcmp ((char *) jchar, str2) <= 0)
+		jstate = JS_HALFKATA;
+	    else jstate = JS_DISCARD;
+	}
+
+	return jstate;
+    }
+
    return (jstate = JS_DISCARD);
 } /* read_jchar() */

@ -401,7 +386,7 @@ static UCHAR	*kanji_compounder (void)
    static UCHAR	*mysubstrp =	NULL;
    static UCHAR	*mysubstrend =	NULL;
    static UCHAR	*op, *ss;
-    static int		i;
+    static int		i, j, mbl;

    if (is_new_substring) {
 	is_new_substring = FALSE;
@ -420,7 +405,7 @@ static UCHAR	*kanji_compounder (void)
 	    return NULL;
 	if (++clen > MAX_KANJI_CLEN) {
 	    clen = 1;
-	    mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
+	    mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX);
 	}
    }

@ -437,15 +422,13 @@ static UCHAR	*kanji_compounder (void)
 	    /* Are there enough jchars left in substring? */
 	    if (ss >= mysubstrend) {
 		clen = 1;
-		mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
+		mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX);
 		i = 0;		/* indicates assembly failure */
 		break;		/* breaks the for loop */
 	    }
 	    /* Assemble one jchar into outbuf */
-	    if (*ss == SS3_CHAR)
-		*op++ = *ss++;
-	    *op++ = *ss++;
-	    *op++ = *ss++;
+	    mbl = mblen ((char *)ss, MB_CUR_MAX);
+	    for (j = 0; j < mbl; ++j) *op++ = *ss++;
 	}
 	/* Did word assembly succeed? */
 	if (i >= clen) {
@ -498,7 +481,7 @@ static UCHAR	*search_kanjitree (void)

 	/* Return first substr jchar as next token */
 	last_node = NULL;	/* NULL = tree not searched yet */
-	jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
+	jcharlen = mblen ((char *)substrp, MB_CUR_MAX);
 	strncpy ((char*)outbuf, (char*)substrp, jcharlen);
 	outbuf [jcharlen] = 0;
 	if (offsetp)
@ -524,7 +507,7 @@ EXHAUSTED_TREE:
 	    all_done = TRUE;
 	    return NULL;
 	}
-	jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
+	jcharlen = mblen ((char *)substrp, MB_CUR_MAX);
 	strncpy ((char*)outbuf, (char*)substrp, jcharlen);
 	outbuf [jcharlen] = 0;
 	if (offsetp)
@ -735,6 +718,8 @@ ENTIRE_SUBSTR_IS_WORD:
 	"%s Program Error: Unknown jstate %d.\n") ,
 	PROGNAME"246", last_jstate);
    DtSearchExit (46);
+
+    return NULL;
 } /* parse_substring() */


--- a/cde/lib/DtSearch/lang.c
+++ b/cde/lib/DtSearch/lang.c
@ -124,6 +124,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <wchar.h>
 #include <sys/stat.h>

 #define X_INCLUDE_STRING_H
@ -170,8 +171,9 @@ int		debugging_search_wordtree =	FALSE;
 int             debugging_teskey =		FALSE;
 int             debugging_paice =		FALSE;
 static int	*paice_charmap;
-static UCHAR	paicebuf [DtSrMAXWIDTH_HWORD + 2];
+static char	paicebuf [DtSrMAXWIDTH_HWORD + 2];
 static int	paicelen;
+static int	paicewcsl;
 static int	word_is_intact;

 /* Language strings correspond to DtSrLa.. constants.  */
@ -230,7 +232,7 @@ static char	*language_name (DtSrINT16 langno)
 * Returns TRUE if successful search, else FALSE.
 * See also search_wordtree_jpn() in jpn.c
 */
-static int	search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
+static int	search_wordtree (WORDTREE *wordtree, char *wordstring)
 {
    static int     	direction;
    static WORDTREE	*node;
@ -240,7 +242,7 @@ static int	search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
 	    wordstring);
    /* MAIN SEARCH LOOP: binary tree search */
    for (node = wordtree;  node != NULL;  ) {
-	if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
+	if ((direction = strcmp (wordstring, node->word)) == 0) {
 	    if (debugging_search_wordtree)
 		fprintf (aa_stderr, "  HIT!\n");
 	    return TRUE;
@ -260,6 +262,61 @@ static int	search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
 }  /* search_wordtree() */


+static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
+{
+    int len;
+
+    mbtowc (NULL, NULL, 0);
+    len = mbtowc (pwc, p, 1);
+
+    if (len < 0 && p > s) {
+	mbtowc (NULL, NULL, 0);
+	len = mbtowc (pwc, p - 1, 2);
+    }
+
+    if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
+
+    return len;
+}
+
+
+static char *euro_wctomb (int c, char *outp, int len)
+{
+    wchar_t wc = c & 0xFF;
+    if (len > 1) wctomb (outp, wc);
+    else *outp = wc;
+    return outp + len;
+}
+
+
+static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
+    wchar_t *pwc)
+{
+    int len = 1;
+
+    *pwc = 0;
+
+    *outp = cofunction (cofunction_arg);
+
+    if (!(*outp)) goto done;
+
+    mbtowc (NULL, NULL, 0);
+
+    if (mbtowc (pwc, outp, 1) >= 0) goto done;
+
+    *(outp + 1) = cofunction (NULL);
+    mbtowc (NULL, NULL, 0);
+    len = mbtowc (pwc, outp, 2);
+
+    if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
+
+    len = 2;
+
+done:
+    return len;
+}
+
+
 /************************************************/
 /*						*/
 /*                 teskey_parser		*/
@ -312,10 +369,10 @@ char	*teskey_parser (PARG *parg)
    static READCFP	cofunction;
    static void		*cofunction_arg;
    static DBLK		*dblk =		NULL;
-    static UCHAR	*outbuf =	NULL;
+    static char		*outbuf =	NULL;
    static size_t	outbufsz =	0;
-    static UCHAR	*endmaxword;	/* end largest possible output word */
-    static UCHAR	*outp;		/* next loc in outbuf */
+    static char		*endmaxword;	/* end largest possible output word */
+    static char		*outp;		/* next loc in outbuf */
    static int		*charmap;
    static int		minwordsz, maxwordsz;
    static int		wordlen;
@ -324,6 +381,8 @@ char	*teskey_parser (PARG *parg)
    static long		*offsetp, readcount, candidate_offset;
    static int		is_hiliting;
    static int		add_msgs;
+    static int		len;
+    static wchar_t	wc;

    /* If first call for current text block... */
    if (parg) {
@ -376,8 +435,12 @@ char	*teskey_parser (PARG *parg)
 READ_ANOTHER_WORD:
    outp = outbuf;
    tpstate = BETW_WORDS;
-    while ((*outp = cofunction (cofunction_arg))) {
-	readcount++;
+    for (;;) {
+	len = euro_readchar (cofunction, cofunction_arg, outp, &wc);
+
+	if (!wc) break;
+
+	readcount += len;
 	cofunction_arg = NULL;

 	/*------------- BETW_WORDS State ------------
@ -387,15 +450,14 @@ READ_ANOTHER_WORD:
 	    /*
 	     * Discard nonconcordable chars between words.
 	     */
-	    if ((charmap[*outp] & NON_CONCORD) != 0)
+	    if ((charmap[wc] & NON_CONCORD) != 0)
 		continue;
 	    /*
 	     * Fully concordable char is definite start of new word.
 	     * Convert to uppercase and go get next char.
 	     */
-	    if ((charmap[*outp] & CONCORDABLE) != 0) {
-		*outp = charmap[*outp] & 0x00ff;
-		outp++;
+	    if ((charmap[wc] & CONCORDABLE) != 0) {
+		outp = euro_wctomb (charmap[wc], outp, len);
 		candidate_offset = readcount;
 		tpstate = IN_WORD;
 		continue;
@ -407,18 +469,18 @@ READ_ANOTHER_WORD:
 	     * to uppercase and go get next char.
 	     * Otherwise discard just like non_concord.
 	     */
-	    outp++;
-	    if ((*outp = cofunction(NULL)))
-		readcount++;
-	    if ((charmap[*outp] & CONCORDABLE) != 0) {
-		*outp = charmap[*outp] & 0x00ff;
-		outp++;
-		candidate_offset = readcount - 1;
+	    outp += len;
+	    len = euro_readchar (cofunction, NULL, outp, &wc);
+	    readcount += len;
+
+	    if ((charmap[wc] & CONCORDABLE) != 0) {
+		outp = euro_wctomb (charmap[wc], outp, len);
+		candidate_offset = readcount - len;
 		tpstate = IN_WORD;
 		continue;
 	    }
 	    else {
-		outp--;
+		outp -= len;
 		continue;
 	    }
 	} /* endif BETW_WORDS */
@ -431,10 +493,9 @@ READ_ANOTHER_WORD:
 	 * Non_concords treatment depends on next char.
 	 */
 	else if (tpstate == IN_WORD) {
-	    if ((charmap[*outp] & CONCORDABLE) != 0) {
+	    if ((charmap[wc] & CONCORDABLE) != 0) {
 		if (outp < endmaxword) {
-		    *outp = charmap[*outp] & 0x00ff;
-		    outp++;
+		    outp = euro_wctomb (charmap[wc], outp, len);
 		}
 		else {
 		    tpstate = TOO_LONG;
@ -457,18 +518,18 @@ READ_ANOTHER_WORD:
 		}
 		continue;
 	    }
-	    if ((charmap[*outp] & NON_CONCORD) != 0) {
+	    if ((charmap[wc] & NON_CONCORD) != 0) {
 		*outp = '\0';
 		break;
 	    }
 	    /* Must be opt_concord... */
-	    outp++;
-	    if ((*outp = cofunction(NULL)))
-		readcount++;
-	    if ((charmap[*outp] & CONCORDABLE) != 0) {
+	    outp += len;
+	    len = euro_readchar (cofunction, NULL, outp, &wc);
+	    readcount += len;
+
+	    if ((charmap[wc] & CONCORDABLE) != 0) {
 		if (outp < endmaxword) {
-		    *outp = charmap[*outp] & 0x00ff;	/* uppercase */
-		    outp++;
+		    outp = euro_wctomb (charmap[wc], outp, len);
 		}
 		else {
 		    tpstate = TOO_LONG;
@ -482,7 +543,8 @@ READ_ANOTHER_WORD:
 		continue;
 	    }
 	    else {	/* next char NOT concordable...*/
-		*(--outp) = '\0';
+		outp -= len;
+		*outp = '\0';
 		break;
 	    }
 	} /* endif IN_WORD */
@ -494,7 +556,7 @@ READ_ANOTHER_WORD:
 	 * can get between words again with a clear non_concord.
 	 */
 	else if (tpstate == TOO_LONG) {
-	    if ((charmap[*outp] & NON_CONCORD) != 0) {
+	    if ((charmap[wc] & NON_CONCORD) != 0) {
 		outp = outbuf;
 		tpstate = BETW_WORDS;
 	    }
@ -524,7 +586,7 @@ READ_ANOTHER_WORD:
 	return NULL;
    }

-    wordlen = strlen ((char *) outbuf);
+    wordlen = strlen (outbuf);
    candidate_offset--;	/* token offset is one less than number of reads */
    if (debugging_teskey)
 	fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
@ -586,7 +648,7 @@ GOOD_WORD:
 	fprintf (aa_stderr, ", ...good word\n");
    if (offsetp)
 	*offsetp = candidate_offset;
-    return (char *) outbuf;
+    return outbuf;
 } /* teskey_parser() */


@ -602,10 +664,13 @@ GOOD_WORD:
 */
 int	is_concordable (char *word, int *charmap)
 {
-    UCHAR	*cptr;
-    for (cptr = (UCHAR *)word;  *cptr != 0;  cptr++)
-	if ((charmap[*cptr] & NON_CONCORD) != 0)
+    char	*cptr;
+    wchar_t	wc;
+    for (cptr = word;  *cptr != 0;  cptr++) {
+	euro_mbtowc (&wc, cptr, word);
+	if ((charmap[wc] & NON_CONCORD) != 0)
 	    break;
+    }
    return (*cptr == 0);
 } /* is_concordable() */

@ -1087,6 +1152,8 @@ static int      load_paice_suffixes (DBLK *dblk, DBLK *dblist)
    int		must_be_intact, is_last_rule;
    UCHAR	remove_count;
    int		lineno, errcount;
+    int		len;
+    wchar_t	wc;
    _Xstrtokparams	strtok_buf;

    dblk->stem_extra = NULL;	/* just to be sure */
@ -1183,9 +1250,11 @@ static int      load_paice_suffixes (DBLK *dblk, DBLK *dblist)
 	if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
 	    continue;

-	for (cptr = suffix;  cptr;  cptr++)
-	    if ((dblk->charmap[*cptr] & NUMERAL) == 0)
+	for (cptr = suffix;  cptr;  cptr++) {
+	    euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
+	    if ((dblk->charmap[wc] & NUMERAL) == 0)
 		break;
+	}
 	if (*cptr == '\0')
 	    continue;

@ -1252,8 +1321,11 @@ BAD_RULE:
 	prule->remove_count =	remove_count;
 	prule->is_last_rule =	is_last_rule;
 	if (apndstr) {
-	    prule->apndstr =	(UCHAR *) strdup ((char*)apndstr);
-	    prule->aplen =	strlen ((char*)apndstr);
+	    len = mbstowcs (NULL, (char *)apndstr, 0);
+	    if (len != -1) {
+		prule->apndstr =	(UCHAR *) strdup ((char*)apndstr);
+		prule->aplen =		len;
+	    }
 	}

 	prule_link = &rules_table[suffix[0]];
@ -1292,7 +1364,8 @@ BAD_RULE:
 */
 static int	is_matching_rule (PRULE *rule)
 {
-    static UCHAR	*ptr;
+    static char		*ptr;
+    static wchar_t	wc;
    static int		i, j;

    if (debugging_paice)
@ -1328,7 +1401,7 @@ static int	is_matching_rule (PRULE *rule)
     * Used to algorithmically test remaining stem length
     * after tentative application of rule.
     */
-    i = paicelen - (rule->remove_count - rule->aplen);
+    i = paicewcsl - (rule->remove_count - rule->aplen);

    if (i <= 1) {
 	if (debugging_paice)
@ -1337,7 +1410,11 @@ static int	is_matching_rule (PRULE *rule)
    }

    if (i == 2) {
-	if (IS_VOWEL (paicebuf[0])) {
+	euro_mbtowc (&wc, paicebuf, paicebuf);
+
+	if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);
+
+	if (IS_VOWEL (wc)) {
 	    if (debugging_paice)
 		fputs (", and short vowel stem valid.\n", aa_stderr);
 	    return TRUE;
@ -1355,13 +1432,15 @@ static int	is_matching_rule (PRULE *rule)
     * Otherwise it's not.
     */
    for (j=0;  j<i;  j++) {
-	if (IS_VOWEL (paicebuf[j])) {
+	euro_mbtowc (&wc, &paicebuf[j], paicebuf);
+
+	if (IS_VOWEL (wc)) {
 GOOD_STEM:
 	    if (debugging_paice)
 		fputs (", and remaining stem valid.\n", aa_stderr);
 	    return TRUE;
 	}
-	if (j > 0  &&  paicebuf[j] == 'Y')
+	if (j > 0  && wc == L'Y')
 	    goto GOOD_STEM;
    }

@ -1389,7 +1468,8 @@ GOOD_STEM:
 */
 static char	*paice_stemmer (char *wordin, DBLK *dblk)
 {
-    UCHAR	finalc;
+    wchar_t	finalwc;
+    int		len;
    PRULE	*rule, **rules_table;

    if (wordin == NULL)
@ -1409,24 +1489,31 @@ static char	*paice_stemmer (char *wordin, DBLK *dblk)
     * prefix ^O that identifies a stem.  (But this
     * stemmer doesn't actually insert the ^O now.)
     */
-    strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
-    paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
+    strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);
+
+    if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
+	mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
+	paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
+    else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;
+
    paice_charmap =	dblk->charmap;
    word_is_intact = 	TRUE;

    for (;;) { /*-------- Main Stemming Loop ---------*/

-	paicelen = strlen ((char*)paicebuf);
-	finalc = *(paicebuf + paicelen - 1);
+	paicelen = strlen (paicebuf);
+	paicewcsl = mbstowcs (NULL, paicebuf, 0);
+	len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);
+
 	if (debugging_paice) {
-	    fprintf (aa_stderr,
-		"paice: '%s', rules list '%c' for database '%s'\n",
-		paicebuf, finalc, dblk->name);
+	    fwprintf (aa_stderr,
+		L"paice: '%s', rules list '%lc' for database '%s'\n",
+		paicebuf, finalwc, dblk->name);
 	    fflush (aa_stderr);
 	}

 	/* Look for a matching rule */
-	if ((rule = rules_table [finalc]) == NULL) {
+	if ((rule = rules_table [finalwc]) == NULL) {
 	    if (debugging_paice)
 		fputs ("  list is null, stop.\n", aa_stderr);
 	    break;
@ -1438,8 +1525,8 @@ static char	*paice_stemmer (char *wordin, DBLK *dblk)
 	}
 	if (rule == NULL) {
 	    if (debugging_paice)
-		fprintf (aa_stderr, "  rules list '%c' is exhausted, stop.\n",
-		    finalc);
+		fwprintf (aa_stderr,
+		    L"  rules list '%lc' is exhausted, stop.\n", finalwc);
 	    break;
 	}

@ -1452,10 +1539,11 @@ static char	*paice_stemmer (char *wordin, DBLK *dblk)
 	    break;
 	}

-	paicebuf [paicelen - rule->remove_count] = 0;
+	paicebuf [paicelen - len * rule->remove_count] = 0;
 	if (rule->aplen)
-	    strcat ((char*)paicebuf, (char*)rule->apndstr);
-	paicelen = strlen ((char*)paicebuf);
+	    strcat (paicebuf, (char*)rule->apndstr);
+	paicelen = strlen (paicebuf);
+	paicewcsl = mbstowcs (NULL, paicebuf, 0);
 	word_is_intact = FALSE;	 /* we've removed at least 1 suffix */
 	if (debugging_paice)
 	    fprintf (aa_stderr, "--> '%s'", paicebuf);
@ -1478,7 +1566,7 @@ static char	*paice_stemmer (char *wordin, DBLK *dblk)
 	fprintf (aa_stderr, "  final stem: '%s'\n", paicebuf);
 	fflush (aa_stderr);
    }
-    return (char *) paicebuf;
+    return paicebuf;
 } /* paice_stemmer() */


@ -1498,9 +1586,9 @@ char	*null_stemmer (char *word, DBLK *dblk)
 	return "";
    if (*word == '\0')
 	return "";
-    strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
+    strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
    paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
-    return (char *) paicebuf;
+    return paicebuf;
 } /* null_stemmer() */


@ -1514,11 +1602,15 @@ char	*null_stemmer (char *word, DBLK *dblk)
 */
 static char	*euro_lstrupr (char *string, DBLK *dblk)
 {
-    static int		*charmap;
-    static UCHAR	*s;
+    static int		*charmap, len;
+    static char		*s;
+    static wchar_t	wc;
    charmap = dblk->charmap;
-    for (s=(UCHAR *)string;  *s;  s++)
-	*s = charmap[*s] & 0xff;
+    for (s = string;  *s;  s++) {
+	len = euro_mbtowc (&wc, s, string);
+	*s = charmap[wc] & 0xFF;
+	if (len > 1) wctomb (s - 1, *s);
+    }
    return string;
 }