From 3f950009bc353922ca255990120890ae986f0bf0 Mon Sep 17 00:00:00 2001 From: Liang Chang Date: Tue, 9 Nov 2021 02:20:45 +0800 Subject: [PATCH] DtSearch: Migrate to UTF-8. --- cde/lib/DtSearch/jpn.c | 171 ++++++++++++++---------------- cde/lib/DtSearch/lang.c | 226 ++++++++++++++++++++++++++++------------ 2 files changed, 237 insertions(+), 160 deletions(-) diff --git a/cde/lib/DtSearch/jpn.c b/cde/lib/DtSearch/jpn.c index 08629f723..0faaf2fd7 100644 --- a/cde/lib/DtSearch/jpn.c +++ b/cde/lib/DtSearch/jpn.c @@ -129,8 +129,6 @@ #include #define PROGNAME "JPN" -#define SS2_CHAR 0x8E /* Single Shift char for Code Set 2 */ -#define SS3_CHAR 0x8F /* Single Shift char for Code Set 3 */ #define EXT_KATAKANA ".ktk" #define EXT_KANJI ".knj" #define SUBSTRBUFSZ 100 @@ -277,23 +275,7 @@ static char *display_jstate (int js) */ static int read_jchar (void) { - /* Jstates table for EUC Set 1 (JIS 0208) */ - static int jstates_set1 [] = { - JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */ - JS_ROMAN, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */ - JS_GREEK, JS_CYRILLIC, JS_DISCARD /* 6 - 8 */ - }; - - /* Jstates table for EUC Set 3 (JIS 0212). - * Row 5 is presumed to be katakana because - * of four new unapproved katakana characters. - */ - static int jstates_set3 [] = { - JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */ - JS_DISCARD, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */ - JS_GREEK, JS_CYRILLIC, JS_DISCARD, /* 6 - 8 */ - JS_ALPHA, JS_ALPHA, JS_ALPHA /* 9 - 11 */ - }; + char str1[8], str2[8]; if (readchar_arg) { jchar[0] = readchar (readchar_arg); @@ -305,75 +287,78 @@ static int read_jchar (void) return (jstate = JS_ETX); readcount++; - /* Set 1 (JIS 0208) */ - if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) { - jcharlen = 2; - if (jchar[0] > 0xA8) - jstate = JS_KANJI; - else - jstate = jstates_set1 [(jchar[0] & 0x7F) - 32]; - if ((jchar[1] = readchar (NULL))) - readcount++; - else - jstate = JS_ETX; - return jstate; - } - - /* Set 0 (ASCII) */ - if (jchar[0] < 0x80) { - jcharlen = 1; - return (jstate = JS_ASCII); + for (jcharlen = 1; jcharlen < MB_CUR_MAX; ++jcharlen) { + jchar[jcharlen] = 0; + if (mblen ((char *) jchar, MB_CUR_MAX) != -1) break; + if ((jchar[jcharlen] = readchar (NULL))) readcount++; } - /* Set 3 (JIS 0212) */ - if (jchar[0] == SS3_CHAR) { - jcharlen = 3; - /* - * Hop over the single shift char to get the first JIS byte. - * Make sure first JIS byte is in proper - * range to avoid indexing outside of table. - */ - if ((jchar[1] = readchar (NULL)) == 0) - return (jstate = JS_ETX); - readcount++; - if (jchar[1] < 0xA1) - return (jstate = JS_DISCARD); - if (jchar[1] > 0xAA) - jstate = JS_KANJI; - else - jstate = jstates_set3 [(*jchar & 0x7F) - 32]; - - if ((jchar[2] = readchar (NULL)) == 0) - return (jstate = JS_ETX); - readcount++; - /* JS_ALPHA chars ('miscellaneous alphabetic chars' of - * rows 9 - 11) are compatible with several other jstates, - * so adjust as necessary. - */ - if (jstate == JS_ALPHA && - ((last_jstate & JS_ALPHA_COMPATIBLE) != 0)) - jstate = last_jstate; - else if (last_jstate == JS_ALPHA && - ((jstate & JS_ALPHA_COMPATIBLE) != 0)) - last_jstate = jstate; - return jstate; - } - - /* Set 2 (half-width katakana) */ - if (jchar[0] == SS2_CHAR) { - jcharlen = 2; - jstate = JS_HALFKATA; - if ((jchar[1] = readchar (NULL))) - readcount++; - else - jstate = JS_ETX; - return jstate; - } - - /* If first jchar doesn't match expected EUC coding, + /* If jchar is an invalid multibyte sequence, * discard it until we get back into sync. */ - jcharlen = 1; + if (jcharlen == MB_CUR_MAX) return (jstate = JS_DISCARD); + + if (jcharlen == 1) { + if (jchar[0] < 0x80) { + jcharlen = 1; + jstate = JS_ASCII; + } + else jstate = JS_DISCARD; + + return jstate; + } + + if (jcharlen == 2) { + str1[0] = 0xC2; + str1[1] = 0x80; + str1[2] = 0; + + str2[0] = 0xDF; + str2[1] = 0xBF; + str2[2] = 0; + + if (strcmp ((char *) jchar, str1) >= 0 && + strcmp ((char *) jchar, str2) <= 0) + jstate = JS_ROMAN; + else jstate = JS_DISCARD; + + return jstate; + } + + if (jcharlen == 3) { + str1[0] = 0xE4; + str1[1] = 0xB8; + str1[2] = 0x80; + str1[3] = 0; + + str2[0] = 0xE9; + str2[1] = 0xBF; + str2[2] = 0xBF; + str2[3] = 0; + + if (strcmp ((char *) jchar, str1) >= 0 && + strcmp ((char *) jchar, str2) <= 0) + jstate = JS_KANJI; + else { + str1[0] = 0xEF; + str1[1] = 0xBD; + str1[2] = 0xA6; + str1[3] = 0; + + str2[0] = 0xEF; + str2[1] = 0xBE; + str2[2] = 0x9F; + str2[3] = 0; + + if (strcmp ((char *) jchar, str1) >= 0 && + strcmp ((char *) jchar, str2) <= 0) + jstate = JS_HALFKATA; + else jstate = JS_DISCARD; + } + + return jstate; + } + return (jstate = JS_DISCARD); } /* read_jchar() */ @@ -401,7 +386,7 @@ static UCHAR *kanji_compounder (void) static UCHAR *mysubstrp = NULL; static UCHAR *mysubstrend = NULL; static UCHAR *op, *ss; - static int i; + static int i, j, mbl; if (is_new_substring) { is_new_substring = FALSE; @@ -420,7 +405,7 @@ static UCHAR *kanji_compounder (void) return NULL; if (++clen > MAX_KANJI_CLEN) { clen = 1; - mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2; + mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX); } } @@ -437,15 +422,13 @@ static UCHAR *kanji_compounder (void) /* Are there enough jchars left in substring? */ if (ss >= mysubstrend) { clen = 1; - mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2; + mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX); i = 0; /* indicates assembly failure */ break; /* breaks the for loop */ } /* Assemble one jchar into outbuf */ - if (*ss == SS3_CHAR) - *op++ = *ss++; - *op++ = *ss++; - *op++ = *ss++; + mbl = mblen ((char *)ss, MB_CUR_MAX); + for (j = 0; j < mbl; ++j) *op++ = *ss++; } /* Did word assembly succeed? */ if (i >= clen) { @@ -498,7 +481,7 @@ static UCHAR *search_kanjitree (void) /* Return first substr jchar as next token */ last_node = NULL; /* NULL = tree not searched yet */ - jcharlen = (*substrp == SS3_CHAR)? 3 : 2; + jcharlen = mblen ((char *)substrp, MB_CUR_MAX); strncpy ((char*)outbuf, (char*)substrp, jcharlen); outbuf [jcharlen] = 0; if (offsetp) @@ -524,7 +507,7 @@ EXHAUSTED_TREE: all_done = TRUE; return NULL; } - jcharlen = (*substrp == SS3_CHAR)? 3 : 2; + jcharlen = mblen ((char *)substrp, MB_CUR_MAX); strncpy ((char*)outbuf, (char*)substrp, jcharlen); outbuf [jcharlen] = 0; if (offsetp) @@ -735,6 +718,8 @@ ENTIRE_SUBSTR_IS_WORD: "%s Program Error: Unknown jstate %d.\n") , PROGNAME"246", last_jstate); DtSearchExit (46); + + return NULL; } /* parse_substring() */ diff --git a/cde/lib/DtSearch/lang.c b/cde/lib/DtSearch/lang.c index 5b2ff3f5c..10ce6db0a 100644 --- a/cde/lib/DtSearch/lang.c +++ b/cde/lib/DtSearch/lang.c @@ -124,6 +124,7 @@ #include #include #include +#include #include #define X_INCLUDE_STRING_H @@ -170,8 +171,9 @@ int debugging_search_wordtree = FALSE; int debugging_teskey = FALSE; int debugging_paice = FALSE; static int *paice_charmap; -static UCHAR paicebuf [DtSrMAXWIDTH_HWORD + 2]; +static char paicebuf [DtSrMAXWIDTH_HWORD + 2]; static int paicelen; +static int paicewcsl; static int word_is_intact; /* Language strings correspond to DtSrLa.. constants. */ @@ -230,7 +232,7 @@ static char *language_name (DtSrINT16 langno) * Returns TRUE if successful search, else FALSE. * See also search_wordtree_jpn() in jpn.c */ -static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring) +static int search_wordtree (WORDTREE *wordtree, char *wordstring) { static int direction; static WORDTREE *node; @@ -240,7 +242,7 @@ static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring) wordstring); /* MAIN SEARCH LOOP: binary tree search */ for (node = wordtree; node != NULL; ) { - if ((direction = strcmp ((char *) wordstring, node->word)) == 0) { + if ((direction = strcmp (wordstring, node->word)) == 0) { if (debugging_search_wordtree) fprintf (aa_stderr, " HIT!\n"); return TRUE; @@ -260,6 +262,61 @@ static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring) } /* search_wordtree() */ +static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s) +{ + int len; + + mbtowc (NULL, NULL, 0); + len = mbtowc (pwc, p, 1); + + if (len < 0 && p > s) { + mbtowc (NULL, NULL, 0); + len = mbtowc (pwc, p - 1, 2); + } + + if (len < 0 || *pwc > 0xFF) *pwc = 0x100; + + return len; +} + + +static char *euro_wctomb (int c, char *outp, int len) +{ + wchar_t wc = c & 0xFF; + if (len > 1) wctomb (outp, wc); + else *outp = wc; + return outp + len; +} + + +static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp, + wchar_t *pwc) +{ + int len = 1; + + *pwc = 0; + + *outp = cofunction (cofunction_arg); + + if (!(*outp)) goto done; + + mbtowc (NULL, NULL, 0); + + if (mbtowc (pwc, outp, 1) >= 0) goto done; + + *(outp + 1) = cofunction (NULL); + mbtowc (NULL, NULL, 0); + len = mbtowc (pwc, outp, 2); + + if (len < 0 || *pwc > 0xFF) *pwc = 0x100; + + len = 2; + +done: + return len; +} + + /************************************************/ /* */ /* teskey_parser */ @@ -312,10 +369,10 @@ char *teskey_parser (PARG *parg) static READCFP cofunction; static void *cofunction_arg; static DBLK *dblk = NULL; - static UCHAR *outbuf = NULL; + static char *outbuf = NULL; static size_t outbufsz = 0; - static UCHAR *endmaxword; /* end largest possible output word */ - static UCHAR *outp; /* next loc in outbuf */ + static char *endmaxword; /* end largest possible output word */ + static char *outp; /* next loc in outbuf */ static int *charmap; static int minwordsz, maxwordsz; static int wordlen; @@ -324,6 +381,8 @@ char *teskey_parser (PARG *parg) static long *offsetp, readcount, candidate_offset; static int is_hiliting; static int add_msgs; + static int len; + static wchar_t wc; /* If first call for current text block... */ if (parg) { @@ -376,8 +435,12 @@ char *teskey_parser (PARG *parg) READ_ANOTHER_WORD: outp = outbuf; tpstate = BETW_WORDS; - while ((*outp = cofunction (cofunction_arg))) { - readcount++; + for (;;) { + len = euro_readchar (cofunction, cofunction_arg, outp, &wc); + + if (!wc) break; + + readcount += len; cofunction_arg = NULL; /*------------- BETW_WORDS State ------------ @@ -387,15 +450,14 @@ READ_ANOTHER_WORD: /* * Discard nonconcordable chars between words. */ - if ((charmap[*outp] & NON_CONCORD) != 0) + if ((charmap[wc] & NON_CONCORD) != 0) continue; /* * Fully concordable char is definite start of new word. * Convert to uppercase and go get next char. */ - if ((charmap[*outp] & CONCORDABLE) != 0) { - *outp = charmap[*outp] & 0x00ff; - outp++; + if ((charmap[wc] & CONCORDABLE) != 0) { + outp = euro_wctomb (charmap[wc], outp, len); candidate_offset = readcount; tpstate = IN_WORD; continue; @@ -407,18 +469,18 @@ READ_ANOTHER_WORD: * to uppercase and go get next char. * Otherwise discard just like non_concord. */ - outp++; - if ((*outp = cofunction(NULL))) - readcount++; - if ((charmap[*outp] & CONCORDABLE) != 0) { - *outp = charmap[*outp] & 0x00ff; - outp++; - candidate_offset = readcount - 1; + outp += len; + len = euro_readchar (cofunction, NULL, outp, &wc); + readcount += len; + + if ((charmap[wc] & CONCORDABLE) != 0) { + outp = euro_wctomb (charmap[wc], outp, len); + candidate_offset = readcount - len; tpstate = IN_WORD; continue; } else { - outp--; + outp -= len; continue; } } /* endif BETW_WORDS */ @@ -431,10 +493,9 @@ READ_ANOTHER_WORD: * Non_concords treatment depends on next char. */ else if (tpstate == IN_WORD) { - if ((charmap[*outp] & CONCORDABLE) != 0) { + if ((charmap[wc] & CONCORDABLE) != 0) { if (outp < endmaxword) { - *outp = charmap[*outp] & 0x00ff; - outp++; + outp = euro_wctomb (charmap[wc], outp, len); } else { tpstate = TOO_LONG; @@ -457,18 +518,18 @@ READ_ANOTHER_WORD: } continue; } - if ((charmap[*outp] & NON_CONCORD) != 0) { + if ((charmap[wc] & NON_CONCORD) != 0) { *outp = '\0'; break; } /* Must be opt_concord... */ - outp++; - if ((*outp = cofunction(NULL))) - readcount++; - if ((charmap[*outp] & CONCORDABLE) != 0) { + outp += len; + len = euro_readchar (cofunction, NULL, outp, &wc); + readcount += len; + + if ((charmap[wc] & CONCORDABLE) != 0) { if (outp < endmaxword) { - *outp = charmap[*outp] & 0x00ff; /* uppercase */ - outp++; + outp = euro_wctomb (charmap[wc], outp, len); } else { tpstate = TOO_LONG; @@ -482,7 +543,8 @@ READ_ANOTHER_WORD: continue; } else { /* next char NOT concordable...*/ - *(--outp) = '\0'; + outp -= len; + *outp = '\0'; break; } } /* endif IN_WORD */ @@ -494,7 +556,7 @@ READ_ANOTHER_WORD: * can get between words again with a clear non_concord. */ else if (tpstate == TOO_LONG) { - if ((charmap[*outp] & NON_CONCORD) != 0) { + if ((charmap[wc] & NON_CONCORD) != 0) { outp = outbuf; tpstate = BETW_WORDS; } @@ -524,7 +586,7 @@ READ_ANOTHER_WORD: return NULL; } - wordlen = strlen ((char *) outbuf); + wordlen = strlen (outbuf); candidate_offset--; /* token offset is one less than number of reads */ if (debugging_teskey) fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"", @@ -586,7 +648,7 @@ GOOD_WORD: fprintf (aa_stderr, ", ...good word\n"); if (offsetp) *offsetp = candidate_offset; - return (char *) outbuf; + return outbuf; } /* teskey_parser() */ @@ -602,10 +664,13 @@ GOOD_WORD: */ int is_concordable (char *word, int *charmap) { - UCHAR *cptr; - for (cptr = (UCHAR *)word; *cptr != 0; cptr++) - if ((charmap[*cptr] & NON_CONCORD) != 0) + char *cptr; + wchar_t wc; + for (cptr = word; *cptr != 0; cptr++) { + euro_mbtowc (&wc, cptr, word); + if ((charmap[wc] & NON_CONCORD) != 0) break; + } return (*cptr == 0); } /* is_concordable() */ @@ -1087,6 +1152,8 @@ static int load_paice_suffixes (DBLK *dblk, DBLK *dblist) int must_be_intact, is_last_rule; UCHAR remove_count; int lineno, errcount; + int len; + wchar_t wc; _Xstrtokparams strtok_buf; dblk->stem_extra = NULL; /* just to be sure */ @@ -1183,9 +1250,11 @@ static int load_paice_suffixes (DBLK *dblk, DBLK *dblist) if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL) continue; - for (cptr = suffix; cptr; cptr++) - if ((dblk->charmap[*cptr] & NUMERAL) == 0) + for (cptr = suffix; cptr; cptr++) { + euro_mbtowc (&wc, (char *)cptr, (char *)suffix); + if ((dblk->charmap[wc] & NUMERAL) == 0) break; + } if (*cptr == '\0') continue; @@ -1252,8 +1321,11 @@ BAD_RULE: prule->remove_count = remove_count; prule->is_last_rule = is_last_rule; if (apndstr) { - prule->apndstr = (UCHAR *) strdup ((char*)apndstr); - prule->aplen = strlen ((char*)apndstr); + len = mbstowcs (NULL, (char *)apndstr, 0); + if (len != -1) { + prule->apndstr = (UCHAR *) strdup ((char*)apndstr); + prule->aplen = len; + } } prule_link = &rules_table[suffix[0]]; @@ -1292,7 +1364,8 @@ BAD_RULE: */ static int is_matching_rule (PRULE *rule) { - static UCHAR *ptr; + static char *ptr; + static wchar_t wc; static int i, j; if (debugging_paice) @@ -1328,7 +1401,7 @@ static int is_matching_rule (PRULE *rule) * Used to algorithmically test remaining stem length * after tentative application of rule. */ - i = paicelen - (rule->remove_count - rule->aplen); + i = paicewcsl - (rule->remove_count - rule->aplen); if (i <= 1) { if (debugging_paice) @@ -1337,7 +1410,11 @@ static int is_matching_rule (PRULE *rule) } if (i == 2) { - if (IS_VOWEL (paicebuf[0])) { + euro_mbtowc (&wc, paicebuf, paicebuf); + + if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf); + + if (IS_VOWEL (wc)) { if (debugging_paice) fputs (", and short vowel stem valid.\n", aa_stderr); return TRUE; @@ -1355,13 +1432,15 @@ static int is_matching_rule (PRULE *rule) * Otherwise it's not. */ for (j=0; j 0 && paicebuf[j] == 'Y') + if (j > 0 && wc == L'Y') goto GOOD_STEM; } @@ -1389,7 +1468,8 @@ GOOD_STEM: */ static char *paice_stemmer (char *wordin, DBLK *dblk) { - UCHAR finalc; + wchar_t finalwc; + int len; PRULE *rule, **rules_table; if (wordin == NULL) @@ -1409,24 +1489,31 @@ static char *paice_stemmer (char *wordin, DBLK *dblk) * prefix ^O that identifies a stem. (But this * stemmer doesn't actually insert the ^O now.) */ - strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD); - paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0; + strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD); + + if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 && + mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1) + paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0; + else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0; + paice_charmap = dblk->charmap; word_is_intact = TRUE; for (;;) { /*-------- Main Stemming Loop ---------*/ - paicelen = strlen ((char*)paicebuf); - finalc = *(paicebuf + paicelen - 1); + paicelen = strlen (paicebuf); + paicewcsl = mbstowcs (NULL, paicebuf, 0); + len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf); + if (debugging_paice) { - fprintf (aa_stderr, - "paice: '%s', rules list '%c' for database '%s'\n", - paicebuf, finalc, dblk->name); + fwprintf (aa_stderr, + L"paice: '%s', rules list '%lc' for database '%s'\n", + paicebuf, finalwc, dblk->name); fflush (aa_stderr); } /* Look for a matching rule */ - if ((rule = rules_table [finalc]) == NULL) { + if ((rule = rules_table [finalwc]) == NULL) { if (debugging_paice) fputs (" list is null, stop.\n", aa_stderr); break; @@ -1438,8 +1525,8 @@ static char *paice_stemmer (char *wordin, DBLK *dblk) } if (rule == NULL) { if (debugging_paice) - fprintf (aa_stderr, " rules list '%c' is exhausted, stop.\n", - finalc); + fwprintf (aa_stderr, + L" rules list '%lc' is exhausted, stop.\n", finalwc); break; } @@ -1452,10 +1539,11 @@ static char *paice_stemmer (char *wordin, DBLK *dblk) break; } - paicebuf [paicelen - rule->remove_count] = 0; + paicebuf [paicelen - len * rule->remove_count] = 0; if (rule->aplen) - strcat ((char*)paicebuf, (char*)rule->apndstr); - paicelen = strlen ((char*)paicebuf); + strcat (paicebuf, (char*)rule->apndstr); + paicelen = strlen (paicebuf); + paicewcsl = mbstowcs (NULL, paicebuf, 0); word_is_intact = FALSE; /* we've removed at least 1 suffix */ if (debugging_paice) fprintf (aa_stderr, "--> '%s'", paicebuf); @@ -1478,7 +1566,7 @@ static char *paice_stemmer (char *wordin, DBLK *dblk) fprintf (aa_stderr, " final stem: '%s'\n", paicebuf); fflush (aa_stderr); } - return (char *) paicebuf; + return paicebuf; } /* paice_stemmer() */ @@ -1498,9 +1586,9 @@ char *null_stemmer (char *word, DBLK *dblk) return ""; if (*word == '\0') return ""; - strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD); + strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD); paicebuf [DtSrMAXWIDTH_HWORD-1] = 0; - return (char *) paicebuf; + return paicebuf; } /* null_stemmer() */ @@ -1514,11 +1602,15 @@ char *null_stemmer (char *word, DBLK *dblk) */ static char *euro_lstrupr (char *string, DBLK *dblk) { - static int *charmap; - static UCHAR *s; + static int *charmap, len; + static char *s; + static wchar_t wc; charmap = dblk->charmap; - for (s=(UCHAR *)string; *s; s++) - *s = charmap[*s] & 0xff; + for (s = string; *s; s++) { + len = euro_mbtowc (&wc, s, string); + *s = charmap[wc] & 0xFF; + if (len > 1) wctomb (s - 1, *s); + } return string; }