mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-02-13 11:42:21 +00:00
DtSearch: Migrate to UTF-8.
This commit is contained in:
parent
e38f1ae3cf
commit
3f950009bc
2 changed files with 237 additions and 160 deletions
|
@ -129,8 +129,6 @@
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#define PROGNAME "JPN"
|
#define PROGNAME "JPN"
|
||||||
#define SS2_CHAR 0x8E /* Single Shift char for Code Set 2 */
|
|
||||||
#define SS3_CHAR 0x8F /* Single Shift char for Code Set 3 */
|
|
||||||
#define EXT_KATAKANA ".ktk"
|
#define EXT_KATAKANA ".ktk"
|
||||||
#define EXT_KANJI ".knj"
|
#define EXT_KANJI ".knj"
|
||||||
#define SUBSTRBUFSZ 100
|
#define SUBSTRBUFSZ 100
|
||||||
|
@ -277,23 +275,7 @@ static char *display_jstate (int js)
|
||||||
*/
|
*/
|
||||||
static int read_jchar (void)
|
static int read_jchar (void)
|
||||||
{
|
{
|
||||||
/* Jstates table for EUC Set 1 (JIS 0208) */
|
char str1[8], str2[8];
|
||||||
static int jstates_set1 [] = {
|
|
||||||
JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
|
|
||||||
JS_ROMAN, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
|
|
||||||
JS_GREEK, JS_CYRILLIC, JS_DISCARD /* 6 - 8 */
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Jstates table for EUC Set 3 (JIS 0212).
|
|
||||||
* Row 5 is presumed to be katakana because
|
|
||||||
* of four new unapproved katakana characters.
|
|
||||||
*/
|
|
||||||
static int jstates_set3 [] = {
|
|
||||||
JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
|
|
||||||
JS_DISCARD, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
|
|
||||||
JS_GREEK, JS_CYRILLIC, JS_DISCARD, /* 6 - 8 */
|
|
||||||
JS_ALPHA, JS_ALPHA, JS_ALPHA /* 9 - 11 */
|
|
||||||
};
|
|
||||||
|
|
||||||
if (readchar_arg) {
|
if (readchar_arg) {
|
||||||
jchar[0] = readchar (readchar_arg);
|
jchar[0] = readchar (readchar_arg);
|
||||||
|
@ -305,75 +287,78 @@ static int read_jchar (void)
|
||||||
return (jstate = JS_ETX);
|
return (jstate = JS_ETX);
|
||||||
readcount++;
|
readcount++;
|
||||||
|
|
||||||
/* Set 1 (JIS 0208) */
|
for (jcharlen = 1; jcharlen < MB_CUR_MAX; ++jcharlen) {
|
||||||
if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
|
jchar[jcharlen] = 0;
|
||||||
jcharlen = 2;
|
if (mblen ((char *) jchar, MB_CUR_MAX) != -1) break;
|
||||||
if (jchar[0] > 0xA8)
|
if ((jchar[jcharlen] = readchar (NULL))) readcount++;
|
||||||
jstate = JS_KANJI;
|
|
||||||
else
|
|
||||||
jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
|
|
||||||
if ((jchar[1] = readchar (NULL)))
|
|
||||||
readcount++;
|
|
||||||
else
|
|
||||||
jstate = JS_ETX;
|
|
||||||
return jstate;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Set 0 (ASCII) */
|
|
||||||
if (jchar[0] < 0x80) {
|
|
||||||
jcharlen = 1;
|
|
||||||
return (jstate = JS_ASCII);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set 3 (JIS 0212) */
|
/* If jchar is an invalid multibyte sequence,
|
||||||
if (jchar[0] == SS3_CHAR) {
|
|
||||||
jcharlen = 3;
|
|
||||||
/*
|
|
||||||
* Hop over the single shift char to get the first JIS byte.
|
|
||||||
* Make sure first JIS byte is in proper
|
|
||||||
* range to avoid indexing outside of table.
|
|
||||||
*/
|
|
||||||
if ((jchar[1] = readchar (NULL)) == 0)
|
|
||||||
return (jstate = JS_ETX);
|
|
||||||
readcount++;
|
|
||||||
if (jchar[1] < 0xA1)
|
|
||||||
return (jstate = JS_DISCARD);
|
|
||||||
if (jchar[1] > 0xAA)
|
|
||||||
jstate = JS_KANJI;
|
|
||||||
else
|
|
||||||
jstate = jstates_set3 [(*jchar & 0x7F) - 32];
|
|
||||||
|
|
||||||
if ((jchar[2] = readchar (NULL)) == 0)
|
|
||||||
return (jstate = JS_ETX);
|
|
||||||
readcount++;
|
|
||||||
/* JS_ALPHA chars ('miscellaneous alphabetic chars' of
|
|
||||||
* rows 9 - 11) are compatible with several other jstates,
|
|
||||||
* so adjust as necessary.
|
|
||||||
*/
|
|
||||||
if (jstate == JS_ALPHA &&
|
|
||||||
((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
|
|
||||||
jstate = last_jstate;
|
|
||||||
else if (last_jstate == JS_ALPHA &&
|
|
||||||
((jstate & JS_ALPHA_COMPATIBLE) != 0))
|
|
||||||
last_jstate = jstate;
|
|
||||||
return jstate;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Set 2 (half-width katakana) */
|
|
||||||
if (jchar[0] == SS2_CHAR) {
|
|
||||||
jcharlen = 2;
|
|
||||||
jstate = JS_HALFKATA;
|
|
||||||
if ((jchar[1] = readchar (NULL)))
|
|
||||||
readcount++;
|
|
||||||
else
|
|
||||||
jstate = JS_ETX;
|
|
||||||
return jstate;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If first jchar doesn't match expected EUC coding,
|
|
||||||
* discard it until we get back into sync.
|
* discard it until we get back into sync.
|
||||||
*/
|
*/
|
||||||
jcharlen = 1;
|
if (jcharlen == MB_CUR_MAX) return (jstate = JS_DISCARD);
|
||||||
|
|
||||||
|
if (jcharlen == 1) {
|
||||||
|
if (jchar[0] < 0x80) {
|
||||||
|
jcharlen = 1;
|
||||||
|
jstate = JS_ASCII;
|
||||||
|
}
|
||||||
|
else jstate = JS_DISCARD;
|
||||||
|
|
||||||
|
return jstate;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jcharlen == 2) {
|
||||||
|
str1[0] = 0xC2;
|
||||||
|
str1[1] = 0x80;
|
||||||
|
str1[2] = 0;
|
||||||
|
|
||||||
|
str2[0] = 0xDF;
|
||||||
|
str2[1] = 0xBF;
|
||||||
|
str2[2] = 0;
|
||||||
|
|
||||||
|
if (strcmp ((char *) jchar, str1) >= 0 &&
|
||||||
|
strcmp ((char *) jchar, str2) <= 0)
|
||||||
|
jstate = JS_ROMAN;
|
||||||
|
else jstate = JS_DISCARD;
|
||||||
|
|
||||||
|
return jstate;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jcharlen == 3) {
|
||||||
|
str1[0] = 0xE4;
|
||||||
|
str1[1] = 0xB8;
|
||||||
|
str1[2] = 0x80;
|
||||||
|
str1[3] = 0;
|
||||||
|
|
||||||
|
str2[0] = 0xE9;
|
||||||
|
str2[1] = 0xBF;
|
||||||
|
str2[2] = 0xBF;
|
||||||
|
str2[3] = 0;
|
||||||
|
|
||||||
|
if (strcmp ((char *) jchar, str1) >= 0 &&
|
||||||
|
strcmp ((char *) jchar, str2) <= 0)
|
||||||
|
jstate = JS_KANJI;
|
||||||
|
else {
|
||||||
|
str1[0] = 0xEF;
|
||||||
|
str1[1] = 0xBD;
|
||||||
|
str1[2] = 0xA6;
|
||||||
|
str1[3] = 0;
|
||||||
|
|
||||||
|
str2[0] = 0xEF;
|
||||||
|
str2[1] = 0xBE;
|
||||||
|
str2[2] = 0x9F;
|
||||||
|
str2[3] = 0;
|
||||||
|
|
||||||
|
if (strcmp ((char *) jchar, str1) >= 0 &&
|
||||||
|
strcmp ((char *) jchar, str2) <= 0)
|
||||||
|
jstate = JS_HALFKATA;
|
||||||
|
else jstate = JS_DISCARD;
|
||||||
|
}
|
||||||
|
|
||||||
|
return jstate;
|
||||||
|
}
|
||||||
|
|
||||||
return (jstate = JS_DISCARD);
|
return (jstate = JS_DISCARD);
|
||||||
} /* read_jchar() */
|
} /* read_jchar() */
|
||||||
|
|
||||||
|
@ -401,7 +386,7 @@ static UCHAR *kanji_compounder (void)
|
||||||
static UCHAR *mysubstrp = NULL;
|
static UCHAR *mysubstrp = NULL;
|
||||||
static UCHAR *mysubstrend = NULL;
|
static UCHAR *mysubstrend = NULL;
|
||||||
static UCHAR *op, *ss;
|
static UCHAR *op, *ss;
|
||||||
static int i;
|
static int i, j, mbl;
|
||||||
|
|
||||||
if (is_new_substring) {
|
if (is_new_substring) {
|
||||||
is_new_substring = FALSE;
|
is_new_substring = FALSE;
|
||||||
|
@ -420,7 +405,7 @@ static UCHAR *kanji_compounder (void)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (++clen > MAX_KANJI_CLEN) {
|
if (++clen > MAX_KANJI_CLEN) {
|
||||||
clen = 1;
|
clen = 1;
|
||||||
mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
|
mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -437,15 +422,13 @@ static UCHAR *kanji_compounder (void)
|
||||||
/* Are there enough jchars left in substring? */
|
/* Are there enough jchars left in substring? */
|
||||||
if (ss >= mysubstrend) {
|
if (ss >= mysubstrend) {
|
||||||
clen = 1;
|
clen = 1;
|
||||||
mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
|
mysubstrp += mblen ((char *)mysubstrp, MB_CUR_MAX);
|
||||||
i = 0; /* indicates assembly failure */
|
i = 0; /* indicates assembly failure */
|
||||||
break; /* breaks the for loop */
|
break; /* breaks the for loop */
|
||||||
}
|
}
|
||||||
/* Assemble one jchar into outbuf */
|
/* Assemble one jchar into outbuf */
|
||||||
if (*ss == SS3_CHAR)
|
mbl = mblen ((char *)ss, MB_CUR_MAX);
|
||||||
*op++ = *ss++;
|
for (j = 0; j < mbl; ++j) *op++ = *ss++;
|
||||||
*op++ = *ss++;
|
|
||||||
*op++ = *ss++;
|
|
||||||
}
|
}
|
||||||
/* Did word assembly succeed? */
|
/* Did word assembly succeed? */
|
||||||
if (i >= clen) {
|
if (i >= clen) {
|
||||||
|
@ -498,7 +481,7 @@ static UCHAR *search_kanjitree (void)
|
||||||
|
|
||||||
/* Return first substr jchar as next token */
|
/* Return first substr jchar as next token */
|
||||||
last_node = NULL; /* NULL = tree not searched yet */
|
last_node = NULL; /* NULL = tree not searched yet */
|
||||||
jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
|
jcharlen = mblen ((char *)substrp, MB_CUR_MAX);
|
||||||
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
||||||
outbuf [jcharlen] = 0;
|
outbuf [jcharlen] = 0;
|
||||||
if (offsetp)
|
if (offsetp)
|
||||||
|
@ -524,7 +507,7 @@ EXHAUSTED_TREE:
|
||||||
all_done = TRUE;
|
all_done = TRUE;
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
|
jcharlen = mblen ((char *)substrp, MB_CUR_MAX);
|
||||||
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
||||||
outbuf [jcharlen] = 0;
|
outbuf [jcharlen] = 0;
|
||||||
if (offsetp)
|
if (offsetp)
|
||||||
|
@ -735,6 +718,8 @@ ENTIRE_SUBSTR_IS_WORD:
|
||||||
"%s Program Error: Unknown jstate %d.\n") ,
|
"%s Program Error: Unknown jstate %d.\n") ,
|
||||||
PROGNAME"246", last_jstate);
|
PROGNAME"246", last_jstate);
|
||||||
DtSearchExit (46);
|
DtSearchExit (46);
|
||||||
|
|
||||||
|
return NULL;
|
||||||
} /* parse_substring() */
|
} /* parse_substring() */
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -124,6 +124,7 @@
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <wchar.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
|
||||||
#define X_INCLUDE_STRING_H
|
#define X_INCLUDE_STRING_H
|
||||||
|
@ -170,8 +171,9 @@ int debugging_search_wordtree = FALSE;
|
||||||
int debugging_teskey = FALSE;
|
int debugging_teskey = FALSE;
|
||||||
int debugging_paice = FALSE;
|
int debugging_paice = FALSE;
|
||||||
static int *paice_charmap;
|
static int *paice_charmap;
|
||||||
static UCHAR paicebuf [DtSrMAXWIDTH_HWORD + 2];
|
static char paicebuf [DtSrMAXWIDTH_HWORD + 2];
|
||||||
static int paicelen;
|
static int paicelen;
|
||||||
|
static int paicewcsl;
|
||||||
static int word_is_intact;
|
static int word_is_intact;
|
||||||
|
|
||||||
/* Language strings correspond to DtSrLa.. constants. */
|
/* Language strings correspond to DtSrLa.. constants. */
|
||||||
|
@ -230,7 +232,7 @@ static char *language_name (DtSrINT16 langno)
|
||||||
* Returns TRUE if successful search, else FALSE.
|
* Returns TRUE if successful search, else FALSE.
|
||||||
* See also search_wordtree_jpn() in jpn.c
|
* See also search_wordtree_jpn() in jpn.c
|
||||||
*/
|
*/
|
||||||
static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
|
static int search_wordtree (WORDTREE *wordtree, char *wordstring)
|
||||||
{
|
{
|
||||||
static int direction;
|
static int direction;
|
||||||
static WORDTREE *node;
|
static WORDTREE *node;
|
||||||
|
@ -240,7 +242,7 @@ static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
|
||||||
wordstring);
|
wordstring);
|
||||||
/* MAIN SEARCH LOOP: binary tree search */
|
/* MAIN SEARCH LOOP: binary tree search */
|
||||||
for (node = wordtree; node != NULL; ) {
|
for (node = wordtree; node != NULL; ) {
|
||||||
if ((direction = strcmp ((char *) wordstring, node->word)) == 0) {
|
if ((direction = strcmp (wordstring, node->word)) == 0) {
|
||||||
if (debugging_search_wordtree)
|
if (debugging_search_wordtree)
|
||||||
fprintf (aa_stderr, " HIT!\n");
|
fprintf (aa_stderr, " HIT!\n");
|
||||||
return TRUE;
|
return TRUE;
|
||||||
|
@ -260,6 +262,61 @@ static int search_wordtree (WORDTREE *wordtree, UCHAR *wordstring)
|
||||||
} /* search_wordtree() */
|
} /* search_wordtree() */
|
||||||
|
|
||||||
|
|
||||||
|
static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
|
||||||
|
{
|
||||||
|
int len;
|
||||||
|
|
||||||
|
mbtowc (NULL, NULL, 0);
|
||||||
|
len = mbtowc (pwc, p, 1);
|
||||||
|
|
||||||
|
if (len < 0 && p > s) {
|
||||||
|
mbtowc (NULL, NULL, 0);
|
||||||
|
len = mbtowc (pwc, p - 1, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static char *euro_wctomb (int c, char *outp, int len)
|
||||||
|
{
|
||||||
|
wchar_t wc = c & 0xFF;
|
||||||
|
if (len > 1) wctomb (outp, wc);
|
||||||
|
else *outp = wc;
|
||||||
|
return outp + len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
|
||||||
|
wchar_t *pwc)
|
||||||
|
{
|
||||||
|
int len = 1;
|
||||||
|
|
||||||
|
*pwc = 0;
|
||||||
|
|
||||||
|
*outp = cofunction (cofunction_arg);
|
||||||
|
|
||||||
|
if (!(*outp)) goto done;
|
||||||
|
|
||||||
|
mbtowc (NULL, NULL, 0);
|
||||||
|
|
||||||
|
if (mbtowc (pwc, outp, 1) >= 0) goto done;
|
||||||
|
|
||||||
|
*(outp + 1) = cofunction (NULL);
|
||||||
|
mbtowc (NULL, NULL, 0);
|
||||||
|
len = mbtowc (pwc, outp, 2);
|
||||||
|
|
||||||
|
if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
|
||||||
|
|
||||||
|
len = 2;
|
||||||
|
|
||||||
|
done:
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/************************************************/
|
/************************************************/
|
||||||
/* */
|
/* */
|
||||||
/* teskey_parser */
|
/* teskey_parser */
|
||||||
|
@ -312,10 +369,10 @@ char *teskey_parser (PARG *parg)
|
||||||
static READCFP cofunction;
|
static READCFP cofunction;
|
||||||
static void *cofunction_arg;
|
static void *cofunction_arg;
|
||||||
static DBLK *dblk = NULL;
|
static DBLK *dblk = NULL;
|
||||||
static UCHAR *outbuf = NULL;
|
static char *outbuf = NULL;
|
||||||
static size_t outbufsz = 0;
|
static size_t outbufsz = 0;
|
||||||
static UCHAR *endmaxword; /* end largest possible output word */
|
static char *endmaxword; /* end largest possible output word */
|
||||||
static UCHAR *outp; /* next loc in outbuf */
|
static char *outp; /* next loc in outbuf */
|
||||||
static int *charmap;
|
static int *charmap;
|
||||||
static int minwordsz, maxwordsz;
|
static int minwordsz, maxwordsz;
|
||||||
static int wordlen;
|
static int wordlen;
|
||||||
|
@ -324,6 +381,8 @@ char *teskey_parser (PARG *parg)
|
||||||
static long *offsetp, readcount, candidate_offset;
|
static long *offsetp, readcount, candidate_offset;
|
||||||
static int is_hiliting;
|
static int is_hiliting;
|
||||||
static int add_msgs;
|
static int add_msgs;
|
||||||
|
static int len;
|
||||||
|
static wchar_t wc;
|
||||||
|
|
||||||
/* If first call for current text block... */
|
/* If first call for current text block... */
|
||||||
if (parg) {
|
if (parg) {
|
||||||
|
@ -376,8 +435,12 @@ char *teskey_parser (PARG *parg)
|
||||||
READ_ANOTHER_WORD:
|
READ_ANOTHER_WORD:
|
||||||
outp = outbuf;
|
outp = outbuf;
|
||||||
tpstate = BETW_WORDS;
|
tpstate = BETW_WORDS;
|
||||||
while ((*outp = cofunction (cofunction_arg))) {
|
for (;;) {
|
||||||
readcount++;
|
len = euro_readchar (cofunction, cofunction_arg, outp, &wc);
|
||||||
|
|
||||||
|
if (!wc) break;
|
||||||
|
|
||||||
|
readcount += len;
|
||||||
cofunction_arg = NULL;
|
cofunction_arg = NULL;
|
||||||
|
|
||||||
/*------------- BETW_WORDS State ------------
|
/*------------- BETW_WORDS State ------------
|
||||||
|
@ -387,15 +450,14 @@ READ_ANOTHER_WORD:
|
||||||
/*
|
/*
|
||||||
* Discard nonconcordable chars between words.
|
* Discard nonconcordable chars between words.
|
||||||
*/
|
*/
|
||||||
if ((charmap[*outp] & NON_CONCORD) != 0)
|
if ((charmap[wc] & NON_CONCORD) != 0)
|
||||||
continue;
|
continue;
|
||||||
/*
|
/*
|
||||||
* Fully concordable char is definite start of new word.
|
* Fully concordable char is definite start of new word.
|
||||||
* Convert to uppercase and go get next char.
|
* Convert to uppercase and go get next char.
|
||||||
*/
|
*/
|
||||||
if ((charmap[*outp] & CONCORDABLE) != 0) {
|
if ((charmap[wc] & CONCORDABLE) != 0) {
|
||||||
*outp = charmap[*outp] & 0x00ff;
|
outp = euro_wctomb (charmap[wc], outp, len);
|
||||||
outp++;
|
|
||||||
candidate_offset = readcount;
|
candidate_offset = readcount;
|
||||||
tpstate = IN_WORD;
|
tpstate = IN_WORD;
|
||||||
continue;
|
continue;
|
||||||
|
@ -407,18 +469,18 @@ READ_ANOTHER_WORD:
|
||||||
* to uppercase and go get next char.
|
* to uppercase and go get next char.
|
||||||
* Otherwise discard just like non_concord.
|
* Otherwise discard just like non_concord.
|
||||||
*/
|
*/
|
||||||
outp++;
|
outp += len;
|
||||||
if ((*outp = cofunction(NULL)))
|
len = euro_readchar (cofunction, NULL, outp, &wc);
|
||||||
readcount++;
|
readcount += len;
|
||||||
if ((charmap[*outp] & CONCORDABLE) != 0) {
|
|
||||||
*outp = charmap[*outp] & 0x00ff;
|
if ((charmap[wc] & CONCORDABLE) != 0) {
|
||||||
outp++;
|
outp = euro_wctomb (charmap[wc], outp, len);
|
||||||
candidate_offset = readcount - 1;
|
candidate_offset = readcount - len;
|
||||||
tpstate = IN_WORD;
|
tpstate = IN_WORD;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
outp--;
|
outp -= len;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} /* endif BETW_WORDS */
|
} /* endif BETW_WORDS */
|
||||||
|
@ -431,10 +493,9 @@ READ_ANOTHER_WORD:
|
||||||
* Non_concords treatment depends on next char.
|
* Non_concords treatment depends on next char.
|
||||||
*/
|
*/
|
||||||
else if (tpstate == IN_WORD) {
|
else if (tpstate == IN_WORD) {
|
||||||
if ((charmap[*outp] & CONCORDABLE) != 0) {
|
if ((charmap[wc] & CONCORDABLE) != 0) {
|
||||||
if (outp < endmaxword) {
|
if (outp < endmaxword) {
|
||||||
*outp = charmap[*outp] & 0x00ff;
|
outp = euro_wctomb (charmap[wc], outp, len);
|
||||||
outp++;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
tpstate = TOO_LONG;
|
tpstate = TOO_LONG;
|
||||||
|
@ -457,18 +518,18 @@ READ_ANOTHER_WORD:
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if ((charmap[*outp] & NON_CONCORD) != 0) {
|
if ((charmap[wc] & NON_CONCORD) != 0) {
|
||||||
*outp = '\0';
|
*outp = '\0';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Must be opt_concord... */
|
/* Must be opt_concord... */
|
||||||
outp++;
|
outp += len;
|
||||||
if ((*outp = cofunction(NULL)))
|
len = euro_readchar (cofunction, NULL, outp, &wc);
|
||||||
readcount++;
|
readcount += len;
|
||||||
if ((charmap[*outp] & CONCORDABLE) != 0) {
|
|
||||||
|
if ((charmap[wc] & CONCORDABLE) != 0) {
|
||||||
if (outp < endmaxword) {
|
if (outp < endmaxword) {
|
||||||
*outp = charmap[*outp] & 0x00ff; /* uppercase */
|
outp = euro_wctomb (charmap[wc], outp, len);
|
||||||
outp++;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
tpstate = TOO_LONG;
|
tpstate = TOO_LONG;
|
||||||
|
@ -482,7 +543,8 @@ READ_ANOTHER_WORD:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else { /* next char NOT concordable...*/
|
else { /* next char NOT concordable...*/
|
||||||
*(--outp) = '\0';
|
outp -= len;
|
||||||
|
*outp = '\0';
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} /* endif IN_WORD */
|
} /* endif IN_WORD */
|
||||||
|
@ -494,7 +556,7 @@ READ_ANOTHER_WORD:
|
||||||
* can get between words again with a clear non_concord.
|
* can get between words again with a clear non_concord.
|
||||||
*/
|
*/
|
||||||
else if (tpstate == TOO_LONG) {
|
else if (tpstate == TOO_LONG) {
|
||||||
if ((charmap[*outp] & NON_CONCORD) != 0) {
|
if ((charmap[wc] & NON_CONCORD) != 0) {
|
||||||
outp = outbuf;
|
outp = outbuf;
|
||||||
tpstate = BETW_WORDS;
|
tpstate = BETW_WORDS;
|
||||||
}
|
}
|
||||||
|
@ -524,7 +586,7 @@ READ_ANOTHER_WORD:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
wordlen = strlen ((char *) outbuf);
|
wordlen = strlen (outbuf);
|
||||||
candidate_offset--; /* token offset is one less than number of reads */
|
candidate_offset--; /* token offset is one less than number of reads */
|
||||||
if (debugging_teskey)
|
if (debugging_teskey)
|
||||||
fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
|
fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
|
||||||
|
@ -586,7 +648,7 @@ GOOD_WORD:
|
||||||
fprintf (aa_stderr, ", ...good word\n");
|
fprintf (aa_stderr, ", ...good word\n");
|
||||||
if (offsetp)
|
if (offsetp)
|
||||||
*offsetp = candidate_offset;
|
*offsetp = candidate_offset;
|
||||||
return (char *) outbuf;
|
return outbuf;
|
||||||
} /* teskey_parser() */
|
} /* teskey_parser() */
|
||||||
|
|
||||||
|
|
||||||
|
@ -602,10 +664,13 @@ GOOD_WORD:
|
||||||
*/
|
*/
|
||||||
int is_concordable (char *word, int *charmap)
|
int is_concordable (char *word, int *charmap)
|
||||||
{
|
{
|
||||||
UCHAR *cptr;
|
char *cptr;
|
||||||
for (cptr = (UCHAR *)word; *cptr != 0; cptr++)
|
wchar_t wc;
|
||||||
if ((charmap[*cptr] & NON_CONCORD) != 0)
|
for (cptr = word; *cptr != 0; cptr++) {
|
||||||
|
euro_mbtowc (&wc, cptr, word);
|
||||||
|
if ((charmap[wc] & NON_CONCORD) != 0)
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
return (*cptr == 0);
|
return (*cptr == 0);
|
||||||
} /* is_concordable() */
|
} /* is_concordable() */
|
||||||
|
|
||||||
|
@ -1087,6 +1152,8 @@ static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
|
||||||
int must_be_intact, is_last_rule;
|
int must_be_intact, is_last_rule;
|
||||||
UCHAR remove_count;
|
UCHAR remove_count;
|
||||||
int lineno, errcount;
|
int lineno, errcount;
|
||||||
|
int len;
|
||||||
|
wchar_t wc;
|
||||||
_Xstrtokparams strtok_buf;
|
_Xstrtokparams strtok_buf;
|
||||||
|
|
||||||
dblk->stem_extra = NULL; /* just to be sure */
|
dblk->stem_extra = NULL; /* just to be sure */
|
||||||
|
@ -1183,9 +1250,11 @@ static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
|
||||||
if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
|
if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (cptr = suffix; cptr; cptr++)
|
for (cptr = suffix; cptr; cptr++) {
|
||||||
if ((dblk->charmap[*cptr] & NUMERAL) == 0)
|
euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
|
||||||
|
if ((dblk->charmap[wc] & NUMERAL) == 0)
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
if (*cptr == '\0')
|
if (*cptr == '\0')
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -1252,8 +1321,11 @@ BAD_RULE:
|
||||||
prule->remove_count = remove_count;
|
prule->remove_count = remove_count;
|
||||||
prule->is_last_rule = is_last_rule;
|
prule->is_last_rule = is_last_rule;
|
||||||
if (apndstr) {
|
if (apndstr) {
|
||||||
prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
|
len = mbstowcs (NULL, (char *)apndstr, 0);
|
||||||
prule->aplen = strlen ((char*)apndstr);
|
if (len != -1) {
|
||||||
|
prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
|
||||||
|
prule->aplen = len;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
prule_link = &rules_table[suffix[0]];
|
prule_link = &rules_table[suffix[0]];
|
||||||
|
@ -1292,7 +1364,8 @@ BAD_RULE:
|
||||||
*/
|
*/
|
||||||
static int is_matching_rule (PRULE *rule)
|
static int is_matching_rule (PRULE *rule)
|
||||||
{
|
{
|
||||||
static UCHAR *ptr;
|
static char *ptr;
|
||||||
|
static wchar_t wc;
|
||||||
static int i, j;
|
static int i, j;
|
||||||
|
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
|
@ -1328,7 +1401,7 @@ static int is_matching_rule (PRULE *rule)
|
||||||
* Used to algorithmically test remaining stem length
|
* Used to algorithmically test remaining stem length
|
||||||
* after tentative application of rule.
|
* after tentative application of rule.
|
||||||
*/
|
*/
|
||||||
i = paicelen - (rule->remove_count - rule->aplen);
|
i = paicewcsl - (rule->remove_count - rule->aplen);
|
||||||
|
|
||||||
if (i <= 1) {
|
if (i <= 1) {
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
|
@ -1337,7 +1410,11 @@ static int is_matching_rule (PRULE *rule)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == 2) {
|
if (i == 2) {
|
||||||
if (IS_VOWEL (paicebuf[0])) {
|
euro_mbtowc (&wc, paicebuf, paicebuf);
|
||||||
|
|
||||||
|
if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);
|
||||||
|
|
||||||
|
if (IS_VOWEL (wc)) {
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
fputs (", and short vowel stem valid.\n", aa_stderr);
|
fputs (", and short vowel stem valid.\n", aa_stderr);
|
||||||
return TRUE;
|
return TRUE;
|
||||||
|
@ -1355,13 +1432,15 @@ static int is_matching_rule (PRULE *rule)
|
||||||
* Otherwise it's not.
|
* Otherwise it's not.
|
||||||
*/
|
*/
|
||||||
for (j=0; j<i; j++) {
|
for (j=0; j<i; j++) {
|
||||||
if (IS_VOWEL (paicebuf[j])) {
|
euro_mbtowc (&wc, &paicebuf[j], paicebuf);
|
||||||
|
|
||||||
|
if (IS_VOWEL (wc)) {
|
||||||
GOOD_STEM:
|
GOOD_STEM:
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
fputs (", and remaining stem valid.\n", aa_stderr);
|
fputs (", and remaining stem valid.\n", aa_stderr);
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
if (j > 0 && paicebuf[j] == 'Y')
|
if (j > 0 && wc == L'Y')
|
||||||
goto GOOD_STEM;
|
goto GOOD_STEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1389,7 +1468,8 @@ GOOD_STEM:
|
||||||
*/
|
*/
|
||||||
static char *paice_stemmer (char *wordin, DBLK *dblk)
|
static char *paice_stemmer (char *wordin, DBLK *dblk)
|
||||||
{
|
{
|
||||||
UCHAR finalc;
|
wchar_t finalwc;
|
||||||
|
int len;
|
||||||
PRULE *rule, **rules_table;
|
PRULE *rule, **rules_table;
|
||||||
|
|
||||||
if (wordin == NULL)
|
if (wordin == NULL)
|
||||||
|
@ -1409,24 +1489,31 @@ static char *paice_stemmer (char *wordin, DBLK *dblk)
|
||||||
* prefix ^O that identifies a stem. (But this
|
* prefix ^O that identifies a stem. (But this
|
||||||
* stemmer doesn't actually insert the ^O now.)
|
* stemmer doesn't actually insert the ^O now.)
|
||||||
*/
|
*/
|
||||||
strncpy ((char*)paicebuf, wordin, DtSrMAXWIDTH_HWORD);
|
strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);
|
||||||
paicebuf [DtSrMAXWIDTH_HWORD - 2] = 0;
|
|
||||||
|
if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
|
||||||
|
mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
|
||||||
|
paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
|
||||||
|
else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;
|
||||||
|
|
||||||
paice_charmap = dblk->charmap;
|
paice_charmap = dblk->charmap;
|
||||||
word_is_intact = TRUE;
|
word_is_intact = TRUE;
|
||||||
|
|
||||||
for (;;) { /*-------- Main Stemming Loop ---------*/
|
for (;;) { /*-------- Main Stemming Loop ---------*/
|
||||||
|
|
||||||
paicelen = strlen ((char*)paicebuf);
|
paicelen = strlen (paicebuf);
|
||||||
finalc = *(paicebuf + paicelen - 1);
|
paicewcsl = mbstowcs (NULL, paicebuf, 0);
|
||||||
|
len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);
|
||||||
|
|
||||||
if (debugging_paice) {
|
if (debugging_paice) {
|
||||||
fprintf (aa_stderr,
|
fwprintf (aa_stderr,
|
||||||
"paice: '%s', rules list '%c' for database '%s'\n",
|
L"paice: '%s', rules list '%lc' for database '%s'\n",
|
||||||
paicebuf, finalc, dblk->name);
|
paicebuf, finalwc, dblk->name);
|
||||||
fflush (aa_stderr);
|
fflush (aa_stderr);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Look for a matching rule */
|
/* Look for a matching rule */
|
||||||
if ((rule = rules_table [finalc]) == NULL) {
|
if ((rule = rules_table [finalwc]) == NULL) {
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
fputs (" list is null, stop.\n", aa_stderr);
|
fputs (" list is null, stop.\n", aa_stderr);
|
||||||
break;
|
break;
|
||||||
|
@ -1438,8 +1525,8 @@ static char *paice_stemmer (char *wordin, DBLK *dblk)
|
||||||
}
|
}
|
||||||
if (rule == NULL) {
|
if (rule == NULL) {
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
fprintf (aa_stderr, " rules list '%c' is exhausted, stop.\n",
|
fwprintf (aa_stderr,
|
||||||
finalc);
|
L" rules list '%lc' is exhausted, stop.\n", finalwc);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1452,10 +1539,11 @@ static char *paice_stemmer (char *wordin, DBLK *dblk)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
paicebuf [paicelen - rule->remove_count] = 0;
|
paicebuf [paicelen - len * rule->remove_count] = 0;
|
||||||
if (rule->aplen)
|
if (rule->aplen)
|
||||||
strcat ((char*)paicebuf, (char*)rule->apndstr);
|
strcat (paicebuf, (char*)rule->apndstr);
|
||||||
paicelen = strlen ((char*)paicebuf);
|
paicelen = strlen (paicebuf);
|
||||||
|
paicewcsl = mbstowcs (NULL, paicebuf, 0);
|
||||||
word_is_intact = FALSE; /* we've removed at least 1 suffix */
|
word_is_intact = FALSE; /* we've removed at least 1 suffix */
|
||||||
if (debugging_paice)
|
if (debugging_paice)
|
||||||
fprintf (aa_stderr, "--> '%s'", paicebuf);
|
fprintf (aa_stderr, "--> '%s'", paicebuf);
|
||||||
|
@ -1478,7 +1566,7 @@ static char *paice_stemmer (char *wordin, DBLK *dblk)
|
||||||
fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
|
fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
|
||||||
fflush (aa_stderr);
|
fflush (aa_stderr);
|
||||||
}
|
}
|
||||||
return (char *) paicebuf;
|
return paicebuf;
|
||||||
} /* paice_stemmer() */
|
} /* paice_stemmer() */
|
||||||
|
|
||||||
|
|
||||||
|
@ -1498,9 +1586,9 @@ char *null_stemmer (char *word, DBLK *dblk)
|
||||||
return "";
|
return "";
|
||||||
if (*word == '\0')
|
if (*word == '\0')
|
||||||
return "";
|
return "";
|
||||||
strncpy ((char *)paicebuf, word, DtSrMAXWIDTH_HWORD);
|
strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
|
||||||
paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
|
paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
|
||||||
return (char *) paicebuf;
|
return paicebuf;
|
||||||
} /* null_stemmer() */
|
} /* null_stemmer() */
|
||||||
|
|
||||||
|
|
||||||
|
@ -1514,11 +1602,15 @@ char *null_stemmer (char *word, DBLK *dblk)
|
||||||
*/
|
*/
|
||||||
static char *euro_lstrupr (char *string, DBLK *dblk)
|
static char *euro_lstrupr (char *string, DBLK *dblk)
|
||||||
{
|
{
|
||||||
static int *charmap;
|
static int *charmap, len;
|
||||||
static UCHAR *s;
|
static char *s;
|
||||||
|
static wchar_t wc;
|
||||||
charmap = dblk->charmap;
|
charmap = dblk->charmap;
|
||||||
for (s=(UCHAR *)string; *s; s++)
|
for (s = string; *s; s++) {
|
||||||
*s = charmap[*s] & 0xff;
|
len = euro_mbtowc (&wc, s, string);
|
||||||
|
*s = charmap[wc] & 0xFF;
|
||||||
|
if (len > 1) wctomb (s - 1, *s);
|
||||||
|
}
|
||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue