1
0
Fork 0
mirror of git://git.code.sf.net/p/cdesktopenv/code synced 2025-03-09 15:50:02 +00:00
cde/cde/lib/DtSearch/lang.c
2022-01-26 19:50:11 +08:00

1770 lines
50 KiB
C

/*
* CDE - Common Desktop Environment
*
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
*
* These libraries and programs are free software; you can
* redistribute them and/or modify them under the terms of the GNU
* Lesser General Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* These libraries and programs are distributed in the hope that
* they will be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with these libraries and programs; if not, write
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
* Floor, Boston, MA 02110-1301 USA
*/
/*
* COMPONENT_NAME: austext
*
* FUNCTIONS: euro_lstrupr
* free_wordtree
* is_concordable
* language_name
* load_include_list
* load_language
* load_paice_suffixes
* load_stop_list
* load_wordtree
* null_lstrupr
* null_stemmer
* paice_stemmer
* search_wordtree
* teskey_parser
* unload_language
*
* ORIGINS: 27
*
*
* (C) COPYRIGHT International Business Machines Corp. 1995,1996
* All Rights Reserved
* Licensed Materials - Property of IBM
* US Government Users Restricted Rights - Use, duplication or
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
*/
/******************** LANG.C ********************
* $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
* July 1995.
* Includes load_language(), unload_language(), and functions and data for
* parsing and stemming European languages in DtSearch/AusText.
* Incorporates p/o socrates.c, p/o proctext.c, parser.c
* delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
* Related to similar semantic modules repackaged into semantic.c.
* Paice suffix removal algorithm from C. Paice, 1990,
* "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
*
* $Log$
* Revision 2.13 1996/03/25 18:55:26 miker
* Changed FILENAME_MAX to _POSIX_PATH_MAX.
*
* Revision 2.12 1996/03/25 17:00:19 miker
* Cleanup compiler warning.
*
* Revision 2.11 1996/03/13 22:58:13 miker
* Changed char to UCHAR several places.
*
* Revision 2.10 1996/03/05 16:49:58 miker
* Move COMMENT_CHARS to SearchP.h.
*
* Revision 2.9 1996/03/05 16:31:20 miker
* Added test of PA_MSGS for yacc-based boolean queries.
* Made comment chars in linguistic files independent of locale.
* Changed several char ptrs to unsigned char so parser will
* work when compiled under default signed char compilers.
* Simplified several statements with LHS *var++ for same reason.
*
* Revision 2.8 1996/02/05 16:16:05 miker
* Restore prolog.
*
* Revision 2.7 1996/02/05 16:10:54 miker
* load_paice_suffixes: discard .sfx lines beginning with all numeric
* first token for compatibility with older file formats.
*
* Revision 2.6 1996/02/01 19:11:43 miker
* AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
* Moved charmaps to new module langmap.c. Removed hard coded
* paice stemmer values--now dynamic from .sfx file.
*
* Revision 2.5 1995/10/26 14:55:28 miker
* Added prolog.
*
* Revision 2.4 1995/10/19 20:54:36 miker
* Increased msg buf sizes to accommodate larger database file names.
*
* Revision 2.3 1995/10/06 14:39:45 miker
* Bug fix: coredump loading multiple databases
* on Solaris.
*
* Revision 2.2 1995/10/03 21:39:10 miker
* Changed teskey_parser, paice_stemmer, and null_stemmer
* to return number of words parsed/stemmed, not just boolean.
*
* Revision 2.1 1995/09/22 21:00:19 miker
* Freeze DtSearch 0.1, AusText 2.1.8
*
* Revision 1.3 1995/09/19 22:08:28 miker
* Added support for loading and parsing Japanese language DtSrLaJPN.
*
* Revision 1.2 1995/09/05 21:34:52 miker
* Fixed bug: search engine wouldn't parse words of exactly
* 3 or 15 chars.
*
* Revision 1.1 1995/08/31 21:03:44 miker
* Initial revision
*/
#include "SearchP.h"
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <wchar.h>
#include <sys/stat.h>
#define X_INCLUDE_STRING_H
#define XOS_USE_NO_LOCKING
#include <X11/Xos_r.h>
#define PROGNAME "LANG"
#define EXT_SUFFIX ".sfx" /* standard paice suffix file format */
#define OUTBUFSZ 6140
#define SFX_DELIMS " \t\n"
#define MS_misc 1
#define MS_lang 15
#define IS_VOWEL(c) ((paice_charmap [(UCHAR)c] & VOWEL) != 0)
/************************************************/
/* */
/* PRULE */
/* */
/************************************************/
/* List of Paice suffix removal rules from .sfx files */
typedef struct prule_t {
struct prule_t *link; /* Ptr to next list node */
UCHAR *suffix; /* Applicable suffix string, backwards */
UCHAR suflen; /* Length of suffix */
char must_be_intact; /* Optional '*'. Rule only applies
* to intact words */
UCHAR remove_count; /* Number of suffix chars to remove */
UCHAR aplen; /* Length of apndstr */
UCHAR *apndstr; /* Optional append string */
char is_last_rule; /* '$' terminate or '>' continue algorithm */
} PRULE;
char *ensure_end_slash (char *pathstr);
void unload_jpn_language (DBLK *dblk);
/************************************************/
/* */
/* GLOBALS */
/* */
/************************************************/
int debugging_loadlang = FALSE;
int debugging_loadword = FALSE;
int debugging_search_wordtree = FALSE;
int debugging_teskey = FALSE;
int debugging_paice = FALSE;
static int *paice_charmap;
static char paicebuf [DtSrMAXWIDTH_HWORD + 2];
static int paicelen;
static int paicewcsl;
static int word_is_intact;
/* Language strings correspond to DtSrLa.. constants. */
static char *lang_fnames[] = {
"eng", /* 0 */
"eng", /* 1 ('eng2' same files as 'eng') */
"esp", /* 2 */
"fra", /* 3 */
"ita", /* 4 */
"deu", /* 5 */
"jpn", /* 6 */
"jpn", /* 7 ('jpn2' same files as 'jpn' */
NULL
};
/************************************************/
/* */
/* language_name */
/* */
/************************************************/
/* Returns language name string given language number */
static char *language_name (DtSrINT16 langno)
{
static char *language_names[] = {
"English-ASCII", /* 0 = DtSrLaENG */
"English-Latin1", /* 1 = DtSrLaENG2 */
"Spanish", /* 2 = DtSrLaESP */
"French", /* 3 = DtSrLaFRA */
"Italian", /* 4 = DtSrLaITA */
"German", /* 5 = DtSrLaDEU */
"Japanese-comp", /* 6 = DtSrLaJPN */
"Japanese-.knj" /* 7 = DtSrLaJPN2 */
};
if (langno < 0)
return "INVALID!";
else if (langno > DtSrLaLAST)
return "(Custom Language)";
else
return language_names [langno];
} /* language_name() */
/************************************************/
/* */
/* search_wordtree */
/* */
/************************************************/
/* Sept 1991.
* Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
* Searches a word list in a binary WORDTREE.
* Passed wordstring is presumed to be a clean,
* uppercase word token string terminated by \0.
* Variables are static for speeeeed.
* Returns TRUE if successful search, else FALSE.
* See also search_wordtree_jpn() in jpn.c
*/
static int search_wordtree (WORDTREE *wordtree, char *wordstring)
{
static int direction;
static WORDTREE *node;
if (debugging_search_wordtree)
fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
wordstring);
/* MAIN SEARCH LOOP: binary tree search */
for (node = wordtree; node != NULL; ) {
if ((direction = strcmp (wordstring, node->word)) == 0) {
if (debugging_search_wordtree)
fprintf (aa_stderr, " HIT!\n");
return TRUE;
}
/* Descend left or right depending on word */
if (debugging_search_wordtree)
fprintf (aa_stderr, " %c '%s'\n",
(direction < 0) ? 'L' : 'R', (char *) node->word);
if (direction < 0)
node = node->llink;
else
node = node->rlink;
}
if (debugging_search_wordtree)
fprintf (aa_stderr, " MISS.\n");
return FALSE;
} /* search_wordtree() */
static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
{
int len = -1;
if (p < s) goto done;
if (*p >= 0 && *p <= 0x7F) {
len = 1;
*pwc = *p;
goto done;
}
if (p == s) goto done;
mbtowc (NULL, NULL, 0); len = mbtowc (pwc, p - 1, 2);
done:
if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
return len;
}
static char *euro_wctomb (int c, char *outp, int len)
{
wchar_t wc = c & 0xFF;
if (len > 1) wctomb (outp, wc);
else *outp = wc;
return outp + len;
}
static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
wchar_t *pwc)
{
int len = 1;
*pwc = *outp = cofunction (cofunction_arg);
if (*pwc >= 0 && *pwc <= 0x7F) goto done;
*(outp + len) = cofunction (NULL);
mbtowc (NULL, NULL, 0); if (mbtowc (pwc, outp, ++len) >= 0) goto done;
*pwc = 0x100;
for (;;) {
if (len >= MB_CUR_MAX) break;
*(outp + len) = cofunction (NULL);
mblen (NULL, 0); if (mblen (outp, ++len) >= 0) break;
}
done:
if (*pwc > 0xFF) *pwc = 0x100;
return len;
}
/************************************************/
/* */
/* teskey_parser */
/* */
/************************************************/
/* 1989.
* Teskey_parser() is derived from the former Socrates() in socrates.c.
* Returns next teskey-parsed word token from a character stream.
* Called from (1) dtsrindex, where readchar_ftext() cofunction
* reads the .fzk file document 'stream', or (2) search engine
* query parsers, where readchar_string() cofunction 'reads'
* from the query string.
* (The word hiliting parser does not directly call teskey_parser; it has
* its own simplified equivalent to the parsing algorithms herein.)
*
* First call passes args in PARG structure. This resets end of
* text block (ETX) flag, resets 'offset' counter to zero, etc.
* Subsequent calls should pass NULL, and parser returns
* next token in block, until reader cofunction reads ETX,
* ie special ETX char ('\0'). Subsequent calls to parser
* return NULL meaning "no tokens left in current stream".
* Reader cofunctions tolerate repeated calls after
* the first ETX, still returning '\0'.
*
* This parser presumes all incoming text is unformatted.
* Since parser accesses streams a char at a time it does
* not require periodic line feeds or anything else.
*
* Parser also returns offset information: number of bytes
* since beginning of text block.
*
* Variables are static for speeeeeeed.
*
* OUTPUT FORMAT: NULL or a static C string containing a single
* parsed word token. Word buffer reused at next call.
* Each word is translated as follows:
* All alphas TO UPPERCASE.
* Teskey algorithm used to find word boundaries.
* Always keeps include-list words.
* Throws away stoplist words, very short words, and very long words.
* All intervening nonconcordables discarded.
*
* There is a slight mod to the published Teskey algorithm.
* Words can begin with optionally concordable chars
* but not end with them. For example if '-' is optionally
* concordable, '-foo-' will be parsed into '-foo'.
*/
char *teskey_parser (PARG *parg)
{
static READCFP cofunction;
static void *cofunction_arg;
static DBLK *dblk = NULL;
static char *outbuf = NULL;
static size_t outbufsz = 0;
static char *endmaxword; /* end largest possible output word */
static char *outp; /* next loc in outbuf */
static int *charmap;
static int minwordsz, maxwordsz;
static int wordlen;
static enum {BETW_WORDS, IN_WORD, TOO_LONG}
tpstate;
static long *offsetp, readcount, candidate_offset;
static int is_hiliting;
static int add_msgs;
static int len, opt_len;
static wchar_t wc;
/* If first call for current text block... */
if (parg) {
dblk = parg->dblk;
minwordsz = dblk->dbrec.or_minwordsz;
maxwordsz = dblk->dbrec.or_maxwordsz;
charmap = dblk->charmap;
offsetp = parg->offsetp;
is_hiliting = (parg->flags & PA_HILITING);
add_msgs = (parg->flags & PA_MSGS);
if (charmap == NULL) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 4,
"%s dblk not initialized.\n"),
PROGNAME"801");
DtSearchExit (55);
}
if (parg->string) {
cofunction_arg = parg->string;
cofunction = (READCFP) readchar_string;
}
else if (parg->ftext) {
cofunction_arg = parg;
cofunction = (READCFP) readchar_ftext;
}
else {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 5,
"%s Program Error: parg contains neither file nor string.\n"),
PROGNAME"327");
DtSearchExit (27);
}
if (outbufsz <= maxwordsz) {
if (outbuf)
free (outbuf);
outbufsz = maxwordsz + 8;
outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
}
endmaxword = outbuf + maxwordsz;
if (debugging_teskey)
fprintf (aa_stderr,
"teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
maxwordsz, (unsigned long) outbufsz);
readcount = 0L;
}
/* CANDIDATE WORD LOOP: Read text chars into outbuf.
* Exit loop when outbuf contains one candidate token or at ETX.
*/
READ_ANOTHER_WORD:
outp = outbuf;
tpstate = BETW_WORDS;
for (;;) {
len = euro_readchar (cofunction, cofunction_arg, outp, &wc);
if (!wc) break;
readcount += len;
cofunction_arg = NULL;
/*------------- BETW_WORDS State ------------
* Reader is between word tokens.
*/
if (tpstate == BETW_WORDS) {
/*
* Discard nonconcordable chars between words.
*/
if ((charmap[wc] & NON_CONCORD) != 0)
continue;
/*
* Fully concordable char is definite start of new word.
* Convert to uppercase and go get next char.
*/
if ((charmap[wc] & CONCORDABLE) != 0) {
outp = euro_wctomb (charmap[wc], outp, len);
candidate_offset = readcount;
tpstate = IN_WORD;
continue;
}
/*
* Must be optionally concordable. It can only
* start a new word if next char is concordable.
* If so, convert a fully concordable char
* to uppercase and go get next char.
* Otherwise discard just like non_concord.
*/
outp += len;
opt_len = euro_readchar (cofunction, NULL, outp, &wc);
if (wc) readcount += opt_len;
if ((charmap[wc] & CONCORDABLE) != 0) {
outp = euro_wctomb (charmap[wc], outp, opt_len);
candidate_offset = readcount - opt_len;
tpstate = IN_WORD;
continue;
}
else {
outp -= len;
continue;
}
} /* endif BETW_WORDS */
/*------------- IN_WORD State ------------
* Reader is in middle of a word.
* Convert all concordables to uppercase and append.
* Terminate word at first non_concord.
* Non_concords treatment depends on next char.
*/
else if (tpstate == IN_WORD) {
if ((charmap[wc] & CONCORDABLE) != 0) {
if (outp < endmaxword) {
outp = euro_wctomb (charmap[wc], outp, len);
}
else {
tpstate = TOO_LONG;
if (debugging_teskey)
fprintf (aa_stderr,
"teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
candidate_offset-1, outbuf);
if (add_msgs) {
char msgbuf [DtSrMAXWIDTH_HWORD + 100];
sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 8,
"%s '%.*s...' is larger\n"
"than the maximum word size of database '%s'.") ,
PROGNAME"449", maxwordsz,
parg->string, dblk->label);
DtSearchAddMessage (msgbuf);
return NULL;
}
outbuf[0] = 0;
outp = outbuf;
}
continue;
}
if ((charmap[wc] & NON_CONCORD) != 0) {
*outp = '\0';
break;
}
/* Must be opt_concord... */
outp += len;
opt_len = euro_readchar (cofunction, NULL, outp, &wc);
if (wc) readcount += opt_len;
if ((charmap[wc] & CONCORDABLE) != 0) {
if (outp < endmaxword) {
outp = euro_wctomb (charmap[wc], outp, opt_len);
}
else {
tpstate = TOO_LONG;
if (debugging_teskey)
fprintf (aa_stderr,
"teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
candidate_offset-1, outbuf);
outbuf[0] = 0;
outp = outbuf;
}
continue;
}
else { /* next char NOT concordable...*/
outp -= len;
*outp = '\0';
break;
}
} /* endif IN_WORD */
/*------------- TOO_LONG State ------------
* Reader is in middle of a word that exceeds max word size.
* Discard all concordables and opt_concords until we
* can get between words again with a clear non_concord.
*/
else if (tpstate == TOO_LONG) {
if ((charmap[wc] & NON_CONCORD) != 0) {
outp = outbuf;
tpstate = BETW_WORDS;
}
continue;
}
/*------------- UNKNOWN State ------------*/
else {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 10,
"%s Program Error: Unknown parser state.\n"),
PROGNAME"306");
DtSearchExit (26);
}
} /* end read loop for next CANDIDATE WORD */
/*---------- TEST FOR ETX -------------*/
if (outbuf[0] == 0) {
if (debugging_teskey)
fprintf (aa_stderr, "teskey: etx\n");
if (add_msgs) {
char msgbuf [200];
sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 12,
"%s '%.120s' is not a valid word in database '%s'.") ,
PROGNAME"506", parg->string, dblk->label);
DtSearchAddMessage (msgbuf);
}
return NULL;
}
wordlen = strlen (outbuf);
candidate_offset--; /* token offset is one less than number of reads */
if (debugging_teskey)
fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
candidate_offset, outbuf);
if (is_hiliting) {
if (debugging_teskey)
fprintf (aa_stderr, ", (hiliting, skip tree searches)");
goto GOOD_WORD;
}
/*--------- INCLUDE LIST ----------
* Search before testing for stoplist or minimum word length.
*/
if (dblk->inclist != NULL) {
if (search_wordtree (dblk->inclist, outbuf)) {
if (debugging_teskey)
fprintf (aa_stderr, ", (INCLUDE LIST)");
goto GOOD_WORD;
}
}
/*--------- TOO SHORT -----------*/
if (wordlen < minwordsz) {
if (debugging_teskey)
fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
if (add_msgs) {
char msgbuf [200];
sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 17,
"%s '%s' is less than the\n"
"minimum word size of database '%s'.") ,
PROGNAME"543", parg->string, dblk->label);
DtSearchAddMessage (msgbuf);
return NULL;
}
goto READ_ANOTHER_WORD;
}
/*----------- STOP LIST -------------*/
if (dblk->stoplist != NULL) {
if (search_wordtree (dblk->stoplist, outbuf)) {
if (debugging_teskey)
fprintf (aa_stderr, ", (STOP LIST)\n");
if (add_msgs) {
char msgbuf [200];
sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 19,
"%s The word '%s' is not indexed in database '%s'.") ,
PROGNAME"558", parg->string, dblk->label);
DtSearchAddMessage (msgbuf);
return NULL;
}
goto READ_ANOTHER_WORD;
}
}
GOOD_WORD:
/* Word is correctly parsed and passes all dblk filters. */
if (debugging_teskey)
fprintf (aa_stderr, ", ...good word\n");
if (offsetp)
*offsetp = candidate_offset;
return outbuf;
} /* teskey_parser() */
/************************************************/
/* */
/* is_concordable */
/* */
/************************************************/
/* Verifies passed word token is teskey-concordable
* in code page of passed charmap. Used in validating
* word files. Returns TRUE if all chars concordable
* or optionally concordable, else returns FALSE.
*/
int is_concordable (char *word, int *charmap)
{
char *cptr;
wchar_t wc;
for (cptr = word; *cptr != 0; cptr++) {
euro_mbtowc (&wc, cptr, word);
if ((charmap[wc] & NON_CONCORD) != 0)
break;
}
return (*cptr == 0);
} /* is_concordable() */
/************************************************/
/* */
/* load_wordtree */
/* */
/************************************************/
/* Called by load_stop_list(), load_include_list(), etc,
* to read an appropriate word list file into binary tree structures.
*
* INPUT FILE FORMAT: One word per line, all chars teskey concordable.
* Preferred order is frequency of occurrence in the corpus
* to make searches efficient. Otherwise the words should at least
* be in random order or an order that will approximate a binary search.
* If first char is any of COMMENT_CHARS, line is ignored as comments.
* Ascii spaces, tabs, or newline delimits the first word token--
* anything else on the line is ignored as comments.
* Optionally characters in word token will be checked for teskey
* concordability.
*
* RETURNS 0 if file successfully loaded, returns 1 if file missing,
* returns 2 and messages in global msglist if file has fatal errors.
*/
int load_wordtree (
WORDTREE **treetop,
DBLK *dblk,
char *fname,
int do_teskey_test)
{
int i;
int errcount;
int is_duplicate;
long linecount = 0;
char *token;
char readbuf [256];
char sprintbuf [_POSIX_PATH_MAX + 1024];
FILE *fileid;
WORDTREE *new;
WORDTREE **this_link;
_Xstrtokparams strtok_buf;
if (debugging_loadlang)
fprintf (aa_stderr, PROGNAME"1071 "
"load_wordtree: db=%s fname='%s'\n",
NULLORSTR(dblk->name), NULLORSTR(fname));
if ((fileid = fopen (fname, "rt")) == NULL) {
/* Not being able to find the file is not an error.
* We indicate that with the return code.
* But any other error (like permissions) is fatal.
*/
if (errno == ENOENT) {
if (debugging_loadlang)
fputs (" ...file not found.\n", aa_stderr);
return 1;
}
else {
sprintf (sprintbuf,
CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
PROGNAME"362", fname, strerror(errno));
DtSearchAddMessage (sprintbuf);
return 2;
}
}
/*--------- Main Read Loop ----------*/
errcount = 0;
while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
linecount++;
/*
* Ignore comment lines beginning with punctuation char.
* Ignore empty lines (strtok returns NULL, no tokens).
* Otherwise first or only word on line is the desired word.
*/
if (strchr (COMMENT_CHARS, readbuf[0]))
continue;
if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
continue;
dblk->lstrupr (token, dblk);
if (debugging_loadword)
fprintf (aa_stderr, " WORD: '%s' ", token);
/* If requested confirm all chars are teskey-concordable. */
if (do_teskey_test)
if (!is_concordable (token, dblk->charmap)) {
sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 400,
"%s: %s, line %ld: Invalid chars in word '%s'."),
PROGNAME"400", fname, linecount, token);
DtSearchAddMessage (sprintbuf);
errcount++;
continue;
}
/* Unless we've already detected some errors,
* allocate a new node and load its data fields.
*/
if (errcount)
continue;
i = strlen (token);
new = austext_malloc (sizeof(WORDTREE) + i + 4,
PROGNAME"104", NULL);
new->llink = NULL;
new->rlink = NULL;
new->len = i;
new->word = (void *) (new + 1);
strcpy (new->word, token);
/* Descend binary tree and insert in correct alphabetical place */
is_duplicate = FALSE;
for (this_link = treetop; *this_link != NULL; ) {
i = strcmp (new->word, (*this_link)->word);
/* test for duplicate word */
if (i == 0) {
sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 423,
"%s Word '%s' in '%s' is a duplicate."),
PROGNAME"423", token, fname);
DtSearchAddMessage (sprintbuf);
/* duplicates aren't fatal, just ignore the word */
is_duplicate = TRUE;
break; /* no point in continuing descent */
}
/* Descend tree to find correct insertion point */
if (debugging_loadword)
fputc(((i < 0)? 'L' : 'R'), aa_stderr);
this_link = (WORDTREE **) ((i < 0) ?
&(*this_link)->llink : &(*this_link)->rlink);
} /* end forloop to find tree insertion point */
/* Don't link anything if error found while descending tree */
if (is_duplicate) {
if (debugging_loadword)
fputs (" duplicate!\n", aa_stderr);
free (new);
continue;
}
/* Insert new node at current location in tree */
*this_link = new;
if (debugging_loadword)
fputs(" .\n", aa_stderr);
} /* end of read loop */
fclose (fileid);
if (errcount) {
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1186 load word file '%s' failed.\n", fname);
return 2;
}
else {
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1193 load word file '%s' successful.\n", fname);
return 0;
}
} /* load_wordtree() */
/************************************************/
/* */
/* free_wordtree */
/* */
/************************************************/
/* Formerly free_bintree() in msgutil.c.
* Frees storage for all nodes in a WORDTREE and
* sets its top-of-list pointer to NULL.
* Works only for node structures where all memory
* was allocated in a single call to malloc().
* Uses link inversion traversal (eg, Data Structure Techniques,
* Thomas A. Standish, Algorithm 3.6) where TAG is initialized
* at preorder visit, and node is freed at postorder visit.
*/
static void free_wordtree (WORDTREE ** wordtree_head)
{
WORDTREE *next;
WORDTREE *prev = NULL;
WORDTREE *pres = *wordtree_head;
if (*wordtree_head == NULL)
return;
DESCEND_LEFT:
pres->word = (void *) 0; /* preorder visit: TAG = 0 */
next = pres->llink;
if (next != NULL) {
pres->llink = prev;
prev = pres;
pres = next;
goto DESCEND_LEFT;
}
DESCEND_RIGHT:
next = pres->rlink;
if (next != NULL) {
pres->word = (void *) 1; /* TAG = 1 */
pres->rlink = prev;
prev = pres;
pres = next;
goto DESCEND_LEFT;
}
POSTORDER_VISIT:
free (pres);
if (prev == NULL) { /* end of algorithm? */
*wordtree_head = NULL;
return;
}
if (prev->word == (void *) 0) { /* go up left leg */
next = prev->llink;
pres = prev;
prev = next;
goto DESCEND_RIGHT;
}
else { /* go up right leg */
next = prev->rlink;
prev->word = (void *) 0; /* restore TAG = 0 */
pres = prev;
prev = next;
goto POSTORDER_VISIT;
}
} /* free_wordtree() */
/************************************************/
/* */
/* load_include_list */
/* */
/************************************************/
/* Builds include list by reading include file
* into a binary tree structure.
* Unlike stoplists, include-lists are optional.
* Also unlike stoplists, there are no language default include-lists.
* 'dblist' may be NULL.
* RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
*/
static int load_include_list (DBLK *dblk, DBLK *dblist)
{
int i;
int filename_was_null = (dblk->fname_inc == NULL);
DBLK *db;
char sprintbuf [512];
dblk->inclist = NULL; /* just to be sure */
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
language_name(dblk->dbrec.or_language));
/* If file name not provided, generate one based on
* dblk's path, database name, and default extension.
*/
if (filename_was_null) {
if (dblk->name[0] == 0) {
dblk->fname_inc = "";
dblk->inclist = NULL;
if (debugging_loadlang)
fprintf (aa_stderr, PROGNAME"1339 "
"No inclist because neither fname nor dbname provided.\n");
return TRUE;
}
if (dblk->path == NULL)
dblk->path = strdup("");
dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
PROGNAME"1187", NULL);
strcpy (dblk->fname_inc, dblk->path);
ensure_end_slash (dblk->fname_inc);
strcat (dblk->fname_inc, dblk->name);
strcat (dblk->fname_inc, EXT_INCLIST);
}
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1350 Include list file name = '%s'.\n",
dblk->fname_inc);
/* Don't reload the same file if it's already
* been loaded into a previous dblk in a list.
* Code works just fine if dblist == NULL.
*/
for (db = dblist; db != NULL; db = db->link) {
if (db == dblk || db->fname_inc == NULL)
continue;
if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
dblk->inclist = db->inclist;
dblk->lang_flags |= LF_DUP_INC;
if (debugging_loadlang)
fprintf (aa_stderr, PROGNAME"1363 "
"Using previously loaded inclist, db='%s'.\n",
dblk->name);
return TRUE;
}
}
/* Include list is optional so missing file is
* not an error unless caller named a specific file.
*/
i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
switch (i) {
case 0:
return TRUE;
case 1:
if (filename_was_null) {
dblk->fname_inc = "";
dblk->inclist = NULL;
return TRUE;
}
else {
sprintf (sprintbuf,
CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
DtSearchAddMessage (sprintbuf);
return FALSE;
}
default:
return FALSE;
}
} /* load_include_list() */
/************************************************/
/* */
/* load_stop_list */
/* */
/************************************************/
/* Builds stoplist by reading stoplist file into a
* binary tree structure. File name can be
* (1) passed in dblk.fname_stp,
* (2) generated from dblk path, name, and '.stp',
* (3) default for dblk path, language, and '.stp'.
* 'dblist' may be NULL.
* RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
*/
static int load_stop_list (DBLK *dblk, DBLK *dblist)
{
int i;
DBLK *db;
char sprintbuf [_POSIX_PATH_MAX + 512];
struct stat statbuf;
dblk->stoplist = NULL; /* just to be sure */
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
language_name(dblk->dbrec.or_language));
/* If file name not provided, generate one based on
* dblk's path, database name, and default extension.
* And if that doesn't work, generate one based on
* dblk's path, language, and default extension.
*/
if (dblk->fname_stp == NULL) {
if (dblk->path == NULL)
dblk->path = strdup("");
dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
PROGNAME"919", NULL);
strcpy (dblk->fname_stp, dblk->path);
ensure_end_slash (dblk->fname_stp);
strcat (dblk->fname_stp, dblk->name);
strcat (dblk->fname_stp, EXT_STOPLIST);
errno = 0;
stat (dblk->fname_stp, &statbuf);
if (errno == ENOENT) {
strcpy (dblk->fname_stp, dblk->path);
ensure_end_slash (dblk->fname_stp);
strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
strcat (dblk->fname_stp, EXT_STOPLIST);
}
}
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1448 Stoplist file name = '%s'.\n",
dblk->fname_stp);
/* Don't reload the same file if it's already
* been loaded into a previous dblk in a list.
* Code works just fine if dblist == NULL.
*/
for (db = dblist; db != NULL; db = db->link) {
if (db == dblk || db->fname_stp == NULL)
continue;
if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
dblk->stoplist = db->stoplist;
dblk->lang_flags |= LF_DUP_STP;
if (debugging_loadlang)
fprintf (aa_stderr, PROGNAME"1460 "
"Using previously loaded stoplist, db='%s'.\n",
dblk->name);
return TRUE;
}
}
/* Stop lists are mandatory--a missing stoplist is fatal. */
i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
if (i == 1) {
sprintf (sprintbuf,
CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
DtSearchAddMessage (sprintbuf);
}
return (i == 0);
} /* load_stop_list() */
/************************************************/
/* */
/* free_paice_rules */
/* */
/************************************************/
/* Frees all allocated storage for a set of paice rules, typically
* loaded at dblk.stem_extra. Called by REINIT routines and
* by load_paice_suffixes() when cleaning up after an error.
*/
static void free_paice_rules (PRULE ***rules_table_ptr)
{
int i;
PRULE *p, **linkp;
PRULE **rules_table;
if (*rules_table_ptr == NULL)
return;
rules_table = *rules_table_ptr;
for (i=0; i<256; i++) {
if (rules_table[i] == NULL)
continue;
p = rules_table[i];
while (p) {
linkp = &p->link;
free (p->suffix);
if (p->apndstr)
free (p->apndstr);
free (p);
p = *linkp;
}
}
free (rules_table);
*rules_table_ptr = NULL;
return;
} /* free_paice_rules() */
/************************************************/
/* */
/* load_paice_suffixes */
/* */
/************************************************/
/* Loads European language paice stemmer suffix rules
* into dblk.stem_extra as an array of ptrs to linked lists.
* Like stop lists, sfx files can be
* (1) passed in dblk.fname_sfx,
* (2) generated from dblk path, dbname, and '.sfx',
* (3) generated from dblk path, language, and '.sfx'.
* Internal tables will be reused if file previously loaded.
* Only uses single byte character sets (ascii, iso-latin-1).
* Uses strtok(). dblk->charmap must already be loaded.
* Will continue to parse entire file even if errors are found.
* RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
*/
static int load_paice_suffixes (DBLK *dblk, DBLK *dblist)
{
FILE *fp;
DBLK *db;
PRULE *prule, **prule_link;
PRULE **rules_table;
struct stat statbuf;
UCHAR *cptr, *token;
char readbuf [_POSIX_PATH_MAX + 1024];
char msgbuf [_POSIX_PATH_MAX + 1024];
UCHAR *suffix, *apndstr;
int must_be_intact, is_last_rule;
UCHAR remove_count;
int lineno, errcount;
int len;
wchar_t wc;
_Xstrtokparams strtok_buf;
dblk->stem_extra = NULL; /* just to be sure */
rules_table = NULL;
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
language_name(dblk->dbrec.or_language));
/* If file name not provided, generate one based on
* dblk's path, database name, and default extension.
* And if that doesn't work, generate one based on
* dblk's path, language, and default extension.
*/
if (dblk->fname_sfx == NULL) {
if (dblk->path == NULL)
dblk->path = strdup("");
dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
PROGNAME"1113", NULL);
strcpy (dblk->fname_sfx, dblk->path);
ensure_end_slash (dblk->fname_sfx);
strcat (dblk->fname_sfx, dblk->name);
strcat (dblk->fname_sfx, EXT_SUFFIX);
errno = 0;
stat (dblk->fname_sfx, &statbuf);
if (errno == ENOENT) {
strcpy (dblk->fname_sfx, dblk->path);
ensure_end_slash (dblk->fname_sfx);
strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
strcat (dblk->fname_sfx, EXT_SUFFIX);
}
}
if (debugging_loadlang)
fprintf (aa_stderr,
PROGNAME"1740 Paice suffix file name = '%s'.\n",
dblk->fname_sfx);
/* Don't reload the same file if it's already
* been loaded into a previous dblk in a list,
* but flag it so it won't be freed at unload_language/REINIT.
* Code works just fine if dblist == NULL.
*/
for (db = dblist; db != NULL; db = db->link) {
if (db == dblk || db->fname_sfx == NULL)
continue;
if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
dblk->stem_extra = db->stem_extra;
dblk->lang_flags |= LF_DUP_SFX;
if (debugging_loadlang)
fprintf (aa_stderr, PROGNAME"1145 "
"Using previously loaded suffixes, db='%s'.\n",
dblk->name);
return TRUE;
}
}
fp = fopen (dblk->fname_sfx, "rt");
if (fp == NULL) {
sprintf (msgbuf,
CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
PROGNAME"181", dblk->fname_sfx, strerror(errno));
DtSearchAddMessage (msgbuf);
dblk->fname_sfx = NULL;
return FALSE;
}
/* Rules table will eventually be loaded at dblk.stem_extra.
* It consists of 256 PRULE ptrs,
* one for each possible single byte char.
* Each ptr is the head of a rules list for that char.
*/
rules_table = austext_malloc (256 * sizeof(PRULE*),
PROGNAME"199", &ausapi_msglist);
memset (rules_table, 0, 256 * sizeof(PRULE*));
lineno = 0;
errcount = 0;
/*------- Main Read Loop -------*/
while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
lineno++;
/* Ignore comment lines */
if (strchr (COMMENT_CHARS, readbuf[0]))
continue;
/* TOKEN #1: suffix string, backwards, all uppercase.
* If missing, ignore 'empty' line.
* If the first token is all numeric, ignore line
* (for compatibility with older versions of file).
*/
if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
continue;
for (cptr = suffix; cptr; cptr++) {
euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
if ((dblk->charmap[wc] & NUMERAL) == 0)
break;
}
if (*cptr == '\0')
continue;
/* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
BAD_RULE:
sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 51,
"%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
PROGNAME"898", dblk->fname_sfx, lineno, suffix);
DtSearchAddMessage (msgbuf);
errcount++;
continue;
}
must_be_intact = FALSE;
if (token[0] == '*') {
must_be_intact = TRUE;
/* Read next token... */
if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
goto BAD_RULE;
}
/* TOKEN #3: remove-count */
remove_count = (UCHAR) atoi ((char *) token);
/* OPTIONAL TOKEN #4: if next token is NOT a continue
* symbol ('>' or '$'), then it's an append string.
*/
apndstr = NULL;
if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
goto BAD_RULE;
if (token[0] != '$' && token[0] != '>') {
apndstr = token;
/* Read next token... */
if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
goto BAD_RULE;
}
/* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
is_last_rule = (token[0] == '$');
if (debugging_loadword) {
fprintf (aa_stderr,
" SFX: intact?=%d stop?=%d remv=%d '%s'",
(int) must_be_intact,
(int) is_last_rule,
(int) remove_count,
suffix);
if (apndstr)
fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
else
fputc ('\n', aa_stderr);
}
/* Good suffix. If we haven't had any errors yet,
* add it to rules list for the first char of the suffix.
*/
if (errcount)
continue;
prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
memset (prule, 0, sizeof(PRULE));
prule->suffix = (UCHAR *) strdup ((char*)suffix);
prule->suflen = strlen ((char*)suffix);
prule->must_be_intact = must_be_intact;
prule->remove_count = remove_count;
prule->is_last_rule = is_last_rule;
if (apndstr) {
len = mbstowcs (NULL, (char *)apndstr, 0);
if (len != -1) {
prule->apndstr = (UCHAR *) strdup ((char*)apndstr);
prule->aplen = len;
}
}
prule_link = &rules_table[suffix[0]];
while (*prule_link)
prule_link = &(*prule_link)->link;
*prule_link = prule;
} /* end Main Read Loop */
fclose (fp);
if (errcount) {
free_paice_rules (&rules_table);
return FALSE;
}
dblk->stem_extra = rules_table;
/* Update last table entry */
if (debugging_loadlang) {
fprintf (aa_stderr,
PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
dblk->fname_sfx);
fflush (aa_stderr);
}
return TRUE;
} /* load_paice_suffixes() */
/************************************************/
/* */
/* is_matching_rule */
/* */
/************************************************/
/* Subroutine of paice_stemmer().
* Returns TRUE if passed rule can be applied to stem in paicebuf.
* Else returns FALSE.
*/
static int is_matching_rule (PRULE *rule)
{
static char *ptr;
static wchar_t wc;
static int i, j;
if (debugging_paice)
fprintf (aa_stderr, " test rule '%s':\t", rule->suffix);
/* Skip rule if we've made at least one previous change
* but the current rule requires an intact word.
*/
if (rule->must_be_intact && !word_is_intact) {
if (debugging_paice)
fputs ("word not intact...\n", aa_stderr);
return FALSE;
}
/* Do a backward strcmp on the suffix.
* Skip rule if it doesn't match current paicebuf's ending chars.
*/
j = rule->suflen;
ptr = paicebuf + paicelen - 1;
for (i = 0; i < j; i++) {
if (*((rule->suffix) + i) != *ptr) {
if (debugging_paice)
fputs ("no match...\n", aa_stderr);
return FALSE;
}
ptr--;
}
if (debugging_paice)
fputs ("match", aa_stderr);
/* Set i = paicebuf length after removing and appending suffixes.
* Used to algorithmically test remaining stem length
* after tentative application of rule.
*/
i = paicewcsl - (rule->remove_count - rule->aplen);
if (i <= 1) {
if (debugging_paice)
fputs (", but stem too short...\n", aa_stderr);
return FALSE;
}
if (i == 2) {
euro_mbtowc (&wc, paicebuf, paicebuf);
if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);
if (IS_VOWEL (wc)) {
if (debugging_paice)
fputs (", and short vowel stem valid.\n", aa_stderr);
return TRUE;
}
else {
if (debugging_paice)
fputs (", but consonant stem too short...\n", aa_stderr);
return FALSE;
}
}
/* Remaining stem is at least 3 chars.
* If it contains a vowel anywhere, it's valid.
* (A 'Y' after the first char counts as a vowel).
* Otherwise it's not.
*/
for (j=0; j<i; j++) {
euro_mbtowc (&wc, &paicebuf[j], paicebuf);
if (IS_VOWEL (wc)) {
GOOD_STEM:
if (debugging_paice)
fputs (", and remaining stem valid.\n", aa_stderr);
return TRUE;
}
if (j > 0 && wc == L'Y')
goto GOOD_STEM;
}
if (debugging_paice)
fputs (", but remaining stem all consonants.\n", aa_stderr);
return FALSE;
} /* is_matching_rule() */
/************************************************/
/* */
/* paice_stemmer */
/* */
/************************************************/
/* Given a word token (ALREADY UPPERCASE) in a single byte
* language such as the output of teskey_parser,
* generates 'stem' by repeated suffix removal.
* Returns stem token in a static buffer valid
* until next call to paice_stemmer or null_stemmer.
* Returned stem might be the original unmodified word.
* Returned stem might also be empty string.
* Returned stem is *never* NULL, even if wordin == NULL.
* Input buffer will not be modified; does not use strtok.
* All variables are static for speeeeeeed.
*/
static char *paice_stemmer (char *wordin, DBLK *dblk)
{
wchar_t finalwc;
int len;
PRULE *rule, **rules_table;
if (wordin == NULL)
return "";
if (*wordin == 0)
return "";
if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 31,
"%s Stemmer suffixes file never loaded.\n"),
PROGNAME"310");
DtSearchExit (2);
}
/* The max length of a stem is bufsz - 2:
* one for the terminating \0 and one for the
* prefix ^O that identifies a stem. (But this
* stemmer doesn't actually insert the ^O now.)
*/
strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);
if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;
paice_charmap = dblk->charmap;
word_is_intact = TRUE;
for (;;) { /*-------- Main Stemming Loop ---------*/
paicelen = strlen (paicebuf);
paicewcsl = mbstowcs (NULL, paicebuf, 0);
len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);
if (debugging_paice) {
fwprintf (aa_stderr,
L"paice: '%s', rules list '%lc' for database '%s'\n",
paicebuf, finalwc, dblk->name);
fflush (aa_stderr);
}
/* Look for a matching rule */
if ((rule = rules_table [finalwc]) == NULL) {
if (debugging_paice)
fputs (" list is null, stop.\n", aa_stderr);
break;
}
while (rule) {
if (is_matching_rule (rule))
break;
rule = rule->link;
}
if (rule == NULL) {
if (debugging_paice)
fwprintf (aa_stderr,
L" rules list '%lc' is exhausted, stop.\n", finalwc);
break;
}
/* Apply rule that matched */
if (debugging_paice)
fputs (" apply rule: ", aa_stderr);
if (rule->remove_count == 0) {
if (debugging_paice)
fputs ("remove_count = 0, stop.\n", aa_stderr);
break;
}
paicebuf [paicelen - len * rule->remove_count] = 0;
if (rule->aplen)
strcat (paicebuf, (char*)rule->apndstr);
paicelen = strlen (paicebuf);
paicewcsl = mbstowcs (NULL, paicebuf, 0);
word_is_intact = FALSE; /* we've removed at least 1 suffix */
if (debugging_paice)
fprintf (aa_stderr, "--> '%s'", paicebuf);
/* Terminate algorithm if rule says so.
* Otherwise continue removing suffixes
* from this partially stemmed word.
*/
if (rule->is_last_rule) {
if (debugging_paice)
fputs (", stop flag is set, stop.\n", aa_stderr);
break;
}
if (debugging_paice)
fputc ('\n', aa_stderr);
} /* end Main Stemming Loop */
if (debugging_paice) {
fprintf (aa_stderr, " final stem: '%s'\n", paicebuf);
fflush (aa_stderr);
}
return paicebuf;
} /* paice_stemmer() */
/************************************************/
/* */
/* null_stemmer */
/* */
/************************************************/
/* Stemmer that just copies and returns passed word.
* In effect, the passed word IS its own stem.
* Output buffer valid until next call to null_stemmer
* or paice_stemmer.
*/
char *null_stemmer (char *word, DBLK *dblk)
{
if (word == NULL)
return "";
if (*word == '\0')
return "";
strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
return paicebuf;
} /* null_stemmer() */
/************************************************/
/* */
/* euro_lstrupr */
/* */
/************************************************/
/* Converts passed string to uppercase in place.
* Classic strupr() function using teskey charmaps.
*/
static char *euro_lstrupr (char *string, DBLK *dblk)
{
static int *charmap, len;
static char *s;
static wchar_t wc;
charmap = dblk->charmap;
for (s = string; *s; s++) {
len = euro_mbtowc (&wc, s, string);
*s = charmap[wc] & 0xFF;
if (len > 1) wctomb (s - 1, *s);
}
return string;
}
/************************************************/
/* */
/* null_lstrupr */
/* */
/************************************************/
/* Just returns passed string. Used where uppercase
* conversions are not required for a language.
*/
char *null_lstrupr (char *s, DBLK *d)
{ return s; }
/************************************************/
/* */
/* load_language */
/* */
/************************************************/
/* Loads a dblk with a specific language's
* structures and function pointers.
* Does not reload structures previously loaded in
* other dblks on dblist if derived from identical files.
* But always loads structures if passed dblist is NULL.
* Presumes dblk already partially initialized with mandatory fields:
* name, path, language.
* May also be preinitialized with optional fields:
* minwordsz, maxwordsz.
* Returns TRUE if all successful.
* Otherwise returns FALSE with err msgs on ausapi_msglist.
*/
int load_language (DBLK *dblk, DBLK *dblist)
{
int oops = FALSE;
int language = dblk->dbrec.or_language;
if (debugging_loadlang)
fprintf (aa_stderr,
"\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
(int)dblk->dbrec.or_language,
language_name (dblk->dbrec.or_language),
NULLORSTR(dblk->name));
/*
* Note: Load list functions must be called
* AFTER charmap and lstrupr are loaded.
*/
switch (language) {
case DtSrLaENG:
case DtSrLaENG2:
case DtSrLaESP:
case DtSrLaFRA:
case DtSrLaITA:
case DtSrLaDEU:
dblk->charmap = (language == DtSrLaENG)?
ascii_charmap : latin_charmap;
dblk->parser = teskey_parser;
dblk->stemmer = paice_stemmer;
dblk->lstrupr = euro_lstrupr;
if (dblk->dbrec.or_maxwordsz == 0)
dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
if (dblk->dbrec.or_minwordsz == 0)
dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
oops = FALSE;
if (!load_stop_list (dblk, dblist))
oops = TRUE;
if (!load_include_list (dblk, dblist))
oops = TRUE;
if (!load_paice_suffixes (dblk, dblist))
oops = TRUE;
if (oops)
return FALSE;
break;
case DtSrLaJPN:
case DtSrLaJPN2:
return load_jpn_language (dblk, dblist);
default:
/* Try loading a custom 'user' language.
* If he failed to provide a loader function,
* the dummy custom loader will tell him so.
* If he provided one but it can't load this language,
* it should return it's own error msgs.
*/
return load_custom_language (dblk, dblist);
} /* end switch (language) */
return TRUE;
} /* load_language() */
/************************************************/
/* */
/* unload_language */
/* */
/************************************************/
/* Frees storage for structures allocated by load_language().
* Called when engine REINITs due to change in site config file
* or databases.
* Duplicate wordtrees are not unloaded because they
* will have already been unloaded in a previous dblk.
*/
void unload_language (DBLK *dblk)
{
switch (dblk->dbrec.or_language) {
case DtSrLaENG:
case DtSrLaENG2:
case DtSrLaESP:
case DtSrLaFRA:
case DtSrLaITA:
case DtSrLaDEU:
dblk->charmap = NULL;
if ((dblk->lang_flags & LF_DUP_STP) == 0)
free_wordtree (&dblk->stoplist);
else {
dblk->stoplist = NULL;
dblk->lang_flags &= ~LF_DUP_STP;
}
if ((dblk->lang_flags & LF_DUP_INC) == 0)
free_wordtree (&dblk->inclist);
else {
dblk->inclist = NULL;
dblk->lang_flags &= ~LF_DUP_INC;
}
if ((dblk->lang_flags & LF_DUP_SFX) == 0)
free_paice_rules ((PRULE***)&dblk->stem_extra);
else {
dblk->stem_extra = NULL;
dblk->lang_flags &= ~LF_DUP_SFX;
}
break;
case DtSrLaJPN:
case DtSrLaJPN2:
unload_jpn_language (dblk);
break;
default:
unload_custom_language (dblk);
break;
}
return;
} /* unload_language() */
/******************** LANG.C ********************/