cde/cde/lib/DtSearch/lang.c

/*
 * CDE - Common Desktop Environment
 *
 * Copyright (c) 1993-2012, The Open Group. All rights reserved.
 *
 * These libraries and programs are free software; you can
 * redistribute them and/or modify them under the terms of the GNU
 * Lesser General Public License as published by the Free Software
 * Foundation; either version 2 of the License, or (at your option)
 * any later version.
 *
 * These libraries and programs are distributed in the hope that
 * they will be useful, but WITHOUT ANY WARRANTY; without even the
 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE. See the GNU Lesser General Public License for more
 * details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with these libraries and programs; if not, write
 * to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
 * Floor, Boston, MA 02110-1301 USA
 */
/*
 *   COMPONENT_NAME: austext
 *
 *   FUNCTIONS: euro_lstrupr
 *		free_wordtree
 *		is_concordable
 *		language_name
 *		load_include_list
 *		load_language
 *		load_paice_suffixes
 *		load_stop_list
 *		load_wordtree
 *		null_lstrupr
 *		null_stemmer
 *		paice_stemmer
 *		search_wordtree
 *		teskey_parser
 *		unload_language
 *
 *   ORIGINS: 27
 *
 *
 *   (C) COPYRIGHT International Business Machines Corp. 1995,1996
 *   All Rights Reserved
 *   Licensed Materials - Property of IBM
 *   US Government Users Restricted Rights - Use, duplication or
 *   disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
 */
/******************** LANG.C ********************
 * $XConsortium: lang.c /main/11 1996/11/25 18:47:29 drk $
 * July 1995.
 * Includes load_language(), unload_language(), and functions and data for
 * parsing and stemming European languages in DtSearch/AusText.
 * Incorporates p/o socrates.c, p/o proctext.c, parser.c
 * delsfx.c, loadchr.c, stop.c, inclist.c, convneg.c, isendwrd.c
 * Related to similar semantic modules repackaged into semantic.c.
 * Paice suffix removal algorithm from C. Paice, 1990,
 * "Another Stemmer", ACM SIGIR Forum, 24(3), 56-61.
 *
 * $Log$
 * Revision 2.13  1996/03/25  18:55:26  miker
 * Changed FILENAME_MAX to _POSIX_PATH_MAX.
 *
 * Revision 2.12  1996/03/25  17:00:19  miker
 * Cleanup compiler warning.
 *
 * Revision 2.11  1996/03/13  22:58:13  miker
 * Changed char to UCHAR several places.
 *
 * Revision 2.10  1996/03/05  16:49:58  miker
 * Move COMMENT_CHARS to SearchP.h.
 *
 * Revision 2.9  1996/03/05  16:31:20  miker
 * Added test of PA_MSGS for yacc-based boolean queries.
 * Made comment chars in linguistic files independent of locale.
 * Changed several char ptrs to unsigned char so parser will
 * work when compiled under default signed char compilers.
 * Simplified several statements with LHS *var++ for same reason.
 *
 * Revision 2.8  1996/02/05  16:16:05  miker
 * Restore prolog.
 *
 * Revision 2.7  1996/02/05  16:10:54  miker
 * load_paice_suffixes: discard .sfx lines beginning with all numeric
 * first token for compatibility with older file formats.
 *
 * Revision 2.6  1996/02/01  19:11:43  miker
 * AusText 2.1.11, DtSearch 0.3:  Major rewrite for new parsers.
 * Moved charmaps to new module langmap.c.  Removed hard coded
 * paice stemmer values--now dynamic from .sfx file.
 *
 * Revision 2.5  1995/10/26  14:55:28  miker
 * Added prolog.
 *
 * Revision 2.4  1995/10/19  20:54:36  miker
 * Increased msg buf sizes to accommodate larger database file names.
 *
 * Revision 2.3  1995/10/06  14:39:45  miker
 * Bug fix: coredump loading multiple databases
 * on Solaris.
 *
 * Revision 2.2  1995/10/03  21:39:10  miker
 * Changed teskey_parser, paice_stemmer, and null_stemmer
 * to return number of words parsed/stemmed, not just boolean.
 *
 * Revision 2.1  1995/09/22  21:00:19  miker
 * Freeze DtSearch 0.1, AusText 2.1.8
 *
 * Revision 1.3  1995/09/19  22:08:28  miker
 * Added support for loading and parsing Japanese language DtSrLaJPN.
 *
 * Revision 1.2  1995/09/05  21:34:52  miker
 * Fixed bug: search engine wouldn't parse words of exactly
 * 3 or 15 chars.
 *
 * Revision 1.1  1995/08/31  21:03:44  miker
 * Initial revision
 */
#include "SearchP.h"

#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <wchar.h>
#include <sys/stat.h>

#define X_INCLUDE_STRING_H
#define XOS_USE_NO_LOCKING
#include <X11/Xos_r.h>

#define PROGNAME	"LANG"
#define EXT_SUFFIX	".sfx"	/* standard paice suffix file format */
#define OUTBUFSZ	6140
#define SFX_DELIMS	" \t\n"
#define MS_misc		1
#define MS_lang		15
#define IS_VOWEL(c)	((paice_charmap [(UCHAR)c] & VOWEL) != 0)

/************************************************/
/*						*/
/*		      PRULE			*/
/*						*/
/************************************************/
/* List of Paice suffix removal rules from .sfx files */
typedef struct prule_t {
    struct prule_t   *link;	/* Ptr to next list node */
    UCHAR   *suffix;		/* Applicable suffix string, backwards */
    UCHAR   suflen;		/* Length of suffix */
    char    must_be_intact;	/* Optional '*'.  Rule only applies
				 * to intact words */
    UCHAR   remove_count;	/* Number of suffix chars to remove */
    UCHAR   aplen;		/* Length of apndstr */
    UCHAR   *apndstr;		/* Optional append string */
    char    is_last_rule;	/* '$' terminate or '>' continue algorithm */
    }	PRULE;

char *ensure_end_slash (char *pathstr);
void  unload_jpn_language (DBLK *dblk);

/************************************************/
/*						*/
/*		      GLOBALS			*/
/*						*/
/************************************************/
int		debugging_loadlang =		FALSE;
int		debugging_loadword =		FALSE;
int		debugging_search_wordtree =	FALSE;
int             debugging_teskey =		FALSE;
int             debugging_paice =		FALSE;
static int	*paice_charmap;
static char	paicebuf [DtSrMAXWIDTH_HWORD + 2];
static int	paicelen;
static int	paicewcsl;
static int	word_is_intact;

/* Language strings correspond to DtSrLa.. constants.  */
static char	*lang_fnames[] = {
			"eng",		/*  0 */
			"eng",		/*  1  ('eng2' same files as 'eng') */
			"esp",		/*  2 */
			"fra",		/*  3 */
			"ita",		/*  4 */
			"deu",		/*  5 */
			"jpn",		/*  6 */
			"jpn",		/*  7  ('jpn2' same files as 'jpn' */
			NULL
		};


/************************************************/
/*						*/
/*                language_name			*/
/*						*/
/************************************************/
/* Returns language name string given language number */
static char	*language_name (DtSrINT16 langno)
{
    static char	*language_names[] = {
		"English-ASCII",	/*  0 = DtSrLaENG */
		"English-Latin1",	/*  1 = DtSrLaENG2 */
		"Spanish",		/*  2 = DtSrLaESP */
		"French",		/*  3 = DtSrLaFRA */
		"Italian",		/*  4 = DtSrLaITA */
		"German",		/*  5 = DtSrLaDEU */
		"Japanese-comp",	/*  6 = DtSrLaJPN */
		"Japanese-.knj"		/*  7 = DtSrLaJPN2 */
		};

    if (langno < 0)
	return "INVALID!";
    else if (langno > DtSrLaLAST)
	return "(Custom Language)";
    else
	return language_names [langno];
} /* language_name() */


/************************************************/
/*						*/
/*		 search_wordtree		*/
/*						*/
/************************************************/
/* Sept 1991.
 * Formerly search_inclist() in inclist.c and search_stoplist() in stop.c.
 * Searches a word list in a binary WORDTREE.
 * Passed wordstring is presumed to be a clean,
 * uppercase word token string terminated by \0.
 * Variables are static for speeeeed.
 * Returns TRUE if successful search, else FALSE.
 * See also search_wordtree_jpn() in jpn.c
 */
static int	search_wordtree (WORDTREE *wordtree, char *wordstring)
{
    static int     	direction;
    static WORDTREE	*node;

    if (debugging_search_wordtree)
	fprintf (aa_stderr, PROGNAME"196 search wordtree for '%s':\n",
	    wordstring);
    /* MAIN SEARCH LOOP: binary tree search */
    for (node = wordtree;  node != NULL;  ) {
	if ((direction = strcmp (wordstring, node->word)) == 0) {
	    if (debugging_search_wordtree)
		fprintf (aa_stderr, "  HIT!\n");
	    return TRUE;
	}
	/* Descend left or right depending on word */
	if (debugging_search_wordtree)
	    fprintf (aa_stderr, "  %c '%s'\n",
		(direction < 0) ? 'L' : 'R', (char *) node->word);
	if (direction < 0)
	    node = node->llink;
	else
	    node = node->rlink;
    }
    if (debugging_search_wordtree)
	fprintf (aa_stderr, "  MISS.\n");
    return FALSE;
}  /* search_wordtree() */


static int euro_mbtowc (wchar_t *pwc, const char *p, const char *s)
{
    int len = -1;

    if (p < s) goto done;

    if (*p >= 0 && *p <= 0x7F) {
	len = 1;
	*pwc = *p;
	goto done;
    }

    if (p == s) goto done;

    mbtowc (NULL, NULL, 0); len = mbtowc (pwc, p - 1, 2);

done:
    if (len < 0 || *pwc > 0xFF) *pwc = 0x100;
    return len;
}


static char *euro_wctomb (int c, char *outp, int len)
{
    wchar_t wc = c & 0xFF;
    if (len > 1) wctomb (outp, wc);
    else *outp = wc;
    return outp + len;
}


static int euro_readchar (READCFP cofunction, void *cofunction_arg, char *outp,
    wchar_t *pwc)
{
    int len = 1;

    *pwc = *outp = cofunction (cofunction_arg);

    if (*pwc >= 0 && *pwc <= 0x7F) goto done;

    *(outp + len) = cofunction (NULL);

    mbtowc (NULL, NULL, 0); if (mbtowc (pwc, outp, ++len) >= 0) goto done;

    *pwc = 0x100;

    for (;;) {
	if (len >= MB_CUR_MAX) break;

	*(outp + len) = cofunction (NULL);

	mblen (NULL, 0); if (mblen (outp, ++len) >= 0) break;
    }

done:
    if (*pwc > 0xFF) *pwc = 0x100;
    return len;
}


/************************************************/
/*						*/
/*                 teskey_parser		*/
/*						*/
/************************************************/
/* 1989.
 * Teskey_parser() is derived from the former Socrates() in socrates.c.
 * Returns next teskey-parsed word token from a character stream.
 * Called from (1) dtsrindex, where readchar_ftext() cofunction
 * reads the .fzk file document 'stream', or (2) search engine
 * query parsers, where readchar_string() cofunction 'reads'
 * from the query string.
 * (The word hiliting parser does not directly call teskey_parser; it has
 * its own simplified equivalent to the parsing algorithms herein.)
 *
 * First call passes args in PARG structure.  This resets end of
 * text block (ETX) flag, resets 'offset' counter to zero, etc.
 * Subsequent calls should pass NULL, and parser returns
 * next token in block, until reader cofunction reads ETX,
 * ie special ETX char ('\0').  Subsequent calls to parser
 * return NULL meaning "no tokens left in current stream".
 * Reader cofunctions tolerate repeated calls after
 * the first ETX, still returning '\0'.
 *
 * This parser presumes all incoming text is unformatted.
 * Since parser accesses streams a char at a time it does
 * not require periodic line feeds or anything else.
 *
 * Parser also returns offset information: number of bytes
 * since beginning of text block.
 *
 * Variables are static for speeeeeeed.
 *
 * OUTPUT FORMAT:  NULL or a static C string containing a single
 * parsed word token.  Word buffer reused at next call.
 * Each word is translated as follows:
 * 	All alphas TO UPPERCASE.
 *	Teskey algorithm used to find word boundaries.
 *	Always keeps include-list words.
 * 	Throws away stoplist words, very short words, and very long words.
 *	All intervening nonconcordables discarded.
 *
 * There is a slight mod to the published Teskey algorithm.
 * Words can begin with optionally concordable chars
 * but not end with them.  For example if '-' is optionally
 * concordable, '-foo-' will be parsed into '-foo'.
 */
char	*teskey_parser (PARG *parg)
{
    static READCFP	cofunction;
    static void		*cofunction_arg;
    static DBLK		*dblk =		NULL;
    static char		*outbuf =	NULL;
    static size_t	outbufsz =	0;
    static char		*endmaxword;	/* end largest possible output word */
    static char		*outp;		/* next loc in outbuf */
    static int		*charmap;
    static int		minwordsz, maxwordsz;
    static int		wordlen;
    static enum	{BETW_WORDS, IN_WORD, TOO_LONG}
			tpstate;
    static long		*offsetp, readcount, candidate_offset;
    static int		is_hiliting;
    static int		add_msgs;
    static int		len, opt_len;
    static wchar_t	wc;

    /* If first call for current text block... */
    if (parg) {
	dblk = parg->dblk;
	minwordsz = dblk->dbrec.or_minwordsz;
	maxwordsz = dblk->dbrec.or_maxwordsz;
	charmap = dblk->charmap;
	offsetp = parg->offsetp;
	is_hiliting = (parg->flags & PA_HILITING);
	add_msgs = (parg->flags & PA_MSGS);
	if (charmap == NULL) {
	    fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 4,
		"%s dblk not initialized.\n"),
		PROGNAME"801");
	    DtSearchExit (55);
	}

	if (parg->string) {
	    cofunction_arg = parg->string;
	    cofunction = (READCFP) readchar_string;
	}
	else if (parg->ftext) {
	    cofunction_arg = parg;
	    cofunction = (READCFP) readchar_ftext;
	}
	else {
	    fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 5,
		"%s Program Error: parg contains neither file nor string.\n"),
		PROGNAME"327");
	    DtSearchExit (27);
	}

	if (outbufsz <= maxwordsz) {
	    if (outbuf)
		free (outbuf);
	    outbufsz = maxwordsz + 8;
	    outbuf = austext_malloc (outbufsz + 8, PROGNAME"807", NULL);
	}
	endmaxword = outbuf + maxwordsz;
	if (debugging_teskey)
	    fprintf (aa_stderr,
		"teskey: start of text block, maxwsz=%d outbufsz=%lu\n",
		maxwordsz, (unsigned long) outbufsz);
	readcount = 0L;
    }

    /* CANDIDATE WORD LOOP:  Read text chars into outbuf.
     * Exit loop when outbuf contains one candidate token or at ETX.
     */
READ_ANOTHER_WORD:
    outp = outbuf;
    tpstate = BETW_WORDS;
    for (;;) {
	len = euro_readchar (cofunction, cofunction_arg, outp, &wc);

	if (!wc) break;

	readcount += len;
	cofunction_arg = NULL;

	/*------------- BETW_WORDS State ------------
	 * Reader is between word tokens.
	 */
	if (tpstate == BETW_WORDS) {
	    /*
	     * Discard nonconcordable chars between words.
	     */
	    if ((charmap[wc] & NON_CONCORD) != 0)
		continue;
	    /*
	     * Fully concordable char is definite start of new word.
	     * Convert to uppercase and go get next char.
	     */
	    if ((charmap[wc] & CONCORDABLE) != 0) {
		outp = euro_wctomb (charmap[wc], outp, len);
		candidate_offset = readcount;
		tpstate = IN_WORD;
		continue;
	    }
	    /*
	     * Must be optionally concordable.  It can only
	     * start a new word if next char is concordable.
	     * If so, convert a fully concordable char
	     * to uppercase and go get next char.
	     * Otherwise discard just like non_concord.
	     */
	    outp += len;
	    opt_len = euro_readchar (cofunction, NULL, outp, &wc);

	    if (wc) readcount += opt_len;

	    if ((charmap[wc] & CONCORDABLE) != 0) {
		outp = euro_wctomb (charmap[wc], outp, opt_len);
		candidate_offset = readcount - opt_len;
		tpstate = IN_WORD;
		continue;
	    }
	    else {
		outp -= len;
		continue;
	    }
	} /* endif BETW_WORDS */


	/*------------- IN_WORD State ------------
	 * Reader is in middle of a word.
	 * Convert all concordables to uppercase and append.
	 * Terminate word at first non_concord.
	 * Non_concords treatment depends on next char.
	 */
	else if (tpstate == IN_WORD) {
	    if ((charmap[wc] & CONCORDABLE) != 0) {
		if (outp < endmaxword) {
		    outp = euro_wctomb (charmap[wc], outp, len);
		}
		else {
		    tpstate = TOO_LONG;
		    if (debugging_teskey)
			fprintf (aa_stderr,
				"teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
				candidate_offset-1, outbuf);
		    if (add_msgs) {
			char	msgbuf [DtSrMAXWIDTH_HWORD + 100];
			sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 8,
			    "%s '%.*s...' is larger\n"
			    "than the maximum word size of database '%s'.") ,
			    PROGNAME"449", maxwordsz,
			    parg->string, dblk->label);
			DtSearchAddMessage (msgbuf);
			return NULL;
		    }
		    outbuf[0] = 0;
		    outp = outbuf;
		}
		continue;
	    }
	    if ((charmap[wc] & NON_CONCORD) != 0) {
		*outp = '\0';
		break;
	    }
	    /* Must be opt_concord... */
	    outp += len;
	    opt_len = euro_readchar (cofunction, NULL, outp, &wc);

	    if (wc) readcount += opt_len;

	    if ((charmap[wc] & CONCORDABLE) != 0) {
		if (outp < endmaxword) {
		    outp = euro_wctomb (charmap[wc], outp, opt_len);
		}
		else {
		    tpstate = TOO_LONG;
		    if (debugging_teskey)
			fprintf (aa_stderr,
				"teskey: ofs=%3ld \"%.15s...\", (TOO LONG)\n",
				candidate_offset-1, outbuf);
		    outbuf[0] = 0;
		    outp = outbuf;
		}
		continue;
	    }
	    else {	/* next char NOT concordable...*/
		outp -= len;
		*outp = '\0';
		break;
	    }
	} /* endif IN_WORD */


	/*------------- TOO_LONG State ------------
	 * Reader is in middle of a word that exceeds max word size.
	 * Discard all concordables and opt_concords until we
	 * can get between words again with a clear non_concord.
	 */
	else if (tpstate == TOO_LONG) {
	    if ((charmap[wc] & NON_CONCORD) != 0) {
		outp = outbuf;
		tpstate = BETW_WORDS;
	    }
	    continue;
	}

	/*------------- UNKNOWN State ------------*/
	else {
	    fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 10,
		"%s Program Error: Unknown parser state.\n"),
		PROGNAME"306");
	    DtSearchExit (26);
	}
    } /* end read loop for next CANDIDATE WORD */

    /*---------- TEST FOR ETX -------------*/
    if (outbuf[0] == 0) {
	if (debugging_teskey)
	    fprintf (aa_stderr, "teskey: etx\n");
	if (add_msgs) {
	    char	msgbuf [200];
	    sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 12,
		"%s '%.120s' is not a valid word in database '%s'.") ,
		PROGNAME"506", parg->string, dblk->label);
	    DtSearchAddMessage (msgbuf);
	}
	return NULL;
    }

    wordlen = strlen (outbuf);
    candidate_offset--;	/* token offset is one less than number of reads */
    if (debugging_teskey)
	fprintf (aa_stderr, "teskey: ofs=%3ld \"%s\"",
	    candidate_offset, outbuf);

    if (is_hiliting) {
	if (debugging_teskey)
	    fprintf (aa_stderr, ", (hiliting, skip tree searches)");
	goto GOOD_WORD;
    }

    /*--------- INCLUDE LIST ----------
     * Search before testing for stoplist or minimum word length.
     */
    if (dblk->inclist != NULL) {
	if (search_wordtree (dblk->inclist, outbuf)) {
	    if (debugging_teskey)
		fprintf (aa_stderr, ", (INCLUDE LIST)");
	    goto GOOD_WORD;
        }
    }

    /*--------- TOO SHORT -----------*/
    if (wordlen < minwordsz) {
	if (debugging_teskey)
	    fprintf (aa_stderr, ", (TOO SHORT, min %d)\n", minwordsz);
	if (add_msgs) {
	    char	msgbuf [200];
	    sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 17,
		"%s '%s' is less than the\n"
		"minimum word size of database '%s'.") ,
		PROGNAME"543", parg->string, dblk->label);
	    DtSearchAddMessage (msgbuf);
	    return NULL;
	}
	goto READ_ANOTHER_WORD;
    }

    /*----------- STOP LIST -------------*/
    if (dblk->stoplist != NULL) {
	if (search_wordtree (dblk->stoplist, outbuf)) {
	    if (debugging_teskey)
		fprintf (aa_stderr, ", (STOP LIST)\n");
	    if (add_msgs) {
		char	msgbuf [200];
		sprintf (msgbuf, CATGETS(dtsearch_catd, MS_lang, 19,
		    "%s The word '%s' is not indexed in database '%s'.") ,
		    PROGNAME"558", parg->string, dblk->label);
		DtSearchAddMessage (msgbuf);
		return NULL;
	    }
	    goto READ_ANOTHER_WORD;
	}
    }

GOOD_WORD:
    /* Word is correctly parsed and passes all dblk filters. */
    if (debugging_teskey)
	fprintf (aa_stderr, ", ...good word\n");
    if (offsetp)
	*offsetp = candidate_offset;
    return outbuf;
} /* teskey_parser() */


/************************************************/
/*						*/
/*                 is_concordable		*/
/*						*/
/************************************************/
/* Verifies passed word token is teskey-concordable
 * in code page of passed charmap.  Used in validating
 * word files.  Returns TRUE if all chars concordable
 * or optionally concordable, else returns FALSE.
 */
int	is_concordable (char *word, int *charmap)
{
    char	*cptr;
    wchar_t	wc;
    for (cptr = word;  *cptr != 0;  cptr++) {
	euro_mbtowc (&wc, cptr, word);
	if ((charmap[wc] & NON_CONCORD) != 0)
	    break;
    }
    return (*cptr == 0);
} /* is_concordable() */


/************************************************/
/*						*/
/*                 load_wordtree		*/
/*						*/
/************************************************/
/* Called by load_stop_list(), load_include_list(), etc,
 * to read an appropriate word list file into binary tree structures.
 *
 * INPUT FILE FORMAT:  One word per line, all chars teskey concordable.
 * Preferred order is frequency of occurrence in the corpus
 * to make searches efficient.  Otherwise the words should at least
 * be in random order or an order that will approximate a binary search.
 * If first char is any of COMMENT_CHARS, line is ignored as comments.
 * Ascii spaces, tabs, or newline delimits the first word token--
 * anything else on the line is ignored as comments.
 * Optionally characters in word token will be checked for teskey
 * concordability.
 *
 * RETURNS 0 if file successfully loaded, returns 1 if file missing,
 * returns 2 and messages in global msglist if file has fatal errors.
 */
int	load_wordtree (
		    WORDTREE	**treetop,
		    DBLK	*dblk,
		    char	*fname,
		    int		do_teskey_test)
{
    int		i;
    int		errcount;
    int		is_duplicate;
    long	linecount = 0;
    char	*token;
    char	readbuf [256];
    char	sprintbuf [_POSIX_PATH_MAX + 1024];
    FILE	*fileid;
    WORDTREE	*new;
    WORDTREE	**this_link;
    _Xstrtokparams	strtok_buf;

    if (debugging_loadlang)
	fprintf (aa_stderr, PROGNAME"1071 "
	    "load_wordtree: db=%s fname='%s'\n",
	    NULLORSTR(dblk->name), NULLORSTR(fname));

    if ((fileid = fopen (fname, "rt")) == NULL) {
	/* Not being able to find the file is not an error.
	 * We indicate that with the return code.
	 * But any other error (like permissions) is fatal.
 	 */
	if (errno == ENOENT) {
	    if (debugging_loadlang)
		fputs ("  ...file not found.\n", aa_stderr);
	    return 1;
	}
	else {
	    sprintf (sprintbuf,
		CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
		PROGNAME"362", fname, strerror(errno));
	    DtSearchAddMessage (sprintbuf);
	    return 2;
	}
    }

    /*--------- Main Read Loop ----------*/
    errcount = 0;
    while (fgets (readbuf, sizeof(readbuf), fileid) != NULL) {
	linecount++;
	/*
	 * Ignore comment lines beginning with punctuation char.
	 * Ignore empty lines (strtok returns NULL, no tokens).
	 * Otherwise first or only word on line is the desired word.
	 */
	if (strchr (COMMENT_CHARS, readbuf[0]))
	    continue;
	if ((token = _XStrtok(readbuf, " \t\n", strtok_buf)) == NULL)
	    continue;
	dblk->lstrupr (token, dblk);

	if (debugging_loadword)
	    fprintf (aa_stderr, "  WORD: '%s'  ", token);

	/* If requested confirm all chars are teskey-concordable. */
	if (do_teskey_test)
	    if (!is_concordable (token, dblk->charmap)) {
		sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 400,
		    "%s: %s, line %ld: Invalid chars in word '%s'."),
		    PROGNAME"400", fname, linecount, token);
		DtSearchAddMessage (sprintbuf);
		errcount++;
		continue;
	    }

	/* Unless we've already detected some errors,
	 * allocate a new node and load its data fields.
	 */
	if (errcount)
	    continue;
	i = strlen (token);
	new = austext_malloc (sizeof(WORDTREE) + i + 4,
	    PROGNAME"104", NULL);
	new->llink = NULL;
	new->rlink = NULL;
	new->len = i;
	new->word = (void *) (new + 1);
	strcpy (new->word, token);

	/* Descend binary tree and insert in correct alphabetical place */
	is_duplicate = FALSE;
	for (this_link = treetop;  *this_link != NULL;  ) {
	    i = strcmp (new->word, (*this_link)->word);

	    /* test for duplicate word */
	    if (i == 0) {
		sprintf (sprintbuf, CATGETS(dtsearch_catd, MS_misc, 423,
		    "%s Word '%s' in '%s' is a duplicate."),
		    PROGNAME"423", token, fname);
		DtSearchAddMessage (sprintbuf);
		/* duplicates aren't fatal, just ignore the word */
		is_duplicate = TRUE;
		break;	/* no point in continuing descent */
	    }

	    /* Descend tree to find correct insertion point */
	    if (debugging_loadword)
		fputc(((i < 0)? 'L' : 'R'), aa_stderr);
	    this_link = (WORDTREE **) ((i < 0) ?
		&(*this_link)->llink : &(*this_link)->rlink);
	} /* end forloop to find tree insertion point */

	/* Don't link anything if error found while descending tree */
	if (is_duplicate) {
	    if (debugging_loadword)
		fputs (" duplicate!\n", aa_stderr);
	    free (new);
	    continue;
	}

	/* Insert new node at current location in tree */
	*this_link = new;
	if (debugging_loadword)
	    fputs(" .\n", aa_stderr);
    }	/* end of read loop */

    fclose (fileid);

    if (errcount) {
	if (debugging_loadlang)
	    fprintf (aa_stderr,
		PROGNAME"1186 load word file '%s' failed.\n", fname);
	return 2;
    }
    else {
	if (debugging_loadlang)
	    fprintf (aa_stderr,
		PROGNAME"1193 load word file '%s' successful.\n", fname);
	return 0;
    }
}  /* load_wordtree() */


/************************************************/
/*						*/
/*		  free_wordtree			*/
/*						*/
/************************************************/
/* Formerly free_bintree() in msgutil.c.
 * Frees storage for all nodes in a WORDTREE and
 * sets its top-of-list pointer to NULL.
 * Works only for node structures where all memory
 * was allocated in a single call to malloc().
 * Uses link inversion traversal (eg, Data Structure Techniques,
 * Thomas A. Standish, Algorithm 3.6) where TAG is initialized
 * at preorder visit, and node is freed at postorder visit.
 */
static void	free_wordtree (WORDTREE ** wordtree_head)
{
    WORDTREE        *next;
    WORDTREE        *prev = NULL;
    WORDTREE        *pres = *wordtree_head;

    if (*wordtree_head == NULL)
	return;

DESCEND_LEFT:
    pres->word = (void *) 0;	/* preorder visit:  TAG = 0 */
    next = pres->llink;
    if (next != NULL) {
	pres->llink = prev;
	prev = pres;
	pres = next;
	goto DESCEND_LEFT;
    }
DESCEND_RIGHT:
     next = pres->rlink;
    if (next != NULL) {
	pres->word = (void *) 1;	/* TAG = 1 */
	pres->rlink = prev;
	prev = pres;
	pres = next;
	goto DESCEND_LEFT;
    }
POSTORDER_VISIT:
    free (pres);
    if (prev == NULL) {	/* end of algorithm? */
	*wordtree_head = NULL;
	return;
    }
    if (prev->word == (void *) 0) {	/* go up left leg */
	next = prev->llink;
	pres = prev;
	prev = next;
	goto DESCEND_RIGHT;
    }
    else {	/* go up right leg */
	next = prev->rlink;
	prev->word = (void *) 0;	/* restore TAG = 0 */
	pres = prev;
	prev = next;
	goto POSTORDER_VISIT;
    }
}  /* free_wordtree() */


/************************************************/
/*						*/
/*              load_include_list		*/
/*						*/
/************************************************/
/* Builds include list by reading include file
 * into a binary tree structure.
 * Unlike stoplists, include-lists are optional.
 * Also unlike stoplists, there are no language default include-lists.
 * 'dblist' may be NULL.
 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 */
static int	load_include_list (DBLK *dblk, DBLK *dblist)
{
    int		i;
    int		filename_was_null = (dblk->fname_inc == NULL);
    DBLK	*db;
    char	sprintbuf [512];

    dblk->inclist = NULL;	/* just to be sure */

    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1705 Load inclist: db='%s' lang=#%d,%s\n",
	    NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
	    language_name(dblk->dbrec.or_language));

    /* If file name not provided, generate one based on
     * dblk's path, database name, and default extension.
     */
    if (filename_was_null) {
	if (dblk->name[0] == 0) {
	    dblk->fname_inc = "";
	    dblk->inclist = NULL;
	    if (debugging_loadlang)
	        fprintf (aa_stderr, PROGNAME"1339 "
		    "No inclist because neither fname nor dbname provided.\n");
	    return TRUE;
	}
	if (dblk->path == NULL)
	    dblk->path = strdup("");
	dblk->fname_inc = austext_malloc (strlen(dblk->path) + 36,
	     PROGNAME"1187", NULL);
	strcpy (dblk->fname_inc, dblk->path);
	ensure_end_slash (dblk->fname_inc);
	strcat (dblk->fname_inc, dblk->name);
	strcat (dblk->fname_inc, EXT_INCLIST);
    }
    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1350 Include list file name = '%s'.\n",
	    dblk->fname_inc);

    /* Don't reload the same file if it's already
     * been loaded into a previous dblk in a list.
     * Code works just fine if dblist == NULL.
     */
    for (db = dblist;  db != NULL;  db = db->link) {
	if (db == dblk || db->fname_inc == NULL)
	    continue;
	if (strcmp (db->fname_inc, dblk->fname_inc) == 0) {
	    dblk->inclist = db->inclist;
	    dblk->lang_flags |= LF_DUP_INC;
	    if (debugging_loadlang)
	        fprintf (aa_stderr, PROGNAME"1363 "
		    "Using previously loaded inclist, db='%s'.\n",
		    dblk->name);
	    return TRUE;
	}
    }

    /* Include list is optional so missing file is
     * not an error unless caller named a specific file.
     */
    i = load_wordtree (&dblk->inclist, dblk, dblk->fname_inc, TRUE);
    switch (i) {
	case 0:
	    return TRUE;

	case 1:
	    if (filename_was_null) {
		dblk->fname_inc = "";
		dblk->inclist = NULL;
		return TRUE;
	    }
	    else {
		sprintf (sprintbuf,
		    CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
		    PROGNAME"1218", dblk->fname_inc, strerror(ENOENT));
		DtSearchAddMessage (sprintbuf);
		return FALSE;
	    }

	default:
	    return FALSE;
    }
} /* load_include_list() */


/************************************************/
/*						*/
/*               load_stop_list			*/
/*						*/
/************************************************/
/* Builds stoplist by reading stoplist file into a
 * binary tree structure.  File name can be
 *    (1) passed in dblk.fname_stp,
 *    (2) generated from dblk path, name, and '.stp',
 *    (3) default for dblk path, language, and '.stp'.
 * 'dblist' may be NULL.
 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 */
static int	load_stop_list (DBLK *dblk, DBLK *dblist)
{
    int		i;
    DBLK	*db;
    char	sprintbuf [_POSIX_PATH_MAX + 512];
    struct stat	statbuf;

    dblk->stoplist = NULL;	/* just to be sure */

    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1700 Load stoplist: db='%s' lang=#%d,%s\n",
	    NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
	    language_name(dblk->dbrec.or_language));

    /* If file name not provided, generate one based on
     * dblk's path, database name, and default extension.
     * And if that doesn't work, generate one based on
     * dblk's path, language, and default extension.
     */
    if (dblk->fname_stp == NULL) {
	if (dblk->path == NULL)
	    dblk->path = strdup("");
	dblk->fname_stp = austext_malloc (strlen(dblk->path) + 36,
	     PROGNAME"919", NULL);

	strcpy (dblk->fname_stp, dblk->path);
	ensure_end_slash (dblk->fname_stp);
	strcat (dblk->fname_stp, dblk->name);
	strcat (dblk->fname_stp, EXT_STOPLIST);
	errno = 0;
	stat (dblk->fname_stp, &statbuf);
	if (errno == ENOENT) {
	    strcpy (dblk->fname_stp, dblk->path);
	    ensure_end_slash (dblk->fname_stp);
	    strcat (dblk->fname_stp, lang_fnames [dblk->dbrec.or_language]);
	    strcat (dblk->fname_stp, EXT_STOPLIST);
	}
    }
    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1448 Stoplist file name = '%s'.\n",
	    dblk->fname_stp);

    /* Don't reload the same file if it's already
     * been loaded into a previous dblk in a list.
     * Code works just fine if dblist == NULL.
     */
    for (db = dblist;  db != NULL;  db = db->link) {
	if (db == dblk || db->fname_stp == NULL)
	    continue;
	if (strcmp (db->fname_stp, dblk->fname_stp) == 0) {
	    dblk->stoplist = db->stoplist;
	    dblk->lang_flags |= LF_DUP_STP;
	    if (debugging_loadlang)
	        fprintf (aa_stderr, PROGNAME"1460 "
		    "Using previously loaded stoplist, db='%s'.\n",
		    dblk->name);
	    return TRUE;
	}
    }

    /* Stop lists are mandatory--a missing stoplist is fatal. */
    i = load_wordtree (&dblk->stoplist, dblk, dblk->fname_stp, TRUE);
    if (i == 1) {
	sprintf (sprintbuf,
	    CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s"),
	    PROGNAME"1270", dblk->fname_stp, strerror(ENOENT));
	DtSearchAddMessage (sprintbuf);
    }
    return (i == 0);
} /* load_stop_list() */


/************************************************/
/*						*/
/*		free_paice_rules		*/
/*						*/
/************************************************/
/* Frees all allocated storage for a set of paice rules, typically
 * loaded at dblk.stem_extra.  Called by REINIT routines and
 * by load_paice_suffixes() when cleaning up after an error.
 */
static void	free_paice_rules (PRULE ***rules_table_ptr)
{
    int		i;
    PRULE	*p, **linkp;
    PRULE	**rules_table;

    if (*rules_table_ptr == NULL)
	return;
    rules_table = *rules_table_ptr;
    for (i=0; i<256; i++) {
	if (rules_table[i] == NULL)
	    continue;
	p = rules_table[i];
	while (p) {
	    linkp = &p->link;
	    free (p->suffix);
	    if (p->apndstr)
		free (p->apndstr);
	    free (p);
	    p = *linkp;
	}
    }
    free (rules_table);
    *rules_table_ptr = NULL;
    return;
} /* free_paice_rules() */


/************************************************/
/*						*/
/*		load_paice_suffixes		*/
/*						*/
/************************************************/
/* Loads European language paice stemmer suffix rules
 * into dblk.stem_extra as an array of ptrs to linked lists.
 * Like stop lists, sfx files can be
 *    (1) passed in dblk.fname_sfx,
 *    (2) generated from dblk path, dbname, and '.sfx',
 *    (3) generated from dblk path, language, and '.sfx'.
 * Internal tables will be reused if file previously loaded.
 * Only uses single byte character sets (ascii, iso-latin-1).
 * Uses strtok().  dblk->charmap must already be loaded.
 * Will continue to parse entire file even if errors are found.
 * RETURNS TRUE if no problems, else FALSE with msg in ausapi_msglist.
 */
static int      load_paice_suffixes (DBLK *dblk, DBLK *dblist)
{
    FILE	*fp;
    DBLK	*db;
    PRULE	*prule, **prule_link;
    PRULE	**rules_table;
    struct stat	statbuf;
    UCHAR	*cptr, *token;
    char	readbuf [_POSIX_PATH_MAX + 1024];
    char	msgbuf [_POSIX_PATH_MAX + 1024];
    UCHAR	*suffix, *apndstr;
    int		must_be_intact, is_last_rule;
    UCHAR	remove_count;
    int		lineno, errcount;
    int		len;
    wchar_t	wc;
    _Xstrtokparams	strtok_buf;

    dblk->stem_extra = NULL;	/* just to be sure */
    rules_table = NULL;

    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1715 Load paice suffixes: db='%s' lang=#%d,%s\n",
	    NULLORSTR(dblk->name), (int)dblk->dbrec.or_language,
	    language_name(dblk->dbrec.or_language));

    /* If file name not provided, generate one based on
     * dblk's path, database name, and default extension.
     * And if that doesn't work, generate one based on
     * dblk's path, language, and default extension.
     */
    if (dblk->fname_sfx == NULL) {
	if (dblk->path == NULL)
	    dblk->path = strdup("");
	dblk->fname_sfx = austext_malloc (strlen(dblk->path) + 36,
	     PROGNAME"1113", NULL);

	strcpy (dblk->fname_sfx, dblk->path);
	ensure_end_slash (dblk->fname_sfx);
	strcat (dblk->fname_sfx, dblk->name);
	strcat (dblk->fname_sfx, EXT_SUFFIX);
	errno = 0;
	stat (dblk->fname_sfx, &statbuf);
	if (errno == ENOENT) {
	    strcpy (dblk->fname_sfx, dblk->path);
	    ensure_end_slash (dblk->fname_sfx);
	    strcat (dblk->fname_sfx, lang_fnames [dblk->dbrec.or_language]);
	    strcat (dblk->fname_sfx, EXT_SUFFIX);
	}
    }
    if (debugging_loadlang)
        fprintf (aa_stderr,
	    PROGNAME"1740 Paice suffix file name = '%s'.\n",
	    dblk->fname_sfx);

    /* Don't reload the same file if it's already
     * been loaded into a previous dblk in a list,
     * but flag it so it won't be freed at unload_language/REINIT.
     * Code works just fine if dblist == NULL.
     */
    for (db = dblist;  db != NULL;  db = db->link) {
	if (db == dblk || db->fname_sfx == NULL)
	    continue;
	if (strcmp (db->fname_sfx, dblk->fname_sfx) == 0) {
	    dblk->stem_extra = db->stem_extra;
	    dblk->lang_flags |= LF_DUP_SFX;
	    if (debugging_loadlang)
	        fprintf (aa_stderr, PROGNAME"1145 "
		    "Using previously loaded suffixes, db='%s'.\n",
		    dblk->name);
	    return TRUE;
	}
    }

    fp = fopen (dblk->fname_sfx, "rt");
    if (fp == NULL) {
	sprintf (msgbuf,
	    CATGETS(dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
	    PROGNAME"181", dblk->fname_sfx, strerror(errno));
	DtSearchAddMessage (msgbuf);
	dblk->fname_sfx = NULL;
	return FALSE;
    }

    /* Rules table will eventually be loaded at dblk.stem_extra.
     * It consists of 256 PRULE ptrs,
     * one for each possible single byte char.
     * Each ptr is the head of a rules list for that char.
     */
    rules_table = austext_malloc (256 * sizeof(PRULE*),
	PROGNAME"199", &ausapi_msglist);
    memset (rules_table, 0, 256 * sizeof(PRULE*));
    lineno =	0;
    errcount =	0;

    /*------- Main Read Loop -------*/
    while (fgets (readbuf, sizeof(readbuf), fp) != NULL) {
	lineno++;

	/* Ignore comment lines */
	if (strchr (COMMENT_CHARS, readbuf[0]))
	    continue;

	/* TOKEN #1: suffix string, backwards, all uppercase.
	 * If missing, ignore 'empty' line.
	 * If the first token is all numeric, ignore line
	 * (for compatibility with older versions of file).
	 */
	if ((suffix = (UCHAR *)_XStrtok(readbuf, SFX_DELIMS, strtok_buf)) == NULL)
	    continue;

	for (cptr = suffix;  cptr;  cptr++) {
	    euro_mbtowc (&wc, (char *)cptr, (char *)suffix);
	    if ((dblk->charmap[wc] & NUMERAL) == 0)
		break;
	}
	if (*cptr == '\0')
	    continue;

	/* OPTIONAL TOKEN #2: if next token '*', set 'intact' flag */
	if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL) {
BAD_RULE:
	    sprintf (msgbuf,  CATGETS(dtsearch_catd, MS_lang, 51,
		"%s %s, Line %d: Invalid Paice Rule for suffix '%s'.") ,
		PROGNAME"898", dblk->fname_sfx, lineno, suffix);
	    DtSearchAddMessage (msgbuf);
	    errcount++;
	    continue;
	}
	must_be_intact = FALSE;
	if (token[0] == '*') {
	    must_be_intact = TRUE;
	    /* Read next token... */
	    if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
		goto BAD_RULE;
	}

	/* TOKEN #3: remove-count */
	remove_count = (UCHAR) atoi ((char *) token);

	/* OPTIONAL TOKEN #4: if next token is NOT a continue
         * symbol ('>' or '$'), then it's an append string.
	 */
	apndstr = NULL;
	if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
	    goto BAD_RULE;
	if (token[0] != '$'  &&  token[0] != '>') {
	    apndstr = token;
	    /* Read next token... */
	    if ((token = (UCHAR *)_XStrtok(NULL, SFX_DELIMS, strtok_buf)) == NULL)
		goto BAD_RULE;
	}

	/* TOKEN #5: continue symbol '$' (stop) or '>' (continue) */
	is_last_rule = (token[0] == '$');

	if (debugging_loadword) {
	    fprintf (aa_stderr,
		"  SFX: intact?=%d stop?=%d remv=%d '%s'",
		(int) must_be_intact,
		(int) is_last_rule,
		(int) remove_count,
		suffix);
	    if (apndstr)
		fprintf (aa_stderr, "\tapnd='%s'\n", apndstr);
	    else
		fputc ('\n', aa_stderr);
	}

	/* Good suffix.  If we haven't had any errors yet,
         * add it to rules list for the first char of the suffix.
	 */
	if (errcount)
	    continue;
	prule = austext_malloc (sizeof(PRULE), PROGNAME"1252", NULL);
	memset (prule, 0, sizeof(PRULE));
	prule->suffix =		(UCHAR *) strdup ((char*)suffix);
	prule->suflen =		strlen ((char*)suffix);
	prule->must_be_intact =	must_be_intact;
	prule->remove_count =	remove_count;
	prule->is_last_rule =	is_last_rule;
	if (apndstr) {
	    len = mbstowcs (NULL, (char *)apndstr, 0);
	    if (len != -1) {
		prule->apndstr =	(UCHAR *) strdup ((char*)apndstr);
		prule->aplen =		len;
	    }
	}

	prule_link = &rules_table[suffix[0]];
	while (*prule_link)
	    prule_link = &(*prule_link)->link;
	*prule_link = prule;

    } /* end Main Read Loop */

    fclose (fp);
    if (errcount) {
	free_paice_rules (&rules_table);
	return FALSE;
    }
    dblk->stem_extra = rules_table;

    /* Update last table entry */
    if (debugging_loadlang) {
        fprintf (aa_stderr,
	    PROGNAME"1654 Paice suffix file '%s' loaded ok.\n",
	    dblk->fname_sfx);
	fflush (aa_stderr);
    }
    return TRUE;
}  /* load_paice_suffixes() */


/************************************************/
/*						*/
/*		 is_matching_rule		*/
/*						*/
/************************************************/
/* Subroutine of paice_stemmer().
 * Returns TRUE if passed rule can be applied to stem in paicebuf.
 * Else returns FALSE.
 */
static int	is_matching_rule (PRULE *rule)
{
    static char		*ptr;
    static wchar_t	wc;
    static int		i, j;

    if (debugging_paice)
	fprintf (aa_stderr, "  test rule '%s':\t", rule->suffix);

    /* Skip rule if we've made at least one previous change
     * but the current rule requires an intact word.
     */
    if (rule->must_be_intact  &&  !word_is_intact) {
	if (debugging_paice)
	    fputs ("word not intact...\n", aa_stderr);
	return FALSE;
    }

    /* Do a backward strcmp on the suffix.
     * Skip rule if it doesn't match current paicebuf's ending chars.
     */
    j = rule->suflen;
    ptr = paicebuf + paicelen - 1;
    for (i = 0; i < j; i++) {
	if (*((rule->suffix) + i) != *ptr) {
	    if (debugging_paice)
		fputs ("no match...\n", aa_stderr);
	    return FALSE;
	}
	ptr--;
    }

    if (debugging_paice)
	fputs ("match", aa_stderr);

    /* Set i = paicebuf length after removing and appending suffixes.
     * Used to algorithmically test remaining stem length
     * after tentative application of rule.
     */
    i = paicewcsl - (rule->remove_count - rule->aplen);

    if (i <= 1) {
	if (debugging_paice)
	    fputs (", but stem too short...\n", aa_stderr);
	return FALSE;
    }

    if (i == 2) {
	euro_mbtowc (&wc, paicebuf, paicebuf);

	if (!IS_VOWEL (wc)) euro_mbtowc (&wc, paicebuf + 1, paicebuf);

	if (IS_VOWEL (wc)) {
	    if (debugging_paice)
		fputs (", and short vowel stem valid.\n", aa_stderr);
	    return TRUE;
	}
	else {
	    if (debugging_paice)
		fputs (", but consonant stem too short...\n", aa_stderr);
	    return FALSE;
	}
    }

    /* Remaining stem is at least 3 chars.
     * If it contains a vowel anywhere, it's valid.
     * (A 'Y' after the first char counts as a vowel).
     * Otherwise it's not.
     */
    for (j=0;  j<i;  j++) {
	euro_mbtowc (&wc, &paicebuf[j], paicebuf);

	if (IS_VOWEL (wc)) {
GOOD_STEM:
	    if (debugging_paice)
		fputs (", and remaining stem valid.\n", aa_stderr);
	    return TRUE;
	}
	if (j > 0  && wc == L'Y')
	    goto GOOD_STEM;
    }

    if (debugging_paice)
	fputs (", but remaining stem all consonants.\n", aa_stderr);
    return FALSE;
}  /* is_matching_rule() */


/************************************************/
/*						*/
/*                 paice_stemmer		*/
/*						*/
/************************************************/
/* Given a word token (ALREADY UPPERCASE) in a single byte
 * language such as the output of teskey_parser,
 * generates 'stem' by repeated suffix removal.
 * Returns stem token in a static buffer valid
 * until next call to paice_stemmer or null_stemmer.
 * Returned stem might be the original unmodified word.
 * Returned stem might also be empty string.
 * Returned stem is *never* NULL, even if wordin == NULL.
 * Input buffer will not be modified; does not use strtok.
 * All variables are static for speeeeeeed.
 */
static char	*paice_stemmer (char *wordin, DBLK *dblk)
{
    wchar_t	finalwc;
    int		len;
    PRULE	*rule, **rules_table;

    if (wordin == NULL)
	return "";
    if (*wordin == 0)
	return "";

    if ((rules_table = (PRULE **)dblk->stem_extra) == NULL) {
	fprintf (aa_stderr, CATGETS(dtsearch_catd, MS_lang, 31,
	    "%s Stemmer suffixes file never loaded.\n"),
	    PROGNAME"310");
	DtSearchExit (2);
    }

    /* The max length of a stem is bufsz - 2:
     * one for the terminating \0 and one for the
     * prefix ^O that identifies a stem.  (But this
     * stemmer doesn't actually insert the ^O now.)
     */
    strncpy (paicebuf, wordin, DtSrMAXWIDTH_HWORD);

    if (mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 2], 1) == -1 &&
	mblen (&paicebuf[DtSrMAXWIDTH_HWORD - 3], 2) != -1)
	paicebuf[DtSrMAXWIDTH_HWORD - 3] = 0;
    else paicebuf[DtSrMAXWIDTH_HWORD - 2] = 0;

    paice_charmap =	dblk->charmap;
    word_is_intact = 	TRUE;

    for (;;) { /*-------- Main Stemming Loop ---------*/

	paicelen = strlen (paicebuf);
	paicewcsl = mbstowcs (NULL, paicebuf, 0);
	len = euro_mbtowc (&finalwc, paicebuf + paicelen - 1, paicebuf);

	if (debugging_paice) {
	    fwprintf (aa_stderr,
		L"paice: '%s', rules list '%lc' for database '%s'\n",
		paicebuf, finalwc, dblk->name);
	    fflush (aa_stderr);
	}

	/* Look for a matching rule */
	if ((rule = rules_table [finalwc]) == NULL) {
	    if (debugging_paice)
		fputs ("  list is null, stop.\n", aa_stderr);
	    break;
	}
	while (rule) {
	    if (is_matching_rule (rule))
		break;
	    rule = rule->link;
	}
	if (rule == NULL) {
	    if (debugging_paice)
		fwprintf (aa_stderr,
		    L"  rules list '%lc' is exhausted, stop.\n", finalwc);
	    break;
	}

	/* Apply rule that matched */
	if (debugging_paice)
	    fputs ("    apply rule: ", aa_stderr);
	if (rule->remove_count == 0) {
	    if (debugging_paice)
		fputs ("remove_count = 0, stop.\n", aa_stderr);
	    break;
	}

	paicebuf [paicelen - len * rule->remove_count] = 0;
	if (rule->aplen)
	    strcat (paicebuf, (char*)rule->apndstr);
	paicelen = strlen (paicebuf);
	paicewcsl = mbstowcs (NULL, paicebuf, 0);
	word_is_intact = FALSE;	 /* we've removed at least 1 suffix */
	if (debugging_paice)
	    fprintf (aa_stderr, "--> '%s'", paicebuf);

	/* Terminate algorithm if rule says so.
	 * Otherwise continue removing suffixes
	 * from this partially stemmed word.
	 */
	if (rule->is_last_rule) {
	    if (debugging_paice)
		fputs (", stop flag is set, stop.\n", aa_stderr);
	    break;
	}
	if (debugging_paice)
	    fputc ('\n', aa_stderr);

    } /* end Main Stemming Loop */

    if (debugging_paice) {
	fprintf (aa_stderr, "  final stem: '%s'\n", paicebuf);
	fflush (aa_stderr);
    }
    return paicebuf;
} /* paice_stemmer() */


/************************************************/
/*						*/
/*                 null_stemmer			*/
/*						*/
/************************************************/
/* Stemmer that just copies and returns passed word.
 * In effect, the passed word IS its own stem.
 * Output buffer valid until next call to null_stemmer
 * or paice_stemmer.
 */
char	*null_stemmer (char *word, DBLK *dblk)
{
    if (word == NULL)
	return "";
    if (*word == '\0')
	return "";
    strncpy (paicebuf, word, DtSrMAXWIDTH_HWORD);
    paicebuf [DtSrMAXWIDTH_HWORD-1] = 0;
    return paicebuf;
} /* null_stemmer() */


/************************************************/
/*						*/
/*                 euro_lstrupr			*/
/*						*/
/************************************************/
/* Converts passed string to uppercase in place.
 * Classic strupr() function using teskey charmaps.
 */
static char	*euro_lstrupr (char *string, DBLK *dblk)
{
    static int		*charmap, len;
    static char		*s;
    static wchar_t	wc;
    charmap = dblk->charmap;
    for (s = string;  *s;  s++) {
	len = euro_mbtowc (&wc, s, string);
	*s = charmap[wc] & 0xFF;
	if (len > 1) wctomb (s - 1, *s);
    }
    return string;
}


/************************************************/
/*						*/
/*                 null_lstrupr			*/
/*						*/
/************************************************/
/* Just returns passed string.  Used where uppercase
 * conversions are not required for a language.
 */
char	*null_lstrupr (char *s, DBLK *d)
{ return s; }


/************************************************/
/*						*/
/*                load_language			*/
/*						*/
/************************************************/
/* Loads a dblk with a specific language's
 * structures and function pointers.
 * Does not reload structures previously loaded in
 * other dblks on dblist if derived from identical files.
 * But always loads structures if passed dblist is NULL.
 * Presumes dblk already partially initialized with mandatory fields:
 *	name, path, language.
 * May also be preinitialized with optional fields:
 *	minwordsz, maxwordsz.
 * Returns TRUE if all successful.
 * Otherwise returns FALSE with err msgs on ausapi_msglist.
 */
int	load_language (DBLK *dblk, DBLK *dblist)
{
    int		oops =	FALSE;
    int		language = dblk->dbrec.or_language;

    if (debugging_loadlang)
        fprintf (aa_stderr,
	    "\n"PROGNAME"1920 Loading language #%d, %s, for dblk '%s'.\n",
	    (int)dblk->dbrec.or_language,
	    language_name (dblk->dbrec.or_language),
	    NULLORSTR(dblk->name));

    /*
     * Note: Load list functions must be called
     * AFTER charmap and lstrupr are loaded.
     */
    switch (language) {
	case DtSrLaENG:
	case DtSrLaENG2:
	case DtSrLaESP:
	case DtSrLaFRA:
	case DtSrLaITA:
	case DtSrLaDEU:
	    dblk->charmap =	(language == DtSrLaENG)?
				    ascii_charmap : latin_charmap;
	    dblk->parser =	teskey_parser;
	    dblk->stemmer =	paice_stemmer;
	    dblk->lstrupr =	euro_lstrupr;
	    if (dblk->dbrec.or_maxwordsz == 0)
		dblk->dbrec.or_maxwordsz = (language == DtSrLaDEU)?
		    MAXWIDTH_LWORD - 1 : MAXWIDTH_SWORD - 1;
	    if (dblk->dbrec.or_minwordsz == 0)
		dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
	    oops = FALSE;
	    if (!load_stop_list (dblk, dblist))
		oops = TRUE;
	    if (!load_include_list (dblk, dblist))
		oops = TRUE;
	    if (!load_paice_suffixes (dblk, dblist))
		oops = TRUE;
	    if (oops)
	        return FALSE;
	    break;

	case DtSrLaJPN:
	case DtSrLaJPN2:
	    return load_jpn_language (dblk, dblist);

	default:
	    /* Try loading a custom 'user' language.
	     * If he failed to provide a loader function,
	     * the dummy custom loader will tell him so.
	     * If he provided one but it can't load this language,
	     * it should return it's own error msgs.
	     */
	    return load_custom_language (dblk, dblist);

    } /* end switch (language) */

    return TRUE;
} /* load_language() */


/************************************************/
/*						*/
/*                unload_language		*/
/*						*/
/************************************************/
/* Frees storage for structures allocated by load_language().
 * Called when engine REINITs due to change in site config file
 * or databases.
 * Duplicate wordtrees are not unloaded because they
 * will have already been unloaded in a previous dblk.
 */
void	unload_language (DBLK *dblk)
{
    switch (dblk->dbrec.or_language) {
	case DtSrLaENG:
	case DtSrLaENG2:
	case DtSrLaESP:
	case DtSrLaFRA:
	case DtSrLaITA:
	case DtSrLaDEU:
	    dblk->charmap = NULL;
	    if ((dblk->lang_flags & LF_DUP_STP) == 0)
		free_wordtree (&dblk->stoplist);
	    else {
		dblk->stoplist = NULL;
		dblk->lang_flags &= ~LF_DUP_STP;
	    }
	    if ((dblk->lang_flags & LF_DUP_INC) == 0)
	        free_wordtree (&dblk->inclist);
	    else {
		dblk->inclist = NULL;
		dblk->lang_flags &= ~LF_DUP_INC;
	    }
	    if ((dblk->lang_flags & LF_DUP_SFX) == 0)
	        free_paice_rules ((PRULE***)&dblk->stem_extra);
	    else {
		dblk->stem_extra = NULL;
		dblk->lang_flags &= ~LF_DUP_SFX;
	    }
	    break;

	case DtSrLaJPN:
	case DtSrLaJPN2:
	    unload_jpn_language (dblk);
	    break;

	default:
	    unload_custom_language (dblk);
	    break;
    }
    return;
} /* unload_language() */
/******************** LANG.C ********************/