mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-03-09 15:50:02 +00:00
1322 lines
40 KiB
C
1322 lines
40 KiB
C
/*
|
|
* CDE - Common Desktop Environment
|
|
*
|
|
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
|
|
*
|
|
* These libraries and programs are free software; you can
|
|
* redistribute them and/or modify them under the terms of the GNU
|
|
* Lesser General Public License as published by the Free Software
|
|
* Foundation; either version 2 of the License, or (at your option)
|
|
* any later version.
|
|
*
|
|
* These libraries and programs are distributed in the hope that
|
|
* they will be useful, but WITHOUT ANY WARRANTY; without even the
|
|
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU Lesser General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with these librararies and programs; if not, write
|
|
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
|
|
* Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
/*
|
|
* COMPONENT_NAME: austext
|
|
*
|
|
* FUNCTIONS: display_jstate
|
|
* jpn_parser
|
|
* kanji_compounder
|
|
* load_jpn_language
|
|
* load_jpntree
|
|
* parse_substring
|
|
* read_jchar
|
|
* search_kanjitree
|
|
*
|
|
* ORIGINS: 27
|
|
*
|
|
*
|
|
* (C) COPYRIGHT International Business Machines Corp. 1995,1996
|
|
* All Rights Reserved
|
|
* Licensed Materials - Property of IBM
|
|
* US Government Users Restricted Rights - Use, duplication or
|
|
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
|
|
*/
|
|
/******************** JPN.C ********************
|
|
* $TOG: jpn.c /main/7 1999/10/14 14:11:33 mgreess $
|
|
* September 1995.
|
|
* Includes functions and data for parsing Japanese,
|
|
* supported languages DtSrLaJPN and DtSrLaJPN2.
|
|
* Currently only supports EUC packed format,
|
|
* but should be easily extendable to Shift-JIS.
|
|
* JIS can be supported if half-width katakana are excluded
|
|
* (no SI or SO chars to conflict with the ^O stemming char,
|
|
* and engine must decide never to balk at ESCape sequences).
|
|
* Will not support Unicode or other fixed width, n-wide
|
|
* encodings that would conflict with ascii in either byte.
|
|
* Does not require wide char or multibyte char functions.
|
|
* There is no Japanese stemmer(), ie standard null_stemmer() is used.
|
|
*
|
|
* Code Set 0 can be either 7-bit ASCII or 7-bit JIS-Roman.
|
|
* The parser() for ASCII is the full teskey_parser()
|
|
* used for European languages with an ascii char set.
|
|
* Min/max word size, stoplists, and include lists may be
|
|
* used if provided, as in European languages.
|
|
*
|
|
* Code Set 1 is JIS X 0208-1990.
|
|
* Symbols and line drawing elements are not indexed.
|
|
* Hirigana strings are discarded as equivalent to stoplist words.
|
|
* Contiguous strings of katakana, Roman, Greek, or cyrillic
|
|
* are parsed as single words.
|
|
*
|
|
* Individual kanji chars are parsed as single words.
|
|
* In addition, for language DtSrLaJPN, all kanji compounds
|
|
* (pairs, triplets, etc) found in any contiguous string of
|
|
* kanjis will be parsed up to a maximum word size
|
|
* defined in MAX_KANJI_CLEN (see caveat below).
|
|
* For language DtSrLaJPN2, only kanji substrings listed
|
|
* in a .knj file are parsed as additional compound words.
|
|
* Characters from unassigned kuten rows are presumed to be
|
|
* user-defined kanji and are parsed as such.
|
|
*
|
|
* Code Set 2 is 1/2 width katakana.
|
|
* Contiguous strings are parsed as single words.
|
|
*
|
|
* Code Set 3 is JIS X 0212-1990.
|
|
* Parsing is similar to Code Set 1: discard symbols, etc,
|
|
* contiguous strings of related foreign characters equal words,
|
|
* and individual kanji and unassigned chars equal single words,
|
|
* with additional kanji compounding depending on language.
|
|
* Row 5 has 4 new katakana (not yet officially approved)
|
|
* so it is treated here as katakana.
|
|
*
|
|
* $Log$
|
|
* Revision 2.8 1996/04/10 20:24:33 miker
|
|
* Fixed bug in kanji tree loader.
|
|
*
|
|
* Revision 2.7 1996/03/25 18:55:15 miker
|
|
* Changed FILENAME_MAX to _POSIX_PATH_MAX.
|
|
*
|
|
* Revision 2.6 1996/03/13 22:57:40 miker
|
|
* Added prolog. Changed char to UCHAR several places.
|
|
*
|
|
* Revision 2.5 1996/03/05 16:09:58 miker
|
|
* Made jchar array of unsigned chars for compat with Sun compilers.
|
|
* Added test of PA_MSGS for yacc-based boolean queries.
|
|
*
|
|
* Revision 2.4 1996/02/01 19:08:10 miker
|
|
* AusText 2.1.11, DtSearch 0.3: Major rewrite for new parsers.
|
|
* Made optional power series kanji compounding (KANJI_COMPOUNDS)
|
|
* into a new DtSrLaJPN language. Old version now DtSrLaJPN2.
|
|
*
|
|
* Revision 2.3 1995/12/01 16:20:17 miker
|
|
* Changed read_jchar arg to unsigned to fix Solaris bug.
|
|
*
|
|
* Revision 2.2 1995/10/26 15:08:31 miker
|
|
* Added prolog.
|
|
*
|
|
* Revision 2.1 1995/09/22 20:57:13 miker
|
|
* Freeze DtSearch 0.1, AusText 2.1.8
|
|
*
|
|
* Revision 1.1 1995/09/19 21:24:57 miker
|
|
* Initial revision
|
|
*/
|
|
#include "SearchP.h"
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <errno.h>
|
|
#include <sys/stat.h>
|
|
|
|
#define PROGNAME "JPN"
|
|
#define SS2_CHAR 0x8E /* Single Shift char for Code Set 2 */
|
|
#define SS3_CHAR 0x8F /* Single Shift char for Code Set 3 */
|
|
#define EXT_KATAKANA ".ktk"
|
|
#define EXT_KANJI ".knj"
|
|
#define SUBSTRBUFSZ 100
|
|
#define MS_misc 1
|
|
#define MS_lang 15
|
|
|
|
/* In addition to single kanji chars parsed as individual words,
|
|
* Language DtSrLaJPN will also blindly consider all contiguous kanji
|
|
* substrings up to MAX_KANJI_CLEN as separate compound words.
|
|
* For example if MAX_KANJI_CLEN were 3, the 4 kanjis "ABCD"
|
|
* would parse as "A B C D AB BC CD ABC BCD".
|
|
* The number of parsed words = the number of
|
|
* ordered permutations of n things taken r! times!
|
|
* This is can be very wasteful of indexing time and file space.
|
|
* The alternative is language DtSrLaJPN2 which only considers
|
|
* strings listed in jpn.knj as valid kanji compounds.
|
|
* The kanji compounds in jpn.knj are the statistically significant
|
|
* kanji substrings found in a large corpus of natural language Japanese.
|
|
*/
|
|
#define MAX_KANJI_CLEN 6
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* JSTATE */
|
|
/* */
|
|
/************************************************/
|
|
/* EUC text substring types.
|
|
* Used to switch states in parser's automaton.
|
|
* Coded as bit positions for efficient boolean comparisons.
|
|
*/
|
|
#define JS_STX 0x0001 /* Start of text blk, initial state */
|
|
#define JS_KANJI 0x0002 /* Set 1, Set 3 */
|
|
#define JS_KATAKANA 0x0004 /* Set 1 */
|
|
#define JS_ASCII 0x0008 /* Set 0 */
|
|
#define JS_ROMAN 0x0010 /* Set 1 */
|
|
#define JS_GREEK 0x0020 /* Set 1, Set 3 */
|
|
#define JS_CYRILLIC 0x0040 /* Set 1 */
|
|
#define JS_ALPHA 0x0080 /* Set 3 */
|
|
#define JS_HALFKATA 0x0100 /* Set 2 */
|
|
#define JS_DISCARD 0x0200 /* Set 1, Set 3, any char not in EUC */
|
|
#define JS_ETX 0x0300 /* End of text block */
|
|
#define JS_ALPHA_COMPATIBLE (JS_ROMAN | JS_GREEK | JS_CYRILLIC)
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* JPNTREE */
|
|
/* */
|
|
/************************************************/
|
|
/* Similar to standard binary WORDTREE.
|
|
* Each tree node distinguished by first 4 bytes
|
|
* (usually 2 jchars), which is minimum compound word size.
|
|
* All compounds beginning with those 4 bytes are chained
|
|
* in a linked list off of that node.
|
|
*/
|
|
typedef struct _jpntree_tag {
|
|
struct _jpntree_tag *rlink; /* ptr to right binary node */
|
|
struct _jpntree_tag *llink; /* ptr to left binary node */
|
|
struct _jpntree_tag *next; /* ptr to next compound in linked list */
|
|
int len; /* length of word in bytes */
|
|
void *word;
|
|
} JPNTREE;
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* JPNBLK */
|
|
/* */
|
|
/************************************************/
|
|
typedef struct {
|
|
JPNTREE *katatree;
|
|
JPNTREE *kanjitree;
|
|
} JPNBLK;
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* GLOBALS */
|
|
/* */
|
|
/************************************************/
|
|
int debugging_jpn = FALSE;
|
|
extern int debugging_loadlang;
|
|
extern int debugging_loadword;
|
|
|
|
/* Used in jpn_parser() and parse_substr(). Made global for speed. */
|
|
static int do_compounding = FALSE;
|
|
static int is_new_substring = TRUE;
|
|
static int jstate, last_jstate;
|
|
static UCHAR jchar [8];
|
|
static int jcharlen = 0;
|
|
static DBLK *jpn_dblk;
|
|
static JPNTREE *jpn_kanjitree = NULL;
|
|
static JPNTREE *jpn_katatree = NULL;
|
|
static JPNTREE *kanjitree = NULL;
|
|
static int language;
|
|
static long *offsetp;
|
|
static long readcount = 0;
|
|
static READCFP readchar;
|
|
static void *readchar_arg;
|
|
static UCHAR *outbuf = NULL;
|
|
static UCHAR *save_parg_string = NULL;
|
|
static UCHAR *substrbuf = NULL;
|
|
static long substr_offset;
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* display_jstate */
|
|
/* */
|
|
/************************************************/
|
|
/* for debugging and error msgs */
|
|
static char *display_jstate (int js)
|
|
{
|
|
switch (js) {
|
|
case JS_KANJI: return "KANJI";
|
|
case JS_KATAKANA: return "KATAKANA";
|
|
case JS_DISCARD: return "DISCARD";
|
|
case JS_ROMAN: return "ROMAN";
|
|
case JS_ASCII: return "ASCII";
|
|
case JS_ALPHA: return "ALPHA";
|
|
case JS_ETX: return "ETX";
|
|
case JS_STX: return "STX";
|
|
case JS_GREEK: return "GREEK";
|
|
case JS_CYRILLIC: return "CYRILLIC";
|
|
case JS_HALFKATA: return "HALFKATA";
|
|
default: return "(UNKNOWN)";
|
|
}
|
|
} /* display_jstate() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* read_jchar */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of jpn_parser().
|
|
* Using global character reading 'readchar' cofunction,
|
|
* returns (1) next multibyte Japanese character in global jchar,
|
|
* (2) length of jchar in global jcharlen, and
|
|
* (3) next state of state machine in global jstate.
|
|
* Function itself returns jstate.
|
|
* Rows in the KUTEN tables which are officially 'unassigned'
|
|
* are treated as user-defined kanji, so all jstates
|
|
* are presumed JS_KANJI except those specifically marked
|
|
* otherwise at the beginning of each array below.
|
|
*/
|
|
static int read_jchar (void)
|
|
{
|
|
/* Jstates table for EUC Set 1 (JIS 0208) */
|
|
static int jstates_set1 [] = {
|
|
JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
|
|
JS_ROMAN, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
|
|
JS_GREEK, JS_CYRILLIC, JS_DISCARD /* 6 - 8 */
|
|
};
|
|
|
|
/* Jstates table for EUC Set 3 (JIS 0212).
|
|
* Row 5 is presumed to be katakana because
|
|
* of four new unapproved katakana characters.
|
|
*/
|
|
static int jstates_set3 [] = {
|
|
JS_DISCARD, JS_DISCARD, JS_DISCARD, /* 0 - 2 */
|
|
JS_DISCARD, JS_DISCARD, JS_KATAKANA, /* 3 - 5 */
|
|
JS_GREEK, JS_CYRILLIC, JS_DISCARD, /* 6 - 8 */
|
|
JS_ALPHA, JS_ALPHA, JS_ALPHA /* 9 - 11 */
|
|
};
|
|
|
|
if (readchar_arg) {
|
|
jchar[0] = readchar (readchar_arg);
|
|
readchar_arg = NULL;
|
|
}
|
|
else
|
|
jchar[0] = readchar (NULL);
|
|
if (jchar[0] == 0)
|
|
return (jstate = JS_ETX);
|
|
readcount++;
|
|
|
|
/* Set 1 (JIS 0208) */
|
|
if (jchar[0] >= 0xA1 && jchar[0] <= 0xFE) {
|
|
jcharlen = 2;
|
|
if (jchar[0] > 0xA8)
|
|
jstate = JS_KANJI;
|
|
else
|
|
jstate = jstates_set1 [(jchar[0] & 0x7F) - 32];
|
|
if (jchar[1] = readchar (NULL))
|
|
readcount++;
|
|
else
|
|
jstate = JS_ETX;
|
|
return jstate;
|
|
}
|
|
|
|
/* Set 0 (ASCII) */
|
|
if (jchar[0] < 0x80) {
|
|
jcharlen = 1;
|
|
return (jstate = JS_ASCII);
|
|
}
|
|
|
|
/* Set 3 (JIS 0212) */
|
|
if (jchar[0] == SS3_CHAR) {
|
|
jcharlen = 3;
|
|
/*
|
|
* Hop over the single shift char to get the first JIS byte.
|
|
* Make sure first JIS byte is in proper
|
|
* range to avoid indexing outside of table.
|
|
*/
|
|
if ((jchar[1] = readchar (NULL)) == 0)
|
|
return (jstate = JS_ETX);
|
|
readcount++;
|
|
if (jchar[1] < 0xA1)
|
|
return (jstate = JS_DISCARD);
|
|
if (jchar[1] > 0xAA)
|
|
jstate = JS_KANJI;
|
|
else
|
|
jstate = jstates_set3 [(*jchar & 0x7F) - 32];
|
|
|
|
if ((jchar[2] = readchar (NULL)) == 0)
|
|
return (jstate = JS_ETX);
|
|
readcount++;
|
|
/* JS_ALPHA chars ('miscellaneous alphabetic chars' of
|
|
* rows 9 - 11) are compatible with several other jstates,
|
|
* so adjust as necessary.
|
|
*/
|
|
if (jstate == JS_ALPHA &&
|
|
((last_jstate & JS_ALPHA_COMPATIBLE) != 0))
|
|
jstate = last_jstate;
|
|
else if (last_jstate == JS_ALPHA &&
|
|
((jstate & JS_ALPHA_COMPATIBLE) != 0))
|
|
last_jstate = jstate;
|
|
return jstate;
|
|
}
|
|
|
|
/* Set 2 (half-width katakana) */
|
|
if (jchar[0] == SS2_CHAR) {
|
|
jcharlen = 2;
|
|
jstate = JS_HALFKATA;
|
|
if (jchar[1] = readchar (NULL))
|
|
readcount++;
|
|
else
|
|
jstate = JS_ETX;
|
|
return jstate;
|
|
}
|
|
|
|
/* If first jchar doesn't match expected EUC coding,
|
|
* discard it until we get back into sync.
|
|
*/
|
|
jcharlen = 1;
|
|
return (jstate = JS_DISCARD);
|
|
} /* read_jchar() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* kanji_compounder */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of parse_substring() of jpn_parser().
|
|
* Used only for language DtSrLaJPN (power series compounding).
|
|
* Called repeatedly when the substring is a sequence of kanji chars.
|
|
* For each call writes to outbuf and returns a single kanji
|
|
* compound word, using every possible compound in the substring
|
|
* from length 1 to length MAX_KANJI_CLEN.
|
|
* Updates offsetp for each word returned.
|
|
* Returns NULL when substring exhausted. First call for
|
|
* a new substring indicated by global is_new_substring.
|
|
*/
|
|
|
|
static UCHAR *kanji_compounder (void)
|
|
{
|
|
static int all_done = TRUE;
|
|
static int clen = MAX_KANJI_CLEN + 1;
|
|
static UCHAR *mysubstrp = NULL;
|
|
static UCHAR *mysubstrend = NULL;
|
|
static UCHAR *op, *ss;
|
|
static int i;
|
|
|
|
if (is_new_substring) {
|
|
is_new_substring = FALSE;
|
|
all_done = FALSE;
|
|
clen = 1;
|
|
mysubstrp = substrbuf;
|
|
mysubstrend = substrbuf + strlen ((char*)substrbuf);
|
|
}
|
|
|
|
/* Advance compound length by 1.
|
|
* If max compound length exceeded, reset it
|
|
* to 1 and increment substring pointer by 1 jchar.
|
|
*/
|
|
else {
|
|
if (all_done)
|
|
return NULL;
|
|
if (++clen > MAX_KANJI_CLEN) {
|
|
clen = 1;
|
|
mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
|
|
}
|
|
}
|
|
|
|
/* Assemble one word into outbuf, of length clen,
|
|
* beginning at current substring ptr.
|
|
* If there aren't enough jchars left in string,
|
|
* reset clen to 1 and advance substrp by 1 jchar.
|
|
* We're all done when substring exhausted.
|
|
*/
|
|
while (mysubstrp < mysubstrend) {
|
|
op = outbuf;
|
|
ss = mysubstrp;
|
|
for (i = 0; i < clen; i++) {
|
|
/* Are there enough jchars left in substring? */
|
|
if (ss >= mysubstrend) {
|
|
clen = 1;
|
|
mysubstrp += (*mysubstrp == SS3_CHAR)? 3 : 2;
|
|
i = 0; /* indicates assembly failure */
|
|
break; /* breaks the for loop */
|
|
}
|
|
/* Assemble one jchar into outbuf */
|
|
if (*ss == SS3_CHAR)
|
|
*op++ = *ss++;
|
|
*op++ = *ss++;
|
|
*op++ = *ss++;
|
|
}
|
|
/* Did word assembly succeed? */
|
|
if (i >= clen) {
|
|
*op = 0;
|
|
if (offsetp)
|
|
*offsetp = substr_offset + (mysubstrp - substrbuf);
|
|
if (debugging_jpn)
|
|
fprintf (aa_stderr,
|
|
"knjcompdr: subofs=%2ld totofs=%3ld \"%s\"\n",
|
|
mysubstrp - substrbuf, *offsetp, outbuf);
|
|
return outbuf;
|
|
}
|
|
}
|
|
|
|
all_done = TRUE;
|
|
return NULL;
|
|
} /* kanji_compounder() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* search_kanjitree */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of parse_substring() of jpn_parser().
|
|
* Used only for language DtSrLaJPN2; DtSrLaJPN calls
|
|
* kanji_compounder() to generate compounds algorithmically.
|
|
* First call for a new substring of kanjis is indicated
|
|
* when is_new_substring is TRUE. Each call, then and thereafter,
|
|
* returns a token (1) for each individual kanji char in string,
|
|
* and (2) for each sequence of kanjis found in the kanji
|
|
* compounds JPNTREE which begins with each char in string.
|
|
* Also returns offset of returned token in offsetp.
|
|
* Returns NULL when string is exhausted.
|
|
* Variables are static for speeeeed.
|
|
*/
|
|
static UCHAR *search_kanjitree (void)
|
|
{
|
|
static int all_done = TRUE;
|
|
static JPNTREE *node, *last_node;
|
|
static UCHAR *substrp, *substrend;
|
|
static int direction;
|
|
static int nodelen;
|
|
static int jcharlen;
|
|
|
|
if (is_new_substring) {
|
|
is_new_substring = FALSE;
|
|
all_done = FALSE;
|
|
substrend = substrbuf + strlen ((char*)substrbuf);
|
|
substrp = substrbuf;
|
|
|
|
/* Return first substr jchar as next token */
|
|
last_node = NULL; /* NULL = tree not searched yet */
|
|
jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
|
|
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
|
outbuf [jcharlen] = 0;
|
|
if (offsetp)
|
|
*offsetp = substr_offset;
|
|
return outbuf;
|
|
}
|
|
else if (all_done)
|
|
return NULL;
|
|
|
|
/* If not enough chars left in substring to search tree,
|
|
* treat it as an exhausted tree search. In other words,
|
|
* reset tree search, increment to next jchar, and return it.
|
|
*/
|
|
if (strlen ((char*)substrp) < 4) {
|
|
if (debugging_jpn)
|
|
fputs ("knjtree: ...remaining substring too short", aa_stderr);
|
|
EXHAUSTED_TREE:
|
|
if (debugging_jpn)
|
|
fputs (".\n", aa_stderr);
|
|
last_node = NULL;
|
|
substrp += jcharlen;
|
|
if (substrp >= substrend) {
|
|
all_done = TRUE;
|
|
return NULL;
|
|
}
|
|
jcharlen = (*substrp == SS3_CHAR)? 3 : 2;
|
|
strncpy ((char*)outbuf, (char*)substrp, jcharlen);
|
|
outbuf [jcharlen] = 0;
|
|
if (offsetp)
|
|
*offsetp = substr_offset + (substrp - substrbuf);
|
|
return outbuf;
|
|
}
|
|
|
|
/* If last call resulted in a tree hit, the node was saved.
|
|
* Continue the linked list search directly from the last hit.
|
|
*/
|
|
if (last_node) {
|
|
last_node = last_node->next;
|
|
if (debugging_jpn)
|
|
fputs ("knjtree: ...continue tree search: ", aa_stderr);
|
|
LINKED_LIST_SEARCH:
|
|
for (node = last_node; node; node = node->next) {
|
|
if ((strncmp ((char*)substrp, node->word, node->len)) == 0) {
|
|
/* HIT on linked list search */
|
|
last_node = node;
|
|
strcpy ((char*)outbuf, node->word);
|
|
if (debugging_jpn)
|
|
fprintf (aa_stderr, "* '%s'\n", outbuf);
|
|
if (offsetp)
|
|
*offsetp = substr_offset + (substrp - substrbuf);
|
|
return outbuf;
|
|
}
|
|
else if (debugging_jpn)
|
|
fputc ('-', aa_stderr);
|
|
}
|
|
goto EXHAUSTED_TREE;
|
|
}
|
|
|
|
/* Start new binary tree search at curr jchar.
|
|
* If hit, commence linked list search.
|
|
*/
|
|
if (debugging_jpn)
|
|
fprintf (aa_stderr,
|
|
"knjtree: \"%.4s...\" ", substrp);
|
|
for (node = kanjitree; node != NULL; ) {
|
|
if ((direction = strncmp ((char*)substrp, node->word, 4)) == 0) {
|
|
/* HIT on binary search */
|
|
last_node = node;
|
|
goto LINKED_LIST_SEARCH;
|
|
}
|
|
/* Descend left or right depending on word */
|
|
if (debugging_jpn)
|
|
fputc ((direction < 0) ? 'L' : 'R', aa_stderr);
|
|
if (direction < 0)
|
|
node = node->llink;
|
|
else
|
|
node = node->rlink;
|
|
}
|
|
|
|
/* No match on first 4 bytes of substrp in binary tree.
|
|
* Tree exhausted without a hit, so increment to next
|
|
* jchar in substring and return it as a word.
|
|
*/
|
|
goto EXHAUSTED_TREE;
|
|
} /* search_kanjitree() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* parse_substring */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of jpn_parser().
|
|
* Returns next Japanese multibyte word token from current
|
|
* substring of jchars, or NULL when out of tokens.
|
|
* Returned token is valid until next call.
|
|
* Static args initialized at first call for a new substring.
|
|
* Provides optional kanji compounding depending on PA_ flags.
|
|
* We usually compound at index time (dtsrindex) or when query
|
|
* is Query-By-Example (statistical searches), and usually don't
|
|
* compound boolean queries.
|
|
*/
|
|
static UCHAR *parse_substring (void)
|
|
{
|
|
static int is_substr_end = TRUE;
|
|
static int substrlen = 0;
|
|
static PARG myparg;
|
|
static UCHAR *token;
|
|
static long myoffset;
|
|
|
|
if (is_new_substring) {
|
|
substrlen = strlen ((char*)substrbuf);
|
|
|
|
/* A very common ascii substring is the final line-feed
|
|
* at the end of a line of text--discard it now.
|
|
*/
|
|
if (last_jstate == JS_ASCII
|
|
&& substrlen == 1
|
|
&& substrbuf[0] == '\n') {
|
|
is_substr_end = TRUE;
|
|
is_new_substring = FALSE;
|
|
return NULL;
|
|
}
|
|
|
|
is_substr_end = FALSE;
|
|
|
|
if (!outbuf)
|
|
outbuf = austext_malloc (DtSrMAXWIDTH_HWORD + 8,
|
|
PROGNAME"807", NULL);
|
|
|
|
if (debugging_jpn) {
|
|
int i;
|
|
fprintf (aa_stderr, "jpnsubstr: js=%s len=%ld str='",
|
|
display_jstate(last_jstate), substrlen);
|
|
for (i = 0; i < substrlen; i++)
|
|
fputc ((substrbuf[i] < 32)? '~' : substrbuf[i],
|
|
aa_stderr);
|
|
fprintf (aa_stderr, "'\n");
|
|
if (last_jstate == JS_ROMAN) {
|
|
fprintf (aa_stderr, " (ascii equiv: '");
|
|
for (i = 1; i < substrlen; i+=2)
|
|
fputc ((substrbuf[i] & 0x7f) + 32, aa_stderr);
|
|
fprintf (aa_stderr, "')\n");
|
|
}
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
} /* endif is_new_substring */
|
|
|
|
if (is_substr_end)
|
|
return NULL;
|
|
|
|
switch (last_jstate) {
|
|
|
|
case JS_DISCARD:
|
|
/* Ignore discardable substrings */
|
|
is_new_substring = FALSE;
|
|
is_substr_end = TRUE;
|
|
return NULL;
|
|
|
|
case JS_KATAKANA:
|
|
case JS_ROMAN:
|
|
case JS_CYRILLIC:
|
|
case JS_GREEK:
|
|
case JS_ALPHA:
|
|
case JS_HALFKATA:
|
|
/* Treat entire substring as single parsed word */
|
|
ENTIRE_SUBSTR_IS_WORD:
|
|
if (debugging_jpn)
|
|
fputs (" token is entire substring.\n", aa_stderr);
|
|
strncpy ((char*)outbuf, (char*)substrbuf, DtSrMAXWIDTH_HWORD);
|
|
outbuf [DtSrMAXWIDTH_HWORD - 1] = 0;
|
|
is_new_substring = FALSE;
|
|
is_substr_end = TRUE;
|
|
if (offsetp)
|
|
*offsetp = substr_offset;
|
|
return outbuf;
|
|
|
|
case JS_ASCII:
|
|
/* Call the full teskey_parser() for European languages.
|
|
* Includes stoplist and include list processing.
|
|
*/
|
|
if (is_new_substring) {
|
|
is_new_substring = FALSE;
|
|
if (debugging_jpn)
|
|
fputs (" calling teskey parser.\n", aa_stderr);
|
|
myparg.dblk = jpn_dblk;
|
|
myparg.string = substrbuf;
|
|
myparg.ftext = NULL;
|
|
myparg.offsetp = &myoffset;
|
|
token = (UCHAR *) teskey_parser (&myparg);
|
|
}
|
|
else
|
|
token = (UCHAR *) teskey_parser (NULL);
|
|
if (token) {
|
|
if (offsetp)
|
|
*offsetp = substr_offset + myoffset;
|
|
}
|
|
else
|
|
is_substr_end = TRUE;
|
|
return token;
|
|
|
|
case JS_KANJI:
|
|
/* If not compounding, treat entire substring
|
|
* as one query word, ie a single compound kanji word.
|
|
* If compounding, each individual kanji in the
|
|
* substring is returned as a word by itself.
|
|
* Each kanji can be 2 or 3 bytes depending on
|
|
* which code set it came from. In addition,
|
|
* sequences of 2 or more kanjis ('compound kanji
|
|
* words') are returned as individual words.
|
|
* Method of kanji compounding depends on language:
|
|
* DtSrLaJPN does "power series" kanji compounding,
|
|
* DtSrLaJPN2 looks up kanji compounds in a word tree.
|
|
* Both functions test and reset is_new_substring,
|
|
* update offsetp as necessary, and return either NULL
|
|
* or a pointer to outbuf containing a valid token.
|
|
*/
|
|
if (!do_compounding)
|
|
goto ENTIRE_SUBSTR_IS_WORD;
|
|
token = (language == DtSrLaJPN)?
|
|
kanji_compounder() : search_kanjitree();
|
|
if (!token)
|
|
is_substr_end = TRUE;
|
|
return token;
|
|
|
|
default:
|
|
break;
|
|
|
|
} /* end state switch */
|
|
|
|
/* Should never get here... */
|
|
fprintf (aa_stderr, catgets(dtsearch_catd, MS_lang, 20,
|
|
"%s Program Error: Unknown jstate %d.\n") ,
|
|
PROGNAME"246", last_jstate);
|
|
DtSearchExit (46);
|
|
} /* parse_substring() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* jpn_parser */
|
|
/* */
|
|
/************************************************/
|
|
/* Returns next word token from text stream of packed EUC
|
|
* Japanese text, languages DtSrLaJPN and DtSrLaJPN2.
|
|
* Called from (1) dtsrindex, where readchar_ftext() cofunction
|
|
* reads the .fzk file document 'stream', or (2) search engine
|
|
* query parsers, where readchar_string() cofunction 'reads'
|
|
* from the query string.
|
|
*
|
|
* First call passes args in PARG block. This resets end of
|
|
* text block (ETX) flag, resets 'offset' counter to zero, etc.
|
|
* Subsequent calls should pass NULL, and parser returns
|
|
* next token in block, until reader cofunction reads ETX
|
|
* end returns special ETX char ('\0'). Subsequent call to parser
|
|
* returns NULL meaning "no tokens left in current stream".
|
|
* Reader cofunction tolerates repeated calls after
|
|
* the first ETX, still returning '\0'.
|
|
*
|
|
* This parser presumes all incoming text is packed EUC multibyte
|
|
* Japanese chars as described above, but is otherwise unformatted.
|
|
* Since parser accesses streams a multibyte char at a time,
|
|
* it does not require periodic line feeds, etc.
|
|
*
|
|
* To control kanji compounding, caller should set a PA_ switch
|
|
* in parg.flags as desired before call. Compounding is done
|
|
* when indexing (dtsrindex) or for hiliting (comparing previous
|
|
* search results against all possible words in document text).
|
|
* But in a Query by Example (stat searches), parser might also
|
|
* be asked to generate compound words. In boolean queries
|
|
* (stems and exact words), parser should not generate compounds
|
|
* because if user enters a compound string, he probably only wants
|
|
* documents containing that exact token.
|
|
*
|
|
* Parser also returns offset information: number of bytes
|
|
* since beginning of text block. The returned offsets are
|
|
* NOT NECESSARILY IN ASCENDING ORDER due to kanji compounding.
|
|
*
|
|
* Variables are static or global for speeeeeeed.
|
|
*
|
|
* OUTPUT FORMAT: NULL or a static C string containing a
|
|
* single parsed word token.
|
|
* The text in the buffer is valid until the next call.
|
|
* Each word is translated as described above.
|
|
*/
|
|
char *jpn_parser (PARG *parg)
|
|
{
|
|
static int filling_substring = TRUE;
|
|
static int was_discarding = FALSE;
|
|
static int add_msgs = FALSE;
|
|
static UCHAR *endsubstrbuf = NULL;
|
|
static size_t substrbufsz = 0;
|
|
static UCHAR *token;
|
|
static UCHAR *substrp;
|
|
|
|
/* If first call for new text block... */
|
|
if (parg) {
|
|
jpn_dblk = parg->dblk;
|
|
language = jpn_dblk->dbrec.or_language;
|
|
kanjitree = ((JPNBLK *)(jpn_dblk->parse_extra))->kanjitree;
|
|
offsetp = parg->offsetp;
|
|
do_compounding = (parg->flags & (PA_HILITING | PA_INDEXING));
|
|
add_msgs = (parg->flags & PA_MSGS);
|
|
if (parg->string) { /* text is query str from search engine */
|
|
save_parg_string = parg->string;
|
|
readchar_arg = parg->string;
|
|
readchar = (READCFP) readchar_string;
|
|
}
|
|
else { /* text is from .fzk file in dtsrindex */
|
|
save_parg_string = NULL;
|
|
readchar_arg = parg;
|
|
readchar = (READCFP) readchar_ftext;
|
|
}
|
|
|
|
if (substrbufsz == 0) {
|
|
substrbufsz = SUBSTRBUFSZ;
|
|
substrbuf = austext_malloc (SUBSTRBUFSZ + 8, PROGNAME"680", NULL);
|
|
}
|
|
endsubstrbuf = substrbuf + substrbufsz;
|
|
|
|
if (debugging_jpn) {
|
|
fprintf (aa_stderr,
|
|
"jpnparser: start text block, substrbufsz=%ld.\n",
|
|
substrbufsz);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
/* Seed the first substring */
|
|
filling_substring = TRUE;
|
|
readcount = 0L;
|
|
last_jstate = JS_STX;
|
|
read_jchar();
|
|
|
|
} /* endif (parg != NULL) */
|
|
|
|
FILL_ANOTHER_SUBSTRING:
|
|
/* Input text is presumed to contain substrings
|
|
* of chars related by their EUC encoding.
|
|
* Fill the substring buffer by reading in nonDISCARDable
|
|
* multibyte jchars until jstate changes signaling
|
|
* end of a substring.
|
|
* Note last jchar read, the one that changes the jstate,
|
|
* hangs around till we come back to this loop.
|
|
*/
|
|
if (filling_substring) {
|
|
if (debugging_jpn) {
|
|
if (jstate == JS_DISCARD) {
|
|
fputs ("jpnparser: js=DISCARD:", aa_stderr);
|
|
was_discarding = TRUE;
|
|
}
|
|
else
|
|
was_discarding = FALSE;
|
|
}
|
|
while (jstate == JS_DISCARD) {
|
|
if (debugging_jpn)
|
|
fprintf (aa_stderr, " %s", jchar);
|
|
read_jchar();
|
|
}
|
|
if (debugging_jpn && was_discarding)
|
|
fputc ('\n', aa_stderr);
|
|
if (jstate == JS_ETX) {
|
|
if (debugging_jpn)
|
|
fputs ("jpnparser: js=ETX\n", aa_stderr);
|
|
if (add_msgs) {
|
|
char msgbuf [DtSrMAXWIDTH_HWORD + 100];
|
|
sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 21,
|
|
"%s '%.*s' is not a valid Japanese word.") ,
|
|
PROGNAME"812", DtSrMAXWIDTH_HWORD, save_parg_string);
|
|
DtSearchAddMessage (msgbuf);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
last_jstate = jstate;
|
|
substrp = substrbuf;
|
|
substr_offset = readcount - jcharlen;
|
|
|
|
/* Fill the substring buffer.
|
|
* Ensure substring buffer is big enough.
|
|
*/
|
|
while (last_jstate == jstate) {
|
|
if (endsubstrbuf - substrp < 8) {
|
|
size_t curlen = substrp - substrbuf;
|
|
if (debugging_jpn) {
|
|
fprintf (aa_stderr,
|
|
"jpnparser: curr substr len %ld, "
|
|
"new substrbufsz %ld.\n",
|
|
curlen, substrbufsz<<1);
|
|
fflush (aa_stderr);
|
|
}
|
|
substrbufsz <<= 1; /* double its size */
|
|
substrbuf = realloc (substrbuf, substrbufsz);
|
|
endsubstrbuf = substrbuf + substrbufsz;
|
|
substrp = substrbuf + curlen;
|
|
}
|
|
strncpy ((char*)substrp, (char*)jchar, jcharlen);
|
|
substrp += jcharlen;
|
|
read_jchar();
|
|
}
|
|
*substrp = 0;
|
|
filling_substring = FALSE;
|
|
is_new_substring = TRUE;
|
|
}
|
|
|
|
/* Empty the substring buffer returning each token
|
|
* one by one, ie parse and return word tokens from string,
|
|
* including possible kanji compounds if switched on.
|
|
*/
|
|
if (token = parse_substring())
|
|
return (char *) token;
|
|
|
|
/* When current substring is empty, go back and fill another one.
|
|
* If we're parsing a string (eg hiliting text of a doc),
|
|
* parse_substring() will have used readchar_string().
|
|
* Since we now want to resume using it to parse the original
|
|
* string, we have to reset it's string ptr.
|
|
*/
|
|
filling_substring = TRUE;
|
|
if (save_parg_string)
|
|
readchar_arg = save_parg_string + readcount;
|
|
goto FILL_ANOTHER_SUBSTRING;
|
|
|
|
} /* jpn_parser() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* load_jpntree */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of load_jpn_language. Builds a JPNTREE
|
|
* from a file of packed EUC compound words.
|
|
* Basically a copy of load_wordtree() in lang.c.
|
|
*
|
|
* INPUT FILE FORMAT: One word per line, min 4 bytes (2 jchars),
|
|
* all words packed EUC. Preferred order is frequency of
|
|
* occurrence in the corpus to make searches efficient.
|
|
* Otherwise the words should at least be in random order or
|
|
* an order that will approximate a binary search.
|
|
* If first char is ASCII (ie not packed EUC), line is
|
|
* ignored as comments. Any ascii chars after packed EUC,
|
|
* such as whitespace and/or subsequent ascii comments,
|
|
* delimits word token (ie anything else on the line is ignored).
|
|
* "Line" ends in ascii linefeed (\n).
|
|
*
|
|
* RETURNS 0 if file successfully loaded, returns 1 if file missing,
|
|
* returns 2 and messages in global msglist if file has fatal errors.
|
|
*/
|
|
static int load_jpntree (
|
|
JPNTREE **treetop,
|
|
char *fname)
|
|
{
|
|
int i;
|
|
int comment_count = 0;
|
|
int node_count = 0;
|
|
int is_duplicate;
|
|
long linecount = 0;
|
|
UCHAR *cptr;
|
|
UCHAR readbuf [256];
|
|
char sprintbuf [_POSIX_PATH_MAX + 1024];
|
|
FILE *fileid;
|
|
JPNTREE *new;
|
|
JPNTREE **this_link;
|
|
|
|
if (debugging_loadlang | debugging_loadword)
|
|
fprintf (aa_stderr, PROGNAME"1071 "
|
|
"load_jpntree: fname='%s'\n", NULLORSTR(fname));
|
|
|
|
if ((fileid = fopen (fname, "rt")) == NULL) {
|
|
/* Not being able to find the file is not an error.
|
|
* We indicate that with the return code.
|
|
* But any other error (like permissions) is fatal.
|
|
*/
|
|
if (errno == ENOENT) {
|
|
if (debugging_loadlang | debugging_loadword)
|
|
fputs (" ...file not found.\n", aa_stderr);
|
|
return 1;
|
|
}
|
|
else {
|
|
sprintf (sprintbuf,
|
|
catgets (dtsearch_catd, MS_misc, 362, "%s: %s: %s."),
|
|
PROGNAME"362", fname, strerror(errno));
|
|
DtSearchAddMessage (sprintbuf);
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
/*--------- Main Read Loop ----------*/
|
|
while (fgets ((char*)readbuf, sizeof(readbuf), fileid) != NULL) {
|
|
linecount++;
|
|
/*
|
|
* Ignore lines beginning with any ascii char (comments).
|
|
* Otherwise first or only packed EUC token on line
|
|
* is the desired word.
|
|
*/
|
|
if (readbuf[0] < 0x80) {
|
|
comment_count++;
|
|
continue;
|
|
}
|
|
for (cptr = readbuf; *cptr >= 0x80; cptr++)
|
|
;
|
|
*cptr = 0;
|
|
if (debugging_loadword) {
|
|
fprintf (aa_stderr, " JPNWORD: '%s' %n", readbuf, &i);
|
|
while (i++ < 28)
|
|
fputc (' ', aa_stderr);
|
|
}
|
|
|
|
/* Test for word too short */
|
|
if (strlen((char*)readbuf) < 4) {
|
|
sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 23,
|
|
"%s Word '%s' on line %ld is too short.") ,
|
|
PROGNAME"1074", readbuf, linecount);
|
|
DtSearchAddMessage (sprintbuf);
|
|
continue;
|
|
}
|
|
|
|
/* Allocate and populate a new node */
|
|
i = strlen ((char*) readbuf);
|
|
new = austext_malloc (sizeof(JPNTREE) + i + 4,
|
|
PROGNAME"104", NULL);
|
|
new->llink = NULL;
|
|
new->rlink = NULL;
|
|
new->next = NULL;
|
|
new->len = i;
|
|
new->word = (void *) (new + 1);
|
|
strcpy (new->word, (char *) readbuf);
|
|
|
|
/* Search binary tree, comparing only first 4 bytes */
|
|
is_duplicate = FALSE;
|
|
for (this_link = treetop; *this_link != NULL; ) {
|
|
i = strncmp (new->word, (*this_link)->word, 4);
|
|
|
|
if (i == 0) {
|
|
/* If first 4 bytes are similar, search
|
|
* linked list, comparing entire string.
|
|
*/
|
|
while (*this_link != NULL) {
|
|
i = strcmp (new->word, (*this_link)->word);
|
|
|
|
/* Test for duplicate word */
|
|
if (i == 0) {
|
|
sprintf (sprintbuf,
|
|
catgets (dtsearch_catd, MS_misc, 423,
|
|
"%s Word '%s' in '%s' is a duplicate."),
|
|
PROGNAME"423", readbuf, fname);
|
|
DtSearchAddMessage (sprintbuf);
|
|
/* duplicates aren't fatal, just ignore the word */
|
|
is_duplicate = TRUE;
|
|
break; /* discontinue list search */
|
|
}
|
|
if (debugging_loadword)
|
|
fputc('-', aa_stderr);
|
|
this_link = &(*this_link)->next;
|
|
} /* end linked list search */
|
|
|
|
break; /* discontinue tree search */
|
|
} /* endif where first 4 bytes matched at a tree node */
|
|
|
|
/* First 4 bytes dissimilar. Descend tree
|
|
* to find next possible insertion point.
|
|
*/
|
|
if (debugging_loadword)
|
|
fputc(((i < 0)? 'L' : 'R'), aa_stderr);
|
|
this_link = (JPNTREE **) ((i < 0) ?
|
|
&(*this_link)->llink : &(*this_link)->rlink);
|
|
} /* end binary tree search */
|
|
|
|
/* Don't link anything if error found while descending tree */
|
|
if (is_duplicate) {
|
|
if (debugging_loadword)
|
|
fputs (" duplicate!\n", aa_stderr);
|
|
free (new);
|
|
continue;
|
|
}
|
|
|
|
/* Insert new node at current location in tree */
|
|
*this_link = new;
|
|
if (debugging_loadword)
|
|
fputs(".\n", aa_stderr);
|
|
node_count++;
|
|
} /* end of read loop */
|
|
|
|
fclose (fileid);
|
|
|
|
if (node_count <= 0) {
|
|
if (debugging_loadlang | debugging_loadword)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"1185 load '%s' unsuccessful, %d comments discarded.\n",
|
|
fname, comment_count);
|
|
sprintf (sprintbuf, catgets(dtsearch_catd, MS_lang, 24,
|
|
"%s No Japanese words in word file '%s'.") ,
|
|
PROGNAME"1186", fname);
|
|
DtSearchAddMessage (sprintbuf);
|
|
return 2;
|
|
}
|
|
else {
|
|
if (debugging_loadlang | debugging_loadword)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"1193 load word file '%s' successful, %d words.\n",
|
|
fname, node_count);
|
|
return 0;
|
|
}
|
|
} /* load_jpntree() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* load_jpn_language */
|
|
/* */
|
|
/************************************************/
|
|
/* Loads a dblk with japanese (DtSrLaJPN, DtSrLaJPN2)
|
|
* structures and function pointers.
|
|
* Called from load_language(), with identical input and output.
|
|
* Does not reload structures previously loaded in
|
|
* other jpn dblks on dblist if derived from identical files.
|
|
* But always loads structures if passed dblist is NULL.
|
|
* Presumes dblk already partially initialized:
|
|
* name, path, language, flags.
|
|
* Returns TRUE if all successful. Otherwise
|
|
* returns FALSE with err msgs on ausapi_msglist.
|
|
*/
|
|
int load_jpn_language (DBLK *dblk, DBLK *dblist)
|
|
{
|
|
extern int ascii_charmap[]; /* in lang.c */
|
|
int i;
|
|
int errcount = 0;
|
|
JPNBLK *jpnblk;
|
|
char fname [_POSIX_PATH_MAX + 4];
|
|
char path [_POSIX_PATH_MAX + 4];
|
|
char msgbuf [_POSIX_PATH_MAX + 128];
|
|
|
|
dblk->charmap = ascii_charmap; /* for teskey */
|
|
dblk->parser = jpn_parser;
|
|
dblk->lstrupr = null_lstrupr;
|
|
dblk->stemmer = null_stemmer;
|
|
if (dblk->dbrec.or_maxwordsz == 0) /* for teskey */
|
|
dblk->dbrec.or_maxwordsz = MAXWIDTH_SWORD - 1;
|
|
if (dblk->dbrec.or_minwordsz == 0) /* for teskey */
|
|
dblk->dbrec.or_minwordsz = MINWIDTH_TOKEN + 1;
|
|
jpnblk = austext_malloc (sizeof(JPNBLK) + 4, PROGNAME"2107", NULL);
|
|
memset (jpnblk, 0, sizeof(JPNBLK));
|
|
dblk->parse_extra = (void *) jpnblk;
|
|
|
|
/* Load optional katakana and kanji word lists.
|
|
* If specific dblk version not found,
|
|
* try the default language version.
|
|
* If either has load errors, return a failure.
|
|
* If both are missing, just forget it.
|
|
*/
|
|
if (dblk->path == NULL)
|
|
path[0] = 0;
|
|
else {
|
|
if (strlen (dblk->path) > _POSIX_PATH_MAX - 14) {
|
|
sprintf (msgbuf, catgets(dtsearch_catd, MS_lang, 25,
|
|
"%s Database '%s' path too long: '%s'.") ,
|
|
PROGNAME"759", dblk->name, dblk->path);
|
|
DtSearchAddMessage (msgbuf);
|
|
return FALSE;
|
|
}
|
|
strcpy (path, dblk->path);
|
|
ensure_end_slash (path);
|
|
}
|
|
|
|
#ifdef NO_KATAKANA_TREES_YET
|
|
/* Load katakana wordtree */
|
|
strcpy (fname, path);
|
|
strcat (fname, dblk->name);
|
|
strcat (fname, EXT_KATAKANA);
|
|
i = load_jpntree (&jpnblk->katatree, fname);
|
|
if (i == 1) { /* ...db specific file not found */
|
|
if (jpn_katatree == NULL) { /* load default... */
|
|
strcpy (fname, path);
|
|
strcat (fname, "jpn");
|
|
strcat (fname, EXT_KATAKANA);
|
|
i = load_jpntree (&jpn_katatree, fname);
|
|
}
|
|
else /* default already loaded */
|
|
i == 0;
|
|
jpnblk->katatree = jpn_katatree;
|
|
}
|
|
if (i > 1)
|
|
errcount++;
|
|
#endif /* NO_KATAKANA_TREES_YET */
|
|
|
|
/* Load kanji wordtree only if kanji compounds are derived
|
|
* from list in file, ie for language DtSrLaJPN2 only.
|
|
* If database specific list not found,
|
|
* use language generic list. If language generic
|
|
* list also not found, ignore compounding.
|
|
* Only one language generic list will
|
|
* be loaded, at jpn_kanjitree.
|
|
*/
|
|
if (dblk->dbrec.or_language == DtSrLaJPN2) {
|
|
strcpy (fname, path);
|
|
strcat (fname, dblk->name);
|
|
strcat (fname, EXT_KANJI);
|
|
i = load_jpntree (&jpnblk->kanjitree, fname);
|
|
if (i == 1) { /* ...db specific file not found */
|
|
/* If the generic knj file (jpn.knj) was
|
|
* never loaded, try loading it now.
|
|
*/
|
|
if (jpn_kanjitree == NULL) {
|
|
strcpy (fname, path);
|
|
strcat (fname, "jpn");
|
|
strcat (fname, EXT_KANJI);
|
|
load_jpntree (&jpn_kanjitree, fname);
|
|
/* (it either worked or it didn't) */
|
|
}
|
|
/* Whether generic load successful or not,
|
|
* try to use it (eg it might still be NULL).
|
|
*/
|
|
jpnblk->kanjitree = jpn_kanjitree;
|
|
}
|
|
if (i > 1) /* error trying to open db specific file */
|
|
errcount++;
|
|
}
|
|
|
|
return (errcount > 0)? FALSE : TRUE;
|
|
|
|
} /* load_jpn_language() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* free_jpntree */
|
|
/* */
|
|
/************************************************/
|
|
/* Identical to free_wordtree() in lang.c
|
|
* (link inversion traversal, from Data Structure Techniques,
|
|
* Thomas A. Standish, Algorithm 3.6),
|
|
* except post order visit includes freeing
|
|
* linked list at each tree node.
|
|
*/
|
|
static void free_jpntree (JPNTREE ** jpntree_head)
|
|
{
|
|
JPNTREE *next, *prev, *pres;
|
|
JPNTREE *listp, *next_listp;
|
|
|
|
if (*jpntree_head == NULL)
|
|
return;
|
|
pres = *jpntree_head;
|
|
prev = NULL;
|
|
|
|
DESCEND_LEFT:
|
|
pres->word = (void *) 0; /* preorder visit: TAG = 0 */
|
|
next = pres->llink;
|
|
if (next != NULL) {
|
|
pres->llink = prev;
|
|
prev = pres;
|
|
pres = next;
|
|
goto DESCEND_LEFT;
|
|
}
|
|
DESCEND_RIGHT:
|
|
next = pres->rlink;
|
|
if (next != NULL) {
|
|
pres->word = (void *) 1; /* TAG = 1 */
|
|
pres->rlink = prev;
|
|
prev = pres;
|
|
pres = next;
|
|
goto DESCEND_LEFT;
|
|
}
|
|
POSTORDER_VISIT:
|
|
listp = pres;
|
|
while (listp->next) {
|
|
next_listp = listp->next;
|
|
free (listp);
|
|
listp = next_listp;
|
|
}
|
|
free (listp);
|
|
|
|
if (prev == NULL) { /* end of algorithm? */
|
|
*jpntree_head = NULL;
|
|
return;
|
|
}
|
|
if (prev->word == (void *) 0) { /* go up left leg */
|
|
next = prev->llink;
|
|
pres = prev;
|
|
prev = next;
|
|
goto DESCEND_RIGHT;
|
|
}
|
|
else { /* go up right leg */
|
|
next = prev->rlink;
|
|
prev->word = (void *) 0; /* restore TAG = 0 */
|
|
pres = prev;
|
|
prev = next;
|
|
goto POSTORDER_VISIT;
|
|
}
|
|
} /* free_jpntree() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* unload_jpn_language */
|
|
/* */
|
|
/************************************************/
|
|
/* Frees storage for structures allocated by load_jpn_language().
|
|
* Called when engine REINITs due to change in site config file
|
|
* or databases.
|
|
* The global jpntrees are not currently unloaded because they
|
|
* are presumed valid for the duration of the engine session.
|
|
* Currently there are no teskey trees (inclist, stoplist) to free.
|
|
*/
|
|
void unload_jpn_language (DBLK *dblk)
|
|
{
|
|
/* free jpnblk and any database-associated jpntrees */
|
|
if (dblk->parse_extra) {
|
|
JPNBLK *jpnblk = (JPNBLK *) dblk->parse_extra;
|
|
if (jpnblk->katatree && jpnblk->katatree != jpn_katatree)
|
|
free_jpntree (&jpnblk->katatree);
|
|
if (jpnblk->kanjitree && jpnblk->kanjitree != jpn_kanjitree)
|
|
free_jpntree (&jpnblk->kanjitree);
|
|
free (jpnblk);
|
|
dblk->parse_extra = NULL;
|
|
}
|
|
return;
|
|
} /* unload_jpn_language() */
|
|
|
|
/******************** JPN.C ********************/
|
|
|