mirror of
git://git.code.sf.net/p/cdesktopenv/code
synced 2025-03-09 15:50:02 +00:00
Most of these are related to missing includes and prototypes as well as parens/braces. A few are also potential 64bit issues.
1596 lines
47 KiB
C
1596 lines
47 KiB
C
/*
|
|
* CDE - Common Desktop Environment
|
|
*
|
|
* Copyright (c) 1993-2012, The Open Group. All rights reserved.
|
|
*
|
|
* These libraries and programs are free software; you can
|
|
* redistribute them and/or modify them under the terms of the GNU
|
|
* Lesser General Public License as published by the Free Software
|
|
* Foundation; either version 2 of the License, or (at your option)
|
|
* any later version.
|
|
*
|
|
* These libraries and programs are distributed in the hope that
|
|
* they will be useful, but WITHOUT ANY WARRANTY; without even the
|
|
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
* PURPOSE. See the GNU Lesser General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with these librararies and programs; if not, write
|
|
* to the Free Software Foundation, Inc., 51 Franklin Street, Fifth
|
|
* Floor, Boston, MA 02110-1301 USA
|
|
*/
|
|
/* $XConsortium: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
|
|
*
|
|
* (c) Copyright 1996 Digital Equipment Corporation.
|
|
* (c) Copyright 1996 Hewlett-Packard Company.
|
|
* (c) Copyright 1996 International Business Machines Corp.
|
|
* (c) Copyright 1996 Sun Microsystems, Inc.
|
|
* (c) Copyright 1996 Novell, Inc.
|
|
* (c) Copyright 1996 FUJITSU LIMITED.
|
|
* (c) Copyright 1996 Hitachi.
|
|
*/
|
|
/*
|
|
* COMPONENT_NAME: austext
|
|
*
|
|
* FUNCTIONS: boolean_search
|
|
* calc_result_bitvec_WK
|
|
* calculate_idfs
|
|
* dbread_filter_WK
|
|
* get_proximity
|
|
* got_USR_STOPSRCH
|
|
* load_DtSrResults_WK
|
|
* load_or_wordrecs
|
|
* read_d99
|
|
* read_recno
|
|
* read_stem_bitvec_WK
|
|
* stuff_DtSrResult
|
|
* weights_filter_WK
|
|
*
|
|
* ORIGINS: 27
|
|
*
|
|
*
|
|
* (C) COPYRIGHT International Business Machines Corp. 1996
|
|
* All Rights Reserved
|
|
* Licensed Materials - Property of IBM
|
|
* US Government Users Restricted Rights - Use, duplication or
|
|
* disclosure restricted by GSA ADP Schedule Contract with IBM Corp.
|
|
*/
|
|
/********************* BOOLSRCH.C **********************
|
|
* $Id: boolsrch.c /main/4 1996/09/23 21:00:18 cde-ibm $
|
|
* February 1996.
|
|
* The vista code from the original vewords.c.
|
|
* Given a final truth table and stems array from the user's boolean
|
|
* query (output of boolean_search()), find all database records
|
|
* containing the truth table's set operations and return
|
|
* their database addresses in a resultlist.
|
|
* See boolpars.h for format and limitations of TRUTHTAB.
|
|
*
|
|
*-------------- D99DBA TO DBA CONVERSION ----------------
|
|
* 'd99dbas' are not real vista dbas! They were modified
|
|
* as follows to permit shorter bit vectors,
|
|
* and to minimize bit shifts at search time.
|
|
* vista_dba <- (OR_D00 << 24) | vista_slot
|
|
* vista_slot <- ((d99recno - 1) * or_recslots) + 2
|
|
* d99dba <- (d99recno << 8) | weight_byte
|
|
* d99recno <- ((vista_slot - 2) / or_recslots) + 1
|
|
* The d99 and bitvec recno of the first rec is 1.
|
|
* The slotno (vista dba) of the first rec is 2
|
|
* (dbrec occupies first slot and vista slots begin at 1).
|
|
*
|
|
* $Log$
|
|
* Revision 1.5 1996/03/20 19:21:49 miker
|
|
* Completed collocations code. Restored get_colloc_bitvec() from colloc.c.
|
|
*
|
|
* Revision 1.4 1996/03/18 22:06:24 miker
|
|
* Bug fix. Zero permute NOT queries always returned no hits.
|
|
*
|
|
* Revision 1.3 1996/03/13 23:05:24 miker
|
|
* Change long double constant to regular float for better portability.
|
|
*
|
|
* Revision 1.2 1996/03/13 22:36:37 miker
|
|
* Changed char to UCHAR several places; similar typecasts.
|
|
* Moved collocations processing to colloc.c.
|
|
*
|
|
* Revision 1.1 1996/03/05 15:52:06 miker
|
|
* Initial revision
|
|
*/
|
|
/***#define _ALL_SOURCE****/ /* to pickup typedefs for shm vnodes */
|
|
#include "SearchE.h"
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include "vista.h"
|
|
#include "boolpars.h"
|
|
|
|
#define PROGNAME "BOOLSRCH"
|
|
#define INIT_ITERATIONS 50
|
|
#define MS_boolsrch 16
|
|
/*
|
|
* DBAS_PER_BLOCK is the max number of dbas to be read
|
|
* from d99 file. Note DBAS_PER_BLOCK * sizeof(DB_ADDR) = 512 bytes,
|
|
* the standard blksize of one hard disk block.
|
|
*/
|
|
#define DBAS_PER_BLOCK 128
|
|
|
|
#define RESET_BIT(bv, by, bm) bv[by] &= (UCHAR) ~bm
|
|
|
|
#if (DtSrMAX_STEMCOUNT != 8)
|
|
#error DtSrMAX_STEMCOUNT does not equal 8.
|
|
#endif
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* PROXWT */
|
|
/* */
|
|
/****************************************/
|
|
typedef struct {
|
|
float wt;
|
|
long byteno;
|
|
int bitmask;
|
|
int proximity;
|
|
} PROXWT;
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* GLOBALS */
|
|
/* */
|
|
/****************************************/
|
|
int debugging_boolsrch = FALSE;
|
|
|
|
static int all_key_types = TRUE;
|
|
static UCHAR *bitvec_allocp = NULL;
|
|
static size_t bitvec_allocsz = 0;
|
|
static long bitveclen; /* 1/8 of tot_addr_count */
|
|
static UCHAR *bitvecs [DtSrMAX_STEMCOUNT];
|
|
static int check_dates = FALSE;
|
|
static int do_stat_sort = FALSE;
|
|
static double idf [DtSrMAX_STEMCOUNT];
|
|
static char *msgbuf = NULL;
|
|
static int need_zero_permute = FALSE;
|
|
static struct or_objrec objrec;
|
|
static DB_ADDR objrecdba;
|
|
static int or_abstrsz = 0;
|
|
static int or_fzkeysz = 0;
|
|
static short or_language = DtSrLaENG;
|
|
static long or_maxdba; /* largest dba in database */
|
|
static long or_reccount; /* tot num db obj (real_num_rec) */
|
|
static long or_recslots; /* D00 slots per obj (slot_d00) */
|
|
static struct or_hwordrec
|
|
*or_wordrecs = NULL;
|
|
static PROXWT *proxwts = NULL;
|
|
static int proxwtct;
|
|
static UCHAR *result_bitvec;
|
|
static long result_count = 0;
|
|
static DtSrResult *resultlist = NULL;
|
|
static int save_stemno = 0;
|
|
static long tot_addr_count; /* may be > reccount bcs deletes */
|
|
static int vistano;
|
|
static float *wtvec = NULL;
|
|
|
|
extern void find_keyword (char *cur_word, int vista_num);
|
|
extern void read_wordstr (struct or_hwordrec * glob_word, int vista_num);
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* got_USR_STOPSRCH */
|
|
/* */
|
|
/************************************************/
|
|
/* Called at beginning of every workproc.
|
|
* Returns TRUE if user pushed STOP SEARCH button,
|
|
* else FALSE.
|
|
*/
|
|
static int got_USR_STOPSRCH (void)
|
|
{
|
|
if ((usrblk.flags & USR_STOPSRCH) == 0)
|
|
return FALSE;
|
|
if (OE_flags & OE_AUDIT)
|
|
oe_write_audit_rec (-1L);
|
|
usrblk.retncode = OE_USER_STOP;
|
|
return TRUE;
|
|
}
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* read_recno */
|
|
/* */
|
|
/****************************************/
|
|
/* Utility function.
|
|
* Reads a database record given a d99 record number.
|
|
* Returns TRUE and loads globals objrec and objrecdba
|
|
* on success, else returns FALSE.
|
|
*/
|
|
static int read_recno (long recno)
|
|
{
|
|
/* Convert recno to a real dba */
|
|
objrecdba = (recno - 1) * or_recslots + 2;
|
|
if (objrecdba >= or_maxdba)
|
|
return FALSE;
|
|
objrecdba |= (OR_D00 << 24);
|
|
|
|
/* Read the object record.
|
|
* Skip records with database read errors.
|
|
* Use d_crset instead of CRSET and d_recread
|
|
* instead of RECREAD to trap vista errors
|
|
* without aborting.
|
|
*/
|
|
d_crset (&objrecdba, vistano);
|
|
if (db_status != S_OKAY) {
|
|
BAD_DBA:
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr,
|
|
PROGNAME"434 Invalid dba %ld. "
|
|
"recno=%ld bitvec[%d]=%02x db_status=%d.\n",
|
|
objrecdba, recno, recno>>3, 1<<(recno%8), db_status);
|
|
fflush (aa_stderr);
|
|
}
|
|
return FALSE;
|
|
}
|
|
d_recread (&objrec, vistano);
|
|
if (db_status != S_OKAY)
|
|
goto BAD_DBA;
|
|
swab_objrec (&objrec, NTOH);
|
|
return TRUE;
|
|
} /* read_recno() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* calculate_idfs */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of boolean_search() initialization.
|
|
* Loads idf[] (inverse doc frequency) for each stem.
|
|
* IDF = 1.0 for a word that occurs in every record.
|
|
* For a word that occurs only once in entire database:
|
|
* NUM OF DB RECS IDF OF SINGULAR WORD
|
|
* 10 4.32
|
|
* 100 7.64
|
|
* 1,000 10.97
|
|
* 10,000 14.29
|
|
* 100,000 17.61
|
|
* 1,000,000 20.93
|
|
* 10,000,000 24.25
|
|
*/
|
|
static void calculate_idfs (void)
|
|
{
|
|
int i;
|
|
char *cptr;
|
|
double dbl;
|
|
|
|
for (i = 0; i < saveusr.stemcount; i++) {
|
|
if ( or_wordrecs[i].or_hwaddrs == 0 ||
|
|
or_wordrecs[i].or_hwordkey[0] == '@')
|
|
idf[i] = 0.0;
|
|
else {
|
|
/* ln(2) = 0.693147181 */
|
|
dbl = (double) or_reccount / (double) or_wordrecs[i].or_hwaddrs;
|
|
idf[i] = log(dbl) / 0.693147181 + 1.0;
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"733 IDF[%d] numdocs=%5ld idf=%lf\n",
|
|
i, or_wordrecs[i].or_hwaddrs, idf[i]);
|
|
}
|
|
}
|
|
return;
|
|
} /* calculate_idfs() */
|
|
|
|
|
|
/************************************************/
|
|
/* */
|
|
/* load_or_wordrecs */
|
|
/* */
|
|
/************************************************/
|
|
/* Subroutine of boolean_search() initialization.
|
|
* Loads or_wordrecs[] array with vista key file
|
|
* records for each term in saveusr.stems.
|
|
* Returns TRUE on success. Else returns FALSE with
|
|
* appropriate usrblk.retncode and user msgs on msglist.
|
|
*/
|
|
static int load_or_wordrecs (void)
|
|
{
|
|
int i, j, k;
|
|
int stemno;
|
|
struct or_hwordrec
|
|
*wordrec;
|
|
int colloc_count = 0;
|
|
int not_found_count = 0;
|
|
|
|
if (or_wordrecs)
|
|
free (or_wordrecs);
|
|
or_wordrecs = austext_malloc (
|
|
saveusr.stemcount * sizeof (struct or_hwordrec) + 16,
|
|
PROGNAME "782", NULL);
|
|
|
|
for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
|
|
|
|
wordrec = &or_wordrecs [stemno];
|
|
|
|
/* If this is a collocation term,
|
|
* save the two indexes and the collocation
|
|
* value in the wordrec buffer instead of usual
|
|
* offsets and dba counts.
|
|
*/
|
|
if (saveusr.stems[stemno][0] == '@') {
|
|
strcpy (wordrec->or_hwordkey, saveusr.stems[stemno]);
|
|
sscanf (saveusr.stems[stemno], COLLOC_STEM_FORMAT, &i, &j, &k);
|
|
wordrec->or_hwoffset = i;
|
|
wordrec->or_hwfree = j;
|
|
wordrec->or_hwaddrs = k;
|
|
colloc_count++;
|
|
continue;
|
|
}
|
|
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr, PROGNAME"823 KEYFIND[%d] ", stemno);
|
|
find_keyword (saveusr.stems[stemno], vistano);
|
|
/*
|
|
* If term is found, add it to the or_wordrecs[] array.
|
|
* But it is an error to include a word in more records
|
|
* than the max specified in site config file. This is
|
|
* meaningful for databases where certain common high
|
|
* frequency words slip by which should be on the stoplist.
|
|
* It's possible in huge databases to run out of memory
|
|
* assembling very long resultlists.
|
|
*/
|
|
if (db_status == S_OKAY) {
|
|
strncpy (wordrec->or_hwordkey, saveusr.stems[stemno],
|
|
DtSrMAXWIDTH_HWORD);
|
|
wordrec->or_hwordkey [DtSrMAXWIDTH_HWORD - 1] = 0;
|
|
read_wordstr (wordrec, vistano);
|
|
if (db_status != S_OKAY) {
|
|
/* Probable corrupted database. The btree
|
|
* read succeeded but the record read failed.
|
|
*/
|
|
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 6,
|
|
"%s Database Error. Word '%s' is\n"
|
|
"listed in database '%s' but has no index record.") ,
|
|
PROGNAME"295", usrblk.stems[stemno], usrblk.dblk->label);
|
|
DtSearchAddMessage (msgbuf);
|
|
usrblk.retncode = OE_SYSTEM_STOP;
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr,
|
|
"db error, db_status = %d.\n", db_status);
|
|
return FALSE;
|
|
}
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr, "ofs=%ld addrs=%ld free=%ld\n",
|
|
wordrec->or_hwoffset,
|
|
wordrec->or_hwaddrs,
|
|
wordrec->or_hwfree);
|
|
if (wordrec->or_hwaddrs > OE_words_hitlimit) {
|
|
sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 14,
|
|
"%s '%s' has more than %ld hits.\n"
|
|
"Please remove it from the query or raise the WHITLIM\n"
|
|
"value in the search engine configuration file."),
|
|
PROGNAME"1444", wordrec->or_hwordkey, OE_words_hitlimit);
|
|
DtSearchAddMessage (msgbuf);
|
|
/* Also log WHITLIM msg for administrator... */
|
|
fprintf (aa_stderr, "%s\n", msgbuf);
|
|
usrblk.retncode = OE_BAD_QUERY;
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/* Only other possible nonfatal vista return is S_NOTFOUND.
|
|
* If qry_is_all_ANDs we can quit right now.
|
|
* Otherwise switch off all bits in the word's bit vector.
|
|
*/
|
|
else if (qry_is_all_ANDs) {
|
|
if (debugging_boolsrch)
|
|
fputs ("not found, qry_all_ANDs, quit.\n", aa_stderr);
|
|
usrblk.retncode = OE_NOTAVAIL;
|
|
return FALSE;
|
|
}
|
|
|
|
else {
|
|
memset (wordrec, 0, sizeof(struct or_hwordrec));
|
|
if (debugging_boolsrch)
|
|
fputs ("not found, addrs-->0.\n", aa_stderr);
|
|
not_found_count++;
|
|
}
|
|
|
|
} /* end loop for each term in saveusr.stems[] */
|
|
|
|
/* It's a failure if all the user's words
|
|
* don't exist in database.
|
|
*/
|
|
if (not_found_count + colloc_count >= saveusr.stemcount) {
|
|
usrblk.retncode = OE_NOTAVAIL;
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
} /* load_or_wordrecs() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* get_proximity */
|
|
/* */
|
|
/****************************************/
|
|
/* Subroutine of stuff_DtSrResult().
|
|
* Given d99recno, finds proxwt[] for record,
|
|
* calculates and returns integer proximity.
|
|
*/
|
|
static int get_proximity (long recno)
|
|
{
|
|
long byteno = recno >> 3;
|
|
int bitmask = 1 << (recno % 8);
|
|
int i;
|
|
for (i = 0; i < proxwtct; i++)
|
|
if (proxwts[i].byteno == byteno && proxwts[i].bitmask == bitmask)
|
|
break;
|
|
if (i >= proxwtct)
|
|
return -1;
|
|
return proxwts[i].proximity;
|
|
} /* get_proximity() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* stuff_DtSrResult */
|
|
/* */
|
|
/****************************************/
|
|
/* Subroutine of load_DtSrResults_WK().
|
|
* Loads passed DtSrResult structure with data from global objrec.
|
|
* Performs additional vista reads as necessary to get misc recs.
|
|
*/
|
|
static void stuff_DtSrResult (
|
|
DtSrResult *new,
|
|
long recno)
|
|
{
|
|
int m;
|
|
int fzkey_remaining;
|
|
char *src, *targ, *targend;
|
|
static struct or_miscrec
|
|
miscrecbuf;
|
|
|
|
new->objflags = objrec.or_objflags;
|
|
new->objuflags = objrec.or_objuflags;
|
|
new->objsize = objrec.or_objsize;
|
|
new->objdate = objrec.or_objdate;
|
|
new->objtype = objrec.or_objtype;
|
|
new->objcost = objrec.or_objcost;
|
|
new->dbn = OE_dbn;
|
|
new->dba = objrecdba;
|
|
new->language = or_language;
|
|
strncpy (new->reckey, objrec.or_objkey, DtSrMAX_DB_KEYSIZE);
|
|
if (do_stat_sort)
|
|
new->proximity = get_proximity (recno);
|
|
|
|
/* The abstract immediately follows the fuzzy key
|
|
* in the FZKABS misc recs. It may span several recs.
|
|
*/
|
|
new->abstractp = (char *) (new + 1);
|
|
if (or_abstrsz > 0) {
|
|
targ = new->abstractp;
|
|
targend = targ + or_abstrsz - 1;
|
|
fzkey_remaining = or_fzkeysz;
|
|
CRSET (PROGNAME"226", &objrecdba, vistano);
|
|
SETOR (PROGNAME"227", OR_OBJ_MISCS, saveusr.vistano);
|
|
FINDFM (PROGNAME"228", OR_OBJ_MISCS, saveusr.vistano);
|
|
|
|
while (db_status == S_OKAY) {
|
|
RECREAD (PROGNAME"2209", &miscrecbuf, saveusr.vistano);
|
|
NTOHS (miscrecbuf.or_misctype);
|
|
if (miscrecbuf.or_misctype == ORM_FZKABS) {
|
|
src = (char *) miscrecbuf.or_misc;
|
|
|
|
for (m = 0; m < sizeof(miscrecbuf.or_misc); m++) {
|
|
|
|
/* skip over the fzkey */
|
|
if (fzkey_remaining > 0) {
|
|
src++;
|
|
fzkey_remaining--;
|
|
continue;
|
|
}
|
|
|
|
/* copy the abstract */
|
|
*targ = *src;
|
|
if (*src++ == 0 || targ++ >= targend) {
|
|
*targ = 0;
|
|
targ = targend; /* force outer loop end */
|
|
break;
|
|
}
|
|
} /* end for-loop m */
|
|
} /* end (misctype == FZKABS) */
|
|
|
|
if (targ >= targend)
|
|
break;
|
|
FINDNM (PROGNAME"545", OR_OBJ_MISCS, saveusr.vistano);
|
|
} /* end while-loop */
|
|
|
|
} /* endif: (or_abstrsz > 0) */
|
|
|
|
return;
|
|
} /* stuff_DtSrResult() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* load_DtSrResults_WK */
|
|
/* */
|
|
/****************************************/
|
|
/* Builds DtSrResult list for every record
|
|
* in result_bitvec, but not more than aa_maxhits.
|
|
*/
|
|
static void load_DtSrResults_WK (void)
|
|
{
|
|
long recno;
|
|
int bitno;
|
|
long byteno;
|
|
int i;
|
|
long dittocount;
|
|
DtSrResult *resultp;
|
|
size_t resultsz = sizeof(DtSrResult) + or_abstrsz + 4;
|
|
|
|
if (got_USR_STOPSRCH())
|
|
return;
|
|
if (resultlist) {
|
|
DtSearchFreeResults (&resultlist);
|
|
resultlist = NULL;
|
|
}
|
|
|
|
/* Make a single pass through the final result_bitvec.
|
|
* For each nonzero bit, ie each database record
|
|
* that satisfies the query requirements,
|
|
* retrieve the record and push it onto the
|
|
* DtSrResult list. If not sorting records,
|
|
* stop when we reach the user's specified aa_maxhits count.
|
|
*/
|
|
dittocount = 0;
|
|
for (recno = 1; recno < tot_addr_count; recno++) {
|
|
byteno = recno >> 3; /* divide by 8 */
|
|
bitno = recno % 8;
|
|
|
|
/* Skip zero bits */
|
|
if ((result_bitvec[byteno] & (1 << bitno)) == 0)
|
|
continue;
|
|
|
|
if (!read_recno (recno))
|
|
continue;
|
|
|
|
/* Create new DtSrResult node, push it onto resultlist. */
|
|
resultp = austext_malloc (resultsz + 4, PROGNAME"466", NULL);
|
|
memset (resultp, 0, resultsz);
|
|
resultp->link = resultlist;
|
|
resultlist = resultp;
|
|
|
|
/* Load the new DtSrResult node from the object record */
|
|
stuff_DtSrResult (resultp, recno);
|
|
|
|
/* Check if any more reads are necessary.
|
|
* If not sorting, stop after aa_maxhits.
|
|
* If sorting, there won't be more than
|
|
* aa_maxhits recs in the bitvec anyway.
|
|
*/
|
|
dittocount++;
|
|
if (dittocount >= aa_maxhits)
|
|
break;
|
|
|
|
} /* end bitvec loop */
|
|
|
|
|
|
/*--------- All Done. Clean up and return to caller. ---------*/
|
|
/*@@@@@@ make separate workproc call if aa_maxhits > 100.
|
|
@@@@@ sort may take a long time */
|
|
if (wtvec) {
|
|
free (wtvec);
|
|
wtvec = NULL;
|
|
}
|
|
if (proxwts) {
|
|
free (proxwts);
|
|
proxwts = NULL;
|
|
}
|
|
|
|
if (dittocount <= 0) {
|
|
usrblk.workproc = dummy_workproc;
|
|
usrblk.retncode = OE_NOTAVAIL;
|
|
return;
|
|
}
|
|
|
|
usrblk.retncode = OE_OK;
|
|
usrblk.workproc = dummy_workproc;
|
|
|
|
usrblk.stemcount = saveusr.stemcount;
|
|
if (usrblk.search_type == 'W')
|
|
memcpy (usrblk.stems, saveusr.stems,
|
|
saveusr.stemcount * DtSrMAXWIDTH_HWORD);
|
|
else
|
|
/* Don't copy first char (ctrl-o) stem */
|
|
for (i = 0; i < saveusr.stemcount; i++)
|
|
strcpy (usrblk.stems[i], &saveusr.stems[i][1]);
|
|
|
|
if (do_stat_sort)
|
|
DtSearchSortResults (&resultlist, DtSrSORT_PROX);
|
|
usrblk.dittocount = dittocount;
|
|
if (usrblk.dittolist)
|
|
DtSearchFreeResults (&usrblk.dittolist);
|
|
usrblk.dittolist = resultlist;
|
|
resultlist = NULL;
|
|
return;
|
|
} /* load_DtSrResults_WK() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* weights_filter_WK */
|
|
/* */
|
|
/****************************************/
|
|
/* This workproc is called only if we're doing statistical sorting.
|
|
* (1) It reduces the result_bitvec to it's final size,
|
|
* containing only the highest aa_maxhits statistical weights
|
|
* in wtvec.
|
|
* (2) It replaces (possibly large) wtvec with (probably much smaller)
|
|
* array of PROXWT structures containing the selected records'
|
|
* weights and calculated proximities, for final ranking sort.
|
|
*
|
|
*/
|
|
static void weights_filter_WK (void)
|
|
{
|
|
int i;
|
|
double scalefac;
|
|
long recno;
|
|
int smallest, biggest;
|
|
float biggestwt;
|
|
long byteno, smallest_byteno;
|
|
int bitmask, smallest_bitmask;
|
|
|
|
if (got_USR_STOPSRCH())
|
|
return;
|
|
|
|
/* Init weight filtering */
|
|
if (proxwts)
|
|
free (proxwts);
|
|
proxwtct = (result_count < aa_maxhits)? result_count : aa_maxhits;
|
|
proxwts = austext_malloc (proxwtct * sizeof(PROXWT) + 4,
|
|
PROGNAME"429", NULL);
|
|
memset (proxwts, 0, proxwtct * sizeof(PROXWT));
|
|
smallest = 0;
|
|
scalefac = 0.0;
|
|
biggestwt = 0.0; /* biggest single wt of all docs */
|
|
|
|
/* One pass thru entire result_bitvec */
|
|
for (recno = 1; recno < tot_addr_count; recno++) {
|
|
byteno = recno >> 3;
|
|
bitmask = 1 << (recno % 8);
|
|
|
|
/* Skip zero bits */
|
|
if ((result_bitvec[byteno] & bitmask) == 0)
|
|
continue;
|
|
|
|
/* Make scalefac = sum of squares of all wts in bitvec.
|
|
* It's possible that all or some of the weights are
|
|
* zero (eg queries like "~aaa" or "~aaa | bbb").
|
|
* In this case give them a very small positive number
|
|
* so we don't divide by zero later on.
|
|
*/
|
|
if (wtvec[recno] == 0.0)
|
|
wtvec[recno] = 0.1;
|
|
scalefac += (double) wtvec[recno] * (double) wtvec[recno];
|
|
|
|
/*
|
|
* The following logic first fills up the proxwts table.
|
|
* After that if a bitvec's weight is larger than the smallest
|
|
* proxwt, replace the smallest proxwt with the new weight
|
|
* and switch off the previous smallest in the original bitvec.
|
|
*/
|
|
|
|
/*
|
|
* Just discard rec on bitvec if it's weight
|
|
* is smaller than the current smallest.
|
|
*/
|
|
if (wtvec [recno] <= proxwts[smallest].wt) {
|
|
RESET_BIT (result_bitvec, byteno, bitmask);
|
|
result_count--;
|
|
continue;
|
|
}
|
|
/*
|
|
* Else discard current smallest if
|
|
* table full, ie it really points to something.
|
|
*/
|
|
if (proxwts[smallest].wt > 0.0) {
|
|
smallest_byteno = proxwts[smallest].byteno;
|
|
smallest_bitmask = proxwts[smallest].bitmask;
|
|
RESET_BIT (result_bitvec, smallest_byteno, smallest_bitmask);
|
|
result_count--;
|
|
}
|
|
|
|
/* Add this weight to the proxwts table. */
|
|
proxwts [smallest] .wt = wtvec [recno];
|
|
proxwts [smallest] .byteno = byteno;
|
|
proxwts [smallest] .bitmask = bitmask;
|
|
|
|
/* Keep track of the table entry that has
|
|
* the highest weight. This will eventually
|
|
* be the first sorted hit on the hitlist.
|
|
* It's weight/proximity will be used
|
|
* to scale the proximities of the
|
|
* other hits.
|
|
*/
|
|
if (biggestwt < wtvec[recno]) {
|
|
biggestwt = wtvec[recno];
|
|
biggest = smallest;
|
|
}
|
|
|
|
/* Find the next smallest */
|
|
smallest = 0;
|
|
for (i = 1; i < proxwtct; i++) {
|
|
if (proxwts[i].wt < proxwts[smallest].wt)
|
|
smallest = i;
|
|
}
|
|
|
|
} /* end loop on every recno */
|
|
|
|
free (wtvec);
|
|
wtvec = NULL;
|
|
|
|
/* PROXIMITY CALCULATIONS.
|
|
* In order to translate statistical weight into an AusText
|
|
* proximity, basically you have to invert it, then scale it.
|
|
* The statistical weight is a similarity measure: the
|
|
* larger it is the more similar the document to the query.
|
|
* But AusText 'proximity' is like a 'distance' measure,
|
|
* the smaller the number the closer the document is to the query.
|
|
*
|
|
* First 'normalize' each document's statistical
|
|
* weight to be a fraction between 0 and 1. Done
|
|
* by calculating a normalization factor,
|
|
* the sqrt of the sum of squares of weights of all
|
|
* docs that would have qualified for the hitlist
|
|
* if we weren't truncating. Note cosine-based normalization
|
|
* factor (Pythagorean) always >= largest wt so we can
|
|
* guarantee all normalized weights are > 0.0 and <= 1.0.
|
|
*
|
|
* The proximity itself is calculated as the 'percent value'
|
|
* that the doc is 'distant' from perfection (1.0 or 100%).
|
|
* For example, if the normalized weight of the first record
|
|
* is .931 then it's proximity will be 7 (100% - 93% = 7).
|
|
*
|
|
* The proximity of every other hit is scaled away
|
|
* from the first because the normalization algorithm
|
|
* tends to clump proximities when there are a lot of hits.
|
|
* Specifically the proximity of every hit is a constant
|
|
* scale factor (derived from the first proximity),
|
|
* divided by it's weight.
|
|
*
|
|
* A "bulls eye" (normalized weight = 1.0, proximity == 0)
|
|
* for the first hit is not allowed so scale factor will
|
|
* not also be zero. Otherwise *all* hits in that particular
|
|
* results list would be bulls eyes.
|
|
*/
|
|
scalefac = (double) biggestwt / sqrt (scalefac);
|
|
/* normalized weight of first hit */
|
|
scalefac = (1.0 - scalefac) * 100.0;
|
|
/* proximity of first hit */
|
|
if (scalefac < 1.0)
|
|
scalefac = 1.0;
|
|
/* No bulls eyes */
|
|
scalefac *= (double) biggestwt * 1.2;
|
|
/* scale factor for other hits */
|
|
for (i = 0; i < proxwtct; i++) {
|
|
proxwts[i].proximity = (int) (scalefac / (double) proxwts[i].wt);
|
|
if (proxwts[i].proximity > 9999)
|
|
proxwts[i].proximity = 9999;
|
|
}
|
|
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr,
|
|
PROGNAME"489 FINAL PROXWTS proxwtct=%d bigwt=%.2f scalefac=%.2lf\n",
|
|
proxwtct, biggestwt, scalefac);
|
|
for (i=0; i<10; i++) {
|
|
if (i >= proxwtct)
|
|
break;
|
|
fprintf (aa_stderr,
|
|
" byteno=%3ld bitmask=%02x wt=%.2f prox=%d\n",
|
|
proxwts[i].byteno, proxwts[i].bitmask,
|
|
proxwts[i].wt, proxwts[i].proximity);
|
|
}
|
|
fprintf (aa_stderr, PROGNAME"499 WEIGHT RESULTS resultct=%ld bv=\n",
|
|
result_count);
|
|
for (i=0; i<22; i++) {
|
|
if (i >= bitveclen)
|
|
break;
|
|
fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = load_DtSrResults_WK;
|
|
return;
|
|
} /* weights_filter_WK() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* dbread_filter_WK */
|
|
/* */
|
|
/****************************************/
|
|
/* Called if we must remove documents from result_bitvec
|
|
* because of keytype or date,
|
|
*/
|
|
static void dbread_filter_WK (void)
|
|
{
|
|
long recno;
|
|
long byteno;
|
|
int bitmask;
|
|
long discards;
|
|
|
|
if (got_USR_STOPSRCH())
|
|
return;
|
|
if (debugging_boolsrch) {
|
|
discards = 0;
|
|
fputs (PROGNAME"865 DBREAD discards (k=keytype d=date):\n", aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
/* One pass thru entire result_bitvec */
|
|
for (recno = 1; recno < tot_addr_count; recno++) {
|
|
byteno = recno >> 3;
|
|
bitmask = 1 << (recno % 8);
|
|
|
|
if ((result_bitvec[byteno] & bitmask) == 0)
|
|
continue;
|
|
|
|
if (!read_recno (recno))
|
|
continue;
|
|
|
|
/* Skip undesired record types */
|
|
if (!all_key_types) {
|
|
if (strchr (saveusr.ktchars, objrec.or_objkey[0]) == NULL) {
|
|
RESET_BIT (result_bitvec, byteno, bitmask);
|
|
result_count--;
|
|
if (debugging_boolsrch) {
|
|
discards++;
|
|
fputc ('k', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Skip record if out of date range */
|
|
if (check_dates) {
|
|
if (!objdate_in_range (objrec.or_objdate,
|
|
usrblk.objdate1, usrblk.objdate2)) {
|
|
RESET_BIT (result_bitvec, byteno, bitmask);
|
|
result_count--;
|
|
if (debugging_boolsrch) {
|
|
discards++;
|
|
fputc ('d', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
|
|
} /* end loop on every recno */
|
|
|
|
if (debugging_boolsrch) {
|
|
int i;
|
|
if (discards)
|
|
fputc ('\n', aa_stderr);
|
|
fprintf (aa_stderr,
|
|
PROGNAME"857 DBREAD RESULTS discards=%ld resultct=%ld bv=\n",
|
|
discards, result_count);
|
|
for (i=0; i<22; i++) {
|
|
if (i >= bitveclen)
|
|
break;
|
|
fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
/* Determine next workproc.
|
|
* (1) If no records survived the read db filter,
|
|
* we're done, return 'no hits'.
|
|
* (2) If we're sorting, the next workproc reduces the
|
|
* bitvec to the aa_maxhits recs with the highest
|
|
* statistical weights.
|
|
* (3) Otherwise the next workproc just loads the hitlist.
|
|
*/
|
|
if (result_count <= 0) {
|
|
usrblk.retncode = OE_NOTAVAIL;
|
|
usrblk.workproc = dummy_workproc;
|
|
}
|
|
else if (do_stat_sort) {
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = weights_filter_WK;
|
|
}
|
|
else {
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"931 No sorting by statistical weights.\n");
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = load_DtSrResults_WK;
|
|
}
|
|
return;
|
|
} /* dbread_filter_WK() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* calc_result_bitvec_WK */
|
|
/* */
|
|
/****************************************/
|
|
/* Second workproc after read_stem_bitvec_WK().
|
|
* If possible, minimizes size of truth table permutes,
|
|
* then applies them to stem bitvecs to create result_bitvec.
|
|
*/
|
|
static void calc_result_bitvec_WK (void)
|
|
{
|
|
int mask;
|
|
int cpm;
|
|
long byteno;
|
|
int bitno, stemno;
|
|
UCHAR permute;
|
|
UCHAR my_permutes [256];
|
|
int my_pmsz;
|
|
int i;
|
|
|
|
if (got_USR_STOPSRCH())
|
|
return;
|
|
|
|
/* If there are fewer than a full complement of stems,
|
|
* minimize size of truth table by discarding
|
|
* permutes that refer to unused stems.
|
|
*/
|
|
if (saveusr.stemcount < DtSrMAX_STEMCOUNT) {
|
|
/* Set high order bits of mask to mark unused stem positions */
|
|
mask = 0;
|
|
for (i = 0; i < saveusr.stemcount; i++)
|
|
mask |= 1 << i;
|
|
mask = ~mask;
|
|
|
|
/* 'cpm' is a candidate permute */
|
|
my_pmsz = 0;
|
|
for (cpm = 0; cpm < 256; cpm++) {
|
|
/*
|
|
* Discard candidate if it refers to an unused stem.
|
|
*/
|
|
if (cpm & mask)
|
|
continue;
|
|
/*
|
|
* Otherwise if candidate is in final_truthtab, keep it.
|
|
*/
|
|
for (i = 0; i < final_truthtab.pmsz; i++) {
|
|
if (final_truthtab.permutes[i] == (UCHAR) cpm) {
|
|
my_permutes [my_pmsz] = (UCHAR) cpm;
|
|
my_pmsz++;
|
|
}
|
|
}
|
|
}
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr,
|
|
PROGNAME"565 Minimize truth table, pmsz=%d-->%d\n permutes=",
|
|
final_truthtab.pmsz, my_pmsz);
|
|
for (i=0; i<16; i++) {
|
|
if (i >= my_pmsz)
|
|
break;
|
|
fprintf (aa_stderr, " %02x", (int) my_permutes [i]);
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
final_truthtab.permutes = my_permutes;
|
|
final_truthtab.pmsz = my_pmsz;
|
|
} /* end minimize of permutes */
|
|
|
|
/* Calculate result bit vector.
|
|
* Loop 1 is a single pass through the bit vectors
|
|
* (a bit loop inside a byte loop).
|
|
* For each nonzero bit, ie each database record
|
|
* that has at least one of the query terms in it,
|
|
* build a 'permute' equivalent to the boolean
|
|
* representation of the terms in that record (Loop 2).
|
|
* Then search the truth table permutes for a match (Loop 3).
|
|
* If found, set the record's bit in the result_bitvec.
|
|
*/
|
|
|
|
/* LOOP 1. For each database addr... */
|
|
result_count = 0;
|
|
for (byteno = 0; byteno < bitveclen; byteno++) {
|
|
for (bitno = 0; bitno < 8; bitno++) {
|
|
mask = 1 << bitno;
|
|
|
|
/* LOOP 2. Build permute for each query term. */
|
|
permute = 0;
|
|
for (stemno = 0; stemno < saveusr.stemcount; stemno++) {
|
|
if (bitvecs [stemno] [byteno] & (UCHAR) mask)
|
|
permute |= 1 << stemno;
|
|
}
|
|
|
|
/* LOOP 3. Search truth table for matching permute. */
|
|
for (i = 0; i < final_truthtab.pmsz; i++) {
|
|
if (final_truthtab.permutes[i] == permute) {
|
|
result_bitvec [byteno] |= (UCHAR) mask;
|
|
result_count++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr, PROGNAME"621 PRELIM RESULTS resultct=%ld bv=\n",
|
|
result_count);
|
|
for (i=0; i<22; i++) {
|
|
if (i >= bitveclen)
|
|
break;
|
|
fprintf (aa_stderr, " %02x", (int) result_bitvec[i]);
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
/* The next workprocs are 'filters', reducing the size
|
|
* of result_bitvec by removing various unwanted records.
|
|
* They're done in the following order:
|
|
* (1) If no records survived the truth table manipulations,
|
|
* we're done, return 'no hits'.
|
|
* (2) If we must remove documents because of keytype or date,
|
|
* the next workproc is the filter that reads the database.
|
|
* (3) If we're sorting, the next workproc reduces the
|
|
* bitvec to the aa_maxhits recs with the highest
|
|
* statistical weights.
|
|
* (4) Otherwise the next workproc just loads the hitlist.
|
|
*/
|
|
if (result_count <= 0) {
|
|
usrblk.retncode = OE_NOTAVAIL;
|
|
usrblk.workproc = dummy_workproc;
|
|
}
|
|
else if (!all_key_types || check_dates) {
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = dbread_filter_WK;
|
|
}
|
|
else if (do_stat_sort) {
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"948 No db reads necessary for date or keytype.\n");
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = weights_filter_WK;
|
|
}
|
|
else {
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr,
|
|
PROGNAME"625 No filtering: no sort and no db reads.\n");
|
|
usrblk.retncode = OE_SEARCHING;
|
|
usrblk.workproc = load_DtSrResults_WK;
|
|
}
|
|
return;
|
|
} /* calc_result_bitvec_WK() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* read_d99 */
|
|
/* */
|
|
/****************************************/
|
|
/* Subroutine of read_stem_bitvec_WK().
|
|
* Repeatedly called to get each d99dba in the inverted index
|
|
* file (d99) for a specific index term. The first call passes
|
|
* the term's wordrec with d99 offset and size information.
|
|
* Subsequent calls pass NULL.
|
|
* Returns valid d99dba, or 0 at end of term's index, or -1 on error.
|
|
* Actual reads are performed a disk block at a time,
|
|
* with dbas stored in a static buffer for the next call.
|
|
*/
|
|
static DB_ADDR read_d99 (struct or_hwordrec *wordrec)
|
|
{
|
|
static DB_ADDR readbuf [DBAS_PER_BLOCK];
|
|
static DB_ADDR *bufptr, *endbuf;
|
|
static FILE *fptr;
|
|
static long bal_read, request_read, actual_read;
|
|
|
|
/* First call for new term */
|
|
if (wordrec) {
|
|
fptr = usrblk.dblk->iifile;
|
|
fseek (fptr, wordrec->or_hwoffset, SEEK_SET);
|
|
bal_read = wordrec->or_hwaddrs;
|
|
bufptr = endbuf = 0; /* triggers block read */
|
|
}
|
|
|
|
/* Time to read another block */
|
|
if (bufptr >= endbuf) {
|
|
if (bal_read <= 0)
|
|
return 0;
|
|
if (bal_read > DBAS_PER_BLOCK) {
|
|
request_read = DBAS_PER_BLOCK;
|
|
bal_read -= DBAS_PER_BLOCK;
|
|
endbuf = readbuf + DBAS_PER_BLOCK;
|
|
}
|
|
else {
|
|
/* last block is usually short */
|
|
request_read = bal_read;
|
|
bal_read = 0;
|
|
endbuf = readbuf + request_read;
|
|
}
|
|
if (fread (readbuf, sizeof(DB_ADDR), request_read, fptr)
|
|
!= request_read) {
|
|
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 28,
|
|
"%s Database Read Error in %s.d99.") ,
|
|
PROGNAME"428", usrblk.dblk->name);
|
|
DtSearchAddMessage (msgbuf);
|
|
return -1;
|
|
}
|
|
bufptr = readbuf;
|
|
}
|
|
|
|
/******return *bufptr++;*******/
|
|
return ntohl (*bufptr++);
|
|
} /* read_d99() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* get_colloc_bitvec */
|
|
/* */
|
|
/****************************************/
|
|
/* Subroutine of read_stem_bitvec_WK().
|
|
* Constructs a 'collocation bitvector' for current save_stemno.
|
|
* A collocation expression requests the return of all records
|
|
* containing both of two terms (a kind of boolean AND) such that
|
|
* the occurrences are within n characters of each other.
|
|
* For example "ICE @5 CREAM" requests the return of all records
|
|
* containing both "ICE" and "CREAM" but only if they are separated
|
|
* by no more than 5 characters.
|
|
*
|
|
* Since offset information is not stored in the inverted index
|
|
* this module initially returns the intersection of the two words'
|
|
* bit vectors (boolean AND). Then it retrieves each record,
|
|
* builds an offset (hilites) table for each of the two words,
|
|
* then compares the offset differences in the tables.
|
|
* If no occurrence pairs are within the specified separation
|
|
* range, the record is deleted from the bitvector.
|
|
* Returns 0 if successful, otherwise returns -1 and msgs.
|
|
@@@@ rewrite as its own workproc--reading/hiliting can take a long time...
|
|
*/
|
|
static int get_colloc_bitvec (void)
|
|
{
|
|
int stemno_A = or_wordrecs[save_stemno].or_hwoffset;
|
|
int stemno_B = or_wordrecs[save_stemno].or_hwfree;
|
|
long range = or_wordrecs[save_stemno].or_hwaddrs;
|
|
UCHAR *bitvec_A = bitvecs [stemno_A];
|
|
UCHAR *bitvec_B = bitvecs [stemno_B];
|
|
UCHAR *bitvec_C = bitvecs [save_stemno];
|
|
long byteno, recno;
|
|
UCHAR bitmask;
|
|
int parse_type;
|
|
int got_a_colloc;
|
|
char *stemp;
|
|
DtSrHitword *hitwords_A, *hitwords_B;
|
|
long hitwcount_A, hitwcount_B;
|
|
long threshold_range;
|
|
DB_ADDR dba;
|
|
LLIST *bloblist;
|
|
long a, b, offset_A, offset_B;
|
|
|
|
/* First construct the set intersection (AND) of
|
|
* each of the collocated terms in the colloc bitvec.
|
|
*/
|
|
for (byteno = 0; byteno < bitveclen; byteno++)
|
|
bitvec_C [byteno] = bitvec_A [byteno] & bitvec_B [byteno];
|
|
if (debugging_boolsrch) {
|
|
int i;
|
|
fprintf (aa_stderr,
|
|
PROGNAME"312 INTERSECT[%d] (colloc %d & %d):\n",
|
|
save_stemno, stemno_A, stemno_B);
|
|
for (i=0; i<bitveclen; i++) {
|
|
fprintf (aa_stderr, " %02x", bitvec_C[i]);
|
|
if (i > 22)
|
|
break;
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
/* Read cleartext for each rec in intersection/colloc bitvec.
|
|
* Get hitwords (hilite table) for each collocation term.
|
|
* Switch off recs in bitvec where no term pairs are in
|
|
* collocation range.
|
|
*/
|
|
for (recno = 1; recno < tot_addr_count; recno++) {
|
|
byteno = recno >> 3; /* divide by 8 */
|
|
bitmask = 1 << (recno % 8);
|
|
|
|
/* Skip zero bits */
|
|
if ((bitvec_C[byteno] & bitmask) == 0)
|
|
continue;
|
|
|
|
/* Convert recno to vista database address.
|
|
* Silently skip rec if dba doesn't exist.
|
|
*/
|
|
dba = (recno - 1) * or_recslots + 2;
|
|
if (dba >= or_maxdba) {
|
|
RESET_BIT (bitvec_C, byteno, bitmask);
|
|
continue;
|
|
}
|
|
dba |= (OR_D00 << 24);
|
|
|
|
/* Silently skip records that have no document text */
|
|
if ((bloblist = ve_getblobs (dba, vistano)) == NULL) {
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr,
|
|
PROGNAME"126 No blobs for recno=%ld byteno=%ld mask%02x\n",
|
|
recno, byteno, bitmask);
|
|
fflush (aa_stderr);
|
|
}
|
|
RESET_BIT (bitvec_C, byteno, bitmask);
|
|
continue;
|
|
}
|
|
|
|
/* Uncompress record text into usrblk.cleartext */
|
|
if (oe_unblob (bloblist) != OE_OK)
|
|
return -1;
|
|
|
|
/* Build 'hilite' table for stem A. If stem
|
|
* can't be found in the record, silently skip it.
|
|
* Otherwise save the table.
|
|
*/
|
|
stemp = saveusr.stems [stemno_A];
|
|
if (stemp[0] == STEM_CH) {
|
|
parse_type = 'S';
|
|
stemp++;
|
|
}
|
|
else
|
|
parse_type = 'W';
|
|
if (!hilite_cleartext (parse_type, stemp, 1)) {
|
|
RESET_BIT (bitvec_C, byteno, bitmask);
|
|
continue;
|
|
}
|
|
hitwords_A = usrblk.hitwords;
|
|
hitwcount_A = usrblk.hitwcount;
|
|
usrblk.hitwords = NULL;
|
|
usrblk.hitwcount = 0;
|
|
|
|
/* In the same way build 'hilite' table for stem B */
|
|
stemp = saveusr.stems [stemno_B];
|
|
if (stemp[0] == STEM_CH) {
|
|
parse_type = 'S';
|
|
stemp++;
|
|
}
|
|
else
|
|
parse_type = 'W';
|
|
if (!hilite_cleartext (parse_type, stemp, 1)) {
|
|
RESET_BIT (bitvec_C, byteno, bitmask);
|
|
free (hitwords_A);
|
|
continue;
|
|
}
|
|
hitwords_B = usrblk.hitwords;
|
|
hitwcount_B = usrblk.hitwcount;
|
|
usrblk.hitwords = NULL;
|
|
usrblk.hitwcount = 0;
|
|
|
|
/* Compare the two hilite tables for range matches */
|
|
got_a_colloc = FALSE;
|
|
b = 0;
|
|
for (a = 0; a < hitwcount_A; a++) {
|
|
offset_A = hitwords_A[a].offset;
|
|
threshold_range = offset_A + hitwords_A[a].length + range;
|
|
for (; b < hitwcount_B; b++) {
|
|
offset_B = hitwords_B[b].offset;
|
|
|
|
/* Advance B to first entry past A's offset */
|
|
if (offset_B <= offset_A )
|
|
continue; /* ...the B loop */
|
|
if (offset_B <= threshold_range)
|
|
got_a_colloc = TRUE;
|
|
break; /* ...the B loop */
|
|
} /* end B loop */
|
|
if (got_a_colloc || b >= hitwcount_B)
|
|
break; /* ...the A loop */
|
|
} /* end A loop */
|
|
free (hitwords_A);
|
|
free (hitwords_B);
|
|
|
|
/* If no collocations found within range,
|
|
* switch off rec in colloc bitvec.
|
|
*/
|
|
if (!got_a_colloc)
|
|
RESET_BIT (bitvec_C, byteno, bitmask);
|
|
|
|
} /* end loop on each recno in intersection/colloc bitvec */
|
|
|
|
return 0;
|
|
} /* get_colloc_bitvec() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* read_stem_bitvec_WK */
|
|
/* */
|
|
/****************************************/
|
|
/* First workproc after boolean_search().
|
|
* Each iterative call loads one (save_stemno) real stem's bitvec.
|
|
* After last stem bitvec loaded, sets up
|
|
* call to next workproc in sequence.
|
|
*/
|
|
static void read_stem_bitvec_WK (void)
|
|
{
|
|
long byteno;
|
|
DB_ADDR d99recno;
|
|
float weight;
|
|
|
|
if (got_USR_STOPSRCH())
|
|
return;
|
|
|
|
/* Process collocation 'stems' */
|
|
if (saveusr.stems [save_stemno] [0] == '@') {
|
|
d99recno = get_colloc_bitvec();
|
|
goto DONE_READING;
|
|
}
|
|
|
|
for ( d99recno = read_d99 (&or_wordrecs [save_stemno]);
|
|
d99recno;
|
|
d99recno = read_d99 (NULL)) {
|
|
if (d99recno == -1) /* read error */
|
|
break;
|
|
|
|
/* Save low byte 'statistical weight' value.
|
|
* It can only be 0 - 255.
|
|
*/
|
|
if (do_stat_sort)
|
|
weight = (float) (d99recno & 0x000000ff) + 1.0;
|
|
|
|
d99recno = (d99recno >> 8) & 0x00ffffff;
|
|
|
|
/* Set correct bit in bitvec.
|
|
* The byte number is the recno divided by 8.
|
|
* The bit number is the remainder after division by 8.
|
|
*/
|
|
if ((byteno = d99recno >> 3) >= bitveclen) {
|
|
sprintf (msgbuf, catgets(dtsearch_catd, MS_boolsrch, 32,
|
|
"%s Database Error: %s '%s'\n"
|
|
"in database '%s' has invalid d99 record number %ld.") ,
|
|
PROGNAME"394",
|
|
(usrblk.search_type == 'W') ?
|
|
catgets(dtsearch_catd, MS_boolsrch, 33, "Word") :
|
|
catgets(dtsearch_catd, MS_boolsrch, 34, "Stem of"),
|
|
usrblk.stems [save_stemno],
|
|
usrblk.dblk->label,
|
|
d99recno);
|
|
DtSearchAddMessage (msgbuf);
|
|
d99recno = -1; /* force error return */
|
|
goto DONE_READING;
|
|
}
|
|
bitvecs [save_stemno] [byteno] |= 1 << (d99recno % 8);
|
|
|
|
/* Add to correct weight in weight vector.
|
|
* IDF ranges between 1.0 and 20.0, and weight
|
|
* is 1 - 256, so we're adding 1 - ~5000 to wtvec.
|
|
*/
|
|
if (do_stat_sort)
|
|
wtvec [d99recno] += weight * (float) idf [save_stemno];
|
|
|
|
} /* end loop that retrieves every d99recno for curr stem */
|
|
|
|
DONE_READING:
|
|
|
|
if (debugging_boolsrch) {
|
|
int i;
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr, PROGNAME"313 BITVEC[%ld]:\n", save_stemno);
|
|
for (i=0; i<bitveclen; i++) {
|
|
fprintf (aa_stderr, " %02x", bitvecs[save_stemno][i]);
|
|
if (i > 22)
|
|
break;
|
|
}
|
|
fputc ('\n', aa_stderr);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
if (d99recno == 0) {
|
|
/* Normal conclusion. Increment to next stem.
|
|
* If not all stems have been read,
|
|
* this is still the next workproc.
|
|
* Otherwise the next workproc is the one
|
|
* merging all bitvectors into the final
|
|
* result bitvec using the truth table.
|
|
*/
|
|
usrblk.retncode = OE_SEARCHING;
|
|
if (++save_stemno < saveusr.stemcount)
|
|
usrblk.workproc = read_stem_bitvec_WK;
|
|
else
|
|
usrblk.workproc = calc_result_bitvec_WK;
|
|
}
|
|
else
|
|
/* d99recno must be -1 */
|
|
usrblk.retncode = OE_SYSTEM_STOP;
|
|
return;
|
|
} /* read_stem_bitvec_WK() */
|
|
|
|
|
|
/****************************************/
|
|
/* */
|
|
/* boolean_search */
|
|
/* */
|
|
/****************************************/
|
|
/* Called from Opera_Engine after successful boolean_parse().
|
|
* Expects valid globals: saveusr.stems, saveusr.stemcount,
|
|
* usrblk.stems (contains original unstemmed query terms for msgs),
|
|
* usrblk.search_type, final_truthtab, qry_has_no_NOTs,
|
|
* and qry_is_all_ANDs.
|
|
* Based on parts of the function ve_word_search().
|
|
* Upon return, usrblk.retncode, msglist, etc is appropriately loaded.
|
|
* Upon successful return usrblk.stems, usrblk.stemcount,
|
|
* and dittolist are also loaded.
|
|
*/
|
|
void boolean_search (void)
|
|
{
|
|
int i, j;
|
|
size_t allocsz_needed;
|
|
|
|
/* Sanity checks */
|
|
if ( saveusr.stemcount <= 0 ||
|
|
final_truthtab.pmsz <= 0 ||
|
|
final_truthtab.pmsz >= 256 ) {
|
|
fprintf (aa_stderr, catgets(dtsearch_catd, MS_boolsrch, 35,
|
|
"%s Program Error: stemct=%d pmsz=%d\n") ,
|
|
PROGNAME"1404", saveusr.stemcount, final_truthtab.pmsz);
|
|
DtSearchExit (14);
|
|
}
|
|
|
|
/*---------- Init globals ----------*/
|
|
if (!msgbuf)
|
|
msgbuf = austext_malloc (500, PROGNAME"393", NULL);
|
|
debugging_boolsrch = (usrblk.debug & USRDBG_SRCHCMPL);
|
|
need_zero_permute = (final_truthtab.permutes[0] == 0);
|
|
do_stat_sort = ((usrblk.flags & USR_SORT_WHITL) != 0);
|
|
check_dates = (usrblk.objdate1 || usrblk.objdate2);
|
|
or_abstrsz = usrblk.dblk->dbrec.or_abstrsz;
|
|
or_fzkeysz = usrblk.dblk->dbrec.or_fzkeysz;
|
|
or_language = usrblk.dblk->dbrec.or_language;
|
|
or_maxdba = usrblk.dblk->dbrec.or_maxdba;
|
|
usrblk.flags &= ~USR_STOPSRCH; /* turn off stop button */
|
|
|
|
saveusr.vistano = vistano = usrblk.dblk->vistano;
|
|
saveusr.dittolist = NULL;
|
|
saveusr.dittocount = 0L;
|
|
saveusr.iterations = INIT_ITERATIONS;
|
|
/*
|
|
* saveusr.ktchars is a string holding
|
|
* first char of desired record ids.
|
|
*/
|
|
all_key_types = TRUE;
|
|
for (i = 0, j = 0; i < usrblk.dblk->ktcount; i++) {
|
|
if (usrblk.dblk->keytypes[i].is_selected)
|
|
saveusr.ktchars[j++] = usrblk.dblk->keytypes[i].ktchar;
|
|
else
|
|
all_key_types = FALSE;
|
|
}
|
|
saveusr.ktchars[j] = '\0';
|
|
|
|
or_recslots = (long) (usrblk.dblk->dbrec.or_recslots);
|
|
or_reccount = usrblk.dblk->dbrec.or_reccount;
|
|
|
|
/* RECFRST is just to get the slot# (dba) of the
|
|
* first real object record after the dbrec.
|
|
* Currently the dbrec occupies only one slot,
|
|
* the first (#1), so dba will usually be #2.
|
|
*/
|
|
/********
|
|
RECFRST(PROGNAME"2545", OR_OBJREC, saveusr.vistano);
|
|
CRGET(PROGNAME"2546", &dba, saveusr.vistano);
|
|
dba &= 0x00FFFFFF;
|
|
********/
|
|
tot_addr_count = ((usrblk.dblk->dbrec.or_maxdba + 1) / or_recslots) + 1;
|
|
bitveclen = (tot_addr_count >> 3) + 1;
|
|
|
|
if (debugging_boolsrch) {
|
|
fprintf (aa_stderr, PROGNAME"360 "
|
|
"boolean_search: typ='%c' needzpm?=%d sort?=%d maxhits=%d\n"
|
|
" maxdba=%ld recct=%ld recslts=%ld\n"
|
|
" totnmadr=%ld bvln=%ld allkts?=%d ktchars='%s'\n"
|
|
,usrblk.search_type
|
|
,need_zero_permute
|
|
,do_stat_sort
|
|
,aa_maxhits
|
|
,usrblk.dblk->dbrec.or_maxdba
|
|
,or_reccount
|
|
,or_recslots
|
|
,tot_addr_count
|
|
,bitveclen
|
|
,all_key_types
|
|
,saveusr.ktchars
|
|
);
|
|
fflush (aa_stderr);
|
|
}
|
|
|
|
|
|
/*---------- Read vista btree ----------
|
|
* Load or_wordrecs[] array for each term in saveusr.stems.
|
|
*/
|
|
if (!load_or_wordrecs())
|
|
return;
|
|
|
|
/* If statistically sorting final resultlist, calculate
|
|
* idf (inverse document frequency) for each term using
|
|
* the frequency data in or_wordrecs[].
|
|
*/
|
|
if (do_stat_sort)
|
|
calculate_idfs();
|
|
|
|
/* Bitvector allocation. Number needed is one for each stem,
|
|
* plus one extra to accumulate the result bitvector.
|
|
*/
|
|
allocsz_needed = bitveclen * (saveusr.stemcount + 1);
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr, PROGNAME"430 "
|
|
"bitvecs[] alloc needed=%ld (bvln=%ld stems=%d+1), have=%ld.\n",
|
|
allocsz_needed, bitveclen, saveusr.stemcount, bitvec_allocsz);
|
|
if (bitvec_allocsz < allocsz_needed) {
|
|
if (bitvec_allocp)
|
|
free (bitvec_allocp);
|
|
bitvec_allocp = austext_malloc (allocsz_needed + 16,
|
|
PROGNAME"508", NULL);
|
|
if (debugging_boolsrch)
|
|
fprintf (aa_stderr, PROGNAME"432 bitvecs[] realloc %ld-->%ld.\n",
|
|
bitvec_allocsz, allocsz_needed);
|
|
bitvec_allocsz = allocsz_needed;
|
|
}
|
|
|
|
/* Clear all bitvecs to zero and assign them */
|
|
memset (bitvec_allocp, 0, allocsz_needed);
|
|
for (i = 0; i < saveusr.stemcount; i++)
|
|
bitvecs[i] = bitvec_allocp + (i * bitveclen);
|
|
result_bitvec = bitvec_allocp + (i * bitveclen);
|
|
|
|
/* If sorting statistically, allocate weight vector.
|
|
* One float for each db record.
|
|
*/
|
|
if (wtvec) {
|
|
free (wtvec);
|
|
wtvec = NULL;
|
|
}
|
|
if (do_stat_sort) {
|
|
wtvec = austext_malloc ((tot_addr_count + 4) * sizeof(float) + 4,
|
|
PROGNAME"040", NULL);
|
|
memset (wtvec, 0, (tot_addr_count + 4) * sizeof(float));
|
|
}
|
|
|
|
/* The 'zero permute' is every record that has
|
|
* NONE of the query terms in it. It can only be
|
|
* generated if a NOT operator was included in the query.
|
|
*/
|
|
if (need_zero_permute) {
|
|
sprintf (msgbuf, catgets (dtsearch_catd, MS_boolsrch, 15,
|
|
"%s Your query requires retrieving every\n"
|
|
"document in the database that does not have any of\n"
|
|
"your query words. This type of search may take an\n"
|
|
"unusually long time."),
|
|
PROGNAME"1536");
|
|
DtSearchAddMessage (msgbuf);
|
|
}
|
|
|
|
if (debugging_boolsrch)
|
|
fflush (aa_stderr);
|
|
|
|
/* Searches may take a long time. To allow gui to put a
|
|
* a 'working' dialog box and a 'cancel' button,
|
|
* we pass execution to workprocs.
|
|
* If user cannot cancel search no matter how
|
|
* long it may take, we call each of the subsequent
|
|
* workproc functions directly from here.
|
|
* Otherwise they will themselves setup each
|
|
* subsequent call to usrblk.workproc(), as long as
|
|
* the previous call returns OE_SEARCHING and the user
|
|
* hasn't pushed USR_STOPSRCH.
|
|
*/
|
|
usrblk.workproc = read_stem_bitvec_WK;
|
|
save_stemno = 0; /* global arg for first workproc */
|
|
usrblk.workproc(); /* direct call to first workproc */
|
|
|
|
if ((usrblk.flags & USR_NO_ITERATE) != 0 &&
|
|
(usrblk.debug & USRDBG_ITERATE) == 0) {
|
|
while (usrblk.retncode == OE_SEARCHING)
|
|
usrblk.workproc();
|
|
}
|
|
return;
|
|
|
|
} /* boolean_search() */
|
|
|
|
/************************** BOOLSRCH.C **********************/
|