1522 lines
50 KiB
C
1522 lines
50 KiB
C
/*
|
|
** 2009 Oct 23
|
|
**
|
|
** The author disclaims copyright to this source code. In place of
|
|
** a legal notice, here is a blessing:
|
|
**
|
|
** May you do good and not evil.
|
|
** May you find forgiveness for yourself and forgive others.
|
|
** May you share freely, never taking more than you give.
|
|
**
|
|
******************************************************************************
|
|
*/
|
|
|
|
#include "fts3Int.h"
|
|
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
|
|
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
|
|
/*
|
|
** Characters that may appear in the second argument to matchinfo().
|
|
*/
|
|
#define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */
|
|
#define FTS3_MATCHINFO_NCOL 'c' /* 1 value */
|
|
#define FTS3_MATCHINFO_NDOC 'n' /* 1 value */
|
|
#define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */
|
|
#define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */
|
|
#define FTS3_MATCHINFO_LCS 's' /* nCol values */
|
|
#define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */
|
|
|
|
/*
|
|
** The default value for the second argument to matchinfo().
|
|
*/
|
|
#define FTS3_MATCHINFO_DEFAULT "pcx"
|
|
|
|
|
|
/*
|
|
** Used as an fts3ExprIterate() context when loading phrase doclists to
|
|
** Fts3Expr.aDoclist[]/nDoclist.
|
|
*/
|
|
typedef struct LoadDoclistCtx LoadDoclistCtx;
|
|
struct LoadDoclistCtx {
|
|
Fts3Cursor *pCsr; /* FTS3 Cursor */
|
|
int nPhrase; /* Number of phrases seen so far */
|
|
int nToken; /* Number of tokens seen so far */
|
|
};
|
|
|
|
/*
|
|
** The following types are used as part of the implementation of the
|
|
** fts3BestSnippet() routine.
|
|
*/
|
|
typedef struct SnippetIter SnippetIter;
|
|
typedef struct SnippetPhrase SnippetPhrase;
|
|
typedef struct SnippetFragment SnippetFragment;
|
|
|
|
struct SnippetIter {
|
|
Fts3Cursor *pCsr; /* Cursor snippet is being generated from */
|
|
int iCol; /* Extract snippet from this column */
|
|
int nSnippet; /* Requested snippet length (in tokens) */
|
|
int nPhrase; /* Number of phrases in query */
|
|
SnippetPhrase *aPhrase; /* Array of size nPhrase */
|
|
int iCurrent; /* First token of current snippet */
|
|
};
|
|
|
|
struct SnippetPhrase {
|
|
int nToken; /* Number of tokens in phrase */
|
|
char *pList; /* Pointer to start of phrase position list */
|
|
int iHead; /* Next value in position list */
|
|
char *pHead; /* Position list data following iHead */
|
|
int iTail; /* Next value in trailing position list */
|
|
char *pTail; /* Position list data following iTail */
|
|
};
|
|
|
|
struct SnippetFragment {
|
|
int iCol; /* Column snippet is extracted from */
|
|
int iPos; /* Index of first token in snippet */
|
|
u64 covered; /* Mask of query phrases covered */
|
|
u64 hlmask; /* Mask of snippet terms to highlight */
|
|
};
|
|
|
|
/*
|
|
** This type is used as an fts3ExprIterate() context object while
|
|
** accumulating the data returned by the matchinfo() function.
|
|
*/
|
|
typedef struct MatchInfo MatchInfo;
|
|
struct MatchInfo {
|
|
Fts3Cursor *pCursor; /* FTS3 Cursor */
|
|
int nCol; /* Number of columns in table */
|
|
int nPhrase; /* Number of matchable phrases in query */
|
|
sqlite3_int64 nDoc; /* Number of docs in database */
|
|
u32 *aMatchinfo; /* Pre-allocated buffer */
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
** The snippet() and offsets() functions both return text values. An instance
|
|
** of the following structure is used to accumulate those values while the
|
|
** functions are running. See fts3StringAppend() for details.
|
|
*/
|
|
typedef struct StrBuffer StrBuffer;
|
|
struct StrBuffer {
|
|
char *z; /* Pointer to buffer containing string */
|
|
int n; /* Length of z in bytes (excl. nul-term) */
|
|
int nAlloc; /* Allocated size of buffer z in bytes */
|
|
};
|
|
|
|
|
|
/*
|
|
** This function is used to help iterate through a position-list. A position
|
|
** list is a list of unique integers, sorted from smallest to largest. Each
|
|
** element of the list is represented by an FTS3 varint that takes the value
|
|
** of the difference between the current element and the previous one plus
|
|
** two. For example, to store the position-list:
|
|
**
|
|
** 4 9 113
|
|
**
|
|
** the three varints:
|
|
**
|
|
** 6 7 106
|
|
**
|
|
** are encoded.
|
|
**
|
|
** When this function is called, *pp points to the start of an element of
|
|
** the list. *piPos contains the value of the previous entry in the list.
|
|
** After it returns, *piPos contains the value of the next element of the
|
|
** list and *pp is advanced to the following varint.
|
|
*/
|
|
static void fts3GetDeltaPosition(char **pp, int *piPos){
|
|
int iVal;
|
|
*pp += sqlite3Fts3GetVarint32(*pp, &iVal);
|
|
*piPos += (iVal-2);
|
|
}
|
|
|
|
/*
|
|
** Helper function for fts3ExprIterate() (see below).
|
|
*/
|
|
static int fts3ExprIterate2(
|
|
Fts3Expr *pExpr, /* Expression to iterate phrases of */
|
|
int *piPhrase, /* Pointer to phrase counter */
|
|
int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
|
|
void *pCtx /* Second argument to pass to callback */
|
|
){
|
|
int rc; /* Return code */
|
|
int eType = pExpr->eType; /* Type of expression node pExpr */
|
|
|
|
if( eType!=FTSQUERY_PHRASE ){
|
|
assert( pExpr->pLeft && pExpr->pRight );
|
|
rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
|
|
if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
|
|
rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
|
|
}
|
|
}else{
|
|
rc = x(pExpr, *piPhrase, pCtx);
|
|
(*piPhrase)++;
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Iterate through all phrase nodes in an FTS3 query, except those that
|
|
** are part of a sub-tree that is the right-hand-side of a NOT operator.
|
|
** For each phrase node found, the supplied callback function is invoked.
|
|
**
|
|
** If the callback function returns anything other than SQLITE_OK,
|
|
** the iteration is abandoned and the error code returned immediately.
|
|
** Otherwise, SQLITE_OK is returned after a callback has been made for
|
|
** all eligible phrase nodes.
|
|
*/
|
|
static int fts3ExprIterate(
|
|
Fts3Expr *pExpr, /* Expression to iterate phrases of */
|
|
int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
|
|
void *pCtx /* Second argument to pass to callback */
|
|
){
|
|
int iPhrase = 0; /* Variable used as the phrase counter */
|
|
return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
|
|
}
|
|
|
|
/*
|
|
** This is an fts3ExprIterate() callback used while loading the doclists
|
|
** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
|
|
** fts3ExprLoadDoclists().
|
|
*/
|
|
static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
|
|
int rc = SQLITE_OK;
|
|
Fts3Phrase *pPhrase = pExpr->pPhrase;
|
|
LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
|
|
|
|
UNUSED_PARAMETER(iPhrase);
|
|
|
|
p->nPhrase++;
|
|
p->nToken += pPhrase->nToken;
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Load the doclists for each phrase in the query associated with FTS3 cursor
|
|
** pCsr.
|
|
**
|
|
** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
|
|
** phrases in the expression (all phrases except those directly or
|
|
** indirectly descended from the right-hand-side of a NOT operator). If
|
|
** pnToken is not NULL, then it is set to the number of tokens in all
|
|
** matchable phrases of the expression.
|
|
*/
|
|
static int fts3ExprLoadDoclists(
|
|
Fts3Cursor *pCsr, /* Fts3 cursor for current query */
|
|
int *pnPhrase, /* OUT: Number of phrases in query */
|
|
int *pnToken /* OUT: Number of tokens in query */
|
|
){
|
|
int rc; /* Return Code */
|
|
LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */
|
|
sCtx.pCsr = pCsr;
|
|
rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
|
|
if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
|
|
if( pnToken ) *pnToken = sCtx.nToken;
|
|
return rc;
|
|
}
|
|
|
|
static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
|
|
(*(int *)ctx)++;
|
|
UNUSED_PARAMETER(pExpr);
|
|
UNUSED_PARAMETER(iPhrase);
|
|
return SQLITE_OK;
|
|
}
|
|
static int fts3ExprPhraseCount(Fts3Expr *pExpr){
|
|
int nPhrase = 0;
|
|
(void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
|
|
return nPhrase;
|
|
}
|
|
|
|
/*
|
|
** Advance the position list iterator specified by the first two
|
|
** arguments so that it points to the first element with a value greater
|
|
** than or equal to parameter iNext.
|
|
*/
|
|
static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
|
|
char *pIter = *ppIter;
|
|
if( pIter ){
|
|
int iIter = *piIter;
|
|
|
|
while( iIter<iNext ){
|
|
if( 0==(*pIter & 0xFE) ){
|
|
iIter = -1;
|
|
pIter = 0;
|
|
break;
|
|
}
|
|
fts3GetDeltaPosition(&pIter, &iIter);
|
|
}
|
|
|
|
*piIter = iIter;
|
|
*ppIter = pIter;
|
|
}
|
|
}
|
|
|
|
/*
|
|
** Advance the snippet iterator to the next candidate snippet.
|
|
*/
|
|
static int fts3SnippetNextCandidate(SnippetIter *pIter){
|
|
int i; /* Loop counter */
|
|
|
|
if( pIter->iCurrent<0 ){
|
|
/* The SnippetIter object has just been initialized. The first snippet
|
|
** candidate always starts at offset 0 (even if this candidate has a
|
|
** score of 0.0).
|
|
*/
|
|
pIter->iCurrent = 0;
|
|
|
|
/* Advance the 'head' iterator of each phrase to the first offset that
|
|
** is greater than or equal to (iNext+nSnippet).
|
|
*/
|
|
for(i=0; i<pIter->nPhrase; i++){
|
|
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
|
|
fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
|
|
}
|
|
}else{
|
|
int iStart;
|
|
int iEnd = 0x7FFFFFFF;
|
|
|
|
for(i=0; i<pIter->nPhrase; i++){
|
|
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
|
|
if( pPhrase->pHead && pPhrase->iHead<iEnd ){
|
|
iEnd = pPhrase->iHead;
|
|
}
|
|
}
|
|
if( iEnd==0x7FFFFFFF ){
|
|
return 1;
|
|
}
|
|
|
|
pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
|
|
for(i=0; i<pIter->nPhrase; i++){
|
|
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
|
|
fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
|
|
fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
** Retrieve information about the current candidate snippet of snippet
|
|
** iterator pIter.
|
|
*/
|
|
static void fts3SnippetDetails(
|
|
SnippetIter *pIter, /* Snippet iterator */
|
|
u64 mCovered, /* Bitmask of phrases already covered */
|
|
int *piToken, /* OUT: First token of proposed snippet */
|
|
int *piScore, /* OUT: "Score" for this snippet */
|
|
u64 *pmCover, /* OUT: Bitmask of phrases covered */
|
|
u64 *pmHighlight /* OUT: Bitmask of terms to highlight */
|
|
){
|
|
int iStart = pIter->iCurrent; /* First token of snippet */
|
|
int iScore = 0; /* Score of this snippet */
|
|
int i; /* Loop counter */
|
|
u64 mCover = 0; /* Mask of phrases covered by this snippet */
|
|
u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */
|
|
|
|
for(i=0; i<pIter->nPhrase; i++){
|
|
SnippetPhrase *pPhrase = &pIter->aPhrase[i];
|
|
if( pPhrase->pTail ){
|
|
char *pCsr = pPhrase->pTail;
|
|
int iCsr = pPhrase->iTail;
|
|
|
|
while( iCsr<(iStart+pIter->nSnippet) ){
|
|
int j;
|
|
u64 mPhrase = (u64)1 << i;
|
|
u64 mPos = (u64)1 << (iCsr - iStart);
|
|
assert( iCsr>=iStart );
|
|
if( (mCover|mCovered)&mPhrase ){
|
|
iScore++;
|
|
}else{
|
|
iScore += 1000;
|
|
}
|
|
mCover |= mPhrase;
|
|
|
|
for(j=0; j<pPhrase->nToken; j++){
|
|
mHighlight |= (mPos>>j);
|
|
}
|
|
|
|
if( 0==(*pCsr & 0x0FE) ) break;
|
|
fts3GetDeltaPosition(&pCsr, &iCsr);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Set the output variables before returning. */
|
|
*piToken = iStart;
|
|
*piScore = iScore;
|
|
*pmCover = mCover;
|
|
*pmHighlight = mHighlight;
|
|
}
|
|
|
|
/*
|
|
** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
|
|
** Each invocation populates an element of the SnippetIter.aPhrase[] array.
|
|
*/
|
|
static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
|
|
SnippetIter *p = (SnippetIter *)ctx;
|
|
SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
|
|
char *pCsr;
|
|
int rc;
|
|
|
|
pPhrase->nToken = pExpr->pPhrase->nToken;
|
|
rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pCsr);
|
|
assert( rc==SQLITE_OK || pCsr==0 );
|
|
if( pCsr ){
|
|
int iFirst = 0;
|
|
pPhrase->pList = pCsr;
|
|
fts3GetDeltaPosition(&pCsr, &iFirst);
|
|
assert( iFirst>=0 );
|
|
pPhrase->pHead = pCsr;
|
|
pPhrase->pTail = pCsr;
|
|
pPhrase->iHead = iFirst;
|
|
pPhrase->iTail = iFirst;
|
|
}else{
|
|
assert( rc!=SQLITE_OK || (
|
|
pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0
|
|
));
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Select the fragment of text consisting of nFragment contiguous tokens
|
|
** from column iCol that represent the "best" snippet. The best snippet
|
|
** is the snippet with the highest score, where scores are calculated
|
|
** by adding:
|
|
**
|
|
** (a) +1 point for each occurrence of a matchable phrase in the snippet.
|
|
**
|
|
** (b) +1000 points for the first occurrence of each matchable phrase in
|
|
** the snippet for which the corresponding mCovered bit is not set.
|
|
**
|
|
** The selected snippet parameters are stored in structure *pFragment before
|
|
** returning. The score of the selected snippet is stored in *piScore
|
|
** before returning.
|
|
*/
|
|
static int fts3BestSnippet(
|
|
int nSnippet, /* Desired snippet length */
|
|
Fts3Cursor *pCsr, /* Cursor to create snippet for */
|
|
int iCol, /* Index of column to create snippet from */
|
|
u64 mCovered, /* Mask of phrases already covered */
|
|
u64 *pmSeen, /* IN/OUT: Mask of phrases seen */
|
|
SnippetFragment *pFragment, /* OUT: Best snippet found */
|
|
int *piScore /* OUT: Score of snippet pFragment */
|
|
){
|
|
int rc; /* Return Code */
|
|
int nList; /* Number of phrases in expression */
|
|
SnippetIter sIter; /* Iterates through snippet candidates */
|
|
int nByte; /* Number of bytes of space to allocate */
|
|
int iBestScore = -1; /* Best snippet score found so far */
|
|
int i; /* Loop counter */
|
|
|
|
memset(&sIter, 0, sizeof(sIter));
|
|
|
|
/* Iterate through the phrases in the expression to count them. The same
|
|
** callback makes sure the doclists are loaded for each phrase.
|
|
*/
|
|
rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
|
|
if( rc!=SQLITE_OK ){
|
|
return rc;
|
|
}
|
|
|
|
/* Now that it is known how many phrases there are, allocate and zero
|
|
** the required space using malloc().
|
|
*/
|
|
nByte = sizeof(SnippetPhrase) * nList;
|
|
sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
|
|
if( !sIter.aPhrase ){
|
|
return SQLITE_NOMEM;
|
|
}
|
|
memset(sIter.aPhrase, 0, nByte);
|
|
|
|
/* Initialize the contents of the SnippetIter object. Then iterate through
|
|
** the set of phrases in the expression to populate the aPhrase[] array.
|
|
*/
|
|
sIter.pCsr = pCsr;
|
|
sIter.iCol = iCol;
|
|
sIter.nSnippet = nSnippet;
|
|
sIter.nPhrase = nList;
|
|
sIter.iCurrent = -1;
|
|
(void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
|
|
|
|
/* Set the *pmSeen output variable. */
|
|
for(i=0; i<nList; i++){
|
|
if( sIter.aPhrase[i].pHead ){
|
|
*pmSeen |= (u64)1 << i;
|
|
}
|
|
}
|
|
|
|
/* Loop through all candidate snippets. Store the best snippet in
|
|
** *pFragment. Store its associated 'score' in iBestScore.
|
|
*/
|
|
pFragment->iCol = iCol;
|
|
while( !fts3SnippetNextCandidate(&sIter) ){
|
|
int iPos;
|
|
int iScore;
|
|
u64 mCover;
|
|
u64 mHighlight;
|
|
fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
|
|
assert( iScore>=0 );
|
|
if( iScore>iBestScore ){
|
|
pFragment->iPos = iPos;
|
|
pFragment->hlmask = mHighlight;
|
|
pFragment->covered = mCover;
|
|
iBestScore = iScore;
|
|
}
|
|
}
|
|
|
|
sqlite3_free(sIter.aPhrase);
|
|
*piScore = iBestScore;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
|
|
/*
|
|
** Append a string to the string-buffer passed as the first argument.
|
|
**
|
|
** If nAppend is negative, then the length of the string zAppend is
|
|
** determined using strlen().
|
|
*/
|
|
static int fts3StringAppend(
|
|
StrBuffer *pStr, /* Buffer to append to */
|
|
const char *zAppend, /* Pointer to data to append to buffer */
|
|
int nAppend /* Size of zAppend in bytes (or -1) */
|
|
){
|
|
if( nAppend<0 ){
|
|
nAppend = (int)strlen(zAppend);
|
|
}
|
|
|
|
/* If there is insufficient space allocated at StrBuffer.z, use realloc()
|
|
** to grow the buffer until so that it is big enough to accomadate the
|
|
** appended data.
|
|
*/
|
|
if( pStr->n+nAppend+1>=pStr->nAlloc ){
|
|
int nAlloc = pStr->nAlloc+nAppend+100;
|
|
char *zNew = sqlite3_realloc(pStr->z, nAlloc);
|
|
if( !zNew ){
|
|
return SQLITE_NOMEM;
|
|
}
|
|
pStr->z = zNew;
|
|
pStr->nAlloc = nAlloc;
|
|
}
|
|
assert( pStr->z!=0 && (pStr->nAlloc >= pStr->n+nAppend+1) );
|
|
|
|
/* Append the data to the string buffer. */
|
|
memcpy(&pStr->z[pStr->n], zAppend, nAppend);
|
|
pStr->n += nAppend;
|
|
pStr->z[pStr->n] = '\0';
|
|
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** The fts3BestSnippet() function often selects snippets that end with a
|
|
** query term. That is, the final term of the snippet is always a term
|
|
** that requires highlighting. For example, if 'X' is a highlighted term
|
|
** and '.' is a non-highlighted term, BestSnippet() may select:
|
|
**
|
|
** ........X.....X
|
|
**
|
|
** This function "shifts" the beginning of the snippet forward in the
|
|
** document so that there are approximately the same number of
|
|
** non-highlighted terms to the right of the final highlighted term as there
|
|
** are to the left of the first highlighted term. For example, to this:
|
|
**
|
|
** ....X.....X....
|
|
**
|
|
** This is done as part of extracting the snippet text, not when selecting
|
|
** the snippet. Snippet selection is done based on doclists only, so there
|
|
** is no way for fts3BestSnippet() to know whether or not the document
|
|
** actually contains terms that follow the final highlighted term.
|
|
*/
|
|
static int fts3SnippetShift(
|
|
Fts3Table *pTab, /* FTS3 table snippet comes from */
|
|
int iLangid, /* Language id to use in tokenizing */
|
|
int nSnippet, /* Number of tokens desired for snippet */
|
|
const char *zDoc, /* Document text to extract snippet from */
|
|
int nDoc, /* Size of buffer zDoc in bytes */
|
|
int *piPos, /* IN/OUT: First token of snippet */
|
|
u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */
|
|
){
|
|
u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */
|
|
|
|
if( hlmask ){
|
|
int nLeft; /* Tokens to the left of first highlight */
|
|
int nRight; /* Tokens to the right of last highlight */
|
|
int nDesired; /* Ideal number of tokens to shift forward */
|
|
|
|
for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
|
|
for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
|
|
nDesired = (nLeft-nRight)/2;
|
|
|
|
/* Ideally, the start of the snippet should be pushed forward in the
|
|
** document nDesired tokens. This block checks if there are actually
|
|
** nDesired tokens to the right of the snippet. If so, *piPos and
|
|
** *pHlMask are updated to shift the snippet nDesired tokens to the
|
|
** right. Otherwise, the snippet is shifted by the number of tokens
|
|
** available.
|
|
*/
|
|
if( nDesired>0 ){
|
|
int nShift; /* Number of tokens to shift snippet by */
|
|
int iCurrent = 0; /* Token counter */
|
|
int rc; /* Return Code */
|
|
sqlite3_tokenizer_module *pMod;
|
|
sqlite3_tokenizer_cursor *pC;
|
|
pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
|
|
|
|
/* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
|
|
** or more tokens in zDoc/nDoc.
|
|
*/
|
|
rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC);
|
|
if( rc!=SQLITE_OK ){
|
|
return rc;
|
|
}
|
|
while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
|
|
const char *ZDUMMY; int DUMMY1 = 0, DUMMY2 = 0, DUMMY3 = 0;
|
|
rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
|
|
}
|
|
pMod->xClose(pC);
|
|
if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
|
|
|
|
nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
|
|
assert( nShift<=nDesired );
|
|
if( nShift>0 ){
|
|
*piPos += nShift;
|
|
*pHlmask = hlmask >> nShift;
|
|
}
|
|
}
|
|
}
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Extract the snippet text for fragment pFragment from cursor pCsr and
|
|
** append it to string buffer pOut.
|
|
*/
|
|
static int fts3SnippetText(
|
|
Fts3Cursor *pCsr, /* FTS3 Cursor */
|
|
SnippetFragment *pFragment, /* Snippet to extract */
|
|
int iFragment, /* Fragment number */
|
|
int isLast, /* True for final fragment in snippet */
|
|
int nSnippet, /* Number of tokens in extracted snippet */
|
|
const char *zOpen, /* String inserted before highlighted term */
|
|
const char *zClose, /* String inserted after highlighted term */
|
|
const char *zEllipsis, /* String inserted between snippets */
|
|
StrBuffer *pOut /* Write output here */
|
|
){
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
int rc; /* Return code */
|
|
const char *zDoc; /* Document text to extract snippet from */
|
|
int nDoc; /* Size of zDoc in bytes */
|
|
int iCurrent = 0; /* Current token number of document */
|
|
int iEnd = 0; /* Byte offset of end of current token */
|
|
int isShiftDone = 0; /* True after snippet is shifted */
|
|
int iPos = pFragment->iPos; /* First token of snippet */
|
|
u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
|
|
int iCol = pFragment->iCol+1; /* Query column to extract text from */
|
|
sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
|
|
sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */
|
|
|
|
zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
|
|
if( zDoc==0 ){
|
|
if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
|
|
return SQLITE_NOMEM;
|
|
}
|
|
return SQLITE_OK;
|
|
}
|
|
nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
|
|
|
|
/* Open a token cursor on the document. */
|
|
pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
|
|
rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC);
|
|
if( rc!=SQLITE_OK ){
|
|
return rc;
|
|
}
|
|
|
|
while( rc==SQLITE_OK ){
|
|
const char *ZDUMMY; /* Dummy argument used with tokenizer */
|
|
int DUMMY1 = -1; /* Dummy argument used with tokenizer */
|
|
int iBegin = 0; /* Offset in zDoc of start of token */
|
|
int iFin = 0; /* Offset in zDoc of end of token */
|
|
int isHighlight = 0; /* True for highlighted terms */
|
|
|
|
/* Variable DUMMY1 is initialized to a negative value above. Elsewhere
|
|
** in the FTS code the variable that the third argument to xNext points to
|
|
** is initialized to zero before the first (*but not necessarily
|
|
** subsequent*) call to xNext(). This is done for a particular application
|
|
** that needs to know whether or not the tokenizer is being used for
|
|
** snippet generation or for some other purpose.
|
|
**
|
|
** Extreme care is required when writing code to depend on this
|
|
** initialization. It is not a documented part of the tokenizer interface.
|
|
** If a tokenizer is used directly by any code outside of FTS, this
|
|
** convention might not be respected. */
|
|
rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
|
|
if( rc!=SQLITE_OK ){
|
|
if( rc==SQLITE_DONE ){
|
|
/* Special case - the last token of the snippet is also the last token
|
|
** of the column. Append any punctuation that occurred between the end
|
|
** of the previous token and the end of the document to the output.
|
|
** Then break out of the loop. */
|
|
rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
|
|
}
|
|
break;
|
|
}
|
|
if( iCurrent<iPos ){ continue; }
|
|
|
|
if( !isShiftDone ){
|
|
int n = nDoc - iBegin;
|
|
rc = fts3SnippetShift(
|
|
pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask
|
|
);
|
|
isShiftDone = 1;
|
|
|
|
/* Now that the shift has been done, check if the initial "..." are
|
|
** required. They are required if (a) this is not the first fragment,
|
|
** or (b) this fragment does not begin at position 0 of its column.
|
|
*/
|
|
if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
|
|
rc = fts3StringAppend(pOut, zEllipsis, -1);
|
|
}
|
|
if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
|
|
}
|
|
|
|
if( iCurrent>=(iPos+nSnippet) ){
|
|
if( isLast ){
|
|
rc = fts3StringAppend(pOut, zEllipsis, -1);
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* Set isHighlight to true if this term should be highlighted. */
|
|
isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
|
|
|
|
if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
|
|
if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
|
|
if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
|
|
if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
|
|
|
|
iEnd = iFin;
|
|
}
|
|
|
|
pMod->xClose(pC);
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** This function is used to count the entries in a column-list (a
|
|
** delta-encoded list of term offsets within a single column of a single
|
|
** row). When this function is called, *ppCollist should point to the
|
|
** beginning of the first varint in the column-list (the varint that
|
|
** contains the position of the first matching term in the column data).
|
|
** Before returning, *ppCollist is set to point to the first byte after
|
|
** the last varint in the column-list (either the 0x00 signifying the end
|
|
** of the position-list, or the 0x01 that precedes the column number of
|
|
** the next column in the position-list).
|
|
**
|
|
** The number of elements in the column-list is returned.
|
|
*/
|
|
static int fts3ColumnlistCount(char **ppCollist){
|
|
char *pEnd = *ppCollist;
|
|
char c = 0;
|
|
int nEntry = 0;
|
|
|
|
/* A column-list is terminated by either a 0x01 or 0x00. */
|
|
while( 0xFE & (*pEnd | c) ){
|
|
c = *pEnd++ & 0x80;
|
|
if( !c ) nEntry++;
|
|
}
|
|
|
|
*ppCollist = pEnd;
|
|
return nEntry;
|
|
}
|
|
|
|
/*
|
|
** fts3ExprIterate() callback used to collect the "global" matchinfo stats
|
|
** for a single query.
|
|
**
|
|
** fts3ExprIterate() callback to load the 'global' elements of a
|
|
** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements
|
|
** of the matchinfo array that are constant for all rows returned by the
|
|
** current query.
|
|
**
|
|
** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
|
|
** function populates Matchinfo.aMatchinfo[] as follows:
|
|
**
|
|
** for(iCol=0; iCol<nCol; iCol++){
|
|
** aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
|
|
** aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
|
|
** }
|
|
**
|
|
** where X is the number of matches for phrase iPhrase is column iCol of all
|
|
** rows of the table. Y is the number of rows for which column iCol contains
|
|
** at least one instance of phrase iPhrase.
|
|
**
|
|
** If the phrase pExpr consists entirely of deferred tokens, then all X and
|
|
** Y values are set to nDoc, where nDoc is the number of documents in the
|
|
** file system. This is done because the full-text index doclist is required
|
|
** to calculate these values properly, and the full-text index doclist is
|
|
** not available for deferred tokens.
|
|
*/
|
|
static int fts3ExprGlobalHitsCb(
|
|
Fts3Expr *pExpr, /* Phrase expression node */
|
|
int iPhrase, /* Phrase number (numbered from zero) */
|
|
void *pCtx /* Pointer to MatchInfo structure */
|
|
){
|
|
MatchInfo *p = (MatchInfo *)pCtx;
|
|
return sqlite3Fts3EvalPhraseStats(
|
|
p->pCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol]
|
|
);
|
|
}
|
|
|
|
/*
|
|
** fts3ExprIterate() callback used to collect the "local" part of the
|
|
** FTS3_MATCHINFO_HITS array. The local stats are those elements of the
|
|
** array that are different for each row returned by the query.
|
|
*/
|
|
static int fts3ExprLocalHitsCb(
|
|
Fts3Expr *pExpr, /* Phrase expression node */
|
|
int iPhrase, /* Phrase number */
|
|
void *pCtx /* Pointer to MatchInfo structure */
|
|
){
|
|
int rc = SQLITE_OK;
|
|
MatchInfo *p = (MatchInfo *)pCtx;
|
|
int iStart = iPhrase * p->nCol * 3;
|
|
int i;
|
|
|
|
for(i=0; i<p->nCol && rc==SQLITE_OK; i++){
|
|
char *pCsr;
|
|
rc = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i, &pCsr);
|
|
if( pCsr ){
|
|
p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr);
|
|
}else{
|
|
p->aMatchinfo[iStart+i*3] = 0;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int fts3MatchinfoCheck(
|
|
Fts3Table *pTab,
|
|
char cArg,
|
|
char **pzErr
|
|
){
|
|
if( (cArg==FTS3_MATCHINFO_NPHRASE)
|
|
|| (cArg==FTS3_MATCHINFO_NCOL)
|
|
|| (cArg==FTS3_MATCHINFO_NDOC && pTab->bFts4)
|
|
|| (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bFts4)
|
|
|| (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
|
|
|| (cArg==FTS3_MATCHINFO_LCS)
|
|
|| (cArg==FTS3_MATCHINFO_HITS)
|
|
){
|
|
return SQLITE_OK;
|
|
}
|
|
*pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
|
|
return SQLITE_ERROR;
|
|
}
|
|
|
|
static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
|
|
int nVal; /* Number of integers output by cArg */
|
|
|
|
switch( cArg ){
|
|
case FTS3_MATCHINFO_NDOC:
|
|
case FTS3_MATCHINFO_NPHRASE:
|
|
case FTS3_MATCHINFO_NCOL:
|
|
nVal = 1;
|
|
break;
|
|
|
|
case FTS3_MATCHINFO_AVGLENGTH:
|
|
case FTS3_MATCHINFO_LENGTH:
|
|
case FTS3_MATCHINFO_LCS:
|
|
nVal = pInfo->nCol;
|
|
break;
|
|
|
|
default:
|
|
assert( cArg==FTS3_MATCHINFO_HITS );
|
|
nVal = pInfo->nCol * pInfo->nPhrase * 3;
|
|
break;
|
|
}
|
|
|
|
return nVal;
|
|
}
|
|
|
|
static int fts3MatchinfoSelectDoctotal(
|
|
Fts3Table *pTab,
|
|
sqlite3_stmt **ppStmt,
|
|
sqlite3_int64 *pnDoc,
|
|
const char **paLen
|
|
){
|
|
sqlite3_stmt *pStmt;
|
|
const char *a;
|
|
sqlite3_int64 nDoc;
|
|
|
|
if( !*ppStmt ){
|
|
int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
|
|
if( rc!=SQLITE_OK ) return rc;
|
|
}
|
|
pStmt = *ppStmt;
|
|
assert( sqlite3_data_count(pStmt)==1 );
|
|
|
|
a = sqlite3_column_blob(pStmt, 0);
|
|
a += sqlite3Fts3GetVarint(a, &nDoc);
|
|
if( nDoc==0 ) return FTS_CORRUPT_VTAB;
|
|
*pnDoc = (u32)nDoc;
|
|
|
|
if( paLen ) *paLen = a;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** An instance of the following structure is used to store state while
|
|
** iterating through a multi-column position-list corresponding to the
|
|
** hits for a single phrase on a single row in order to calculate the
|
|
** values for a matchinfo() FTS3_MATCHINFO_LCS request.
|
|
*/
|
|
typedef struct LcsIterator LcsIterator;
|
|
struct LcsIterator {
|
|
Fts3Expr *pExpr; /* Pointer to phrase expression */
|
|
int iPosOffset; /* Tokens count up to end of this phrase */
|
|
char *pRead; /* Cursor used to iterate through aDoclist */
|
|
int iPos; /* Current position */
|
|
};
|
|
|
|
/*
|
|
** If LcsIterator.iCol is set to the following value, the iterator has
|
|
** finished iterating through all offsets for all columns.
|
|
*/
|
|
#define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
|
|
|
|
static int fts3MatchinfoLcsCb(
|
|
Fts3Expr *pExpr, /* Phrase expression node */
|
|
int iPhrase, /* Phrase number (numbered from zero) */
|
|
void *pCtx /* Pointer to MatchInfo structure */
|
|
){
|
|
LcsIterator *aIter = (LcsIterator *)pCtx;
|
|
aIter[iPhrase].pExpr = pExpr;
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Advance the iterator passed as an argument to the next position. Return
|
|
** 1 if the iterator is at EOF or if it now points to the start of the
|
|
** position list for the next column.
|
|
*/
|
|
static int fts3LcsIteratorAdvance(LcsIterator *pIter){
|
|
char *pRead = pIter->pRead;
|
|
sqlite3_int64 iRead;
|
|
int rc = 0;
|
|
|
|
pRead += sqlite3Fts3GetVarint(pRead, &iRead);
|
|
if( iRead==0 || iRead==1 ){
|
|
pRead = 0;
|
|
rc = 1;
|
|
}else{
|
|
pIter->iPos += (int)(iRead-2);
|
|
}
|
|
|
|
pIter->pRead = pRead;
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag.
|
|
**
|
|
** If the call is successful, the longest-common-substring lengths for each
|
|
** column are written into the first nCol elements of the pInfo->aMatchinfo[]
|
|
** array before returning. SQLITE_OK is returned in this case.
|
|
**
|
|
** Otherwise, if an error occurs, an SQLite error code is returned and the
|
|
** data written to the first nCol elements of pInfo->aMatchinfo[] is
|
|
** undefined.
|
|
*/
|
|
static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
|
|
LcsIterator *aIter;
|
|
int i;
|
|
int iCol;
|
|
int nToken = 0;
|
|
|
|
/* Allocate and populate the array of LcsIterator objects. The array
|
|
** contains one element for each matchable phrase in the query.
|
|
**/
|
|
aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
|
|
if( !aIter ) return SQLITE_NOMEM;
|
|
memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
|
|
(void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
|
|
|
|
for(i=0; i<pInfo->nPhrase; i++){
|
|
LcsIterator *pIter = &aIter[i];
|
|
nToken -= pIter->pExpr->pPhrase->nToken;
|
|
pIter->iPosOffset = nToken;
|
|
}
|
|
|
|
for(iCol=0; iCol<pInfo->nCol; iCol++){
|
|
int nLcs = 0; /* LCS value for this column */
|
|
int nLive = 0; /* Number of iterators in aIter not at EOF */
|
|
|
|
for(i=0; i<pInfo->nPhrase; i++){
|
|
int rc;
|
|
LcsIterator *pIt = &aIter[i];
|
|
rc = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol, &pIt->pRead);
|
|
if( rc!=SQLITE_OK ) return rc;
|
|
if( pIt->pRead ){
|
|
pIt->iPos = pIt->iPosOffset;
|
|
fts3LcsIteratorAdvance(&aIter[i]);
|
|
nLive++;
|
|
}
|
|
}
|
|
|
|
while( nLive>0 ){
|
|
LcsIterator *pAdv = 0; /* The iterator to advance by one position */
|
|
int nThisLcs = 0; /* LCS for the current iterator positions */
|
|
|
|
for(i=0; i<pInfo->nPhrase; i++){
|
|
LcsIterator *pIter = &aIter[i];
|
|
if( pIter->pRead==0 ){
|
|
/* This iterator is already at EOF for this column. */
|
|
nThisLcs = 0;
|
|
}else{
|
|
if( pAdv==0 || pIter->iPos<pAdv->iPos ){
|
|
pAdv = pIter;
|
|
}
|
|
if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
|
|
nThisLcs++;
|
|
}else{
|
|
nThisLcs = 1;
|
|
}
|
|
if( nThisLcs>nLcs ) nLcs = nThisLcs;
|
|
}
|
|
}
|
|
if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
|
|
}
|
|
|
|
pInfo->aMatchinfo[iCol] = nLcs;
|
|
}
|
|
|
|
sqlite3_free(aIter);
|
|
return SQLITE_OK;
|
|
}
|
|
|
|
/*
|
|
** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
|
|
** be returned by the matchinfo() function. Argument zArg contains the
|
|
** format string passed as the second argument to matchinfo (or the
|
|
** default value "pcx" if no second argument was specified). The format
|
|
** string has already been validated and the pInfo->aMatchinfo[] array
|
|
** is guaranteed to be large enough for the output.
|
|
**
|
|
** If bGlobal is true, then populate all fields of the matchinfo() output.
|
|
** If it is false, then assume that those fields that do not change between
|
|
** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
|
|
** have already been populated.
|
|
**
|
|
** Return SQLITE_OK if successful, or an SQLite error code if an error
|
|
** occurs. If a value other than SQLITE_OK is returned, the state the
|
|
** pInfo->aMatchinfo[] buffer is left in is undefined.
|
|
*/
|
|
static int fts3MatchinfoValues(
|
|
Fts3Cursor *pCsr, /* FTS3 cursor object */
|
|
int bGlobal, /* True to grab the global stats */
|
|
MatchInfo *pInfo, /* Matchinfo context object */
|
|
const char *zArg /* Matchinfo format string */
|
|
){
|
|
int rc = SQLITE_OK;
|
|
int i;
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
sqlite3_stmt *pSelect = 0;
|
|
|
|
for(i=0; rc==SQLITE_OK && zArg[i]; i++){
|
|
|
|
switch( zArg[i] ){
|
|
case FTS3_MATCHINFO_NPHRASE:
|
|
if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
|
|
break;
|
|
|
|
case FTS3_MATCHINFO_NCOL:
|
|
if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
|
|
break;
|
|
|
|
case FTS3_MATCHINFO_NDOC:
|
|
if( bGlobal ){
|
|
sqlite3_int64 nDoc = 0;
|
|
rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
|
|
pInfo->aMatchinfo[0] = (u32)nDoc;
|
|
}
|
|
break;
|
|
|
|
case FTS3_MATCHINFO_AVGLENGTH:
|
|
if( bGlobal ){
|
|
sqlite3_int64 nDoc; /* Number of rows in table */
|
|
const char *a; /* Aggregate column length array */
|
|
|
|
rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
|
|
if( rc==SQLITE_OK ){
|
|
int iCol;
|
|
for(iCol=0; iCol<pInfo->nCol; iCol++){
|
|
u32 iVal;
|
|
sqlite3_int64 nToken;
|
|
a += sqlite3Fts3GetVarint(a, &nToken);
|
|
iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
|
|
pInfo->aMatchinfo[iCol] = iVal;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case FTS3_MATCHINFO_LENGTH: {
|
|
sqlite3_stmt *pSelectDocsize = 0;
|
|
rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
|
|
if( rc==SQLITE_OK ){
|
|
int iCol;
|
|
const char *a = sqlite3_column_blob(pSelectDocsize, 0);
|
|
for(iCol=0; iCol<pInfo->nCol; iCol++){
|
|
sqlite3_int64 nToken;
|
|
a += sqlite3Fts3GetVarint(a, &nToken);
|
|
pInfo->aMatchinfo[iCol] = (u32)nToken;
|
|
}
|
|
}
|
|
sqlite3_reset(pSelectDocsize);
|
|
break;
|
|
}
|
|
|
|
case FTS3_MATCHINFO_LCS:
|
|
rc = fts3ExprLoadDoclists(pCsr, 0, 0);
|
|
if( rc==SQLITE_OK ){
|
|
rc = fts3MatchinfoLcs(pCsr, pInfo);
|
|
}
|
|
break;
|
|
|
|
default: {
|
|
Fts3Expr *pExpr;
|
|
assert( zArg[i]==FTS3_MATCHINFO_HITS );
|
|
pExpr = pCsr->pExpr;
|
|
rc = fts3ExprLoadDoclists(pCsr, 0, 0);
|
|
if( rc!=SQLITE_OK ) break;
|
|
if( bGlobal ){
|
|
if( pCsr->pDeferred ){
|
|
rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
|
|
if( rc!=SQLITE_OK ) break;
|
|
}
|
|
rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
|
|
if( rc!=SQLITE_OK ) break;
|
|
}
|
|
(void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
|
|
break;
|
|
}
|
|
}
|
|
|
|
pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
|
|
}
|
|
|
|
sqlite3_reset(pSelect);
|
|
return rc;
|
|
}
|
|
|
|
|
|
/*
|
|
** Populate pCsr->aMatchinfo[] with data for the current row. The
|
|
** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
|
|
*/
|
|
static int fts3GetMatchinfo(
|
|
Fts3Cursor *pCsr, /* FTS3 Cursor object */
|
|
const char *zArg /* Second argument to matchinfo() function */
|
|
){
|
|
MatchInfo sInfo;
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
int rc = SQLITE_OK;
|
|
int bGlobal = 0; /* Collect 'global' stats as well as local */
|
|
|
|
memset(&sInfo, 0, sizeof(MatchInfo));
|
|
sInfo.pCursor = pCsr;
|
|
sInfo.nCol = pTab->nColumn;
|
|
|
|
/* If there is cached matchinfo() data, but the format string for the
|
|
** cache does not match the format string for this request, discard
|
|
** the cached data. */
|
|
if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
|
|
assert( pCsr->aMatchinfo );
|
|
sqlite3_free(pCsr->aMatchinfo);
|
|
pCsr->zMatchinfo = 0;
|
|
pCsr->aMatchinfo = 0;
|
|
}
|
|
|
|
/* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
|
|
** matchinfo function has been called for this query. In this case
|
|
** allocate the array used to accumulate the matchinfo data and
|
|
** initialize those elements that are constant for every row.
|
|
*/
|
|
if( pCsr->aMatchinfo==0 ){
|
|
int nMatchinfo = 0; /* Number of u32 elements in match-info */
|
|
int nArg; /* Bytes in zArg */
|
|
int i; /* Used to iterate through zArg */
|
|
|
|
/* Determine the number of phrases in the query */
|
|
pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
|
|
sInfo.nPhrase = pCsr->nPhrase;
|
|
|
|
/* Determine the number of integers in the buffer returned by this call. */
|
|
for(i=0; zArg[i]; i++){
|
|
nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
|
|
}
|
|
|
|
/* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
|
|
nArg = (int)strlen(zArg);
|
|
pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
|
|
if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
|
|
|
|
pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
|
|
pCsr->nMatchinfo = nMatchinfo;
|
|
memcpy(pCsr->zMatchinfo, zArg, nArg+1);
|
|
memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
|
|
pCsr->isMatchinfoNeeded = 1;
|
|
bGlobal = 1;
|
|
}
|
|
|
|
sInfo.aMatchinfo = pCsr->aMatchinfo;
|
|
sInfo.nPhrase = pCsr->nPhrase;
|
|
if( pCsr->isMatchinfoNeeded ){
|
|
rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
|
|
pCsr->isMatchinfoNeeded = 0;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Implementation of snippet() function.
|
|
*/
|
|
void sqlite3Fts3Snippet(
|
|
sqlite3_context *pCtx, /* SQLite function call context */
|
|
Fts3Cursor *pCsr, /* Cursor object */
|
|
const char *zStart, /* Snippet start text - "<b>" */
|
|
const char *zEnd, /* Snippet end text - "</b>" */
|
|
const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */
|
|
int iCol, /* Extract snippet from this column */
|
|
int nToken /* Approximate number of tokens in snippet */
|
|
){
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
int rc = SQLITE_OK;
|
|
int i;
|
|
StrBuffer res = {0, 0, 0};
|
|
|
|
/* The returned text includes up to four fragments of text extracted from
|
|
** the data in the current row. The first iteration of the for(...) loop
|
|
** below attempts to locate a single fragment of text nToken tokens in
|
|
** size that contains at least one instance of all phrases in the query
|
|
** expression that appear in the current row. If such a fragment of text
|
|
** cannot be found, the second iteration of the loop attempts to locate
|
|
** a pair of fragments, and so on.
|
|
*/
|
|
int nSnippet = 0; /* Number of fragments in this snippet */
|
|
SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */
|
|
int nFToken = -1; /* Number of tokens in each fragment */
|
|
|
|
if( !pCsr->pExpr ){
|
|
sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
|
|
return;
|
|
}
|
|
|
|
for(nSnippet=1; 1; nSnippet++){
|
|
|
|
int iSnip; /* Loop counter 0..nSnippet-1 */
|
|
u64 mCovered = 0; /* Bitmask of phrases covered by snippet */
|
|
u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */
|
|
|
|
if( nToken>=0 ){
|
|
nFToken = (nToken+nSnippet-1) / nSnippet;
|
|
}else{
|
|
nFToken = -1 * nToken;
|
|
}
|
|
|
|
for(iSnip=0; iSnip<nSnippet; iSnip++){
|
|
int iBestScore = -1; /* Best score of columns checked so far */
|
|
int iRead; /* Used to iterate through columns */
|
|
SnippetFragment *pFragment = &aSnippet[iSnip];
|
|
|
|
memset(pFragment, 0, sizeof(*pFragment));
|
|
|
|
/* Loop through all columns of the table being considered for snippets.
|
|
** If the iCol argument to this function was negative, this means all
|
|
** columns of the FTS3 table. Otherwise, only column iCol is considered.
|
|
*/
|
|
for(iRead=0; iRead<pTab->nColumn; iRead++){
|
|
SnippetFragment sF = {0, 0, 0, 0};
|
|
int iS;
|
|
if( iCol>=0 && iRead!=iCol ) continue;
|
|
|
|
/* Find the best snippet of nFToken tokens in column iRead. */
|
|
rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
|
|
if( rc!=SQLITE_OK ){
|
|
goto snippet_out;
|
|
}
|
|
if( iS>iBestScore ){
|
|
*pFragment = sF;
|
|
iBestScore = iS;
|
|
}
|
|
}
|
|
|
|
mCovered |= pFragment->covered;
|
|
}
|
|
|
|
/* If all query phrases seen by fts3BestSnippet() are present in at least
|
|
** one of the nSnippet snippet fragments, break out of the loop.
|
|
*/
|
|
assert( (mCovered&mSeen)==mCovered );
|
|
if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
|
|
}
|
|
|
|
assert( nFToken>0 );
|
|
|
|
for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
|
|
rc = fts3SnippetText(pCsr, &aSnippet[i],
|
|
i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
|
|
);
|
|
}
|
|
|
|
snippet_out:
|
|
sqlite3Fts3SegmentsClose(pTab);
|
|
if( rc!=SQLITE_OK ){
|
|
sqlite3_result_error_code(pCtx, rc);
|
|
sqlite3_free(res.z);
|
|
}else{
|
|
sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
|
|
}
|
|
}
|
|
|
|
|
|
typedef struct TermOffset TermOffset;
|
|
typedef struct TermOffsetCtx TermOffsetCtx;
|
|
|
|
struct TermOffset {
|
|
char *pList; /* Position-list */
|
|
int iPos; /* Position just read from pList */
|
|
int iOff; /* Offset of this term from read positions */
|
|
};
|
|
|
|
struct TermOffsetCtx {
|
|
Fts3Cursor *pCsr;
|
|
int iCol; /* Column of table to populate aTerm for */
|
|
int iTerm;
|
|
sqlite3_int64 iDocid;
|
|
TermOffset *aTerm;
|
|
};
|
|
|
|
/*
|
|
** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
|
|
*/
|
|
static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
|
|
TermOffsetCtx *p = (TermOffsetCtx *)ctx;
|
|
int nTerm; /* Number of tokens in phrase */
|
|
int iTerm; /* For looping through nTerm phrase terms */
|
|
char *pList; /* Pointer to position list for phrase */
|
|
int iPos = 0; /* First position in position-list */
|
|
int rc;
|
|
|
|
UNUSED_PARAMETER(iPhrase);
|
|
rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pList);
|
|
nTerm = pExpr->pPhrase->nToken;
|
|
if( pList ){
|
|
fts3GetDeltaPosition(&pList, &iPos);
|
|
assert( iPos>=0 );
|
|
}
|
|
|
|
for(iTerm=0; iTerm<nTerm; iTerm++){
|
|
TermOffset *pT = &p->aTerm[p->iTerm++];
|
|
pT->iOff = nTerm-iTerm-1;
|
|
pT->pList = pList;
|
|
pT->iPos = iPos;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/*
|
|
** Implementation of offsets() function.
|
|
*/
|
|
void sqlite3Fts3Offsets(
|
|
sqlite3_context *pCtx, /* SQLite function call context */
|
|
Fts3Cursor *pCsr /* Cursor object */
|
|
){
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
|
|
int rc; /* Return Code */
|
|
int nToken; /* Number of tokens in query */
|
|
int iCol; /* Column currently being processed */
|
|
StrBuffer res = {0, 0, 0}; /* Result string */
|
|
TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */
|
|
|
|
if( !pCsr->pExpr ){
|
|
sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
|
|
return;
|
|
}
|
|
|
|
memset(&sCtx, 0, sizeof(sCtx));
|
|
assert( pCsr->isRequireSeek==0 );
|
|
|
|
/* Count the number of terms in the query */
|
|
rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
|
|
if( rc!=SQLITE_OK ) goto offsets_out;
|
|
|
|
/* Allocate the array of TermOffset iterators. */
|
|
sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
|
|
if( 0==sCtx.aTerm ){
|
|
rc = SQLITE_NOMEM;
|
|
goto offsets_out;
|
|
}
|
|
sCtx.iDocid = pCsr->iPrevId;
|
|
sCtx.pCsr = pCsr;
|
|
|
|
/* Loop through the table columns, appending offset information to
|
|
** string-buffer res for each column.
|
|
*/
|
|
for(iCol=0; iCol<pTab->nColumn; iCol++){
|
|
sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
|
|
const char *ZDUMMY; /* Dummy argument used with xNext() */
|
|
int NDUMMY = 0; /* Dummy argument used with xNext() */
|
|
int iStart = 0;
|
|
int iEnd = 0;
|
|
int iCurrent = 0;
|
|
const char *zDoc;
|
|
int nDoc;
|
|
|
|
/* Initialize the contents of sCtx.aTerm[] for column iCol. There is
|
|
** no way that this operation can fail, so the return code from
|
|
** fts3ExprIterate() can be discarded.
|
|
*/
|
|
sCtx.iCol = iCol;
|
|
sCtx.iTerm = 0;
|
|
(void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
|
|
|
|
/* Retreive the text stored in column iCol. If an SQL NULL is stored
|
|
** in column iCol, jump immediately to the next iteration of the loop.
|
|
** If an OOM occurs while retrieving the data (this can happen if SQLite
|
|
** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
|
|
** to the caller.
|
|
*/
|
|
zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
|
|
nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
|
|
if( zDoc==0 ){
|
|
if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
|
|
continue;
|
|
}
|
|
rc = SQLITE_NOMEM;
|
|
goto offsets_out;
|
|
}
|
|
|
|
/* Initialize a tokenizer iterator to iterate through column iCol. */
|
|
rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid,
|
|
zDoc, nDoc, &pC
|
|
);
|
|
if( rc!=SQLITE_OK ) goto offsets_out;
|
|
|
|
rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
|
|
while( rc==SQLITE_OK ){
|
|
int i; /* Used to loop through terms */
|
|
int iMinPos = 0x7FFFFFFF; /* Position of next token */
|
|
TermOffset *pTerm = 0; /* TermOffset associated with next token */
|
|
|
|
for(i=0; i<nToken; i++){
|
|
TermOffset *pT = &sCtx.aTerm[i];
|
|
if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
|
|
iMinPos = pT->iPos-pT->iOff;
|
|
pTerm = pT;
|
|
}
|
|
}
|
|
|
|
if( !pTerm ){
|
|
/* All offsets for this column have been gathered. */
|
|
rc = SQLITE_DONE;
|
|
}else{
|
|
assert( iCurrent<=iMinPos );
|
|
if( 0==(0xFE&*pTerm->pList) ){
|
|
pTerm->pList = 0;
|
|
}else{
|
|
fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
|
|
}
|
|
while( rc==SQLITE_OK && iCurrent<iMinPos ){
|
|
rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
|
|
}
|
|
if( rc==SQLITE_OK ){
|
|
char aBuffer[64];
|
|
sqlite3_snprintf(sizeof(aBuffer), aBuffer,
|
|
"%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
|
|
);
|
|
rc = fts3StringAppend(&res, aBuffer, -1);
|
|
}else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){
|
|
rc = FTS_CORRUPT_VTAB;
|
|
}
|
|
}
|
|
}
|
|
if( rc==SQLITE_DONE ){
|
|
rc = SQLITE_OK;
|
|
}
|
|
|
|
pMod->xClose(pC);
|
|
if( rc!=SQLITE_OK ) goto offsets_out;
|
|
}
|
|
|
|
offsets_out:
|
|
sqlite3_free(sCtx.aTerm);
|
|
assert( rc!=SQLITE_DONE );
|
|
sqlite3Fts3SegmentsClose(pTab);
|
|
if( rc!=SQLITE_OK ){
|
|
sqlite3_result_error_code(pCtx, rc);
|
|
sqlite3_free(res.z);
|
|
}else{
|
|
sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
|
|
}
|
|
return;
|
|
}
|
|
|
|
/*
|
|
** Implementation of matchinfo() function.
|
|
*/
|
|
void sqlite3Fts3Matchinfo(
|
|
sqlite3_context *pContext, /* Function call context */
|
|
Fts3Cursor *pCsr, /* FTS3 table cursor */
|
|
const char *zArg /* Second arg to matchinfo() function */
|
|
){
|
|
Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
|
|
int rc;
|
|
int i;
|
|
const char *zFormat;
|
|
|
|
if( zArg ){
|
|
for(i=0; zArg[i]; i++){
|
|
char *zErr = 0;
|
|
if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
|
|
sqlite3_result_error(pContext, zErr, -1);
|
|
sqlite3_free(zErr);
|
|
return;
|
|
}
|
|
}
|
|
zFormat = zArg;
|
|
}else{
|
|
zFormat = FTS3_MATCHINFO_DEFAULT;
|
|
}
|
|
|
|
if( !pCsr->pExpr ){
|
|
sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
|
|
return;
|
|
}
|
|
|
|
/* Retrieve matchinfo() data. */
|
|
rc = fts3GetMatchinfo(pCsr, zFormat);
|
|
sqlite3Fts3SegmentsClose(pTab);
|
|
|
|
if( rc!=SQLITE_OK ){
|
|
sqlite3_result_error_code(pContext, rc);
|
|
}else{
|
|
int n = pCsr->nMatchinfo * sizeof(u32);
|
|
sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
|
|
}
|
|
}
|
|
|
|
#endif
|