86 lines
3.3 KiB
C
86 lines
3.3 KiB
C
// from https://github.com/siyuan-note/sqlite-fts5-siyuan-tokenizer/blob/main/siyuan.c
|
||
|
||
typedef struct SiYuanTokenizer {
|
||
int (*xTokenize)(void* pCtx, int flags, const char* pText, int nText,
|
||
int (*xToken)(
|
||
void *pCtx, /* Copy of 2nd argument to xTokenize() */
|
||
int tflags, /* Mask of FTS5_TOKEN_* flags */
|
||
const char *pToken, /* Pointer to buffer containing token */
|
||
int nToken, /* Size of token in bytes */
|
||
int iStart, /* Byte offset of token within input text */
|
||
int iEnd /* Byte offset of end of token within input text */
|
||
));
|
||
}SiYuanTokenizer;
|
||
|
||
static const char CharacterBytesForUTF8[256] = {
|
||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
|
||
};
|
||
|
||
static int siyuanTokenizer(void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void* pCtx, int tflags, const char* pToken, int nToken, int iStart, int iEnd)) {
|
||
int rc = SQLITE_OK;
|
||
|
||
// flags 可能的值有 FTS5_TOKENIZE_QUERY、FTS5_TOKENIZE_PREFIX、FTS5_TOKENIZE_DOCUMENT、FTS5_TOKENIZE_AUX,如果想对不同的输入做不同的处理可以使用这个
|
||
|
||
int iStart = 0;
|
||
int iEnd = 0;
|
||
char* p = pCtx;
|
||
|
||
while (iEnd < nText)
|
||
{
|
||
int length = CharacterBytesForUTF8[(unsigned int)(unsigned char)pText[iStart]];
|
||
iEnd += length;
|
||
if (length == 0 || iEnd > nText)
|
||
return SQLITE_ERROR;
|
||
|
||
// 过滤停止词
|
||
// ...
|
||
|
||
char buf[8]; // UTF-8 最多 4 字节,这里留点余量
|
||
memcpy(buf, pText + iStart, length);
|
||
|
||
// 仅对 ASCII 做小写转换
|
||
if (length == 1) {
|
||
unsigned char c = (unsigned char)buf[0];
|
||
if (c >= 'A' && c <= 'Z') {
|
||
buf[0] = c - 'A' + 'a';
|
||
}
|
||
}
|
||
|
||
rc = xToken(pCtx, 0, buf, length, iStart, iEnd);
|
||
// rc = xToken(pCtx, FTS5_TOKEN_COLOCATED, ..., ..., ..., ...); // 如果需要添加同义词,则第 2 个参数为 FTS5_TOKEN_COLOCATED
|
||
iStart = iEnd;
|
||
}
|
||
|
||
return rc;
|
||
}
|
||
|
||
static int fts5SiYuanCreate(void* pCtx, const char** azArg, int nArg, Fts5Tokenizer** ppOut) {
|
||
for (int i = 0; i != nArg; ++i) {
|
||
// 对传入参数做处理
|
||
// 如:create virtual table blocks_fts using fts5(content, tokenize = 'siyuan arg1 arg2')
|
||
}
|
||
SiYuanTokenizer* p = (SiYuanTokenizer*)sqlite3_malloc(sizeof(SiYuanTokenizer));
|
||
if (p) {
|
||
p->xTokenize = siyuanTokenizer;
|
||
*ppOut = (Fts5Tokenizer*)(p);
|
||
return SQLITE_OK;
|
||
}
|
||
return SQLITE_ERROR;
|
||
}
|
||
|
||
static void fts5SiYuanDelete(Fts5Tokenizer* p) {
|
||
sqlite3_free(p);
|
||
}
|
||
|
||
static int fts5SiYuanTokenize(Fts5Tokenizer* tokenizer_ptr, void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void*, int, const char*, int, int, int)) {
|
||
SiYuanTokenizer* p = (SiYuanTokenizer*)tokenizer_ptr;
|
||
return p->xTokenize(pCtx, flags, pText, nText, xToken);
|
||
}
|