1
Files
go-sqlite3-fts5/siyuan-tokenizer.h

86 lines
3.3 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// from https://github.com/siyuan-note/sqlite-fts5-siyuan-tokenizer/blob/main/siyuan.c
typedef struct SiYuanTokenizer {
int (*xTokenize)(void* pCtx, int flags, const char* pText, int nText,
int (*xToken)(
void *pCtx, /* Copy of 2nd argument to xTokenize() */
int tflags, /* Mask of FTS5_TOKEN_* flags */
const char *pToken, /* Pointer to buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Byte offset of token within input text */
int iEnd /* Byte offset of end of token within input text */
));
}SiYuanTokenizer;
static const char CharacterBytesForUTF8[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
};
static int siyuanTokenizer(void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void* pCtx, int tflags, const char* pToken, int nToken, int iStart, int iEnd)) {
int rc = SQLITE_OK;
// flags 可能的值有 FTS5_TOKENIZE_QUERY、FTS5_TOKENIZE_PREFIX、FTS5_TOKENIZE_DOCUMENT、FTS5_TOKENIZE_AUX如果想对不同的输入做不同的处理可以使用这个
int iStart = 0;
int iEnd = 0;
char* p = pCtx;
while (iEnd < nText)
{
int length = CharacterBytesForUTF8[(unsigned int)(unsigned char)pText[iStart]];
iEnd += length;
if (length == 0 || iEnd > nText)
return SQLITE_ERROR;
// 过滤停止词
// ...
char buf[8]; // UTF-8 最多 4 字节,这里留点余量
memcpy(buf, pText + iStart, length);
// 仅对 ASCII 做小写转换
if (length == 1) {
unsigned char c = (unsigned char)buf[0];
if (c >= 'A' && c <= 'Z') {
buf[0] = c - 'A' + 'a';
}
}
rc = xToken(pCtx, 0, buf, length, iStart, iEnd);
// rc = xToken(pCtx, FTS5_TOKEN_COLOCATED, ..., ..., ..., ...); // 如果需要添加同义词,则第 2 个参数为 FTS5_TOKEN_COLOCATED
iStart = iEnd;
}
return rc;
}
static int fts5SiYuanCreate(void* pCtx, const char** azArg, int nArg, Fts5Tokenizer** ppOut) {
for (int i = 0; i != nArg; ++i) {
// 对传入参数做处理
// 如create virtual table blocks_fts using fts5(content, tokenize = 'siyuan arg1 arg2')
}
SiYuanTokenizer* p = (SiYuanTokenizer*)sqlite3_malloc(sizeof(SiYuanTokenizer));
if (p) {
p->xTokenize = siyuanTokenizer;
*ppOut = (Fts5Tokenizer*)(p);
return SQLITE_OK;
}
return SQLITE_ERROR;
}
static void fts5SiYuanDelete(Fts5Tokenizer* p) {
sqlite3_free(p);
}
static int fts5SiYuanTokenize(Fts5Tokenizer* tokenizer_ptr, void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void*, int, const char*, int, int, int)) {
SiYuanTokenizer* p = (SiYuanTokenizer*)tokenizer_ptr;
return p->xTokenize(pCtx, flags, pText, nText, xToken);
}