// from https://github.com/siyuan-note/sqlite-fts5-siyuan-tokenizer/blob/main/siyuan.c typedef struct SiYuanTokenizer { int (*xTokenize)(void* pCtx, int flags, const char* pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ int tflags, /* Mask of FTS5_TOKEN_* flags */ const char *pToken, /* Pointer to buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Byte offset of token within input text */ int iEnd /* Byte offset of end of token within input text */ )); }SiYuanTokenizer; static const char CharacterBytesForUTF8[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 }; static int siyuanTokenizer(void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void* pCtx, int tflags, const char* pToken, int nToken, int iStart, int iEnd)) { int rc = SQLITE_OK; // flags 可能的值有 FTS5_TOKENIZE_QUERY、FTS5_TOKENIZE_PREFIX、FTS5_TOKENIZE_DOCUMENT、FTS5_TOKENIZE_AUX,如果想对不同的输入做不同的处理可以使用这个 int iStart = 0; int iEnd = 0; char* p = pCtx; while (iEnd < nText) { int length = CharacterBytesForUTF8[(unsigned int)(unsigned char)pText[iStart]]; iEnd += length; if (length == 0 || iEnd > nText) return SQLITE_ERROR; // 过滤停止词 // ... char buf[8]; // UTF-8 最多 4 字节,这里留点余量 memcpy(buf, pText + iStart, length); // 仅对 ASCII 做小写转换 if (length == 1) { unsigned char c = (unsigned char)buf[0]; if (c >= 'A' && c <= 'Z') { buf[0] = c - 'A' + 'a'; } } rc = xToken(pCtx, 0, buf, length, iStart, iEnd); // rc = xToken(pCtx, FTS5_TOKEN_COLOCATED, ..., ..., ..., ...); // 如果需要添加同义词,则第 2 个参数为 FTS5_TOKEN_COLOCATED iStart = iEnd; } return rc; } static int fts5SiYuanCreate(void* pCtx, const char** azArg, int nArg, Fts5Tokenizer** ppOut) { for (int i = 0; i != nArg; ++i) { // 对传入参数做处理 // 如:create virtual table blocks_fts using fts5(content, tokenize = 'siyuan arg1 arg2') } SiYuanTokenizer* p = (SiYuanTokenizer*)sqlite3_malloc(sizeof(SiYuanTokenizer)); if (p) { p->xTokenize = siyuanTokenizer; *ppOut = (Fts5Tokenizer*)(p); return SQLITE_OK; } return SQLITE_ERROR; } static void fts5SiYuanDelete(Fts5Tokenizer* p) { sqlite3_free(p); } static int fts5SiYuanTokenize(Fts5Tokenizer* tokenizer_ptr, void* pCtx, int flags, const char* pText, int nText, int (*xToken)(void*, int, const char*, int, int, int)) { SiYuanTokenizer* p = (SiYuanTokenizer*)tokenizer_ptr; return p->xTokenize(pCtx, flags, pText, nText, xToken); }