src/third_party/sqlite/src/ext/fts2/fts2_icu.c

   1 /*
   2 ** 2007 June 22
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 ** This file implements a tokenizer for fts2 based on the ICU library.
  13 **
  14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
  15 */
  16
  17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  18 #ifdef SQLITE_ENABLE_ICU
  19
  20 #include <assert.h>
  21 #include <string.h>
  22 #include "fts2_tokenizer.h"
  23
  24 #include <unicode/ubrk.h>
  25 #include <unicode/ucol.h>
  26 #include <unicode/ustring.h>
  27 #include <unicode/utf16.h>
  28
  29 typedef struct IcuTokenizer IcuTokenizer;
  30 typedef struct IcuCursor IcuCursor;
  31
  32 struct IcuTokenizer {
  33   sqlite3_tokenizer base;
  34   char *zLocale;
  35 };
  36
  37 struct IcuCursor {
  38   sqlite3_tokenizer_cursor base;
  39
  40   UBreakIterator *pIter;      /* ICU break-iterator object */
  41   int nChar;                  /* Number of UChar elements in pInput */
  42   UChar *aChar;               /* Copy of input using utf-16 encoding */
  43   int *aOffset;               /* Offsets of each character in utf-8 input */
  44
  45   int nBuffer;
  46   char *zBuffer;
  47
  48   int iToken;
  49 };
  50
  51 /*
  52 ** Create a new tokenizer instance.
  53 */
  54 static int icuCreate(
  55   int argc,                            /* Number of entries in argv[] */
  56   const char * const *argv,            /* Tokenizer creation arguments */
  57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
  58 ){
  59   IcuTokenizer *p;
  60   int n = 0;
  61
  62   if( argc>0 ){
  63     n = strlen(argv[0])+1;
  64   }
  65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
  66   if( !p ){
  67     return SQLITE_NOMEM;
  68   }
  69   memset(p, 0, sizeof(IcuTokenizer));
  70
  71   if( n ){
  72     p->zLocale = (char *)&p[1];
  73     memcpy(p->zLocale, argv[0], n);
  74   }
  75
  76   *ppTokenizer = (sqlite3_tokenizer *)p;
  77
  78   return SQLITE_OK;
  79 }
  80
  81 /*
  82 ** Destroy a tokenizer
  83 */
  84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
  85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
  86   sqlite3_free(p);
  87   return SQLITE_OK;
  88 }
  89
  90 /*
  91 ** Prepare to begin tokenizing a particular string.  The input
  92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
  93 ** used to incrementally tokenize this string is returned in
  94 ** *ppCursor.
  95 */
  96 static int icuOpen(
  97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
  98   const char *zInput,                    /* Input string */
  99   int nInput,                            /* Length of zInput in bytes */
 100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
 101 ){
 102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
 103   IcuCursor *pCsr;
 104
 105   const int32_t opt = U_FOLD_CASE_DEFAULT;
 106   UErrorCode status = U_ZERO_ERROR;
 107   int nChar;
 108
 109   UChar32 c;
 110   int iInput = 0;
 111   int iOut = 0;
 112
 113   *ppCursor = 0;
 114
 115   if( nInput<0 ){
 116     nInput = strlen(zInput);
 117   }
 118   nChar = nInput+1;
 119   pCsr = (IcuCursor *)sqlite3_malloc(
 120       sizeof(IcuCursor) +                /* IcuCursor */
 121       (nChar+1) * sizeof(int) +          /* IcuCursor.aOffset[] */
 122       nChar * sizeof(UChar)              /* IcuCursor.aChar[] */
 123   );
 124   if( !pCsr ){
 125     return SQLITE_NOMEM;
 126   }
 127   memset(pCsr, 0, sizeof(IcuCursor));
 128   pCsr->aOffset = (int *)&pCsr[1];
 129   pCsr->aChar = (UChar *)&pCsr->aOffset[nChar+1];
 130
 131   pCsr->aOffset[iOut] = iInput;
 132   U8_NEXT(zInput, iInput, nInput, c);
 133   while( c>0 ){
 134     int isError = 0;
 135     c = u_foldCase(c, opt);
 136     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
 137     if( isError ){
 138       sqlite3_free(pCsr);
 139       return SQLITE_ERROR;
 140     }
 141     pCsr->aOffset[iOut] = iInput;
 142
 143     if( iInput<nInput ){
 144       U8_NEXT(zInput, iInput, nInput, c);
 145     }else{
 146       c = 0;
 147     }
 148   }
 149
 150   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
 151   if( !U_SUCCESS(status) ){
 152     sqlite3_free(pCsr);
 153     return SQLITE_ERROR;
 154   }
 155   pCsr->nChar = iOut;
 156
 157   ubrk_first(pCsr->pIter);
 158   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
 159   return SQLITE_OK;
 160 }
 161
 162 /*
 163 ** Close a tokenization cursor previously opened by a call to icuOpen().
 164 */
 165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
 166   IcuCursor *pCsr = (IcuCursor *)pCursor;
 167   ubrk_close(pCsr->pIter);
 168   sqlite3_free(pCsr->zBuffer);
 169   sqlite3_free(pCsr);
 170   return SQLITE_OK;
 171 }
 172
 173 /*
 174 ** Extract the next token from a tokenization cursor.
 175 */
 176 static int icuNext(
 177   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
 178   const char **ppToken,               /* OUT: *ppToken is the token text */
 179   int *pnBytes,                       /* OUT: Number of bytes in token */
 180   int *piStartOffset,                 /* OUT: Starting offset of token */
 181   int *piEndOffset,                   /* OUT: Ending offset of token */
 182   int *piPosition                     /* OUT: Position integer of token */
 183 ){
 184   IcuCursor *pCsr = (IcuCursor *)pCursor;
 185
 186   int iStart = 0;
 187   int iEnd = 0;
 188   int nByte = 0;
 189
 190   while( iStart==iEnd ){
 191     UChar32 c;
 192
 193     iStart = ubrk_current(pCsr->pIter);
 194     iEnd = ubrk_next(pCsr->pIter);
 195     if( iEnd==UBRK_DONE ){
 196       return SQLITE_DONE;
 197     }
 198
 199     while( iStart<iEnd ){
 200       int iWhite = iStart;
 201       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
 202       if( u_isspace(c) ){
 203         iStart = iWhite;
 204       }else{
 205         break;
 206       }
 207     }
 208     assert(iStart<=iEnd);
 209   }
 210
 211   do {
 212     UErrorCode status = U_ZERO_ERROR;
 213     if( nByte ){
 214       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
 215       if( !zNew ){
 216         return SQLITE_NOMEM;
 217       }
 218       pCsr->zBuffer = zNew;
 219       pCsr->nBuffer = nByte;
 220     }
 221
 222     u_strToUTF8(
 223         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
 224         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
 225         &status                                  /* Output success/failure */
 226     );
 227   } while( nByte>pCsr->nBuffer );
 228
 229   *ppToken = pCsr->zBuffer;
 230   *pnBytes = nByte;
 231   *piStartOffset = pCsr->aOffset[iStart];
 232   *piEndOffset = pCsr->aOffset[iEnd];
 233   *piPosition = pCsr->iToken++;
 234
 235   return SQLITE_OK;
 236 }
 237
 238 /*
 239 ** The set of routines that implement the simple tokenizer
 240 */
 241 static const sqlite3_tokenizer_module icuTokenizerModule = {
 242   0,                           /* iVersion */
 243   icuCreate,                   /* xCreate  */
 244   icuDestroy,                  /* xCreate  */
 245   icuOpen,                     /* xOpen    */
 246   icuClose,                    /* xClose   */
 247   icuNext,                     /* xNext    */
 248 };
 249
 250 /*
 251 ** Set *ppModule to point at the implementation of the ICU tokenizer.
 252 */
 253 void sqlite3Fts2IcuTokenizerModule(
 254   sqlite3_tokenizer_module const**ppModule
 255 ){
 256   *ppModule = &icuTokenizerModule;
 257 }
 258
 259 #endif /* defined(SQLITE_ENABLE_ICU) */
 260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */