Fix FSF address (Tobias Mueller, #470445)
[platform/upstream/evolution-data-server.git] / camel / camel-text-index.c
1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8; fill-column: 160 -*- */
2 /*
3  *  Copyright (C) 2001-2003 Ximian Inc.
4  *
5  *  Authors: Michael Zucchi <notzed@ximian.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of version 2 of the GNU Lesser General Public
9  * License as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this program; if not, write to the
18  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  * Boston, MA 02110-1301, USA.
20  */
21
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <ctype.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 #include <sys/stat.h>
34 #include <sys/types.h>
35
36 #include <glib.h>
37 #include <glib/gstdio.h>
38
39 #include <libedataserver/e-memory.h>
40 #include <libedataserver/e-msgport.h>
41
42 #include "camel-block-file.h"
43 #include "camel-object.h"
44 #include "camel-partition-table.h"
45 #include "camel-private.h"
46 #include "camel-text-index.h"
47
48
49 #define w(x)
50 #define io(x) 
51 #define d(x) /*(printf("%s(%d):%s: ",  __FILE__, __LINE__, __PRETTY_FUNCTION__),(x))*/
52
53 /* cursor debug */
54 #define c(x)
55
56 #define CAMEL_TEXT_INDEX_MAX_WORDLEN  (36)
57
58 #define CAMEL_TEXT_INDEX_LOCK(kf, lock) \
59         (g_static_rec_mutex_lock(&((CamelTextIndex *)kf)->priv->lock))
60 #define CAMEL_TEXT_INDEX_UNLOCK(kf, lock) \
61         (g_static_rec_mutex_unlock(&((CamelTextIndex *)kf)->priv->lock))
62
63 static int text_index_compress_nosync(CamelIndex *idx);
64
65 /* ********************************************************************** */
66
67 /* "private" data, shared between index/cursor/name classes */
68 typedef struct _CamelTextIndexNamePrivate CamelTextIndexNamePrivate;
69
70 struct _CamelTextIndexNamePrivate {
71         GString *buffer;
72         camel_key_t nameid;
73         EMemPool *pool;
74 };
75
76 CamelTextIndexName *camel_text_index_name_new(CamelTextIndex *idx, const char *name, camel_key_t nameid);
77
78 /* ****************************** */
79
80 typedef struct _CamelTextIndexCursorPrivate CamelTextIndexCursorPrivate;
81
82 struct _CamelTextIndexCursorPrivate {
83         camel_block_t first;
84         camel_block_t next;
85
86         int record_index;
87
88         size_t record_count;
89         camel_key_t *records;
90
91         char *current;
92 };
93
94 CamelTextIndexCursor *camel_text_index_cursor_new(CamelTextIndex *idx, camel_block_t data);
95
96 /* ****************************** */
97 typedef struct _CamelTextIndexKeyCursorPrivate CamelTextIndexKeyCursorPrivate;
98
99 struct _CamelTextIndexKeyCursorPrivate {
100         CamelKeyTable *table;
101
102         camel_key_t keyid;
103         unsigned int flags;
104         camel_block_t data;
105         char *current;
106 };
107
108 CamelTextIndexKeyCursor *camel_text_index_key_cursor_new(CamelTextIndex *idx, CamelKeyTable *table);
109
110 /* ********************************************************************** */
111
112 #define CAMEL_TEXT_INDEX_VERSION "TEXT.000"
113 #define CAMEL_TEXT_INDEX_KEY_VERSION "KEYS.000"
114
115 struct _CamelTextIndexPrivate {
116         CamelBlockFile *blocks;
117         CamelKeyFile *links;
118
119         CamelKeyTable *word_index;
120         CamelPartitionTable *word_hash;
121
122         CamelKeyTable *name_index;
123         CamelPartitionTable *name_hash;
124
125         /* Cache of words to write */
126         int word_cache_limit;
127         int word_cache_count;
128         EDList word_cache;
129         GHashTable *words;
130         GStaticRecMutex lock;
131 };
132
133 /* Root block of text index */
134 struct _CamelTextIndexRoot {
135         struct _CamelBlockRoot root;
136
137         /* FIXME: the index root could contain a pointer to the hash root */
138         camel_block_t word_index_root; /* a keyindex containing the keyid -> word mapping */
139         camel_block_t word_hash_root; /* a partitionindex containing word -> keyid mapping */
140
141         camel_block_t name_index_root; /* same, for names */
142         camel_block_t name_hash_root;
143
144         guint32 words;          /* total words */
145         guint32 names;          /* total names */
146         guint32 deleted;        /* deleted names */
147         guint32 keys;           /* total key 'chunks' written, used with deleted to determine fragmentation */
148 };
149
150 struct _CamelTextIndexWord {
151         struct _CamelTextIndexWord *next;
152         struct _CamelTextIndexWord *prev;
153
154         camel_block_t data;     /* where the data starts */
155         camel_key_t wordid;
156         char *word;
157         unsigned int used;
158         camel_key_t names[32];
159 };
160
161 #define CTI_PRIVATE(o) (((CamelTextIndex *)(o))->priv)
162
163 #define CI_CLASS(o) ((CamelTextIndexClass *)(((CamelObject *)o)->classfuncs))
164
165 /* ********************************************************************** */
166 /* CamelTextIndex */
167 /* ********************************************************************** */
168
169 static CamelObjectClass *camel_text_index_parent;
170
171 /* call locked */
172 static void
173 text_index_add_name_to_word(CamelIndex *idx, const char *word, camel_key_t nameid)
174 {
175         struct _CamelTextIndexWord *w, *wp, *ww;
176         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
177         camel_key_t wordid;
178         camel_block_t data;
179         struct _CamelTextIndexRoot *rb = (struct _CamelTextIndexRoot *)p->blocks->root;
180
181         w = g_hash_table_lookup(p->words, word);
182         if (w == NULL) {
183                 wordid = camel_partition_table_lookup(p->word_hash, word);
184                 if (wordid == 0) {
185                         data = 0;
186                         wordid = camel_key_table_add(p->word_index, word, 0, 0);
187                         if (wordid == 0){
188                                 g_warning ("Could not create key entry for word '%s': %s\n",
189                                            word, strerror (errno));
190                                 return;
191                         }
192                         if (camel_partition_table_add(p->word_hash, word, wordid) == -1) {
193                                 g_warning ("Could not create hash entry for word '%s': %s\n",
194                                            word, strerror (errno));
195                                 return;
196                         }
197                         rb->words++;
198                         camel_block_file_touch_block(p->blocks, p->blocks->root_block);
199                 } else {
200                         data = camel_key_table_lookup(p->word_index, wordid, NULL, 0);
201                         if (data == 0) {
202                                 g_warning ("Could not find key entry for word '%s': %s\n",
203                                            word, strerror (errno));
204                                 return;
205                         }
206                 }
207                 
208                 w = g_malloc0(sizeof(*w));
209                 w->word = g_strdup(word);
210                 w->wordid = wordid;
211                 w->used = 1;
212                 w->data = data;
213
214                 w->names[0] = nameid;
215                 g_hash_table_insert(p->words, w->word, w);
216                 e_dlist_addhead(&p->word_cache, (EDListNode *)w);
217                 p->word_cache_count++;
218                 ww = (struct _CamelTextIndexWord *)p->word_cache.tailpred;
219                 wp = ww->prev;
220                 while (wp && p->word_cache_count > p->word_cache_limit) {
221                         io(printf("writing key file entry '%s' [%x]\n", ww->word, ww->data));
222                         if (camel_key_file_write(p->links, &ww->data, ww->used, ww->names) != -1) {
223                                 io(printf("  new data [%x]\n", ww->data));
224                                 rb->keys++;
225                                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
226                                 /* if this call fails - we still point to the old data - not fatal */
227                                 camel_key_table_set_data(p->word_index, ww->wordid, ww->data);
228                                 e_dlist_remove((EDListNode *)ww);
229                                 g_hash_table_remove(p->words, ww->word);
230                                 g_free(ww->word);
231                                 g_free(ww);
232                                 p->word_cache_count--;
233                         }
234                         ww = wp;
235                         wp = wp->prev;
236                 }
237         } else {
238                 e_dlist_remove((EDListNode *)w);
239                 e_dlist_addhead(&p->word_cache, (EDListNode *)w);
240                 w->names[w->used] = nameid;
241                 w->used++;
242                 if (w->used == sizeof(w->names)/sizeof(w->names[0])) {
243                         io(printf("writing key file entry '%s' [%x]\n", w->word, w->data));
244                         if (camel_key_file_write(p->links, &w->data, w->used, w->names) != -1) {
245                                 rb->keys++;
246                                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
247                                 /* if this call fails - we still point to the old data - not fatal */
248                                 camel_key_table_set_data(p->word_index, w->wordid, w->data);
249                         }
250                         /* FIXME: what to on error?  lost data? */
251                         w->used = 0;
252                 }
253         }
254 }
255
256 static int
257 text_index_sync(CamelIndex *idx)
258 {
259         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
260         struct _CamelTextIndexWord *ww;
261         struct _CamelTextIndexRoot *rb;
262         int ret = 0, wfrag, nfrag, work = FALSE;
263
264         d(printf("sync: blocks = %p\n", p->blocks));
265
266         if (p->blocks == NULL)
267                 return 0;
268
269         rb = (struct _CamelTextIndexRoot *)p->blocks->root;
270
271         /* sync/flush word cache */
272
273         CAMEL_TEXT_INDEX_LOCK(idx, lock);
274
275         /* we sync, bump down the cache limits since we dont need them for reading */
276         p->blocks->block_cache_limit = 128;
277         /* this doesn't really need to be dropped, its only used in updates anyway */
278         p->word_cache_limit = 1024;
279
280         work = !e_dlist_empty(&p->word_cache);
281
282         while ( (ww = (struct _CamelTextIndexWord *)e_dlist_remhead(&p->word_cache)) ) {
283                 if (ww->used > 0) {
284                         io(printf("writing key file entry '%s' [%x]\n", ww->word, ww->data));
285                         if (camel_key_file_write(p->links, &ww->data, ww->used, ww->names) != -1) {
286                                 io(printf("  new data [%x]\n", ww->data));
287                                 rb->keys++;
288                                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
289                                 camel_key_table_set_data(p->word_index, ww->wordid, ww->data);
290                         } else {
291                                 ret = -1;
292                         }
293                         ww->used = 0;
294                 }
295                 g_hash_table_remove(p->words, ww->word);
296                 g_free(ww->word);
297                 g_free(ww);
298         }       
299
300         if (camel_key_table_sync(p->word_index) == -1
301             || camel_key_table_sync(p->name_index) == -1
302             || camel_partition_table_sync(p->word_hash) == -1
303             || camel_partition_table_sync(p->name_hash) == -1)
304                 ret = -1;
305
306         /* only do the frag/compress check if we did some new writes on this index */
307         wfrag = rb->words ? (((rb->keys - rb->words) * 100)/ rb->words) : 0;
308         nfrag = rb->names ? ((rb->deleted * 100) / rb->names) : 0;
309         d(printf("wfrag = %d, nfrag = %d, work = %s, ret = %d\n", wfrag, nfrag, work?"true":"false", ret));
310         d(printf("  words = %d, keys = %d\n", rb->words, rb->keys));
311
312         if (ret == 0) {
313                 if (wfrag > 30 || nfrag > 20)
314                         ret = text_index_compress_nosync(idx);
315         }
316
317         ret = camel_block_file_sync(p->blocks);
318
319         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
320
321         return ret;
322 }
323
324 static void tmp_name(const char *in, char *o)
325 {
326         char *s;
327
328         s = strrchr(in, '/');
329         if (s) {
330                 memcpy(o, in, s-in+1);
331                 memcpy(o+(s-in+1), ".#", 2);
332                 strcpy(o+(s-in+3), s+1);
333         } else {
334                 sprintf(o, ".#%s", in);
335         }
336 }
337
338 static int
339 text_index_compress(CamelIndex *idx)
340 {
341         int ret;
342
343         CAMEL_TEXT_INDEX_LOCK(idx, lock);
344
345         ret = camel_index_sync(idx);
346         if (ret != -1)
347                 ret = text_index_compress_nosync(idx);
348
349         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
350
351         return ret;
352 }
353
354 /* Attempt to recover index space by compressing the indices */
355 static int
356 text_index_compress_nosync(CamelIndex *idx)
357 {
358         CamelTextIndex *newidx;
359         struct _CamelTextIndexPrivate *newp, *oldp;
360         camel_key_t oldkeyid, newkeyid;
361         GHashTable *remap;
362         unsigned int deleted;
363         camel_block_t data, newdata;
364         int i, ret = -1;
365         char *name = NULL;
366         unsigned int flags;
367         char *newpath, *savepath, *oldpath;
368         size_t count, newcount;
369         camel_key_t *records, newrecords[256];
370         struct _CamelTextIndexRoot *rb;
371
372         i = strlen(idx->path)+16;
373         oldpath = alloca(i);
374         newpath = alloca(i);
375         savepath = alloca(i);
376
377         strcpy(oldpath, idx->path);
378         oldpath[strlen(oldpath)-strlen(".index")] = 0;
379
380         tmp_name(oldpath, newpath);
381         sprintf(savepath, "%s~", oldpath);
382
383         d(printf("Old index: %s\n", idx->path));
384         d(printf("Old path: %s\n", oldpath));
385         d(printf("New: %s\n", newpath));
386         d(printf("Save: %s\n", savepath));
387
388         newidx = camel_text_index_new(newpath, O_RDWR|O_CREAT);
389         if (newidx == NULL)
390                 return -1;
391
392         newp = CTI_PRIVATE(newidx);
393         oldp = CTI_PRIVATE(idx);
394
395         CAMEL_TEXT_INDEX_LOCK(idx, lock);
396
397         rb = (struct _CamelTextIndexRoot *)newp->blocks->root;
398
399         rb->words = 0;
400         rb->names = 0;
401         rb->deleted = 0;
402         rb->keys = 0;
403
404         /* Process:
405            For each name we still have:
406            Add it to the new index & setup remap table
407
408            For each word:
409            Copy word's data to a new file
410            Add new word to index(*) (can we just copy blocks?) */
411
412         /* Copy undeleted names to new index file, creating new indices */
413         io(printf("Copying undeleted names to new file\n"));
414         remap = g_hash_table_new(NULL, NULL);
415         oldkeyid = 0;
416         deleted = 0;
417         while ( (oldkeyid = camel_key_table_next(oldp->name_index, oldkeyid, &name, &flags, &data)) ) {
418                 if ((flags&1) == 0) {
419                         io(printf("copying name '%s'\n", name));
420                         newkeyid = camel_key_table_add(newp->name_index, name, data, flags);
421                         if (newkeyid == 0)
422                                 goto fail;
423                         rb->names++;
424                         camel_partition_table_add(newp->name_hash, name, newkeyid);
425                         g_hash_table_insert(remap, GINT_TO_POINTER(oldkeyid), GINT_TO_POINTER(newkeyid));
426                 } else
427                         io(printf("deleted name '%s'\n", name));
428                 g_free(name);
429                 name = NULL;
430                 deleted |= flags;
431         }
432
433         /* Copy word data across, remapping/deleting and create new index for it */
434         /* We re-block the data into 256 entry lots while we're at it, since we only
435            have to do 1 at a time and its cheap */
436         oldkeyid = 0;
437         while ( (oldkeyid = camel_key_table_next(oldp->word_index, oldkeyid, &name, &flags, &data)) ) {
438                 io(printf("copying word '%s'\n", name));
439                 newdata = 0;
440                 newcount = 0;
441                 if (data) {
442                         rb->words++;
443                         rb->keys++;
444                 }
445                 while (data) {
446                         if (camel_key_file_read(oldp->links, &data, &count, &records) == -1) {
447                                 io(printf("could not read from old keys at %d for word '%s'\n", (int)data, name));
448                                 goto fail;
449                         }
450                         for (i=0;i<count;i++) {
451                                 newkeyid = (camel_key_t)GPOINTER_TO_INT(g_hash_table_lookup(remap, GINT_TO_POINTER(records[i])));
452                                 if (newkeyid) {
453                                         newrecords[newcount++] = newkeyid;
454                                         if (newcount == sizeof(newrecords)/sizeof(newrecords[0])) {
455                                                 if (camel_key_file_write(newp->links, &newdata, newcount, newrecords) == -1) {
456                                                         g_free(records);
457                                                         goto fail;
458                                                 }
459                                                 newcount = 0;
460                                         }
461                                 }
462                         }
463                         g_free(records);
464                 }
465                 
466                 if (newcount > 0) {
467                         if (camel_key_file_write(newp->links, &newdata, newcount, newrecords) == -1)
468                                 goto fail;
469                 }
470
471                 if (newdata != 0) {
472                         newkeyid = camel_key_table_add(newp->word_index, name, newdata, flags);
473                         if (newkeyid == 0)
474                                 goto fail;
475                         camel_partition_table_add(newp->word_hash, name, newkeyid);
476                 }
477                 g_free(name);
478                 name = NULL;
479         }
480
481         camel_block_file_touch_block(newp->blocks, newp->blocks->root_block);
482         
483         if (camel_index_sync((CamelIndex *)newidx) == -1)
484                 goto fail;
485
486         /* Rename underlying files to match */
487         ret = camel_index_rename(idx, savepath);
488         if (ret == -1)
489                 goto fail;
490
491         /* If this fails, we'll pick up something during restart? */
492         ret = camel_index_rename((CamelIndex *)newidx, oldpath);
493
494 #define myswap(a, b) { void *tmp = a; a = b; b = tmp; }
495         /* Poke the private data across to the new object */
496         /* And change the fd's over, etc? */
497         /* Yes: This is a hack */
498         myswap(newp->blocks, oldp->blocks);
499         myswap(newp->links, oldp->links);
500         myswap(newp->word_index, oldp->word_index);
501         myswap(newp->word_hash, oldp->word_hash);
502         myswap(newp->name_index, oldp->name_index);
503         myswap(newp->name_hash, oldp->name_hash);
504         myswap(((CamelIndex *)newidx)->path, ((CamelIndex *)idx)->path);
505 #undef myswap
506
507         ret = 0;
508 fail:
509         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
510
511         camel_index_delete((CamelIndex *)newidx);
512
513         camel_object_unref((CamelObject *)newidx);
514         g_free(name);
515         g_hash_table_destroy(remap);
516
517         /* clean up temp files always */
518         sprintf(savepath, "%s~.index", oldpath);
519         g_unlink(savepath);
520         sprintf(newpath, "%s.data", savepath);
521         g_unlink(newpath);
522
523         return ret;
524 }
525
526 static int
527 text_index_delete(CamelIndex *idx)
528 {
529         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
530         int ret = 0;
531
532         if (camel_block_file_delete(p->blocks) == -1)
533                 ret = -1;
534         if (camel_key_file_delete(p->links) == -1)
535                 ret = -1;
536
537         return ret;
538 }
539
540 static int
541 text_index_rename(CamelIndex *idx, const char *path)
542 {
543         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
544         char *newlink, *newblock;
545         int err, ret;
546
547         CAMEL_TEXT_INDEX_LOCK(idx, lock);
548
549         newblock = alloca(strlen(path)+8);
550         sprintf(newblock, "%s.index", path);
551         ret = camel_block_file_rename(p->blocks, newblock);
552         if (ret == -1) {
553                 CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
554                 return -1;
555         }
556
557         newlink = alloca(strlen(path)+16);
558         sprintf(newlink, "%s.index.data", path);
559         ret = camel_key_file_rename(p->links, newlink);
560         if (ret == -1) {
561                 err = errno;
562                 camel_block_file_rename(p->blocks, idx->path);
563                 CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
564                 errno = err;
565                 return -1;
566         }
567
568         g_free(idx->path);
569         idx->path = g_strdup(newblock);
570
571         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
572
573         return 0;
574 }
575
576 static int
577 text_index_has_name(CamelIndex *idx, const char *name)
578 {
579         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
580
581         return camel_partition_table_lookup(p->name_hash, name) != 0;
582 }
583
584 static CamelIndexName *
585 text_index_add_name(CamelIndex *idx, const char *name)
586 {
587         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
588         camel_key_t keyid;
589         CamelIndexName *idn;
590         struct _CamelTextIndexRoot *rb = (struct _CamelTextIndexRoot *)p->blocks->root;
591
592         CAMEL_TEXT_INDEX_LOCK(idx, lock);
593
594         /* if we're adding words, up the cache limits a lot */
595         if (p->blocks) {
596                 p->blocks->block_cache_limit = 1024;
597                 p->word_cache_limit = 8192;
598         }
599
600         /* If we have it already replace it */
601         keyid = camel_partition_table_lookup(p->name_hash, name);
602         if (keyid != 0) {
603                 /* TODO: We could just update the partition table's
604                    key pointer rather than having to delete it */
605                 rb->deleted++;
606                 camel_key_table_set_flags(p->name_index, keyid, 1, 1);
607                 camel_partition_table_remove(p->name_hash, name);
608         }
609         
610         keyid = camel_key_table_add(p->name_index, name, 0, 0);
611         if (keyid != 0) {
612                 camel_partition_table_add(p->name_hash, name, keyid);
613                 rb->names++;
614         }
615
616         camel_block_file_touch_block(p->blocks, p->blocks->root_block);
617
618         /* TODO: if keyid == 0, we had a failure, we should somehow flag that, but for
619            now just return a valid object but discard its results, see text_index_write_name */
620
621         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
622
623         idn = (CamelIndexName *)camel_text_index_name_new((CamelTextIndex *)idx, name, keyid);
624
625         return idn;
626 }
627
628 /* call locked */
629 static void
630 hash_write_word(char *word, void *data, CamelIndexName *idn)
631 {
632         CamelTextIndexName *tin = (CamelTextIndexName *)idn;
633
634         text_index_add_name_to_word(idn->index, word, tin->priv->nameid);
635 }
636
637 static int
638 text_index_write_name(CamelIndex *idx, CamelIndexName *idn)
639 {
640         /* force 'flush' of any outstanding data */
641         camel_index_name_add_buffer(idn, NULL, 0);
642
643         /* see text_index_add_name for when this can be 0 */
644         if (((CamelTextIndexName *)idn)->priv->nameid != 0) {
645                 CAMEL_TEXT_INDEX_LOCK(idx, lock);
646                 
647                 g_hash_table_foreach(idn->words, (GHFunc)hash_write_word, idn);
648                 
649                 CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
650         }
651
652         return 0;
653 }
654
655 static CamelIndexCursor *
656 text_index_find_name(CamelIndex *idx, const char *name)
657 {
658         /* what was this for, umm */
659         return NULL;
660 }
661
662 static void
663 text_index_delete_name(CamelIndex *idx, const char *name)
664 {
665         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
666         camel_key_t keyid;
667         struct _CamelTextIndexRoot *rb = (struct _CamelTextIndexRoot *)p->blocks->root;
668
669         d(printf("Delete name: %s\n", name));
670
671         /* probably doesn't really need locking, but oh well */
672         CAMEL_TEXT_INDEX_LOCK(idx, lock);
673
674         /* We just mark the key deleted, and remove it from the hash table */
675         keyid = camel_partition_table_lookup(p->name_hash, name);
676         if (keyid != 0) {
677                 rb->deleted++;
678                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
679                 camel_key_table_set_flags(p->name_index, keyid, 1, 1);
680                 camel_partition_table_remove(p->name_hash, name);
681         }
682
683         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
684 }
685
686 static CamelIndexCursor *
687 text_index_find(CamelIndex *idx, const char *word)
688 {
689         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
690         camel_key_t keyid;
691         camel_block_t data = 0;
692         unsigned int flags;
693         CamelIndexCursor *idc;
694
695         CAMEL_TEXT_INDEX_LOCK(idx, lock);
696
697         keyid = camel_partition_table_lookup(p->word_hash, word);
698         if (keyid != 0) {
699                 data = camel_key_table_lookup(p->word_index, keyid, NULL, &flags);
700                 if (flags & 1)
701                         data = 0;
702         }
703
704         CAMEL_TEXT_INDEX_UNLOCK(idx, lock);
705
706         idc = (CamelIndexCursor *)camel_text_index_cursor_new((CamelTextIndex *)idx, data);
707
708         return idc;
709 }
710
711 static CamelIndexCursor *
712 text_index_words(CamelIndex *idx)
713 {
714         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
715
716         return (CamelIndexCursor *)camel_text_index_key_cursor_new((CamelTextIndex *)idx, p->word_index);
717 }
718
719 static CamelIndexCursor *
720 text_index_names(CamelIndex *idx)
721 {
722         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
723
724         return (CamelIndexCursor *)camel_text_index_key_cursor_new((CamelTextIndex *)idx, p->name_index);
725 }
726
727 static void
728 camel_text_index_class_init(CamelTextIndexClass *klass)
729 {
730         CamelIndexClass *iklass = (CamelIndexClass *)klass;
731
732         camel_text_index_parent = CAMEL_OBJECT_CLASS(camel_type_get_global_classfuncs(camel_object_get_type()));
733
734         iklass->sync = text_index_sync;
735         iklass->compress = text_index_compress;
736         iklass->delete = text_index_delete;
737
738         iklass->rename = text_index_rename;
739
740         iklass->has_name = text_index_has_name;
741         iklass->add_name = text_index_add_name;
742         iklass->write_name = text_index_write_name;
743         iklass->find_name = text_index_find_name;
744         iklass->delete_name = text_index_delete_name;
745         iklass->find = text_index_find;
746
747         iklass->words = text_index_words;
748         iklass->names = text_index_names;
749 }
750
751 static void
752 camel_text_index_init(CamelTextIndex *idx)
753 {
754         struct _CamelTextIndexPrivate *p;
755
756         p = CTI_PRIVATE(idx) = g_malloc0(sizeof(*p));
757
758         e_dlist_init(&p->word_cache);
759         p->words = g_hash_table_new(g_str_hash, g_str_equal);
760         p->word_cache_count = 0;
761         /* this cache size and the block cache size have been tuned for about the best
762            with moderate memory usage.  Doubling the memory usage barely affects performance. */
763         p->word_cache_limit = 4096; /* 1024 = 128K */
764         
765         g_static_rec_mutex_init(&p->lock);
766 }
767
768 static void
769 camel_text_index_finalise(CamelTextIndex *idx)
770 {
771         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
772
773         camel_index_sync((CamelIndex *)idx);
774
775         g_assert(e_dlist_empty(&p->word_cache));
776         g_assert(g_hash_table_size(p->words) == 0);
777
778         g_hash_table_destroy(p->words);
779
780         if (p->word_index)
781                 camel_object_unref((CamelObject *)p->word_index);
782         if (p->word_hash)
783                 camel_object_unref((CamelObject *)p->word_hash);
784         if (p->name_index)
785                 camel_object_unref((CamelObject *)p->name_index);
786         if (p->name_hash)
787                 camel_object_unref((CamelObject *)p->name_hash);
788
789         if (p->blocks)
790                 camel_object_unref((CamelObject *)p->blocks);
791         if (p->links)
792                 camel_object_unref((CamelObject *)p->links);
793         
794         g_static_rec_mutex_free(&p->lock);
795         
796         g_free(p);
797 }
798
799 CamelType
800 camel_text_index_get_type(void)
801 {
802         static CamelType type = CAMEL_INVALID_TYPE;
803         
804         if (type == CAMEL_INVALID_TYPE) {
805                 type = camel_type_register(camel_index_get_type(), "CamelTextIndex",
806                                            sizeof (CamelTextIndex),
807                                            sizeof (CamelTextIndexClass),
808                                            (CamelObjectClassInitFunc) camel_text_index_class_init,
809                                            NULL,
810                                            (CamelObjectInitFunc) camel_text_index_init,
811                                            (CamelObjectFinalizeFunc) camel_text_index_finalise);
812         }
813         
814         return type;
815 }
816
817 static char *
818 text_index_normalise(CamelIndex *idx, const char *in, void *data)
819 {
820         char *word;
821
822         /* Sigh, this is really expensive */
823         /*g_utf8_normalize(in, strlen(in), G_NORMALIZE_ALL);*/
824         word = g_utf8_strdown(in, -1);
825
826         return word;
827 }
828
829 CamelTextIndex *
830 camel_text_index_new(const char *path, int flags)
831 {
832         CamelTextIndex *idx = (CamelTextIndex *)camel_object_new(camel_text_index_get_type());
833         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
834         struct _CamelTextIndexRoot *rb;
835         char *link;
836         CamelBlock *bl;
837
838         camel_index_construct((CamelIndex *)idx, path, flags);
839         camel_index_set_normalise((CamelIndex *)idx, text_index_normalise, NULL);
840
841         p->blocks = camel_block_file_new(idx->parent.path, flags, CAMEL_TEXT_INDEX_VERSION, CAMEL_BLOCK_SIZE);
842         link = alloca(strlen(idx->parent.path)+7);
843         sprintf(link, "%s.data", idx->parent.path);
844         p->links = camel_key_file_new(link, flags, CAMEL_TEXT_INDEX_KEY_VERSION);
845
846         if (p->blocks == NULL || p->links == NULL)
847                 goto fail;
848
849         rb = (struct _CamelTextIndexRoot *)p->blocks->root;
850
851         if (rb->word_index_root == 0) {
852                 bl = camel_block_file_new_block(p->blocks);
853
854                 if (bl == NULL)
855                         goto fail;
856
857                 rb->word_index_root = bl->id;
858                 camel_block_file_unref_block(p->blocks, bl);
859                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
860         }       
861
862         if (rb->word_hash_root == 0) {
863                 bl = camel_block_file_new_block(p->blocks);
864
865                 if (bl == NULL)
866                         goto fail;
867
868                 rb->word_hash_root = bl->id;
869                 camel_block_file_unref_block(p->blocks, bl);
870                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
871         }       
872
873         if (rb->name_index_root == 0) {
874                 bl = camel_block_file_new_block(p->blocks);
875
876                 if (bl == NULL)
877                         goto fail;
878
879                 rb->name_index_root = bl->id;
880                 camel_block_file_unref_block(p->blocks, bl);
881                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
882         }       
883
884         if (rb->name_hash_root == 0) {
885                 bl = camel_block_file_new_block(p->blocks);
886
887                 if (bl == NULL)
888                         goto fail;
889
890                 rb->name_hash_root = bl->id;
891                 camel_block_file_unref_block(p->blocks, bl);
892                 camel_block_file_touch_block(p->blocks, p->blocks->root_block);
893         }       
894
895         p->word_index = camel_key_table_new(p->blocks, rb->word_index_root);
896         p->word_hash = camel_partition_table_new(p->blocks, rb->word_hash_root);
897         p->name_index = camel_key_table_new(p->blocks, rb->name_index_root);
898         p->name_hash = camel_partition_table_new(p->blocks, rb->name_hash_root);
899
900         if (p->word_index == NULL || p->word_hash == NULL
901             || p->name_index == NULL || p->name_hash == NULL) {
902                 camel_object_unref((CamelObject *)idx);
903                 idx = NULL;
904         }
905
906         return idx;
907
908 fail:
909         camel_object_unref((CamelObject *)idx);
910         return NULL;
911 }
912
913 /* returns 0 if the index exists, is valid, and synced, -1 otherwise */
914 int
915 camel_text_index_check(const char *path)
916 {
917         char *block, *key;
918         CamelBlockFile *blocks;
919         CamelKeyFile *keys;
920
921         block = alloca(strlen(path)+7);
922         sprintf(block, "%s.index", path);
923         blocks = camel_block_file_new(block, O_RDONLY, CAMEL_TEXT_INDEX_VERSION, CAMEL_BLOCK_SIZE);
924         if (blocks == NULL) {
925                 io(printf("Check failed: No block file: %s\n", strerror (errno)));
926                 return -1;
927         }
928         key = alloca(strlen(path)+12);
929         sprintf(key, "%s.index.data", path);
930         keys = camel_key_file_new(key, O_RDONLY, CAMEL_TEXT_INDEX_KEY_VERSION);
931         if (keys == NULL) {
932                 io(printf("Check failed: No key file: %s\n", strerror (errno)));
933                 camel_object_unref((CamelObject *)blocks);
934                 return -1;
935         }
936
937         camel_object_unref((CamelObject *)keys);
938         camel_object_unref((CamelObject *)blocks);
939
940         return 0;
941 }
942
943 int
944 camel_text_index_rename(const char *old, const char *new)
945 {
946         char *oldname, *newname;
947         int err;
948
949         /* TODO: camel_text_index_rename should find out if we have an active index and use that instead */
950
951         oldname = alloca(strlen(old)+12);
952         newname = alloca(strlen(new)+12);
953         sprintf(oldname, "%s.index", old);
954         sprintf(newname, "%s.index", new);
955
956         if (g_rename(oldname, newname) == -1 && errno != ENOENT)
957                 return -1;
958
959         sprintf(oldname, "%s.index.data", old);
960         sprintf(newname, "%s.index.data", new);
961
962         if (g_rename(oldname, newname) == -1 && errno != ENOENT) {
963                 err = errno;
964                 sprintf(oldname, "%s.index", old);
965                 sprintf(newname, "%s.index", new);
966                 g_rename(newname, oldname);
967                 errno = err;
968                 return -1;
969         }
970
971         return 0;
972 }
973
974 int
975 camel_text_index_remove(const char *old)
976 {
977         char *block, *key;
978         int ret = 0;
979
980         /* TODO: needs to poke any active indices to remain unlinked */
981
982         block = alloca(strlen(old)+12);
983         key = alloca(strlen(old)+12);
984         sprintf(block, "%s.index", old);
985         sprintf(key, "%s.index.data", old);
986
987         if (g_unlink(block) == -1 && errno != ENOENT)
988                 ret = -1;
989         if (g_unlink(key) == -1 && errno != ENOENT)
990                 ret = -1;
991
992         return ret;
993 }
994
995 /* Debug */
996 void
997 camel_text_index_info(CamelTextIndex *idx)
998 {
999         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
1000         struct _CamelTextIndexRoot *rb = (struct _CamelTextIndexRoot *)p->blocks->root;
1001         int frag;
1002
1003         printf("Path: '%s'\n", idx->parent.path);
1004         printf("Version: %u\n", idx->parent.version);
1005         printf("Flags: %08x\n", idx->parent.flags);
1006         printf("Total words: %u\n", rb->words);
1007         printf("Total names: %u\n", rb->names);
1008         printf("Total deleted: %u\n", rb->deleted);
1009         printf("Total key blocks: %u\n", rb->keys);
1010
1011         if (rb->words > 0) {
1012                 frag = ((rb->keys - rb->words) * 100)/ rb->words;
1013                 printf("Word fragmentation: %d%%\n", frag);
1014         }
1015
1016         if (rb->names > 0) {
1017                 frag = (rb->deleted * 100)/ rb->names;
1018                 printf("Name fragmentation: %d%%\n", frag);
1019         }
1020 }
1021
1022 /* #define DUMP_RAW */
1023
1024 #ifdef DUMP_RAW
1025 enum { KEY_ROOT = 1, KEY_DATA = 2, PARTITION_MAP = 4, PARTITION_DATA = 8 };
1026
1027 static void
1028 add_type(GHashTable *map, camel_block_t id, int type)
1029 {
1030         camel_block_t old;
1031
1032         old = g_hash_table_lookup(map, id);
1033         if (old == type)
1034                 return;
1035
1036         if (old != 0 && old != type)
1037                 g_warning("block %x redefined as type %d, already type %d\n", id, type, old);
1038         g_hash_table_insert(map, id, GINT_TO_POINTER (type|old));
1039 }
1040
1041 static void
1042 add_partition(GHashTable *map, CamelBlockFile *blocks, camel_block_t id)
1043 {
1044         CamelBlock *bl;
1045         CamelPartitionMapBlock *pm;
1046         int i;
1047
1048         while (id) {
1049                 add_type(map, id, PARTITION_MAP);
1050                 bl = camel_block_file_get_block(blocks, id);
1051                 if (bl == NULL) {
1052                         g_warning("couldn't get parition: %x\n", id);
1053                         return;
1054                 }
1055
1056                 pm = (CamelPartitionMapBlock *)&bl->data;
1057                 if (pm->used > sizeof(pm->partition)/sizeof(pm->partition[0])) {
1058                         g_warning("Partition block %x invalid\n", id);
1059                         camel_block_file_unref_block(blocks, bl);
1060                         return;
1061                 }
1062
1063                 for (i=0;i<pm->used;i++)
1064                         add_type(map, pm->partition[i].blockid, PARTITION_DATA);
1065
1066                 id = pm->next;
1067                 camel_block_file_unref_block(blocks, bl);
1068         }
1069 }
1070
1071 static void
1072 add_keys(GHashTable *map, CamelBlockFile *blocks, camel_block_t id)
1073 {
1074         CamelBlock *rbl, *bl;
1075         CamelKeyRootBlock *root;
1076         CamelKeyBlock *kb;
1077
1078         add_type(map, id, KEY_ROOT);
1079         rbl = camel_block_file_get_block(blocks, id);
1080         if (rbl == NULL) {
1081                 g_warning("couldn't get key root: %x\n", id);
1082                 return;
1083         }
1084         root = (CamelKeyRootBlock *)&rbl->data;
1085         id = root->first;
1086
1087         while (id) {
1088                 add_type(map, id, KEY_DATA);
1089                 bl = camel_block_file_get_block(blocks, id);
1090                 if (bl == NULL) {
1091                         g_warning("couldn't get key: %x\n", id);
1092                         break;
1093                 }
1094
1095                 kb = (CamelKeyBlock *)&bl->data;
1096                 id = kb->next;
1097                 camel_block_file_unref_block(blocks, bl);
1098         }
1099
1100         camel_block_file_unref_block(blocks, rbl);
1101 }
1102
1103 static void
1104 dump_raw(GHashTable *map, char *path)
1105 {
1106         char buf[1024];
1107         char line[256];
1108         char *p, c, *e, *a, *o;
1109         int v, n, len, i, type;
1110         char hex[16] = "0123456789ABCDEF";
1111         int fd;
1112         camel_block_t id, total;
1113
1114         fd = g_open(path, O_RDONLY|O_BINARY, 0);
1115         if (fd == -1)
1116                 return;
1117
1118         total = 0;
1119         while ((len = read(fd, buf, 1024)) == 1024) {
1120                 id = total;
1121
1122
1123
1124                 type = g_hash_table_lookup(map, id);
1125                 switch(type) {
1126                 case 0:
1127                         printf(" - unknown -\n");
1128                         break;
1129                 default:
1130                         printf(" - invalid -\n");
1131                         break;
1132                 case KEY_ROOT: {
1133                         CamelKeyRootBlock *r = (CamelKeyRootBlock *)buf;
1134                         printf("Key root:\n");
1135                         printf("First: %08x     Last: %08x     Free: %08x\n", r->first, r->last, r->free);
1136                 } break;
1137                 case KEY_DATA: {
1138                         CamelKeyBlock *k = (CamelKeyBlock *)buf;
1139                         printf("Key data:\n");
1140                         printf("Next: %08x      Used: %u\n", k->next, k->used);
1141                         for (i=0;i<k->used;i++) {
1142                                 if (i == 0)
1143                                         len = sizeof(k->u.keydata);
1144                                 else
1145                                         len = k->u.keys[i-1].offset;
1146                                 len -= k->u.keys[i].offset;
1147                                 printf("[%03d]: %08x %5d %06x %3d '%.*s'\n", i,
1148                                        k->u.keys[i].data, k->u.keys[i].offset, k->u.keys[i].flags,
1149                                        len, len, k->u.keydata+k->u.keys[i].offset);
1150                         }
1151                 } break;
1152                 case PARTITION_MAP: {
1153                         CamelPartitionMapBlock *m = (CamelPartitionMapBlock *)buf;
1154                         printf("Partition map\n");
1155                         printf("Next: %08x      Used: %u\n", m->next, m->used);
1156                         for (i=0;i<m->used;i++) {
1157                                 printf("[%03d]: %08x -> %08x\n", i, m->partition[i].hashid, m->partition[i].blockid);
1158                         }
1159                 } break;
1160                 case PARTITION_DATA: {
1161                         CamelPartitionKeyBlock *k = (CamelPartitionKeyBlock *)buf;
1162                         printf("Partition data\n");
1163                         printf("Used: %u\n", k->used);
1164                 } break;
1165                 }
1166
1167
1168                 printf("--raw--\n");
1169
1170                 len = 1024;
1171                 p = buf;
1172                 do {
1173                         sprintf(line, "%08x:                                                                      ", total);
1174                         total += 16;
1175                         o = line+10;
1176                         a = o+16*2+2;
1177                         i = 0;
1178                         while (len && i<16) {
1179                                 c = *p++;
1180                                 *a++ = isprint(c)?c:'.';
1181                                 *o++ = hex[(c>>4)&0x0f];
1182                                 *o++ = hex[c&0x0f];
1183                                 i++;
1184                                 if (i==8)
1185                                         *o++ = ' ';
1186                                 len--;
1187                         }
1188                         *a = 0;
1189                         printf("%s\n", line);
1190                 } while (len);
1191                 printf("\n");
1192         }
1193         close (fd);
1194 }
1195 #endif
1196
1197 /* Debug */
1198 void
1199 camel_text_index_dump(CamelTextIndex *idx)
1200 {
1201         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
1202 #ifndef DUMP_RAW
1203         camel_key_t keyid;
1204         char *word;
1205         const char *name;
1206         unsigned int flags;
1207         camel_block_t data;
1208
1209         /* Iterate over all names in the file first */
1210
1211         printf("UID's in index\n");
1212
1213         keyid = 0;
1214         while ( (keyid = camel_key_table_next(p->name_index, keyid, &word, &flags, &data)) ) {
1215                 if ((flags & 1) == 0)
1216                         printf(" %s\n", word);
1217                 else
1218                         printf(" %s (deleted)\n", word);
1219                 g_free(word);
1220         }
1221
1222         printf("Word's in index\n");
1223
1224         keyid = 0;
1225         while ( (keyid = camel_key_table_next(p->word_index, keyid, &word, &flags, &data)) ) {
1226                 CamelIndexCursor *idc;
1227
1228                 printf("Word: '%s':\n", word);
1229                 
1230                 idc = camel_index_find((CamelIndex *)idx, word);
1231                 while ( (name = camel_index_cursor_next(idc)) ) {
1232                         printf(" %s", name);
1233                 }
1234                 printf("\n");
1235                 camel_object_unref((CamelObject *)idc);
1236                 g_free(word);
1237         }
1238 #else
1239         /* a more low-level dump routine */
1240         GHashTable *block_type = g_hash_table_new(NULL, NULL);
1241         camel_block_t id;
1242         struct stat st;
1243         int type;
1244
1245         add_keys(block_type, p->blocks, p->word_index->rootid);
1246         add_keys(block_type, p->blocks, p->name_index->rootid);
1247
1248         add_partition(block_type, p->blocks, p->word_hash->rootid);
1249         add_partition(block_type, p->blocks, p->name_hash->rootid);
1250
1251         dump_raw(block_type, p->blocks->path);
1252         g_hash_table_destroy(block_type);
1253 #endif
1254 }
1255
1256 /* more debug stuff */
1257 void
1258 camel_text_index_validate(CamelTextIndex *idx)
1259 {
1260         struct _CamelTextIndexPrivate *p = CTI_PRIVATE(idx);
1261         camel_key_t keyid;
1262         char *word;
1263         const char *name;
1264         unsigned int flags;
1265         camel_block_t data;
1266         char *oldword;
1267         camel_key_t *records;
1268         size_t count;
1269
1270         GHashTable *names, *deleted, *words, *keys, *name_word, *word_word;
1271
1272         names = g_hash_table_new(NULL, NULL);
1273         deleted = g_hash_table_new(NULL, NULL);
1274
1275         name_word = g_hash_table_new(g_str_hash, g_str_equal);
1276
1277         words = g_hash_table_new(NULL, NULL);
1278         keys = g_hash_table_new(NULL, NULL);
1279
1280         word_word = g_hash_table_new(g_str_hash, g_str_equal);
1281
1282         /* Iterate over all names in the file first */
1283
1284         printf("Checking UID consistency\n");
1285
1286         keyid = 0;
1287         while ( (keyid = camel_key_table_next(p->name_index, keyid, &word, &flags, &data)) ) {
1288                 if ((oldword = g_hash_table_lookup(names, GINT_TO_POINTER(keyid))) != NULL
1289                     || (oldword = g_hash_table_lookup(deleted, GINT_TO_POINTER(keyid))) != NULL) {
1290                         printf("Warning, name '%s' duplicates key (%x) with name '%s'\n", word, keyid, oldword);
1291                         g_free(word);
1292                 } else {
1293                         g_hash_table_insert(name_word, word, GINT_TO_POINTER (1));
1294                         if ((flags & 1) == 0) {
1295                                 g_hash_table_insert(names, GINT_TO_POINTER(keyid), word);
1296                         } else {
1297                                 g_hash_table_insert(deleted, GINT_TO_POINTER(keyid), word);
1298                         }
1299                 }
1300         }
1301
1302         printf("Checking WORD member consistency\n");
1303
1304         keyid = 0;
1305         while ( (keyid = camel_key_table_next(p->word_index, keyid, &word, &flags, &data)) ) {
1306                 CamelIndexCursor *idc;
1307                 GHashTable *used;
1308
1309                 /* first, check for duplicates of keyid, and data */
1310                 if ((oldword = g_hash_table_lookup(words, GINT_TO_POINTER(keyid))) != NULL) {
1311                         printf("Warning, word '%s' duplicates key (%x) with name '%s'\n", word, keyid, oldword);
1312                         g_free(word);
1313                         word = oldword;
1314                 } else {
1315                         g_hash_table_insert(words, GINT_TO_POINTER(keyid), word);
1316                 }
1317
1318                 if (data == 0) {
1319                         /* This may not be an issue if things have been removed over time,
1320                            though it is a problem if its a fresh index */
1321                         printf("Word '%s' has no data associated with it\n", word);
1322                 } else {
1323                         if ((oldword = g_hash_table_lookup(keys, GUINT_TO_POINTER(data))) != NULL) {
1324                                 printf("Warning, word '%s' duplicates data (%x) with name '%s'\n", word, data, oldword);
1325                         } else {
1326                                 g_hash_table_insert(keys, GUINT_TO_POINTER(data), word);
1327                         }
1328                 }
1329
1330                 if ((oldword = g_hash_table_lookup(word_word, word)) != NULL) {
1331                         printf("Warning, word '%s' occurs more than once\n", word);
1332                 } else {
1333                         g_hash_table_insert(word_word, word, word);
1334                 }
1335
1336                 used = g_hash_table_new(g_str_hash, g_str_equal);
1337
1338                 idc = camel_index_find((CamelIndex *)idx, word);
1339                 while ( (name = camel_index_cursor_next(idc)) ) {
1340                         if (g_hash_table_lookup(name_word, name) == NULL) {
1341                                 printf("word '%s' references non-existant name '%s'\n", word, name);
1342                         }
1343                         if (g_hash_table_lookup(used, name) != NULL) {
1344                                 printf("word '%s' uses word '%s' more than once\n", word, name);
1345                         } else {
1346                                 g_hash_table_insert(used, g_strdup(name), (void *)1);
1347                         }
1348                 }
1349                 camel_object_unref((CamelObject *)idc);
1350
1351                 g_hash_table_foreach(used, (GHFunc)g_free, NULL);
1352                 g_hash_table_destroy(used);
1353
1354                 printf("word '%s'\n", word);
1355
1356                 while (data) {
1357                         printf(" data %x ", data);
1358                         if (camel_key_file_read(p->links, &data, &count, &records) == -1) {
1359                                 printf("Warning, read failed for word '%s', at data '%u'\n", word, data);
1360                                 data = 0;
1361                         } else {
1362                                 printf("(%d)\n", (int)count);
1363                                 g_free(records);
1364                         }
1365                 }
1366         }
1367
1368         g_hash_table_destroy(names);
1369         g_hash_table_destroy(deleted);
1370         g_hash_table_destroy(words);
1371         g_hash_table_destroy(keys);
1372
1373         g_hash_table_foreach(name_word, (GHFunc)g_free, NULL);
1374         g_hash_table_destroy(name_word);
1375
1376         g_hash_table_foreach(word_word, (GHFunc)g_free, NULL);
1377         g_hash_table_destroy(word_word);
1378 }
1379
1380 /* ********************************************************************** */
1381 /* CamelTextIndexName */
1382 /* ********************************************************************** */
1383
1384 static CamelIndexNameClass *camel_text_index_name_parent;
1385
1386 #define CIN_CLASS(o) ((CamelTextIndexNameClass *)(((CamelObject *)o)->classfuncs))
1387 #define CIN_PRIVATE(o) (((CamelTextIndexName *)(o))->priv)
1388
1389 static void
1390 text_index_name_add_word(CamelIndexName *idn, const char *word)
1391 {
1392         struct _CamelTextIndexNamePrivate *p = ((CamelTextIndexName *)idn)->priv;
1393
1394         if (g_hash_table_lookup(idn->words, word) == NULL) {
1395                 char *w = e_mempool_strdup(p->pool, word);
1396
1397                 g_hash_table_insert(idn->words, w, w);
1398         }
1399 }
1400
1401 /* Why?
1402    Because it doesn't hang/loop forever on bad data
1403    Used to clean up utf8 before it gets further */
1404
1405 static inline guint32
1406 camel_utf8_next(const unsigned char **ptr, const unsigned char *ptrend)
1407 {
1408         register unsigned char *p = (unsigned char *)*ptr;
1409         register unsigned int c;
1410         register guint32 v;
1411         int l;
1412
1413         if (p == ptrend)
1414                 return 0;
1415
1416         while ( (c = *p++) ) {
1417                 if (c < 0x80) {
1418                         *ptr = p;
1419                         return c;
1420                 } else if ((c&0xe0) == 0xc0) {
1421                         v = c & 0x1f;
1422                         l = 1;
1423                 } else if ((c&0xf0) == 0xe0) {
1424                         v = c & 0x0f;
1425                         l = 2;
1426                 } else if ((c&0xf8) == 0xf0) {
1427                         v = c & 0x07;
1428                         l = 3;
1429                 } else if ((c&0xfc) == 0xf8) {
1430                         v = c & 0x03;
1431                         l = 4;
1432                 } else if ((c&0xfe) == 0xfc) {
1433                         v = c & 0x01;
1434                         l = 5;
1435                 } else
1436                         /* Invalid, ignore and look for next start char if room */
1437                         if (p == ptrend) {
1438                                 return 0;
1439                         } else {
1440                                 continue;
1441                         }
1442
1443                 /* bad data or truncated buffer */
1444                 if (p + l > ptrend)
1445                         return 0;
1446
1447                 while (l && ((c = *p) & 0xc0) == 0x80) {
1448                         p++;
1449                         l--;
1450                         v = (v << 6) | (c & 0x3f);
1451                 }
1452
1453                 /* valid char */
1454                 if (l == 0) {
1455                         *ptr = p;
1456                         return v;
1457                 }
1458
1459                 /* else look for a start char again */
1460         }
1461
1462         return 0;
1463 }
1464
1465 static size_t
1466 text_index_name_add_buffer(CamelIndexName *idn, const char *buffer, size_t len)
1467 {
1468         CamelTextIndexNamePrivate *p = CIN_PRIVATE(idn);
1469         const unsigned char *ptr, *ptrend;
1470         guint32 c;
1471         unsigned char utf8[8];
1472         size_t utf8len;
1473
1474         if (buffer == NULL) {
1475                 if (p->buffer->len) {
1476                         camel_index_name_add_word(idn, p->buffer->str);
1477                         g_string_truncate(p->buffer, 0);
1478                 }
1479                 return 0;
1480         }
1481
1482         ptr = buffer;
1483         ptrend = buffer+len;
1484         while ((c = camel_utf8_next(&ptr, ptrend))) {
1485                 if (g_unichar_isalnum(c)) {
1486                         c = g_unichar_tolower(c);
1487                         utf8len = g_unichar_to_utf8(c, utf8);
1488                         utf8[utf8len] = 0;
1489                         g_string_append(p->buffer, utf8);
1490                 } else {
1491                         if (p->buffer->len > 0 && p->buffer->len <= CAMEL_TEXT_INDEX_MAX_WORDLEN) {
1492                                 text_index_name_add_word(idn, p->buffer->str);
1493                                 /*camel_index_name_add_word(idn, p->buffer->str);*/
1494                         }
1495                         
1496                         g_string_truncate (p->buffer, 0);
1497                 }
1498         }
1499
1500         return 0;
1501 }
1502
1503 static void
1504 camel_text_index_name_class_init(CamelTextIndexNameClass *klass)
1505 {
1506         CamelIndexNameClass *nklass = (CamelIndexNameClass *)klass;
1507
1508         camel_text_index_name_parent = CAMEL_INDEX_NAME_CLASS(camel_type_get_global_classfuncs(camel_index_name_get_type()));
1509
1510         nklass->add_word = text_index_name_add_word;
1511         nklass->add_buffer = text_index_name_add_buffer;
1512 }
1513
1514 static void
1515 camel_text_index_name_init(CamelTextIndexName *idn)
1516 {
1517         CamelTextIndexNamePrivate *p;
1518
1519         idn->parent.words = g_hash_table_new(g_str_hash, g_str_equal);
1520
1521         p = idn->priv = g_malloc0(sizeof(*idn->priv));
1522         p->buffer = g_string_new("");
1523         p->pool = e_mempool_new(256, 128, E_MEMPOOL_ALIGN_BYTE);
1524 }
1525
1526 static void
1527 camel_text_index_name_finalise(CamelTextIndexName *idn)
1528 {
1529         CamelTextIndexNamePrivate *p = CIN_PRIVATE(idn);
1530         
1531         g_hash_table_destroy(idn->parent.words);
1532
1533         g_string_free(p->buffer, TRUE);
1534         e_mempool_destroy(p->pool);
1535
1536         g_free(p);
1537 }
1538
1539 CamelType
1540 camel_text_index_name_get_type(void)
1541 {
1542         static CamelType type = CAMEL_INVALID_TYPE;
1543         
1544         if (type == CAMEL_INVALID_TYPE) {
1545                 type = camel_type_register(camel_index_name_get_type(), "CamelTextIndexName",
1546                                            sizeof (CamelTextIndexName),
1547                                            sizeof (CamelTextIndexNameClass),
1548                                            (CamelObjectClassInitFunc) camel_text_index_name_class_init,
1549                                            NULL,
1550                                            (CamelObjectInitFunc) camel_text_index_name_init,
1551                                            (CamelObjectFinalizeFunc) camel_text_index_name_finalise);
1552         }
1553         
1554         return type;
1555 }
1556
1557 CamelTextIndexName *
1558 camel_text_index_name_new(CamelTextIndex *idx, const char *name, camel_key_t nameid)
1559 {
1560         CamelTextIndexName *idn = (CamelTextIndexName *)camel_object_new(camel_text_index_name_get_type());
1561         CamelIndexName *cin = &idn->parent;
1562         CamelTextIndexNamePrivate *p = CIN_PRIVATE(idn);
1563
1564         cin->index = (CamelIndex *)idx;
1565         camel_object_ref((CamelObject *)idx);
1566         cin->name = e_mempool_strdup(p->pool, name);
1567         p->nameid = nameid;
1568
1569         return idn;
1570 }
1571
1572 /* ********************************************************************** */
1573 /* CamelTextIndexCursor */
1574 /* ********************************************************************** */
1575
1576 static CamelIndexCursorClass *camel_text_index_cursor_parent;
1577
1578 #define CIC_CLASS(o) ((CamelTextIndexCursorClass *)(((CamelObject *)o)->classfuncs))
1579 #define CIC_PRIVATE(o) (((CamelTextIndexCursor *)(o))->priv)
1580
1581 static const char *
1582 text_index_cursor_next(CamelIndexCursor *idc)
1583 {
1584         struct _CamelTextIndexCursorPrivate *p = CIC_PRIVATE(idc);
1585         struct _CamelTextIndexPrivate *tip = CTI_PRIVATE(idc->index);
1586         unsigned int flags;
1587
1588         c(printf("Going to next cursor for word with data '%08x' next %08x\n", p->first, p->next));
1589
1590         do {
1591                 while (p->record_index >= p->record_count) {
1592                         g_free(p->records);
1593                         p->records = NULL;
1594                         p->record_index = 0;
1595                         p->record_count = 0;
1596                         if (p->next == 0)
1597                                 return NULL;
1598                         if (camel_key_file_read(tip->links, &p->next, &p->record_count, &p->records) == -1)
1599                                 return NULL;
1600                 }
1601
1602                 g_free(p->current);
1603                 camel_key_table_lookup(tip->name_index, p->records[p->record_index], &p->current, &flags);
1604                 if (flags & 1) {
1605                         g_free(p->current);
1606                         p->current = NULL;
1607                 }               
1608                 p->record_index++;
1609         } while (p->current == NULL);
1610
1611         return p->current;
1612 }
1613
1614 static void
1615 text_index_cursor_reset(CamelIndexCursor *idc)
1616 {
1617         struct _CamelTextIndexCursorPrivate *p = CIC_PRIVATE(idc);
1618
1619         g_free(p->records);
1620         p->records = NULL;
1621         g_free(p->current);
1622         p->current = NULL;
1623         p->record_count = 0;
1624         p->record_index = 0;
1625         p->next = p->first;
1626 }
1627
1628 static void
1629 camel_text_index_cursor_class_init(CamelTextIndexCursorClass *klass)
1630 {
1631         CamelIndexCursorClass *cklass = (CamelIndexCursorClass *)klass;
1632
1633         camel_text_index_cursor_parent = CAMEL_INDEX_CURSOR_CLASS(camel_type_get_global_classfuncs(camel_index_cursor_get_type()));
1634
1635         cklass->next = text_index_cursor_next;
1636         cklass->reset = text_index_cursor_reset;
1637 }
1638
1639 static void
1640 camel_text_index_cursor_init(CamelTextIndexCursor *idc)
1641 {
1642         CIC_PRIVATE(idc) = g_malloc0(sizeof(struct _CamelTextIndexCursorPrivate));
1643 }
1644
1645 static void
1646 camel_text_index_cursor_finalise(CamelTextIndexCursor *idc)
1647 {
1648         struct _CamelTextIndexCursorPrivate *p = CIC_PRIVATE(idc);
1649
1650         g_free(p->records);
1651         g_free(p->current);
1652         g_free(p);
1653 }
1654
1655 CamelType
1656 camel_text_index_cursor_get_type(void)
1657 {
1658         static CamelType type = CAMEL_INVALID_TYPE;
1659         
1660         if (type == CAMEL_INVALID_TYPE) {
1661                 type = camel_type_register(camel_index_cursor_get_type(), "CamelTextIndexCursor",
1662                                            sizeof (CamelTextIndexCursor),
1663                                            sizeof (CamelTextIndexCursorClass),
1664                                            (CamelObjectClassInitFunc) camel_text_index_cursor_class_init,
1665                                            NULL,
1666                                            (CamelObjectInitFunc) camel_text_index_cursor_init,
1667                                            (CamelObjectFinalizeFunc) camel_text_index_cursor_finalise);
1668         }
1669         
1670         return type;
1671 }
1672
1673 CamelTextIndexCursor *
1674 camel_text_index_cursor_new(CamelTextIndex *idx, camel_block_t data)
1675 {
1676         CamelTextIndexCursor *idc = (CamelTextIndexCursor *)camel_object_new(camel_text_index_cursor_get_type());
1677         CamelIndexCursor *cic = &idc->parent;
1678         struct _CamelTextIndexCursorPrivate *p = CIC_PRIVATE(idc);
1679
1680         cic->index = (CamelIndex *)idx;
1681         camel_object_ref((CamelObject *)idx);
1682         p->first = data;
1683         p->next = data;
1684         p->record_count = 0;
1685         p->record_index = 0;
1686
1687         return idc;
1688 }
1689
1690 /* ********************************************************************** */
1691 /* CamelTextIndexKeyCursor */
1692 /* ********************************************************************** */
1693
1694 static CamelIndexCursorClass *camel_text_index_key_cursor_parent;
1695
1696 #define CIKC_CLASS(o) ((CamelTextIndexKeyCursorClass *)(((CamelObject *)o)->classfuncs))
1697 #define CIKC_PRIVATE(o) (((CamelTextIndexKeyCursor *)(o))->priv)
1698
1699 static const char *
1700 text_index_key_cursor_next(CamelIndexCursor *idc)
1701 {
1702         struct _CamelTextIndexKeyCursorPrivate *p = CIKC_PRIVATE(idc);
1703
1704         c(printf("Going to next cursor for keyid %08x\n", p->keyid));
1705
1706         g_free(p->current);
1707         p->current = NULL;
1708
1709         while ( (p->keyid = camel_key_table_next(p->table, p->keyid, &p->current, &p->flags, &p->data)) ) {
1710                 if ((p->flags & 1) == 0) {
1711                         return p->current;
1712                 } else {
1713                         g_free(p->current);
1714                         p->current = NULL;
1715                 }
1716         }
1717
1718         return NULL;
1719 }
1720
1721 static void
1722 text_index_key_cursor_reset(CamelIndexCursor *idc)
1723 {
1724         struct _CamelTextIndexKeyCursorPrivate *p = CIKC_PRIVATE(idc);
1725
1726         p->keyid = 0;
1727         p->flags = 0;
1728         p->data = 0;
1729         g_free(p->current);
1730         p->current = NULL;
1731 }
1732
1733 static void
1734 camel_text_index_key_cursor_class_init(CamelTextIndexKeyCursorClass *klass)
1735 {
1736         CamelIndexCursorClass *cklass = (CamelIndexCursorClass *)klass;
1737
1738         camel_text_index_key_cursor_parent = CAMEL_INDEX_CURSOR_CLASS(camel_type_get_global_classfuncs(camel_index_cursor_get_type()));
1739
1740         cklass->next = text_index_key_cursor_next;
1741         cklass->reset = text_index_key_cursor_reset;
1742 }
1743
1744 static void
1745 camel_text_index_key_cursor_init(CamelTextIndexKeyCursor *idc)
1746 {
1747         struct _CamelTextIndexKeyCursorPrivate *p;
1748
1749         p = idc->priv = g_malloc0(sizeof(struct _CamelTextIndexKeyCursorPrivate));
1750         p->keyid = 0;
1751         p->flags = 0;
1752         p->data = 0;
1753         p->current = NULL;
1754 }
1755
1756 static void
1757 camel_text_index_key_cursor_finalise(CamelTextIndexKeyCursor *idc)
1758 {
1759         struct _CamelTextIndexKeyCursorPrivate *p = CIKC_PRIVATE(idc);
1760
1761         g_free(p->current);
1762         if (p->table)
1763                 camel_object_unref((CamelObject *)p->table);
1764         g_free(p);
1765 }
1766
1767 CamelType
1768 camel_text_index_key_cursor_get_type(void)
1769 {
1770         static CamelType type = CAMEL_INVALID_TYPE;
1771         
1772         if (type == CAMEL_INVALID_TYPE) {
1773                 type = camel_type_register(camel_index_cursor_get_type(), "CamelTextIndexKeyCursor",
1774                                            sizeof (CamelTextIndexKeyCursor),
1775                                            sizeof (CamelTextIndexKeyCursorClass),
1776                                            (CamelObjectClassInitFunc) camel_text_index_key_cursor_class_init,
1777                                            NULL,
1778                                            (CamelObjectInitFunc) camel_text_index_key_cursor_init,
1779                                            (CamelObjectFinalizeFunc) camel_text_index_key_cursor_finalise);
1780         }
1781         
1782         return type;
1783 }
1784
1785 CamelTextIndexKeyCursor *
1786 camel_text_index_key_cursor_new(CamelTextIndex *idx, CamelKeyTable *table)
1787 {
1788         CamelTextIndexKeyCursor *idc = (CamelTextIndexKeyCursor *)camel_object_new(camel_text_index_key_cursor_get_type());
1789         CamelIndexCursor *cic = &idc->parent;
1790         struct _CamelTextIndexKeyCursorPrivate *p = CIKC_PRIVATE(idc);
1791
1792         cic->index = (CamelIndex *)idx;
1793         camel_object_ref((CamelObject *)idx);
1794         p->table = table;
1795         camel_object_ref((CamelObject *)table);
1796
1797         return idc;
1798 }
1799
1800 /* ********************************************************************** */
1801
1802 #define m(x)
1803
1804 #if 0
1805
1806 struct _CamelIndexRoot {
1807         struct _CamelBlockRoot root;
1808
1809         camel_block_t word_root; /* a keyindex containing the keyid -> word mapping */
1810         camel_block_t word_hash_root; /* a partitionindex containing word -> keyid mapping */
1811
1812         camel_block_t name_root; /* same, for names */
1813         camel_block_t name_hash_root;
1814 };
1815
1816 char wordbuffer[] = "This is a buffer of multiple words.  Some of the words are duplicates"
1817 " while other words are the same, some are in difFerenT Different different case cAsE casE,"
1818 " with,with:with;with-with'with\"'\"various punctuation as well.  So much for those Words. and 10"
1819 " numbers in a row too 1,2,3,4,5,6,7,8,9,10!  Yay!.";
1820
1821 int main(int argc, char **argv)
1822 {
1823 #if 0
1824         CamelBlockFile *bs;
1825         CamelKeyTable *ki;
1826         CamelPartitionTable *cpi;
1827         CamelBlock *keyroot, *partroot;
1828         struct _CamelIndexRoot *root;
1829         FILE *fp;
1830         char line[256], *key;
1831         camel_key_t keyid;
1832         int index = 0, flags, data;
1833 #endif
1834         CamelIndex *idx;
1835         CamelIndexName *idn;
1836         CamelIndexCursor *idc;
1837         const char *word;
1838         int i;
1839
1840         printf("Camel text index tester!\n");
1841
1842         g_thread_init(NULL);
1843         camel_init(NULL, 0);
1844
1845         idx = (CamelIndex *)camel_text_index_new("textindex", O_CREAT|O_RDWR|O_TRUNC);
1846
1847
1848 #if 1
1849         camel_index_compress(idx);
1850
1851         return 0;
1852 #endif
1853
1854         for (i=0;i<100;i++) {
1855                 char name[16];
1856
1857                 sprintf(name, "%d", i);
1858                 printf("Adding words to name '%s'\n", name);
1859                 idn = camel_index_add_name(idx, name);
1860                 camel_index_name_add_buffer(idn, wordbuffer, sizeof(wordbuffer)-1);
1861                 camel_index_write_name(idx, idn);
1862                 camel_object_unref((CamelObject *)idn);
1863         }
1864
1865         printf("Looking up which names contain word 'word'\n");
1866         idc = camel_index_find(idx, "words");
1867         while ( (word = camel_index_cursor_next(idc)) != NULL ) {
1868                 printf(" name is '%s'\n", word);
1869         }
1870         camel_object_unref((CamelObject *)idc);
1871         printf("done.\n");
1872
1873         printf("Looking up which names contain word 'truncate'\n");
1874         idc = camel_index_find(idx, "truncate");
1875         while ( (word = camel_index_cursor_next(idc)) != NULL ) {
1876                 printf(" name is '%s'\n", word);
1877         }
1878         camel_object_unref((CamelObject *)idc);
1879         printf("done.\n");
1880
1881         camel_index_sync(idx);
1882         camel_object_unref((CamelObject *)idx);
1883
1884 #if 0
1885         bs = camel_block_file_new("blocks", "TESTINDX", CAMEL_BLOCK_SIZE);
1886
1887         root = (struct _CamelIndexRoot *)bs->root;
1888         if (root->word_root == 0) {
1889                 keyroot = camel_block_file_new_block(bs);
1890                 root->word_root = keyroot->id;
1891                 camel_block_file_touch_block(bs, bs->root_block);
1892         }
1893         if (root->word_hash_root == 0) {
1894                 partroot = camel_block_file_new_block(bs);
1895                 root->word_hash_root = partroot->id;
1896                 camel_block_file_touch_block(bs, bs->root_block);
1897         }
1898
1899         ki = camel_key_table_new(bs, root->word_root);
1900         cpi = camel_partition_table_new(bs, root->word_hash_root);
1901
1902         fp = fopen("/usr/dict/words", "r");
1903         if (fp == NULL) {
1904                 perror("fopen");
1905                 return 1;
1906         }
1907
1908         while (fgets(line, sizeof(line), fp) != NULL) {
1909                 line[strlen(line)-1] = 0;
1910
1911                 /* see if its already there */
1912                 keyid = camel_partition_table_lookup(cpi, line);                
1913                 if (keyid == 0) {
1914                         m(printf("Adding word '%s' %d\n", line, index));
1915
1916                         keyid = camel_key_table_add(ki, line, index, 0);
1917                         m(printf(" key = %08x\n", keyid));
1918
1919                         camel_partition_table_add(cpi, line, keyid);
1920
1921                         m(printf("Lookup word '%s'\n", line));
1922                         keyid = camel_partition_table_lookup(cpi, line);                
1923                         m(printf(" key = %08x\n", keyid));
1924                 }
1925
1926                 m(printf("Lookup key %08x\n", keyid));
1927
1928                 camel_key_table_set_flags(ki, keyid, index, 1);
1929
1930                 data = camel_key_table_lookup(ki, keyid, &key, &flags);
1931                 m(printf(" word = '%s' %d %04x\n", key, data, flags));
1932
1933                 g_assert(data == index && strcmp(key, line) == 0);
1934
1935                 g_free(key);
1936
1937                 index++;
1938         }
1939
1940         printf("Scanning again\n");
1941         fseek(fp, SEEK_SET, 0);
1942         index = 0;
1943         while (fgets(line, sizeof(line), fp) != NULL) {
1944                 line[strlen(line)-1] = 0;
1945                 m(printf("Lookup word '%s' %d\n", line, index));
1946                 keyid = camel_partition_table_lookup(cpi, line);                
1947                 m(printf(" key = %08d\n", keyid));
1948
1949                 m(printf("Lookup key %08x\n", keyid));
1950                 data = camel_key_table_lookup(ki, keyid, &key, &flags);
1951                 m(printf(" word = '%s' %d\n", key, data));
1952
1953                 g_assert(data == index && strcmp(key, line) == 0);
1954
1955                 g_free(key);
1956
1957                 index++;
1958         }
1959         fclose (fp);
1960
1961         printf("Freeing partition index\n");
1962         camel_partition_table_free(cpi);
1963
1964         printf("Syncing block file\n");
1965         camel_block_file_sync(bs);
1966 #endif
1967         return 0;
1968 }
1969
1970 #endif