1 /* Fast fuzzy searching among messages.
2 Copyright (C) 2006, 2008, 2011, 2015 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
23 #include "msgl-fsearch.h"
29 #include "po-charset.h"
32 /* Fuzzy searching of L strings in a large set of N messages (assuming
33 they have all the same small size) takes O(L * N) when using repeated
34 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
36 So we preprocess the set of N messages, yielding a data structure
37 that allows to select the similar messages for any given string, and
38 apply fstrcmp() only to the search result. This allows to reduce the
39 time to something between O(L * 1) and O(L * N) - depending on the
40 structure of the strings. In the example with L = 800 and N = 69000,
41 memory use increases by a factor of 2 and the time decreases from
44 The data structure is a hash table mapping each n-gram (here with n=4)
45 occurring in the message strings to the list of messages that contains
46 it. For example, if the message list is
49 the index is a hash table mapping
54 Searching the similar messages to, say, "closest", is done by looking up
55 all its 4-grams in the hash table and summing up the results:
60 => total: { 2x0, 1x1 }
61 The list is sorted according to decreasing frequency: { 0, 1, ... }
62 and only the first few messages from this frequency list are passed to
65 The n-gram similarity and the fstrcmp() similarity are quite different
66 metrics. For example, "close" and "c l o s e" have no n-grams in common
67 (even for n as small as n = 2); however, fstrcmp() will find that they
68 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
69 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
70 only 0.02. Also, repeated n-grams don't have the same effect on the
71 two similarity measures. But all this doesn't matter much in practice.
73 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
74 lists are likely too long. (Well, with ideographic languages like Chinese,
75 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
78 The units are characters in the current encoding. Not just bytes,
79 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
81 /* Each message is represented by its index in the message list. */
82 typedef unsigned int index_ty;
84 /* An index list has its allocated size and length tacked at the front.
85 The indices are sorted in ascending order. */
86 typedef index_ty *index_list_ty;
87 #define IL_ALLOCATED 0
90 /* Create a new index list containing only a given index. */
91 static inline index_list_ty
92 new_index (index_ty idx)
94 index_ty *list = XNMALLOC (2 + 1, index_ty);
95 list[IL_ALLOCATED] = 1;
102 /* Add a given index, greater or equal than all previous indices, to an
104 Return the new index list, if it had to be reallocated, or NULL if it
106 static inline index_list_ty
107 addlast_index (index_list_ty list, index_ty idx)
109 index_list_ty result;
110 size_t length = list[IL_LENGTH];
112 /* Look whether it should be inserted. */
113 if (length > 0 && list[2 + (length - 1)] == idx)
116 /* Now make room for one more list element. */
118 if (length == list[IL_ALLOCATED])
120 size_t new_allocated = 2 * length - (length >> 6);
121 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
122 list[IL_ALLOCATED] = new_allocated;
125 list[2 + length] = idx;
126 list[IL_LENGTH] = length + 1;
130 /* Add a given index to an index list.
131 Return the new index list, if it had to be reallocated, or NULL if it
133 static inline index_list_ty
134 add_index (index_list_ty list, index_ty idx)
136 index_list_ty result;
137 size_t length = list[IL_LENGTH];
139 /* Look where it should be inserted. */
144 /* Here we know that idx must be inserted at a position >= lo, <= hi. */
145 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
146 index_ty val = list[2 + mid];
155 /* Now make room for one more list element. */
157 if (length == list[IL_ALLOCATED])
159 size_t new_allocated = 2 * length - (length >> 6);
160 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
161 list[IL_ALLOCATED] = new_allocated;
164 list[IL_LENGTH] = length + 1;
165 for (; length > hi; length--)
166 list[2 + length] = list[1 + length];
167 list[2 + length] = idx;
171 /* We use 4-grams, therefore strings with less than 4 characters cannot be
172 handled through the 4-grams table and need to be handled specially.
173 Since every character occupies at most 4 bytes (see po-charset.c),
174 this means the size of such short strings is bounded by: */
175 #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
176 #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
178 /* Such short strings are handled by direct comparison with all messages
179 of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
180 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
181 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
182 limit the search to lengths l' in the range
183 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
184 Thus we need the list of the short strings up to length: */
185 #if !defined __SUNPRO_C
186 # define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
188 /* Sun C on Solaris 8 cannot compute this constant expression. */
189 # define SHORT_MSG_MAX 28
192 /* A fuzzy index contains a hash table mapping all n-grams to their
194 struct message_fuzzy_index_ty
196 message_ty **messages;
197 character_iterator_t iterator;
200 message_list_ty **short_messages;
203 /* Allocate a fuzzy index corresponding to a given list of messages.
204 The list of messages and the msgctxt and msgid fields of the messages
205 inside it must not be modified while the returned fuzzy index is in use. */
206 message_fuzzy_index_ty *
207 message_fuzzy_index_alloc (const message_list_ty *mlp,
208 const char *canon_charset)
210 message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
211 size_t count = mlp->nitems;
215 findex->messages = mlp->item;
216 findex->iterator = po_charset_character_iterator (canon_charset);
218 /* Setup hash table. */
219 if (hash_init (&findex->gram4, 10 * count) < 0)
221 for (j = 0; j < count; j++)
223 message_ty *mp = mlp->item[j];
225 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
227 const char *str = mp->msgid;
229 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
230 const char *p0 = str;
233 const char *p1 = p0 + findex->iterator (p0);
236 const char *p2 = p1 + findex->iterator (p1);
239 const char *p3 = p2 + findex->iterator (p2);
242 const char *p4 = p3 + findex->iterator (p3);
245 /* The segment from p0 to p4 is a 4-gram of
246 characters. Add a hash table entry that maps
247 it to the index j, or extend the existing
248 hash table entry accordingly. */
251 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
254 index_list_ty list = (index_list_ty) found;
255 list = addlast_index (list, j);
257 hash_set_value (&findex->gram4, p0, p4 - p0,
261 hash_insert_entry (&findex->gram4, p0, p4 - p0,
271 p4 = p4 + findex->iterator (p4);
280 /* Shrink memory used by the hash table. */
288 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
291 index_list_ty list = (index_list_ty) *valuep;
292 index_ty length = list[IL_LENGTH];
294 if (length < list[IL_ALLOCATED])
296 list[IL_ALLOCATED] = length;
297 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
302 findex->firstfew = (int) sqrt ((double) count);
303 if (findex->firstfew < 10)
304 findex->firstfew = 10;
306 /* Setup lists of short messages. */
307 findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
308 for (l = 0; l <= SHORT_MSG_MAX; l++)
309 findex->short_messages[l] = message_list_alloc (false);
310 for (j = 0; j < count; j++)
312 message_ty *mp = mlp->item[j];
314 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
316 const char *str = mp->msgid;
317 size_t len = strlen (str);
319 if (len <= SHORT_MSG_MAX)
320 message_list_append (findex->short_messages[len], mp);
324 /* Shrink memory used by the lists of short messages. */
325 for (l = 0; l <= SHORT_MSG_MAX; l++)
327 message_list_ty *mlp = findex->short_messages[l];
329 if (mlp->nitems < mlp->nitems_max)
331 mlp->nitems_max = mlp->nitems;
334 xrealloc (mlp->item, mlp->nitems_max * sizeof (message_ty *));
341 /* An index with multiplicity. */
348 /* A list of indices with multiplicity.
349 The indices are sorted in ascending order. */
350 struct mult_index_list
352 struct mult_index *item;
356 struct mult_index *item2;
360 /* Initialize an empty list of indices with multiplicity. */
362 mult_index_list_init (struct mult_index_list *accu)
365 accu->nitems_max = 0;
367 accu->nitems2_max = 0;
371 /* Add an index list to a list of indices with multiplicity. */
373 mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
375 size_t len1 = accu->nitems;
376 size_t len2 = list[IL_LENGTH];
377 size_t need = len1 + len2;
378 struct mult_index *ptr1;
379 struct mult_index *ptr1_end;
382 struct mult_index *destptr;
384 /* Make the work area large enough. */
385 if (accu->nitems2_max < need)
387 size_t new_max = 2 * accu->nitems2_max + 1;
391 if (accu->item2 != NULL)
393 accu->item2 = XNMALLOC (new_max, struct mult_index);
394 accu->nitems2_max = new_max;
397 /* Make a linear pass through accu and list simultaneously. */
399 ptr1_end = ptr1 + len1;
401 ptr2_end = ptr2 + len2;
402 destptr = accu->item2;
403 while (ptr1 < ptr1_end && ptr2 < ptr2_end)
405 if (ptr1->index < *ptr2)
410 else if (ptr1->index > *ptr2)
412 destptr->index = *ptr2;
416 else /* ptr1->index == list[2 + i2] */
418 destptr->index = ptr1->index;
419 destptr->count = ptr1->count + 1;
425 while (ptr1 < ptr1_end)
431 while (ptr2 < ptr2_end)
433 destptr->index = *ptr2;
439 /* Swap accu->item and accu->item2. */
441 struct mult_index *dest = accu->item2;
442 size_t dest_max = accu->nitems2_max;
444 accu->item2 = accu->item;
445 accu->nitems2_max = accu->nitems_max;
448 accu->nitems = destptr - dest;
449 accu->nitems_max = dest_max;
453 /* Compares two indices with multiplicity, according to their multiplicity. */
455 mult_index_compare (const void *p1, const void *p2)
457 const struct mult_index *ptr1 = (const struct mult_index *) p1;
458 const struct mult_index *ptr2 = (const struct mult_index *) p2;
460 if (ptr1->count < ptr2->count)
462 if (ptr1->count > ptr2->count)
464 /* For reproduceable results, also take into account the indices. */
465 if (ptr1->index > ptr2->index)
467 if (ptr1->index < ptr2->index)
472 /* Sort a list of indices with multiplicity according to decreasing
475 mult_index_list_sort (struct mult_index_list *accu)
477 if (accu->nitems > 1)
478 qsort (accu->item, accu->nitems, sizeof (struct mult_index),
482 /* Frees a list of indices with multiplicity. */
484 mult_index_list_free (struct mult_index_list *accu)
486 if (accu->item != NULL)
488 if (accu->item2 != NULL)
492 /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
493 The match does not need to be optimal.
494 Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
495 LOWER_BOUND must be >= FUZZY_THRESHOLD.
496 If HEURISTIC is true, only the few best messages among the list - according
497 to a certain heuristic - are considered. If HEURISTIC is false, all
498 messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
499 like in message_list_search_fuzzy (except that in ambiguous cases where
500 several best matches exist, message_list_search_fuzzy chooses the one with
501 the smallest index whereas message_fuzzy_index_search makes a better
504 message_fuzzy_index_search (message_fuzzy_index_ty *findex,
505 const char *msgctxt, const char *msgid,
509 const char *str = msgid;
511 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
512 const char *p0 = str;
515 const char *p1 = p0 + findex->iterator (p0);
518 const char *p2 = p1 + findex->iterator (p1);
521 const char *p3 = p2 + findex->iterator (p2);
524 const char *p4 = p3 + findex->iterator (p3);
525 struct mult_index_list accu;
527 mult_index_list_init (&accu);
530 /* The segment from p0 to p4 is a 4-gram of
531 characters. Get the hash table entry containing
532 a list of indices, and add it to the accu. */
535 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
538 index_list_ty list = (index_list_ty) found;
539 mult_index_list_accumulate (&accu, list);
549 p4 = p4 + findex->iterator (p4);
552 /* Sort in decreasing count order. */
553 mult_index_list_sort (&accu);
555 /* Iterate over this sorted list, and maximize the
556 fuzzy_search_goal_function() result.
557 If HEURISTIC is true, take only the first few messages.
558 If HEURISTIC is false, consider all messages - to match
559 the behaviour of message_list_search_fuzzy -, but process
560 them in the order of the sorted list. This increases
561 the chances that the later calls to fstrcmp_bounded() (via
562 fuzzy_search_goal_function()) terminate quickly, thanks
563 to the best_weight which will be quite high already after
564 the first few messages. */
567 struct mult_index *ptr;
574 if (count > findex->firstfew)
575 count = findex->firstfew;
578 best_weight = lower_bound;
580 for (ptr = accu.item; count > 0; ptr++, count--)
582 message_ty *mp = findex->messages[ptr->index];
584 fuzzy_search_goal_function (mp, msgctxt, msgid,
587 if (weight > best_weight)
589 best_weight = weight;
594 mult_index_list_free (&accu);
603 /* The string had less than 4 characters. */
605 size_t l = strlen (str);
610 if (!(l <= SHORT_STRING_MAX_BYTES))
613 /* Walk through those short messages which have an appropriate length.
614 See the comment before SHORT_MSG_MAX. */
615 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
616 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
617 if (!(lmax <= SHORT_MSG_MAX))
620 best_weight = lower_bound;
622 for (l = lmin; l <= lmax; l++)
624 message_list_ty *mlp = findex->short_messages[l];
627 for (j = 0; j < mlp->nitems; j++)
629 message_ty *mp = mlp->item[j];
631 fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
633 if (weight > best_weight)
635 best_weight = weight;
645 /* Free a fuzzy index. */
647 message_fuzzy_index_free (message_fuzzy_index_ty *findex)
655 /* Free the short lists. */
656 for (l = 0; l <= SHORT_MSG_MAX; l++)
657 message_list_free (findex->short_messages[l], 1);
658 free (findex->short_messages);
660 /* Free the index lists occurring as values in the hash tables. */
662 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
663 free ((index_list_ty *) data);
664 /* Free the hash table itself. */
665 hash_destroy (&findex->gram4);