1 /* Fast fuzzy searching among messages.
2 Copyright (C) 2006, 2008, 2011, 2013, 2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2006.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "msgl-fsearch.h"
24
25 #include <math.h>
26 #include <stdlib.h>
27
28 #include "xalloc.h"
29 #include "po-charset.h"
30
31
32 /* Fuzzy searching of L strings in a large set of N messages (assuming
33 they have all the same small size) takes O(L * N) when using repeated
34 fstrcmp() calls. When for example L = 800 and N = 69000, this is slow.
35
36 So we preprocess the set of N messages, yielding a data structure
37 that allows to select the similar messages for any given string, and
38 apply fstrcmp() only to the search result. This allows to reduce the
39 time to something between O(L * 1) and O(L * N) - depending on the
40 structure of the strings. In the example with L = 800 and N = 69000,
41 memory use increases by a factor of 2 and the time decreases from
42 9068 s to 230 s.
43
44 The data structure is a hash table mapping each n-gram (here with n=4)
45 occurring in the message strings to the list of messages that contains
46 it. For example, if the message list is
47 [0] "close"
48 [1] "losers"
49 the index is a hash table mapping
50 "clos" -> { 0 }
51 "lose" -> { 0, 1 }
52 "oser" -> { 1 }
53 "sers" -> { 1 }
54 Searching the similar messages to, say, "closest", is done by looking up
55 all its 4-grams in the hash table and summing up the results:
56 "clos" -> { 0 }
57 "lose" -> { 0, 1 }
58 "oses" -> {}
59 "sest" -> {}
60 => total: { 2x0, 1x1 }
61 The list is sorted according to decreasing frequency: { 0, 1, ... }
62 and only the first few messages from this frequency list are passed to
63 fstrcmp().
64
65 The n-gram similarity and the fstrcmp() similarity are quite different
66 metrics. For example, "close" and "c l o s e" have no n-grams in common
67 (even for n as small as n = 2); however, fstrcmp() will find that they
68 are quite similar (= 0.71). Conversely, "AAAA BBBB ... ZZZZ" and
69 "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
70 only 0.02. Also, repeated n-grams don't have the same effect on the
71 two similarity measures. But all this doesn't matter much in practice.
72
73 We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
74 lists are likely too long. (Well, with ideographic languages like Chinese,
75 n = 3 might actually be quite good. Anyway, n = 4 is not bad in this case
76 either.)
77
78 The units are characters in the current encoding. Not just bytes,
79 because 4 consecutive bytes in UTF-8 or GB18030 don't mean much. */
80
81 /* Each message is represented by its index in the message list. */
82 typedef unsigned int index_ty;
83
84 /* An index list has its allocated size and length tacked at the front.
85 The indices are sorted in ascending order. */
86 typedef index_ty *index_list_ty;
87 #define IL_ALLOCATED 0
88 #define IL_LENGTH 1
89
90 /* Create a new index list containing only a given index. */
91 static inline index_list_ty
92 new_index (index_ty idx)
93 {
94 index_ty *list = XNMALLOC (2 + 1, index_ty);
95 list[IL_ALLOCATED] = 1;
96 list[IL_LENGTH] = 1;
97 list[2] = idx;
98
99 return list;
100 }
101
102 /* Add a given index, greater or equal than all previous indices, to an
103 index list.
104 Return the new index list, if it had to be reallocated, or NULL if it
105 didn't change. */
106 static inline index_list_ty
107 addlast_index (index_list_ty list, index_ty idx)
108 {
109 index_list_ty result;
110 size_t length = list[IL_LENGTH];
111
112 /* Look whether it should be inserted. */
113 if (length > 0 && list[2 + (length - 1)] == idx)
114 return NULL;
115
116 /* Now make room for one more list element. */
117 result = NULL;
118 if (length == list[IL_ALLOCATED])
119 {
120 size_t new_allocated = 2 * length - (length >> 6);
121 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
122 list[IL_ALLOCATED] = new_allocated;
123 result = list;
124 }
125 list[2 + length] = idx;
126 list[IL_LENGTH] = length + 1;
127 return result;
128 }
129
130 /* Add a given index to an index list.
131 Return the new index list, if it had to be reallocated, or NULL if it
132 didn't change. */
133 static inline index_list_ty
134 add_index (index_list_ty list, index_ty idx)
135 {
136 index_list_ty result;
137 size_t length = list[IL_LENGTH];
138
139 /* Look where it should be inserted. */
140 size_t lo = 0;
141 size_t hi = length;
142 while (lo < hi)
143 {
144 /* Here we know that idx must be inserted at a position >= lo, <= hi. */
145 size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
146 index_ty val = list[2 + mid];
147 if (val < idx)
148 lo = mid + 1;
149 else if (val > idx)
150 hi = mid;
151 else
152 return NULL;
153 }
154
155 /* Now make room for one more list element. */
156 result = NULL;
157 if (length == list[IL_ALLOCATED])
158 {
159 size_t new_allocated = 2 * length - (length >> 6);
160 list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
161 list[IL_ALLOCATED] = new_allocated;
162 result = list;
163 }
164 list[IL_LENGTH] = length + 1;
165 for (; length > hi; length--)
166 list[2 + length] = list[1 + length];
167 list[2 + length] = idx;
168 return result;
169 }
170
171 /* We use 4-grams, therefore strings with less than 4 characters cannot be
172 handled through the 4-grams table and need to be handled specially.
173 Since every character occupies at most 4 bytes (see po-charset.c),
174 this means the size of such short strings is bounded by: */
175 #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
176 #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
177
178 /* Such short strings are handled by direct comparison with all messages
179 of appropriate size. Note that for two strings of length 0 <= l1 <= l2,
180 fstrcmp() is <= 2 * l1 / (l1 + l2). Since we are only interested in
181 fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
182 limit the search to lengths l' in the range
183 l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
184 Thus we need the list of the short strings up to length: */
185 #if !defined __SUNPRO_C
186 # define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
187 #else
188 /* Sun C on Solaris 8 cannot compute this constant expression. */
189 # define SHORT_MSG_MAX 28
190 #endif
191
192 /* A fuzzy index contains a hash table mapping all n-grams to their
193 occurrences list. */
194 struct message_fuzzy_index_ty
195 {
196 message_ty **messages;
197 character_iterator_t iterator;
198 hash_table gram4;
199 size_t firstfew;
200 message_list_ty **short_messages;
201 };
202
203 /* Allocate a fuzzy index corresponding to a given list of messages.
204 The list of messages and the msgctxt and msgid fields of the messages
205 inside it must not be modified while the returned fuzzy index is in use. */
206 message_fuzzy_index_ty *
207 message_fuzzy_index_alloc (const message_list_ty *mlp,
208 const char *canon_charset)
209 {
210 message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
211 size_t count = mlp->nitems;
212 size_t j;
213 size_t l;
214
215 findex->messages = mlp->item;
216 findex->iterator = po_charset_character_iterator (canon_charset);
217
218 /* Setup hash table. */
219 hash_init (&findex->gram4, 10 * count);
220 for (j = 0; j < count; j++)
221 {
222 message_ty *mp = mlp->item[j];
223
224 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
225 {
226 const char *str = mp->msgid;
227
228 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
229 const char *p0 = str;
230 if (*p0 != '\0')
231 {
232 const char *p1 = p0 + findex->iterator (p0);
233 if (*p1 != '\0')
234 {
235 const char *p2 = p1 + findex->iterator (p1);
236 if (*p2 != '\0')
237 {
238 const char *p3 = p2 + findex->iterator (p2);
239 if (*p3 != '\0')
240 {
241 const char *p4 = p3 + findex->iterator (p3);
242 for (;;)
243 {
244 /* The segment from p0 to p4 is a 4-gram of
245 characters. Add a hash table entry that maps
246 it to the index j, or extend the existing
247 hash table entry accordingly. */
248 void *found;
249
250 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
251 &found) == 0)
252 {
253 index_list_ty list = (index_list_ty) found;
254 list = addlast_index (list, j);
255 if (list != NULL)
256 hash_set_value (&findex->gram4, p0, p4 - p0,
257 list);
258 }
259 else
260 hash_insert_entry (&findex->gram4, p0, p4 - p0,
261 new_index (j));
262
263 /* Advance. */
264 if (*p4 == '\0')
265 break;
266 p0 = p1;
267 p1 = p2;
268 p2 = p3;
269 p3 = p4;
270 p4 = p4 + findex->iterator (p4);
271 }
272 }
273 }
274 }
275 }
276 }
277 }
278
279 /* Shrink memory used by the hash table. */
280 {
281 void *iter;
282 const void *key;
283 size_t keylen;
284 void **valuep;
285
286 iter = NULL;
287 while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
288 == 0)
289 {
290 index_list_ty list = (index_list_ty) *valuep;
291 index_ty length = list[IL_LENGTH];
292
293 if (length < list[IL_ALLOCATED])
294 {
295 list[IL_ALLOCATED] = length;
296 *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
297 }
298 }
299 }
300
301 findex->firstfew = (int) sqrt ((double) count);
302 if (findex->firstfew < 10)
303 findex->firstfew = 10;
304
305 /* Setup lists of short messages. */
306 findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
307 for (l = 0; l <= SHORT_MSG_MAX; l++)
308 findex->short_messages[l] = message_list_alloc (false);
309 for (j = 0; j < count; j++)
310 {
311 message_ty *mp = mlp->item[j];
312
313 if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
314 {
315 const char *str = mp->msgid;
316 size_t len = strlen (str);
317
318 if (len <= SHORT_MSG_MAX)
319 message_list_append (findex->short_messages[len], mp);
320 }
321 }
322
323 /* Shrink memory used by the lists of short messages. */
324 for (l = 0; l <= SHORT_MSG_MAX; l++)
325 {
326 message_list_ty *smlp = findex->short_messages[l];
327
328 if (smlp->nitems < smlp->nitems_max)
329 {
330 smlp->nitems_max = smlp->nitems;
331 smlp->item =
332 (message_ty **)
333 xrealloc (smlp->item, smlp->nitems_max * sizeof (message_ty *));
334 }
335 }
336
337 return findex;
338 }
339
340 /* An index with multiplicity. */
341 struct mult_index
342 {
343 index_ty index;
344 unsigned int count;
345 };
346
347 /* A list of indices with multiplicity.
348 The indices are sorted in ascending order. */
349 struct mult_index_list
350 {
351 struct mult_index *item;
352 size_t nitems;
353 size_t nitems_max;
354 /* Work area. */
355 struct mult_index *item2;
356 size_t nitems2_max;
357 };
358
359 /* Initialize an empty list of indices with multiplicity. */
360 static inline void
361 mult_index_list_init (struct mult_index_list *accu)
362 {
363 accu->nitems = 0;
364 accu->nitems_max = 0;
365 accu->item = NULL;
366 accu->nitems2_max = 0;
367 accu->item2 = NULL;
368 }
369
370 /* Add an index list to a list of indices with multiplicity. */
371 static inline void
372 mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
373 {
374 size_t len1 = accu->nitems;
375 size_t len2 = list[IL_LENGTH];
376 size_t need = len1 + len2;
377 struct mult_index *ptr1;
378 struct mult_index *ptr1_end;
379 index_ty *ptr2;
380 index_ty *ptr2_end;
381 struct mult_index *destptr;
382
383 /* Make the work area large enough. */
384 if (accu->nitems2_max < need)
385 {
386 size_t new_max = 2 * accu->nitems2_max + 1;
387
388 if (new_max < need)
389 new_max = need;
390 if (accu->item2 != NULL)
391 free (accu->item2);
392 accu->item2 = XNMALLOC (new_max, struct mult_index);
393 accu->nitems2_max = new_max;
394 }
395
396 /* Make a linear pass through accu and list simultaneously. */
397 ptr1 = accu->item;
398 ptr1_end = ptr1 + len1;
399 ptr2 = list + 2;
400 ptr2_end = ptr2 + len2;
401 destptr = accu->item2;
402 while (ptr1 < ptr1_end && ptr2 < ptr2_end)
403 {
404 if (ptr1->index < *ptr2)
405 {
406 *destptr = *ptr1;
407 ptr1++;
408 }
409 else if (ptr1->index > *ptr2)
410 {
411 destptr->index = *ptr2;
412 destptr->count = 1;
413 ptr2++;
414 }
415 else /* ptr1->index == list[2 + i2] */
416 {
417 destptr->index = ptr1->index;
418 destptr->count = ptr1->count + 1;
419 ptr1++;
420 ptr2++;
421 }
422 destptr++;
423 }
424 while (ptr1 < ptr1_end)
425 {
426 *destptr = *ptr1;
427 ptr1++;
428 destptr++;
429 }
430 while (ptr2 < ptr2_end)
431 {
432 destptr->index = *ptr2;
433 destptr->count = 1;
434 ptr2++;
435 destptr++;
436 }
437
438 /* Swap accu->item and accu->item2. */
439 {
440 struct mult_index *dest = accu->item2;
441 size_t dest_max = accu->nitems2_max;
442
443 accu->item2 = accu->item;
444 accu->nitems2_max = accu->nitems_max;
445
446 accu->item = dest;
447 accu->nitems = destptr - dest;
448 accu->nitems_max = dest_max;
449 }
450 }
451
452 /* Compares two indices with multiplicity, according to their multiplicity. */
453 static int
454 mult_index_compare (const void *p1, const void *p2)
455 {
456 const struct mult_index *ptr1 = (const struct mult_index *) p1;
457 const struct mult_index *ptr2 = (const struct mult_index *) p2;
458
459 if (ptr1->count < ptr2->count)
460 return 1;
461 if (ptr1->count > ptr2->count)
462 return -1;
463 /* For reproduceable results, also take into account the indices. */
464 if (ptr1->index > ptr2->index)
465 return 1;
466 if (ptr1->index < ptr2->index)
467 return -1;
468 return 0;
469 }
470
471 /* Sort a list of indices with multiplicity according to decreasing
472 multiplicity. */
473 static inline void
474 mult_index_list_sort (struct mult_index_list *accu)
475 {
476 if (accu->nitems > 1)
477 qsort (accu->item, accu->nitems, sizeof (struct mult_index),
478 mult_index_compare);
479 }
480
481 /* Frees a list of indices with multiplicity. */
482 static inline void
483 mult_index_list_free (struct mult_index_list *accu)
484 {
485 if (accu->item != NULL)
486 free (accu->item);
487 if (accu->item2 != NULL)
488 free (accu->item2);
489 }
490
491 /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
492 The match does not need to be optimal.
493 Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
494 LOWER_BOUND must be >= FUZZY_THRESHOLD.
495 If HEURISTIC is true, only the few best messages among the list - according
496 to a certain heuristic - are considered. If HEURISTIC is false, all
497 messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
498 like in message_list_search_fuzzy (except that in ambiguous cases where
499 several best matches exist, message_list_search_fuzzy chooses the one with
500 the smallest index whereas message_fuzzy_index_search makes a better
501 choice). */
502 message_ty *
503 message_fuzzy_index_search (message_fuzzy_index_ty *findex,
504 const char *msgctxt, const char *msgid,
505 double lower_bound,
506 bool heuristic)
507 {
508 const char *str = msgid;
509
510 /* Let p0 < p1 < p2 < p3 < p4 walk through the string. */
511 const char *p0 = str;
512 if (*p0 != '\0')
513 {
514 const char *p1 = p0 + findex->iterator (p0);
515 if (*p1 != '\0')
516 {
517 const char *p2 = p1 + findex->iterator (p1);
518 if (*p2 != '\0')
519 {
520 const char *p3 = p2 + findex->iterator (p2);
521 if (*p3 != '\0')
522 {
523 const char *p4 = p3 + findex->iterator (p3);
524 struct mult_index_list accu;
525
526 mult_index_list_init (&accu);
527 for (;;)
528 {
529 /* The segment from p0 to p4 is a 4-gram of
530 characters. Get the hash table entry containing
531 a list of indices, and add it to the accu. */
532 void *found;
533
534 if (hash_find_entry (&findex->gram4, p0, p4 - p0,
535 &found) == 0)
536 {
537 index_list_ty list = (index_list_ty) found;
538 mult_index_list_accumulate (&accu, list);
539 }
540
541 /* Advance. */
542 if (*p4 == '\0')
543 break;
544 p0 = p1;
545 p1 = p2;
546 p2 = p3;
547 p3 = p4;
548 p4 = p4 + findex->iterator (p4);
549 }
550
551 /* Sort in decreasing count order. */
552 mult_index_list_sort (&accu);
553
554 /* Iterate over this sorted list, and maximize the
555 fuzzy_search_goal_function() result.
556 If HEURISTIC is true, take only the first few messages.
557 If HEURISTIC is false, consider all messages - to match
558 the behaviour of message_list_search_fuzzy -, but process
559 them in the order of the sorted list. This increases
560 the chances that the later calls to fstrcmp_bounded() (via
561 fuzzy_search_goal_function()) terminate quickly, thanks
562 to the best_weight which will be quite high already after
563 the first few messages. */
564 {
565 size_t count;
566 struct mult_index *ptr;
567 message_ty *best_mp;
568 double best_weight;
569
570 count = accu.nitems;
571 if (heuristic)
572 {
573 if (count > findex->firstfew)
574 count = findex->firstfew;
575 }
576
577 best_weight = lower_bound;
578 best_mp = NULL;
579 for (ptr = accu.item; count > 0; ptr++, count--)
580 {
581 message_ty *mp = findex->messages[ptr->index];
582 double weight =
583 fuzzy_search_goal_function (mp, msgctxt, msgid,
584 best_weight);
585
586 if (weight > best_weight)
587 {
588 best_weight = weight;
589 best_mp = mp;
590 }
591 }
592
593 mult_index_list_free (&accu);
594
595 return best_mp;
596 }
597 }
598 }
599 }
600 }
601
602 /* The string had less than 4 characters. */
603 {
604 size_t l = strlen (str);
605 size_t lmin, lmax;
606 message_ty *best_mp;
607 double best_weight;
608
609 if (!(l <= SHORT_STRING_MAX_BYTES))
610 abort ();
611
612 /* Walk through those short messages which have an appropriate length.
613 See the comment before SHORT_MSG_MAX. */
614 lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
615 lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
616 if (!(lmax <= SHORT_MSG_MAX))
617 abort ();
618
619 best_weight = lower_bound;
620 best_mp = NULL;
621 for (l = lmin; l <= lmax; l++)
622 {
623 message_list_ty *mlp = findex->short_messages[l];
624 size_t j;
625
626 for (j = 0; j < mlp->nitems; j++)
627 {
628 message_ty *mp = mlp->item[j];
629 double weight =
630 fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
631
632 if (weight > best_weight)
633 {
634 best_weight = weight;
635 best_mp = mp;
636 }
637 }
638 }
639
640 return best_mp;
641 }
642 }
643
644 /* Free a fuzzy index. */
645 void
646 message_fuzzy_index_free (message_fuzzy_index_ty *findex)
647 {
648 size_t l;
649 void *iter;
650 const void *key;
651 size_t keylen;
652 void *data;
653
654 /* Free the short lists. */
655 for (l = 0; l <= SHORT_MSG_MAX; l++)
656 message_list_free (findex->short_messages[l], 1);
657 free (findex->short_messages);
658
659 /* Free the index lists occurring as values in the hash tables. */
660 iter = NULL;
661 while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
662 free ((index_list_ty *) data);
663 /* Free the hash table itself. */
664 hash_destroy (&findex->gram4);
665
666 free (findex);
667 }