(root)/
gettext-0.22.4/
gettext-tools/
src/
msgl-fsearch.c
       1  /* Fast fuzzy searching among messages.
       2     Copyright (C) 2006, 2008, 2011, 2013, 2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2006.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "msgl-fsearch.h"
      24  
      25  #include <math.h>
      26  #include <stdlib.h>
      27  
      28  #include "xalloc.h"
      29  #include "po-charset.h"
      30  
      31  
      32  /* Fuzzy searching of L strings in a large set of N messages (assuming
      33     they have all the same small size) takes O(L * N) when using repeated
      34     fstrcmp() calls.  When for example L = 800 and N = 69000, this is slow.
      35  
      36     So we preprocess the set of N messages, yielding a data structure
      37     that allows to select the similar messages for any given string, and
      38     apply fstrcmp() only to the search result.  This allows to reduce the
      39     time to something between O(L * 1) and O(L * N) - depending on the
      40     structure of the strings.  In the example with L = 800 and N = 69000,
      41     memory use increases by a factor of 2 and the time decreases from
      42     9068 s to 230 s.
      43  
      44     The data structure is a hash table mapping each n-gram (here with n=4)
      45     occurring in the message strings to the list of messages that contains
      46     it.  For example, if the message list is
      47        [0] "close"
      48        [1] "losers"
      49     the index is a hash table mapping
      50        "clos" -> { 0 }
      51        "lose" -> { 0, 1 }
      52        "oser" -> { 1 }
      53        "sers" -> { 1 }
      54     Searching the similar messages to, say, "closest", is done by looking up
      55     all its 4-grams in the hash table and summing up the results:
      56        "clos" -> { 0 }
      57        "lose" -> { 0, 1 }
      58        "oses" -> {}
      59        "sest" -> {}
      60     => total: { 2x0, 1x1 }
      61     The list is sorted according to decreasing frequency: { 0, 1, ... }
      62     and only the first few messages from this frequency list are passed to
      63     fstrcmp().
      64  
      65     The n-gram similarity and the fstrcmp() similarity are quite different
      66     metrics.  For example, "close" and "c l o s e" have no n-grams in common
      67     (even for n as small as n = 2); however, fstrcmp() will find that they
      68     are quite similar (= 0.71).  Conversely, "AAAA BBBB ... ZZZZ" and
      69     "ZZZZ ... BBBB AAAA" have many 4-grams in common, but their fstrcmp() is
      70     only 0.02.  Also, repeated n-grams don't have the same effect on the
      71     two similarity measures.  But all this doesn't matter much in practice.
      72  
      73     We chose n = 4 because for alphabetic languages, with n = 3 the occurrence
      74     lists are likely too long.  (Well, with ideographic languages like Chinese,
      75     n = 3 might actually be quite good.  Anyway, n = 4 is not bad in this case
      76     either.)
      77  
      78     The units are characters in the current encoding.  Not just bytes,
      79     because 4 consecutive bytes in UTF-8 or GB18030 don't mean much.  */
      80  
      81  /* Each message is represented by its index in the message list.  */
      82  typedef unsigned int index_ty;
      83  
      84  /* An index list has its allocated size and length tacked at the front.
      85     The indices are sorted in ascending order.  */
      86  typedef index_ty *index_list_ty;
      87  #define IL_ALLOCATED 0
      88  #define IL_LENGTH 1
      89  
      90  /* Create a new index list containing only a given index.  */
      91  static inline index_list_ty
      92  new_index (index_ty idx)
      93  {
      94    index_ty *list = XNMALLOC (2 + 1, index_ty);
      95    list[IL_ALLOCATED] = 1;
      96    list[IL_LENGTH] = 1;
      97    list[2] = idx;
      98  
      99    return list;
     100  }
     101  
     102  /* Add a given index, greater or equal than all previous indices, to an
     103     index list.
     104     Return the new index list, if it had to be reallocated, or NULL if it
     105     didn't change.  */
     106  static inline index_list_ty
     107  addlast_index (index_list_ty list, index_ty idx)
     108  {
     109    index_list_ty result;
     110    size_t length = list[IL_LENGTH];
     111  
     112    /* Look whether it should be inserted.  */
     113    if (length > 0 && list[2 + (length - 1)] == idx)
     114      return NULL;
     115  
     116    /* Now make room for one more list element.  */
     117    result = NULL;
     118    if (length == list[IL_ALLOCATED])
     119      {
     120        size_t new_allocated = 2 * length - (length >> 6);
     121        list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
     122        list[IL_ALLOCATED] = new_allocated;
     123        result = list;
     124      }
     125    list[2 + length] = idx;
     126    list[IL_LENGTH] = length + 1;
     127    return result;
     128  }
     129  
     130  /* Add a given index to an index list.
     131     Return the new index list, if it had to be reallocated, or NULL if it
     132     didn't change.  */
     133  static inline index_list_ty
     134  add_index (index_list_ty list, index_ty idx)
     135  {
     136    index_list_ty result;
     137    size_t length = list[IL_LENGTH];
     138  
     139    /* Look where it should be inserted.  */
     140    size_t lo = 0;
     141    size_t hi = length;
     142    while (lo < hi)
     143      {
     144        /* Here we know that idx must be inserted at a position >= lo, <= hi.  */
     145        size_t mid = (lo + hi) / 2; /* lo <= mid < hi */
     146        index_ty val = list[2 + mid];
     147        if (val < idx)
     148          lo = mid + 1;
     149        else if (val > idx)
     150          hi = mid;
     151        else
     152          return NULL;
     153      }
     154  
     155    /* Now make room for one more list element.  */
     156    result = NULL;
     157    if (length == list[IL_ALLOCATED])
     158      {
     159        size_t new_allocated = 2 * length - (length >> 6);
     160        list = (index_ty *) xrealloc (list, (2 + new_allocated) * sizeof (index_ty));
     161        list[IL_ALLOCATED] = new_allocated;
     162        result = list;
     163      }
     164    list[IL_LENGTH] = length + 1;
     165    for (; length > hi; length--)
     166      list[2 + length] = list[1 + length];
     167    list[2 + length] = idx;
     168    return result;
     169  }
     170  
     171  /* We use 4-grams, therefore strings with less than 4 characters cannot be
     172     handled through the 4-grams table and need to be handled specially.
     173     Since every character occupies at most 4 bytes (see po-charset.c),
     174     this means the size of such short strings is bounded by:  */
     175  #define SHORT_STRING_MAX_CHARACTERS (4 - 1)
     176  #define SHORT_STRING_MAX_BYTES (SHORT_STRING_MAX_CHARACTERS * 4)
     177  
     178  /* Such short strings are handled by direct comparison with all messages
     179     of appropriate size.  Note that for two strings of length 0 <= l1 <= l2,
     180     fstrcmp() is <= 2 * l1 / (l1 + l2).  Since we are only interested in
     181     fstrcmp() values >= FUZZY_THRESHOLD, we can for a given string of length l
     182     limit the search to lengths l' in the range
     183       l / (2 / FUZZY_THRESHOLD - 1) <= l' <= l * (2 / FUZZY_THRESHOLD - 1)
     184     Thus we need the list of the short strings up to length:  */
     185  #if !defined __SUNPRO_C
     186  # define SHORT_MSG_MAX (int) (SHORT_STRING_MAX_BYTES * (2 / FUZZY_THRESHOLD - 1))
     187  #else
     188  /* Sun C on Solaris 8 cannot compute this constant expression.  */
     189  # define SHORT_MSG_MAX 28
     190  #endif
     191  
     192  /* A fuzzy index contains a hash table mapping all n-grams to their
     193     occurrences list.  */
     194  struct message_fuzzy_index_ty
     195  {
     196    message_ty **messages;
     197    character_iterator_t iterator;
     198    hash_table gram4;
     199    size_t firstfew;
     200    message_list_ty **short_messages;
     201  };
     202  
     203  /* Allocate a fuzzy index corresponding to a given list of messages.
     204     The list of messages and the msgctxt and msgid fields of the messages
     205     inside it must not be modified while the returned fuzzy index is in use.  */
     206  message_fuzzy_index_ty *
     207  message_fuzzy_index_alloc (const message_list_ty *mlp,
     208                             const char *canon_charset)
     209  {
     210    message_fuzzy_index_ty *findex = XMALLOC (message_fuzzy_index_ty);
     211    size_t count = mlp->nitems;
     212    size_t j;
     213    size_t l;
     214  
     215    findex->messages = mlp->item;
     216    findex->iterator = po_charset_character_iterator (canon_charset);
     217  
     218    /* Setup hash table.  */
     219    hash_init (&findex->gram4, 10 * count);
     220    for (j = 0; j < count; j++)
     221      {
     222        message_ty *mp = mlp->item[j];
     223  
     224        if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
     225          {
     226            const char *str = mp->msgid;
     227  
     228            /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
     229            const char *p0 = str;
     230            if (*p0 != '\0')
     231              {
     232                const char *p1 = p0 + findex->iterator (p0);
     233                if (*p1 != '\0')
     234                  {
     235                    const char *p2 = p1 + findex->iterator (p1);
     236                    if (*p2 != '\0')
     237                      {
     238                        const char *p3 = p2 + findex->iterator (p2);
     239                        if (*p3 != '\0')
     240                          {
     241                            const char *p4 = p3 + findex->iterator (p3);
     242                            for (;;)
     243                              {
     244                                /* The segment from p0 to p4 is a 4-gram of
     245                                   characters.  Add a hash table entry that maps
     246                                   it to the index j, or extend the existing
     247                                   hash table entry accordingly.  */
     248                                void *found;
     249  
     250                                if (hash_find_entry (&findex->gram4, p0, p4 - p0,
     251                                                     &found) == 0)
     252                                  {
     253                                    index_list_ty list = (index_list_ty) found;
     254                                    list = addlast_index (list, j);
     255                                    if (list != NULL)
     256                                      hash_set_value (&findex->gram4, p0, p4 - p0,
     257                                                      list);
     258                                  }
     259                                else
     260                                  hash_insert_entry (&findex->gram4, p0, p4 - p0,
     261                                                     new_index (j));
     262  
     263                                /* Advance.  */
     264                                if (*p4 == '\0')
     265                                  break;
     266                                p0 = p1;
     267                                p1 = p2;
     268                                p2 = p3;
     269                                p3 = p4;
     270                                p4 = p4 + findex->iterator (p4);
     271                              }
     272                          }
     273                      }
     274                  }
     275              }
     276          }
     277      }
     278  
     279    /* Shrink memory used by the hash table.  */
     280    {
     281      void *iter;
     282      const void *key;
     283      size_t keylen;
     284      void **valuep;
     285  
     286      iter = NULL;
     287      while (hash_iterate_modify (&findex->gram4, &iter, &key, &keylen, &valuep)
     288             == 0)
     289        {
     290          index_list_ty list = (index_list_ty) *valuep;
     291          index_ty length = list[IL_LENGTH];
     292  
     293          if (length < list[IL_ALLOCATED])
     294            {
     295              list[IL_ALLOCATED] = length;
     296              *valuep = xrealloc (list, (2 + length) * sizeof (index_ty));
     297            }
     298        }
     299    }
     300  
     301    findex->firstfew = (int) sqrt ((double) count);
     302    if (findex->firstfew < 10)
     303      findex->firstfew = 10;
     304  
     305    /* Setup lists of short messages.  */
     306    findex->short_messages = XNMALLOC (SHORT_MSG_MAX + 1, message_list_ty *);
     307    for (l = 0; l <= SHORT_MSG_MAX; l++)
     308      findex->short_messages[l] = message_list_alloc (false);
     309    for (j = 0; j < count; j++)
     310      {
     311        message_ty *mp = mlp->item[j];
     312  
     313        if (mp->msgstr != NULL && mp->msgstr[0] != '\0')
     314          {
     315            const char *str = mp->msgid;
     316            size_t len = strlen (str);
     317  
     318            if (len <= SHORT_MSG_MAX)
     319              message_list_append (findex->short_messages[len], mp);
     320          }
     321      }
     322  
     323    /* Shrink memory used by the lists of short messages.  */
     324    for (l = 0; l <= SHORT_MSG_MAX; l++)
     325      {
     326        message_list_ty *smlp = findex->short_messages[l];
     327  
     328        if (smlp->nitems < smlp->nitems_max)
     329          {
     330            smlp->nitems_max = smlp->nitems;
     331            smlp->item =
     332              (message_ty **)
     333              xrealloc (smlp->item, smlp->nitems_max * sizeof (message_ty *));
     334          }
     335      }
     336  
     337    return findex;
     338  }
     339  
     340  /* An index with multiplicity.  */
     341  struct mult_index
     342  {
     343    index_ty index;
     344    unsigned int count;
     345  };
     346  
     347  /* A list of indices with multiplicity.
     348     The indices are sorted in ascending order.  */
     349  struct mult_index_list
     350  {
     351    struct mult_index *item;
     352    size_t nitems;
     353    size_t nitems_max;
     354    /* Work area.  */
     355    struct mult_index *item2;
     356    size_t nitems2_max;
     357  };
     358  
     359  /* Initialize an empty list of indices with multiplicity.  */
     360  static inline void
     361  mult_index_list_init (struct mult_index_list *accu)
     362  {
     363    accu->nitems = 0;
     364    accu->nitems_max = 0;
     365    accu->item = NULL;
     366    accu->nitems2_max = 0;
     367    accu->item2 = NULL;
     368  }
     369  
     370  /* Add an index list to a list of indices with multiplicity.  */
     371  static inline void
     372  mult_index_list_accumulate (struct mult_index_list *accu, index_list_ty list)
     373  {
     374    size_t len1 = accu->nitems;
     375    size_t len2 = list[IL_LENGTH];
     376    size_t need = len1 + len2;
     377    struct mult_index *ptr1;
     378    struct mult_index *ptr1_end;
     379    index_ty *ptr2;
     380    index_ty *ptr2_end;
     381    struct mult_index *destptr;
     382  
     383    /* Make the work area large enough.  */
     384    if (accu->nitems2_max < need)
     385      {
     386        size_t new_max = 2 * accu->nitems2_max + 1;
     387  
     388        if (new_max < need)
     389          new_max = need;
     390        if (accu->item2 != NULL)
     391          free (accu->item2);
     392        accu->item2 = XNMALLOC (new_max, struct mult_index);
     393        accu->nitems2_max = new_max;
     394      }
     395  
     396    /* Make a linear pass through accu and list simultaneously.  */
     397    ptr1 = accu->item;
     398    ptr1_end = ptr1 + len1;
     399    ptr2 = list + 2;
     400    ptr2_end = ptr2 + len2;
     401    destptr = accu->item2;
     402    while (ptr1 < ptr1_end && ptr2 < ptr2_end)
     403      {
     404        if (ptr1->index < *ptr2)
     405          {
     406            *destptr = *ptr1;
     407            ptr1++;
     408          }
     409        else if (ptr1->index > *ptr2)
     410          {
     411            destptr->index = *ptr2;
     412            destptr->count = 1;
     413            ptr2++;
     414          }
     415        else /* ptr1->index == list[2 + i2] */
     416          {
     417            destptr->index = ptr1->index;
     418            destptr->count = ptr1->count + 1;
     419            ptr1++;
     420            ptr2++;
     421          }
     422        destptr++;
     423      }
     424    while (ptr1 < ptr1_end)
     425      {
     426        *destptr = *ptr1;
     427        ptr1++;
     428        destptr++;
     429      }
     430    while (ptr2 < ptr2_end)
     431      {
     432        destptr->index = *ptr2;
     433        destptr->count = 1;
     434        ptr2++;
     435        destptr++;
     436      }
     437  
     438    /* Swap accu->item and accu->item2.  */
     439    {
     440      struct mult_index *dest = accu->item2;
     441      size_t dest_max = accu->nitems2_max;
     442  
     443      accu->item2 = accu->item;
     444      accu->nitems2_max = accu->nitems_max;
     445  
     446      accu->item = dest;
     447      accu->nitems = destptr - dest;
     448      accu->nitems_max = dest_max;
     449    }
     450  }
     451  
     452  /* Compares two indices with multiplicity, according to their multiplicity.  */
     453  static int
     454  mult_index_compare (const void *p1, const void *p2)
     455  {
     456    const struct mult_index *ptr1 = (const struct mult_index *) p1;
     457    const struct mult_index *ptr2 = (const struct mult_index *) p2;
     458  
     459    if (ptr1->count < ptr2->count)
     460      return 1;
     461    if (ptr1->count > ptr2->count)
     462      return -1;
     463    /* For reproduceable results, also take into account the indices.  */
     464    if (ptr1->index > ptr2->index)
     465      return 1;
     466    if (ptr1->index < ptr2->index)
     467      return -1;
     468    return 0;
     469  }
     470  
     471  /* Sort a list of indices with multiplicity according to decreasing
     472     multiplicity.  */
     473  static inline void
     474  mult_index_list_sort (struct mult_index_list *accu)
     475  {
     476    if (accu->nitems > 1)
     477      qsort (accu->item, accu->nitems, sizeof (struct mult_index),
     478             mult_index_compare);
     479  }
     480  
     481  /* Frees a list of indices with multiplicity.  */
     482  static inline void
     483  mult_index_list_free (struct mult_index_list *accu)
     484  {
     485    if (accu->item != NULL)
     486      free (accu->item);
     487    if (accu->item2 != NULL)
     488      free (accu->item2);
     489  }
     490  
     491  /* Find a good match for the given msgctxt and msgid in the given fuzzy index.
     492     The match does not need to be optimal.
     493     Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
     494     LOWER_BOUND must be >= FUZZY_THRESHOLD.
     495     If HEURISTIC is true, only the few best messages among the list - according
     496     to a certain heuristic - are considered.  If HEURISTIC is false, all
     497     messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
     498     like in message_list_search_fuzzy (except that in ambiguous cases where
     499     several best matches exist, message_list_search_fuzzy chooses the one with
     500     the smallest index whereas message_fuzzy_index_search makes a better
     501     choice).  */
     502  message_ty *
     503  message_fuzzy_index_search (message_fuzzy_index_ty *findex,
     504                              const char *msgctxt, const char *msgid,
     505                              double lower_bound,
     506                              bool heuristic)
     507  {
     508    const char *str = msgid;
     509  
     510    /* Let p0 < p1 < p2 < p3 < p4 walk through the string.  */
     511    const char *p0 = str;
     512    if (*p0 != '\0')
     513      {
     514        const char *p1 = p0 + findex->iterator (p0);
     515        if (*p1 != '\0')
     516          {
     517            const char *p2 = p1 + findex->iterator (p1);
     518            if (*p2 != '\0')
     519              {
     520                const char *p3 = p2 + findex->iterator (p2);
     521                if (*p3 != '\0')
     522                  {
     523                    const char *p4 = p3 + findex->iterator (p3);
     524                    struct mult_index_list accu;
     525  
     526                    mult_index_list_init (&accu);
     527                    for (;;)
     528                      {
     529                        /* The segment from p0 to p4 is a 4-gram of
     530                           characters.  Get the hash table entry containing
     531                           a list of indices, and add it to the accu.  */
     532                        void *found;
     533  
     534                        if (hash_find_entry (&findex->gram4, p0, p4 - p0,
     535                                             &found) == 0)
     536                          {
     537                            index_list_ty list = (index_list_ty) found;
     538                            mult_index_list_accumulate (&accu, list);
     539                          }
     540  
     541                        /* Advance.  */
     542                        if (*p4 == '\0')
     543                          break;
     544                        p0 = p1;
     545                        p1 = p2;
     546                        p2 = p3;
     547                        p3 = p4;
     548                        p4 = p4 + findex->iterator (p4);
     549                      }
     550  
     551                    /* Sort in decreasing count order.  */
     552                    mult_index_list_sort (&accu);
     553  
     554                    /* Iterate over this sorted list, and maximize the
     555                       fuzzy_search_goal_function() result.
     556                       If HEURISTIC is true, take only the first few messages.
     557                       If HEURISTIC is false, consider all messages - to match
     558                       the behaviour of message_list_search_fuzzy -, but process
     559                       them in the order of the sorted list.  This increases
     560                       the chances that the later calls to fstrcmp_bounded() (via
     561                       fuzzy_search_goal_function()) terminate quickly, thanks
     562                       to the best_weight which will be quite high already after
     563                       the first few messages.  */
     564                    {
     565                      size_t count;
     566                      struct mult_index *ptr;
     567                      message_ty *best_mp;
     568                      double best_weight;
     569  
     570                      count = accu.nitems;
     571                      if (heuristic)
     572                        {
     573                          if (count > findex->firstfew)
     574                            count = findex->firstfew;
     575                        }
     576  
     577                      best_weight = lower_bound;
     578                      best_mp = NULL;
     579                      for (ptr = accu.item; count > 0; ptr++, count--)
     580                        {
     581                          message_ty *mp = findex->messages[ptr->index];
     582                          double weight =
     583                            fuzzy_search_goal_function (mp, msgctxt, msgid,
     584                                                        best_weight);
     585  
     586                          if (weight > best_weight)
     587                            {
     588                              best_weight = weight;
     589                              best_mp = mp;
     590                            }
     591                        }
     592  
     593                      mult_index_list_free (&accu);
     594  
     595                      return best_mp;
     596                    }
     597                  }
     598              }
     599          }
     600      }
     601  
     602    /* The string had less than 4 characters.  */
     603    {
     604      size_t l = strlen (str);
     605      size_t lmin, lmax;
     606      message_ty *best_mp;
     607      double best_weight;
     608  
     609      if (!(l <= SHORT_STRING_MAX_BYTES))
     610        abort ();
     611  
     612      /* Walk through those short messages which have an appropriate length.
     613         See the comment before SHORT_MSG_MAX.  */
     614      lmin = (int) ceil (l / (2 / FUZZY_THRESHOLD - 1));
     615      lmax = (int) (l * (2 / FUZZY_THRESHOLD - 1));
     616      if (!(lmax <= SHORT_MSG_MAX))
     617        abort ();
     618  
     619      best_weight = lower_bound;
     620      best_mp = NULL;
     621      for (l = lmin; l <= lmax; l++)
     622        {
     623          message_list_ty *mlp = findex->short_messages[l];
     624          size_t j;
     625  
     626          for (j = 0; j < mlp->nitems; j++)
     627            {
     628              message_ty *mp = mlp->item[j];
     629              double weight =
     630                fuzzy_search_goal_function (mp, msgctxt, msgid, best_weight);
     631  
     632              if (weight > best_weight)
     633                {
     634                  best_weight = weight;
     635                  best_mp = mp;
     636                }
     637            }
     638        }
     639  
     640      return best_mp;
     641    }
     642  }
     643  
     644  /* Free a fuzzy index.  */
     645  void
     646  message_fuzzy_index_free (message_fuzzy_index_ty *findex)
     647  {
     648    size_t l;
     649    void *iter;
     650    const void *key;
     651    size_t keylen;
     652    void *data;
     653  
     654    /* Free the short lists.  */
     655    for (l = 0; l <= SHORT_MSG_MAX; l++)
     656      message_list_free (findex->short_messages[l], 1);
     657    free (findex->short_messages);
     658  
     659    /* Free the index lists occurring as values in the hash tables.  */
     660    iter = NULL;
     661    while (hash_iterate (&findex->gram4, &iter, &key, &keylen, &data) == 0)
     662      free ((index_list_ty *) data);
     663    /* Free the hash table itself.  */
     664    hash_destroy (&findex->gram4);
     665  
     666    free (findex);
     667  }