(root)/
diffutils-3.10/
lib/
propername.c
       1  /* Localization of proper names.
       2     Copyright (C) 2006-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2006.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
      19     the proper_name function might be candidate for attribute 'const'  */
      20  #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
      21  # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
      22  #endif
      23  
      24  #include <config.h>
      25  
      26  /* Specification.  */
      27  #include "propername.h"
      28  
      29  #include <ctype.h>
      30  #include <stdio.h>
      31  #include <stdlib.h>
      32  #include <string.h>
      33  #if HAVE_ICONV
      34  # include <iconv.h>
      35  #endif
      36  
      37  #include "trim.h"
      38  #include "mbchar.h"
      39  #include "mbuiter.h"
      40  #include "localcharset.h"
      41  #include "c-strcase.h"
      42  #include "xstriconv.h"
      43  #include "xalloc.h"
      44  #include "gettext.h"
      45  
      46  
      47  /* Tests whether STRING contains trim (SUB), starting and ending at word
      48     boundaries.
      49     Here, instead of implementing Unicode Standard Annex #29 for determining
      50     word boundaries, we assume that trim (SUB) starts and ends with words and
      51     only test whether the part before it ends with a non-word and the part
      52     after it starts with a non-word.  */
      53  static bool
      54  mbsstr_trimmed_wordbounded (const char *string, const char *sub)
      55  {
      56    char *tsub = trim (sub);
      57    bool found = false;
      58  
      59    for (; *string != '\0';)
      60      {
      61        const char *tsub_in_string = mbsstr (string, tsub);
      62        if (tsub_in_string == NULL)
      63          break;
      64        else
      65          {
      66            if (MB_CUR_MAX > 1)
      67              {
      68                mbui_iterator_t string_iter;
      69                bool word_boundary_before;
      70                bool word_boundary_after;
      71  
      72                mbui_init (string_iter, string);
      73                word_boundary_before = true;
      74                if (mbui_cur_ptr (string_iter) < tsub_in_string)
      75                  {
      76                    mbchar_t last_char_before_tsub;
      77                    do
      78                      {
      79                        if (!mbui_avail (string_iter))
      80                          abort ();
      81                        last_char_before_tsub = mbui_cur (string_iter);
      82                        mbui_advance (string_iter);
      83                      }
      84                    while (mbui_cur_ptr (string_iter) < tsub_in_string);
      85                    if (mb_isalnum (last_char_before_tsub))
      86                      word_boundary_before = false;
      87                  }
      88  
      89                mbui_init (string_iter, tsub_in_string);
      90                {
      91                  mbui_iterator_t tsub_iter;
      92  
      93                  for (mbui_init (tsub_iter, tsub);
      94                       mbui_avail (tsub_iter);
      95                       mbui_advance (tsub_iter))
      96                    {
      97                      if (!mbui_avail (string_iter))
      98                        abort ();
      99                      mbui_advance (string_iter);
     100                    }
     101                }
     102                word_boundary_after = true;
     103                if (mbui_avail (string_iter))
     104                  {
     105                    mbchar_t first_char_after_tsub = mbui_cur (string_iter);
     106                    if (mb_isalnum (first_char_after_tsub))
     107                      word_boundary_after = false;
     108                  }
     109  
     110                if (word_boundary_before && word_boundary_after)
     111                  {
     112                    found = true;
     113                    break;
     114                  }
     115  
     116                mbui_init (string_iter, tsub_in_string);
     117                if (!mbui_avail (string_iter))
     118                  break;
     119                string = tsub_in_string + mb_len (mbui_cur (string_iter));
     120              }
     121            else
     122              {
     123                bool word_boundary_before;
     124                const char *p;
     125                bool word_boundary_after;
     126  
     127                word_boundary_before = true;
     128                if (string < tsub_in_string)
     129                  if (isalnum ((unsigned char) tsub_in_string[-1]))
     130                    word_boundary_before = false;
     131  
     132                p = tsub_in_string + strlen (tsub);
     133                word_boundary_after = true;
     134                if (*p != '\0')
     135                  if (isalnum ((unsigned char) *p))
     136                    word_boundary_after = false;
     137  
     138                if (word_boundary_before && word_boundary_after)
     139                  {
     140                    found = true;
     141                    break;
     142                  }
     143  
     144                if (*tsub_in_string == '\0')
     145                  break;
     146                string = tsub_in_string + 1;
     147              }
     148          }
     149      }
     150    free (tsub);
     151    return found;
     152  }
     153  
     154  /* Return the localization of NAME.  NAME is written in ASCII.  */
     155  
     156  const char *
     157  proper_name (const char *name)
     158  {
     159    /* See whether there is a translation.   */
     160    const char *translation = gettext (name);
     161  
     162    if (translation != name)
     163      {
     164        /* See whether the translation contains the original name.  */
     165        if (mbsstr_trimmed_wordbounded (translation, name))
     166          return translation;
     167        else
     168          {
     169            /* Return "TRANSLATION (NAME)".  */
     170            char *result =
     171              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     172  
     173            sprintf (result, "%s (%s)", translation, name);
     174            return result;
     175          }
     176      }
     177    else
     178      return name;
     179  }
     180  
     181  /* Return the localization of a name whose original writing is not ASCII.
     182     NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
     183     escape sequences.  NAME_ASCII is a fallback written only with ASCII
     184     characters.  */
     185  
     186  const char *
     187  proper_name_utf8 (const char *name_ascii, const char *name_utf8)
     188  {
     189    /* See whether there is a translation.   */
     190    const char *translation = gettext (name_ascii);
     191  
     192    /* Try to convert NAME_UTF8 to the locale encoding.  */
     193    const char *locale_code = locale_charset ();
     194    char *alloc_name_converted = NULL;
     195    char *alloc_name_converted_translit = NULL;
     196    const char *name_converted = NULL;
     197    const char *name_converted_translit = NULL;
     198    const char *name;
     199  
     200    if (c_strcasecmp (locale_code, "UTF-8") != 0)
     201      {
     202  #if HAVE_ICONV
     203        name_converted = alloc_name_converted =
     204          xstr_iconv (name_utf8, "UTF-8", locale_code);
     205  
     206  # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     207        && !defined __UCLIBC__) \
     208       || _LIBICONV_VERSION >= 0x0105
     209        {
     210          char *converted_translit;
     211  
     212          size_t len = strlen (locale_code);
     213          char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
     214          memcpy (locale_code_translit, locale_code, len);
     215          memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
     216  
     217          converted_translit =
     218            xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
     219  
     220          free (locale_code_translit);
     221  
     222          if (converted_translit != NULL)
     223            {
     224  #  if !_LIBICONV_VERSION
     225              /* Don't use the transliteration if it added question marks.
     226                 glibc's transliteration falls back to question marks; libiconv's
     227                 transliteration does not.
     228                 mbschr is equivalent to strchr in this case.  */
     229              if (strchr (converted_translit, '?') != NULL)
     230                free (converted_translit);
     231              else
     232  #  endif
     233                name_converted_translit = alloc_name_converted_translit =
     234                  converted_translit;
     235            }
     236        }
     237  # endif
     238  #endif
     239      }
     240    else
     241      {
     242        name_converted = name_utf8;
     243        name_converted_translit = name_utf8;
     244      }
     245  
     246    /* The name in locale encoding.  */
     247    name = (name_converted != NULL ? name_converted :
     248            name_converted_translit != NULL ? name_converted_translit :
     249            name_ascii);
     250  
     251    /* See whether we have a translation.  Some translators have not understood
     252       that they should use the UTF-8 form of the name, if possible.  So if the
     253       translator provided a no-op translation, we ignore it.  */
     254    if (strcmp (translation, name_ascii) != 0)
     255      {
     256        /* See whether the translation contains the original name.  */
     257        if (mbsstr_trimmed_wordbounded (translation, name_ascii)
     258            || (name_converted != NULL
     259                && mbsstr_trimmed_wordbounded (translation, name_converted))
     260            || (name_converted_translit != NULL
     261                && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
     262          {
     263            if (alloc_name_converted != NULL)
     264              free (alloc_name_converted);
     265            if (alloc_name_converted_translit != NULL)
     266              free (alloc_name_converted_translit);
     267            return translation;
     268          }
     269        else
     270          {
     271            /* Return "TRANSLATION (NAME)".  */
     272            char *result =
     273              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     274  
     275            sprintf (result, "%s (%s)", translation, name);
     276  
     277            if (alloc_name_converted != NULL)
     278              free (alloc_name_converted);
     279            if (alloc_name_converted_translit != NULL)
     280              free (alloc_name_converted_translit);
     281            return result;
     282          }
     283      }
     284    else
     285      {
     286        if (alloc_name_converted != NULL && alloc_name_converted != name)
     287          free (alloc_name_converted);
     288        if (alloc_name_converted_translit != NULL
     289            && alloc_name_converted_translit != name)
     290          free (alloc_name_converted_translit);
     291        return name;
     292      }
     293  }
     294  
     295  #ifdef TEST1
     296  # include <locale.h>
     297  int
     298  main (int argc, char *argv[])
     299  {
     300    setlocale (LC_ALL, "");
     301    if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
     302      printf("found\n");
     303    return 0;
     304  }
     305  #endif
     306  
     307  #ifdef TEST2
     308  # include <locale.h>
     309  # include <stdio.h>
     310  int
     311  main (int argc, char *argv[])
     312  {
     313    setlocale (LC_ALL, "");
     314    printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
     315    return 0;
     316  }
     317  #endif