1  /* Localization of proper names.
       2     Copyright (C) 2006-2021 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2006.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
      19     the proper_name function might be candidate for attribute 'const'  */
      20  #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
      21  # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
      22  #endif
      23  
      24  #include <config.h>
      25  
      26  /* Specification.  */
      27  #include "propername.h"
      28  
      29  #include <ctype.h>
      30  #include <stdbool.h>
      31  #include <stdio.h>
      32  #include <stdlib.h>
      33  #include <string.h>
      34  #if HAVE_ICONV
      35  # include <iconv.h>
      36  #endif
      37  
      38  #include "trim.h"
      39  #include "mbchar.h"
      40  #include "mbuiter.h"
      41  #include "localcharset.h"
      42  #include "c-strcase.h"
      43  #include "xstriconv.h"
      44  #include "xalloc.h"
      45  #include "gettext.h"
      46  
      47  
      48  /* Tests whether STRING contains trim (SUB), starting and ending at word
      49     boundaries.
      50     Here, instead of implementing Unicode Standard Annex #29 for determining
      51     word boundaries, we assume that trim (SUB) starts and ends with words and
      52     only test whether the part before it ends with a non-word and the part
      53     after it starts with a non-word.  */
      54  static bool
      55  mbsstr_trimmed_wordbounded (const char *string, const char *sub)
      56  {
      57    char *tsub = trim (sub);
      58    bool found = false;
      59  
      60    for (; *string != '\0';)
      61      {
      62        const char *tsub_in_string = mbsstr (string, tsub);
      63        if (tsub_in_string == NULL)
      64          break;
      65        else
      66          {
      67            if (MB_CUR_MAX > 1)
      68              {
      69                mbui_iterator_t string_iter;
      70                bool word_boundary_before;
      71                bool word_boundary_after;
      72  
      73                mbui_init (string_iter, string);
      74                word_boundary_before = true;
      75                if (mbui_cur_ptr (string_iter) < tsub_in_string)
      76                  {
      77                    mbchar_t last_char_before_tsub;
      78                    do
      79                      {
      80                        if (!mbui_avail (string_iter))
      81                          abort ();
      82                        last_char_before_tsub = mbui_cur (string_iter);
      83                        mbui_advance (string_iter);
      84                      }
      85                    while (mbui_cur_ptr (string_iter) < tsub_in_string);
      86                    if (mb_isalnum (last_char_before_tsub))
      87                      word_boundary_before = false;
      88                  }
      89  
      90                mbui_init (string_iter, tsub_in_string);
      91                {
      92                  mbui_iterator_t tsub_iter;
      93  
      94                  for (mbui_init (tsub_iter, tsub);
      95                       mbui_avail (tsub_iter);
      96                       mbui_advance (tsub_iter))
      97                    {
      98                      if (!mbui_avail (string_iter))
      99                        abort ();
     100                      mbui_advance (string_iter);
     101                    }
     102                }
     103                word_boundary_after = true;
     104                if (mbui_avail (string_iter))
     105                  {
     106                    mbchar_t first_char_after_tsub = mbui_cur (string_iter);
     107                    if (mb_isalnum (first_char_after_tsub))
     108                      word_boundary_after = false;
     109                  }
     110  
     111                if (word_boundary_before && word_boundary_after)
     112                  {
     113                    found = true;
     114                    break;
     115                  }
     116  
     117                mbui_init (string_iter, tsub_in_string);
     118                if (!mbui_avail (string_iter))
     119                  break;
     120                string = tsub_in_string + mb_len (mbui_cur (string_iter));
     121              }
     122            else
     123              {
     124                bool word_boundary_before;
     125                const char *p;
     126                bool word_boundary_after;
     127  
     128                word_boundary_before = true;
     129                if (string < tsub_in_string)
     130                  if (isalnum ((unsigned char) tsub_in_string[-1]))
     131                    word_boundary_before = false;
     132  
     133                p = tsub_in_string + strlen (tsub);
     134                word_boundary_after = true;
     135                if (*p != '\0')
     136                  if (isalnum ((unsigned char) *p))
     137                    word_boundary_after = false;
     138  
     139                if (word_boundary_before && word_boundary_after)
     140                  {
     141                    found = true;
     142                    break;
     143                  }
     144  
     145                if (*tsub_in_string == '\0')
     146                  break;
     147                string = tsub_in_string + 1;
     148              }
     149          }
     150      }
     151    free (tsub);
     152    return found;
     153  }
     154  
     155  /* Return the localization of NAME.  NAME is written in ASCII.  */
     156  
     157  const char *
     158  proper_name (const char *name)
     159  {
     160    /* See whether there is a translation.   */
     161    const char *translation = gettext (name);
     162  
     163    if (translation != name)
     164      {
     165        /* See whether the translation contains the original name.  */
     166        if (mbsstr_trimmed_wordbounded (translation, name))
     167          return translation;
     168        else
     169          {
     170            /* Return "TRANSLATION (NAME)".  */
     171            char *result =
     172              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     173  
     174            sprintf (result, "%s (%s)", translation, name);
     175            return result;
     176          }
     177      }
     178    else
     179      return name;
     180  }
     181  
     182  /* Return the localization of a name whose original writing is not ASCII.
     183     NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
     184     escape sequences.  NAME_ASCII is a fallback written only with ASCII
     185     characters.  */
     186  
     187  const char *
     188  proper_name_utf8 (const char *name_ascii, const char *name_utf8)
     189  {
     190    /* See whether there is a translation.   */
     191    const char *translation = gettext (name_ascii);
     192  
     193    /* Try to convert NAME_UTF8 to the locale encoding.  */
     194    const char *locale_code = locale_charset ();
     195    char *alloc_name_converted = NULL;
     196    char *alloc_name_converted_translit = NULL;
     197    const char *name_converted = NULL;
     198    const char *name_converted_translit = NULL;
     199    const char *name;
     200  
     201    if (c_strcasecmp (locale_code, "UTF-8") != 0)
     202      {
     203  #if HAVE_ICONV
     204        name_converted = alloc_name_converted =
     205          xstr_iconv (name_utf8, "UTF-8", locale_code);
     206  
     207  # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     208        && !defined __UCLIBC__) \
     209       || _LIBICONV_VERSION >= 0x0105
     210        {
     211          char *converted_translit;
     212  
     213          size_t len = strlen (locale_code);
     214          char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
     215          memcpy (locale_code_translit, locale_code, len);
     216          memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
     217  
     218          converted_translit =
     219            xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
     220  
     221          free (locale_code_translit);
     222  
     223          if (converted_translit != NULL)
     224            {
     225  #  if !_LIBICONV_VERSION
     226              /* Don't use the transliteration if it added question marks.
     227                 glibc's transliteration falls back to question marks; libiconv's
     228                 transliteration does not.
     229                 mbschr is equivalent to strchr in this case.  */
     230              if (strchr (converted_translit, '?') != NULL)
     231                free (converted_translit);
     232              else
     233  #  endif
     234                name_converted_translit = alloc_name_converted_translit =
     235                  converted_translit;
     236            }
     237        }
     238  # endif
     239  #endif
     240      }
     241    else
     242      {
     243        name_converted = name_utf8;
     244        name_converted_translit = name_utf8;
     245      }
     246  
     247    /* The name in locale encoding.  */
     248    name = (name_converted != NULL ? name_converted :
     249            name_converted_translit != NULL ? name_converted_translit :
     250            name_ascii);
     251  
     252    /* See whether we have a translation.  Some translators have not understood
     253       that they should use the UTF-8 form of the name, if possible.  So if the
     254       translator provided a no-op translation, we ignore it.  */
     255    if (strcmp (translation, name_ascii) != 0)
     256      {
     257        /* See whether the translation contains the original name.  */
     258        if (mbsstr_trimmed_wordbounded (translation, name_ascii)
     259            || (name_converted != NULL
     260                && mbsstr_trimmed_wordbounded (translation, name_converted))
     261            || (name_converted_translit != NULL
     262                && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
     263          {
     264            if (alloc_name_converted != NULL)
     265              free (alloc_name_converted);
     266            if (alloc_name_converted_translit != NULL)
     267              free (alloc_name_converted_translit);
     268            return translation;
     269          }
     270        else
     271          {
     272            /* Return "TRANSLATION (NAME)".  */
     273            char *result =
     274              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     275  
     276            sprintf (result, "%s (%s)", translation, name);
     277  
     278            if (alloc_name_converted != NULL)
     279              free (alloc_name_converted);
     280            if (alloc_name_converted_translit != NULL)
     281              free (alloc_name_converted_translit);
     282            return result;
     283          }
     284      }
     285    else
     286      {
     287        if (alloc_name_converted != NULL && alloc_name_converted != name)
     288          free (alloc_name_converted);
     289        if (alloc_name_converted_translit != NULL
     290            && alloc_name_converted_translit != name)
     291          free (alloc_name_converted_translit);
     292        return name;
     293      }
     294  }
     295  
     296  #ifdef TEST1
     297  # include <locale.h>
     298  int
     299  main (int argc, char *argv[])
     300  {
     301    setlocale (LC_ALL, "");
     302    if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
     303      printf("found\n");
     304    return 0;
     305  }
     306  #endif
     307  
     308  #ifdef TEST2
     309  # include <locale.h>
     310  # include <stdio.h>
     311  int
     312  main (int argc, char *argv[])
     313  {
     314    setlocale (LC_ALL, "");
     315    printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
     316    return 0;
     317  }
     318  #endif