1  /* Localization of proper names.
       2     Copyright (C) 2006-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2006.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation, either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
      19     the proper_name function might be candidate for attribute 'const'  */
      20  #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
      21  # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
      22  #endif
      23  
      24  #include <config.h>
      25  
      26  /* Specification.  */
      27  #include "propername.h"
      28  
      29  #include <ctype.h>
      30  #include <stdio.h>
      31  #include <stdlib.h>
      32  #include <string.h>
      33  #if HAVE_ICONV
      34  # include <iconv.h>
      35  #endif
      36  
      37  #include "trim.h"
      38  #if GNULIB_MCEL_PREFER
      39  # include "mcel.h"
      40  #else
      41  # include "mbchar.h"
      42  # include "mbuiter.h"
      43  #endif
      44  #include "localcharset.h"
      45  #include "c-strcase.h"
      46  #include "xstriconv.h"
      47  #include "xalloc.h"
      48  #include "gettext.h"
      49  
      50  
      51  /* Tests whether STRING contains trim (SUB), starting and ending at word
      52     boundaries.
      53     Here, instead of implementing Unicode Standard Annex #29 for determining
      54     word boundaries, we assume that trim (SUB) starts and ends with words and
      55     only test whether the part before it ends with a non-word and the part
      56     after it starts with a non-word.  */
      57  static bool
      58  mbsstr_trimmed_wordbounded (const char *string, const char *sub)
      59  {
      60    char *tsub = trim (sub);
      61    bool found = false;
      62    bool multibyte_locale = MB_CUR_MAX > 1;
      63    size_t tsublen;
      64    if (! multibyte_locale)
      65      tsublen = strlen (tsub);
      66  
      67    while (*string != '\0')
      68      {
      69        const char *tsub_in_string = mbsstr (string, tsub);
      70        if (tsub_in_string == NULL)
      71          break;
      72        else
      73          {
      74            if (multibyte_locale)
      75              {
      76  #if GNULIB_MCEL_PREFER
      77                char const *string_iter = string;
      78  
      79                char32_t last_char_before_tsub = 0;
      80                while (string_iter < tsub_in_string)
      81                  {
      82                    mcel_t g = mcel_scanz (string_iter);
      83                    last_char_before_tsub = g.ch;
      84                    string_iter += g.len;
      85                  }
      86  
      87                string_iter = tsub_in_string;
      88                for (char const *tsub_iter = tsub; *tsub_iter;
      89                     tsub_iter += mcel_scanz (tsub_iter).len)
      90                  string_iter += mcel_scanz (string_iter).len;
      91  
      92                if (!c32isalnum (last_char_before_tsub)
      93                    && !c32isalnum (mcel_scanz (string_iter).ch))
      94                  {
      95                    found = true;
      96                    break;
      97                  }
      98  
      99                if (!*tsub_in_string)
     100                  break;
     101                string = tsub_in_string + mcel_scanz (tsub_in_string).len;
     102  #else
     103                mbui_iterator_t string_iter;
     104                bool word_boundary_before;
     105                bool word_boundary_after;
     106  
     107                mbui_init (string_iter, string);
     108                word_boundary_before = true;
     109                if (mbui_cur_ptr (string_iter) < tsub_in_string)
     110                  {
     111                    mbchar_t last_char_before_tsub;
     112                    do
     113                      {
     114                        if (!mbui_avail (string_iter))
     115                          abort ();
     116                        last_char_before_tsub = mbui_cur (string_iter);
     117                        mbui_advance (string_iter);
     118                      }
     119                    while (mbui_cur_ptr (string_iter) < tsub_in_string);
     120                    if (mb_isalnum (last_char_before_tsub))
     121                      word_boundary_before = false;
     122                  }
     123  
     124                mbui_init (string_iter, tsub_in_string);
     125                {
     126                  mbui_iterator_t tsub_iter;
     127  
     128                  for (mbui_init (tsub_iter, tsub);
     129                       mbui_avail (tsub_iter);
     130                       mbui_advance (tsub_iter))
     131                    {
     132                      if (!mbui_avail (string_iter))
     133                        abort ();
     134                      mbui_advance (string_iter);
     135                    }
     136                }
     137                word_boundary_after = true;
     138                if (mbui_avail (string_iter))
     139                  {
     140                    mbchar_t first_char_after_tsub = mbui_cur (string_iter);
     141                    if (mb_isalnum (first_char_after_tsub))
     142                      word_boundary_after = false;
     143                  }
     144  
     145                if (word_boundary_before && word_boundary_after)
     146                  {
     147                    found = true;
     148                    break;
     149                  }
     150  
     151                mbui_init (string_iter, tsub_in_string);
     152                if (!mbui_avail (string_iter))
     153                  break;
     154                string = tsub_in_string + mb_len (mbui_cur (string_iter));
     155  #endif
     156              }
     157            else
     158              {
     159                if ((string == tsub_in_string
     160                     || !isalnum ((unsigned char) tsub_in_string[-1]))
     161                    && !isalnum ((unsigned char) tsub_in_string[tsublen]))
     162                  {
     163                    found = true;
     164                    break;
     165                  }
     166  
     167                if (*tsub_in_string == '\0')
     168                  break;
     169                string = tsub_in_string + 1;
     170              }
     171          }
     172      }
     173    free (tsub);
     174    return found;
     175  }
     176  
     177  /* Return the localization of NAME.  NAME is written in ASCII.  */
     178  
     179  const char *
     180  proper_name (const char *name)
     181  {
     182    /* See whether there is a translation.   */
     183    const char *translation = gettext (name);
     184  
     185    if (translation != name)
     186      {
     187        /* See whether the translation contains the original name.  */
     188        if (mbsstr_trimmed_wordbounded (translation, name))
     189          return translation;
     190        else
     191          {
     192            /* Return "TRANSLATION (NAME)".  */
     193            char *result =
     194              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     195  
     196            sprintf (result, "%s (%s)", translation, name);
     197            return result;
     198          }
     199      }
     200    else
     201      return name;
     202  }
     203  
     204  /* Return the localization of a name whose original writing is not ASCII.
     205     NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
     206     escape sequences.  NAME_ASCII is a fallback written only with ASCII
     207     characters.  */
     208  
     209  const char *
     210  proper_name_utf8 (const char *name_ascii, const char *name_utf8)
     211  {
     212    /* See whether there is a translation.   */
     213    const char *translation = gettext (name_ascii);
     214  
     215    /* Try to convert NAME_UTF8 to the locale encoding.  */
     216    const char *locale_code = locale_charset ();
     217    char *alloc_name_converted = NULL;
     218    char *alloc_name_converted_translit = NULL;
     219    const char *name_converted = NULL;
     220    const char *name_converted_translit = NULL;
     221    const char *name;
     222  
     223    if (c_strcasecmp (locale_code, "UTF-8") != 0)
     224      {
     225  #if HAVE_ICONV
     226        name_converted = alloc_name_converted =
     227          xstr_iconv (name_utf8, "UTF-8", locale_code);
     228  
     229  # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
     230        && !defined __UCLIBC__) \
     231       || _LIBICONV_VERSION >= 0x0105
     232        {
     233          char *converted_translit;
     234  
     235          size_t len = strlen (locale_code);
     236          char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
     237          memcpy (locale_code_translit, locale_code, len);
     238          memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
     239  
     240          converted_translit =
     241            xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
     242  
     243          free (locale_code_translit);
     244  
     245          if (converted_translit != NULL)
     246            {
     247  #  if !_LIBICONV_VERSION
     248              /* Don't use the transliteration if it added question marks.
     249                 glibc's transliteration falls back to question marks; libiconv's
     250                 transliteration does not.
     251                 mbschr is equivalent to strchr in this case.  */
     252              if (strchr (converted_translit, '?') != NULL)
     253                free (converted_translit);
     254              else
     255  #  endif
     256                name_converted_translit = alloc_name_converted_translit =
     257                  converted_translit;
     258            }
     259        }
     260  # endif
     261  #endif
     262      }
     263    else
     264      {
     265        name_converted = name_utf8;
     266        name_converted_translit = name_utf8;
     267      }
     268  
     269    /* The name in locale encoding.  */
     270    name = (name_converted != NULL ? name_converted :
     271            name_converted_translit != NULL ? name_converted_translit :
     272            name_ascii);
     273  
     274    /* See whether we have a translation.  Some translators have not understood
     275       that they should use the UTF-8 form of the name, if possible.  So if the
     276       translator provided a no-op translation, we ignore it.  */
     277    if (strcmp (translation, name_ascii) != 0)
     278      {
     279        /* See whether the translation contains the original name.  */
     280        if (mbsstr_trimmed_wordbounded (translation, name_ascii)
     281            || (name_converted != NULL
     282                && mbsstr_trimmed_wordbounded (translation, name_converted))
     283            || (name_converted_translit != NULL
     284                && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
     285          {
     286            if (alloc_name_converted != NULL)
     287              free (alloc_name_converted);
     288            if (alloc_name_converted_translit != NULL)
     289              free (alloc_name_converted_translit);
     290            return translation;
     291          }
     292        else
     293          {
     294            /* Return "TRANSLATION (NAME)".  */
     295            char *result =
     296              XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
     297  
     298            sprintf (result, "%s (%s)", translation, name);
     299  
     300            if (alloc_name_converted != NULL)
     301              free (alloc_name_converted);
     302            if (alloc_name_converted_translit != NULL)
     303              free (alloc_name_converted_translit);
     304            return result;
     305          }
     306      }
     307    else
     308      {
     309        if (alloc_name_converted != NULL && alloc_name_converted != name)
     310          free (alloc_name_converted);
     311        if (alloc_name_converted_translit != NULL
     312            && alloc_name_converted_translit != name)
     313          free (alloc_name_converted_translit);
     314        return name;
     315      }
     316  }
     317  
     318  #ifdef TEST1
     319  # include <locale.h>
     320  int
     321  main (int argc, char *argv[])
     322  {
     323    setlocale (LC_ALL, "");
     324    if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
     325      printf("found\n");
     326    return 0;
     327  }
     328  #endif
     329  
     330  #ifdef TEST2
     331  # include <locale.h>
     332  # include <stdio.h>
     333  int
     334  main (int argc, char *argv[])
     335  {
     336    setlocale (LC_ALL, "");
     337    printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
     338    return 0;
     339  }
     340  #endif