1  /* Recode Serbian text from Cyrillic to Latin script.
       2     Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
       3     Written by Danilo Šegan <danilo@gnome.org>, 2006,
       4     and Bruno Haible <bruno@clisp.org>, 2006.
       5  
       6     This program is free software: you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3 of the License, or
       9     (at your option) any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      18  
      19  #ifdef HAVE_CONFIG_H
      20  # include <config.h>
      21  #endif
      22  
      23  /* Specification.  */
      24  #include "filters.h"
      25  
      26  #include <stdlib.h>
      27  
      28  #include "xalloc.h"
      29  
      30  
      31  /* Table for Serbian Cyrillic to Latin transcription.
      32     The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
      33     The longest table entry is three bytes long.  */
      34  static const char table[240][3 + 1] =
      35  {
      36    /* U+0400 */ "\xC3\x88", /* "È" */
      37    /* U+0401 */ "",
      38    /* U+0402 */ "\xC4\x90", /* "Đ" */
      39    /* U+0403 */ "",
      40    /* U+0404 */ "",
      41    /* U+0405 */ "",
      42    /* U+0406 */ "",
      43    /* U+0407 */ "",
      44    /* U+0408 */ "J",
      45    /* U+0409 */ "Lj",
      46    /* U+040A */ "Nj",
      47    /* U+040B */ "\xC4\x86", /* "Ć" */
      48    /* U+040C */ "",
      49    /* U+040D */ "\xC3\x8C", /* "Ì" */
      50    /* U+040E */ "",
      51    /* U+040F */ "D\xC5\xBE", /* "Dž" */
      52    /* U+0410 */ "A",
      53    /* U+0411 */ "B",
      54    /* U+0412 */ "V",
      55    /* U+0413 */ "G",
      56    /* U+0414 */ "D",
      57    /* U+0415 */ "E",
      58    /* U+0416 */ "\xC5\xBD", /* "Ž" */
      59    /* U+0417 */ "Z",
      60    /* U+0418 */ "I",
      61    /* U+0419 */ "",
      62    /* U+041A */ "K",
      63    /* U+041B */ "L",
      64    /* U+041C */ "M",
      65    /* U+041D */ "N",
      66    /* U+041E */ "O",
      67    /* U+041F */ "P",
      68    /* U+0420 */ "R",
      69    /* U+0421 */ "S",
      70    /* U+0422 */ "T",
      71    /* U+0423 */ "U",
      72    /* U+0424 */ "F",
      73    /* U+0425 */ "H",
      74    /* U+0426 */ "C",
      75    /* U+0427 */ "\xC4\x8C", /* "Č" */
      76    /* U+0428 */ "\xC5\xA0", /* "Š" */
      77    /* U+0429 */ "",
      78    /* U+042A */ "",
      79    /* U+042B */ "",
      80    /* U+042C */ "",
      81    /* U+042D */ "",
      82    /* U+042E */ "",
      83    /* U+042F */ "",
      84    /* U+0430 */ "a",
      85    /* U+0431 */ "b",
      86    /* U+0432 */ "v",
      87    /* U+0433 */ "g",
      88    /* U+0434 */ "d",
      89    /* U+0435 */ "e",
      90    /* U+0436 */ "\xC5\xBE", /* "ž" */
      91    /* U+0437 */ "z",
      92    /* U+0438 */ "i",
      93    /* U+0439 */ "",
      94    /* U+043A */ "k",
      95    /* U+043B */ "l",
      96    /* U+043C */ "m",
      97    /* U+043D */ "n",
      98    /* U+043E */ "o",
      99    /* U+043F */ "p",
     100    /* U+0440 */ "r",
     101    /* U+0441 */ "s",
     102    /* U+0442 */ "t",
     103    /* U+0443 */ "u",
     104    /* U+0444 */ "f",
     105    /* U+0445 */ "h",
     106    /* U+0446 */ "c",
     107    /* U+0447 */ "\xC4\x8D", /* "č" */
     108    /* U+0448 */ "\xC5\xA1", /* "š" */
     109    /* U+0449 */ "",
     110    /* U+044A */ "",
     111    /* U+044B */ "",
     112    /* U+044C */ "",
     113    /* U+044D */ "",
     114    /* U+044E */ "",
     115    /* U+044F */ "",
     116    /* U+0450 */ "\xC3\xA8", /* "è" */
     117    /* U+0451 */ "",
     118    /* U+0452 */ "\xC4\x91", /* "đ" */
     119    /* U+0453 */ "",
     120    /* U+0454 */ "",
     121    /* U+0455 */ "",
     122    /* U+0456 */ "",
     123    /* U+0457 */ "",
     124    /* U+0458 */ "j",
     125    /* U+0459 */ "lj",
     126    /* U+045A */ "nj",
     127    /* U+045B */ "\xC4\x87", /* "ć" */
     128    /* U+045C */ "",
     129    /* U+045D */ "\xC3\xAC", /* "ì" */
     130    /* U+045E */ "",
     131    /* U+045F */ "d\xC5\xBE", /* "dž" */
     132    /* U+0460 */ "",
     133    /* U+0461 */ "",
     134    /* U+0462 */ "",
     135    /* U+0463 */ "",
     136    /* U+0464 */ "",
     137    /* U+0465 */ "",
     138    /* U+0466 */ "",
     139    /* U+0467 */ "",
     140    /* U+0468 */ "",
     141    /* U+0469 */ "",
     142    /* U+046A */ "",
     143    /* U+046B */ "",
     144    /* U+046C */ "",
     145    /* U+046D */ "",
     146    /* U+046E */ "",
     147    /* U+046F */ "",
     148    /* U+0470 */ "",
     149    /* U+0471 */ "",
     150    /* U+0472 */ "",
     151    /* U+0473 */ "",
     152    /* U+0474 */ "",
     153    /* U+0475 */ "",
     154    /* U+0476 */ "",
     155    /* U+0477 */ "",
     156    /* U+0478 */ "",
     157    /* U+0479 */ "",
     158    /* U+047A */ "",
     159    /* U+047B */ "",
     160    /* U+047C */ "",
     161    /* U+047D */ "",
     162    /* U+047E */ "",
     163    /* U+047F */ "",
     164    /* U+0480 */ "",
     165    /* U+0481 */ "",
     166    /* U+0482 */ "",
     167    /* U+0483 */ "",
     168    /* U+0484 */ "",
     169    /* U+0485 */ "",
     170    /* U+0486 */ "",
     171    /* U+0487 */ "",
     172    /* U+0488 */ "",
     173    /* U+0489 */ "",
     174    /* U+048A */ "",
     175    /* U+048B */ "",
     176    /* U+048C */ "",
     177    /* U+048D */ "",
     178    /* U+048E */ "",
     179    /* U+048F */ "",
     180    /* U+0490 */ "",
     181    /* U+0491 */ "",
     182    /* U+0492 */ "",
     183    /* U+0493 */ "",
     184    /* U+0494 */ "",
     185    /* U+0495 */ "",
     186    /* U+0496 */ "",
     187    /* U+0497 */ "",
     188    /* U+0498 */ "",
     189    /* U+0499 */ "",
     190    /* U+049A */ "",
     191    /* U+049B */ "",
     192    /* U+049C */ "",
     193    /* U+049D */ "",
     194    /* U+049E */ "",
     195    /* U+049F */ "",
     196    /* U+04A0 */ "",
     197    /* U+04A1 */ "",
     198    /* U+04A2 */ "",
     199    /* U+04A3 */ "",
     200    /* U+04A4 */ "",
     201    /* U+04A5 */ "",
     202    /* U+04A6 */ "",
     203    /* U+04A7 */ "",
     204    /* U+04A8 */ "",
     205    /* U+04A9 */ "",
     206    /* U+04AA */ "",
     207    /* U+04AB */ "",
     208    /* U+04AC */ "",
     209    /* U+04AD */ "",
     210    /* U+04AE */ "",
     211    /* U+04AF */ "",
     212    /* U+04B0 */ "",
     213    /* U+04B1 */ "",
     214    /* U+04B2 */ "",
     215    /* U+04B3 */ "",
     216    /* U+04B4 */ "",
     217    /* U+04B5 */ "",
     218    /* U+04B6 */ "",
     219    /* U+04B7 */ "",
     220    /* U+04B8 */ "",
     221    /* U+04B9 */ "",
     222    /* U+04BA */ "",
     223    /* U+04BB */ "",
     224    /* U+04BC */ "",
     225    /* U+04BD */ "",
     226    /* U+04BE */ "",
     227    /* U+04BF */ "",
     228    /* U+04C0 */ "",
     229    /* U+04C1 */ "",
     230    /* U+04C2 */ "",
     231    /* U+04C3 */ "",
     232    /* U+04C4 */ "",
     233    /* U+04C5 */ "",
     234    /* U+04C6 */ "",
     235    /* U+04C7 */ "",
     236    /* U+04C8 */ "",
     237    /* U+04C9 */ "",
     238    /* U+04CA */ "",
     239    /* U+04CB */ "",
     240    /* U+04CC */ "",
     241    /* U+04CD */ "",
     242    /* U+04CE */ "",
     243    /* U+04CF */ "",
     244    /* U+04D0 */ "",
     245    /* U+04D1 */ "",
     246    /* U+04D2 */ "",
     247    /* U+04D3 */ "",
     248    /* U+04D4 */ "",
     249    /* U+04D5 */ "",
     250    /* U+04D6 */ "",
     251    /* U+04D7 */ "",
     252    /* U+04D8 */ "",
     253    /* U+04D9 */ "",
     254    /* U+04DA */ "",
     255    /* U+04DB */ "",
     256    /* U+04DC */ "",
     257    /* U+04DD */ "",
     258    /* U+04DE */ "",
     259    /* U+04DF */ "",
     260    /* U+04E0 */ "",
     261    /* U+04E1 */ "",
     262    /* U+04E2 */ "\xC4\xAA", /* "Ī" */
     263    /* U+04E3 */ "\xC4\xAB", /* "ī" */
     264    /* U+04E4 */ "",
     265    /* U+04E5 */ "",
     266    /* U+04E6 */ "",
     267    /* U+04E7 */ "",
     268    /* U+04E8 */ "",
     269    /* U+04E9 */ "",
     270    /* U+04EA */ "",
     271    /* U+04EB */ "",
     272    /* U+04EC */ "",
     273    /* U+04ED */ "",
     274    /* U+04EE */ "\xC5\xAA", /* "Ū" */
     275    /* U+04EF */ "\xC5\xAB" /* "ū" */
     276  };
     277  
     278  /* Quick test for an uppercase character in the range U+0041..U+005A.
     279     The argument must be a byte in the range 0..UCHAR_MAX.  */
     280  #define IS_UPPERCASE_LATIN(byte) \
     281    ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
     282  
     283  /* Quick test for an uppercase character in the range U+0400..U+042F,
     284     or exactly U+04E2 or U+04EE.
     285     The arguments must be bytes in the range 0..UCHAR_MAX.  */
     286  #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
     287    (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
     288     || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
     289  
     290  void
     291  serbian_to_latin (const char *input, size_t input_len,
     292                    char **output_p, size_t *output_len_p)
     293  {
     294    /* Loop through the input string, producing a replacement for each character.
     295       Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
     296       be handled, and more precisely only those for which a replacement exists
     297       in the table.  Other characters are copied without modification.
     298       The characters U+0409, U+040A, U+040F are transliterated to uppercase or
     299       mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
     300       on the case of the surrounding characters.
     301       Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
     302       beginning of a character; the second and further bytes of a character are
     303       all in the range \x80..\xBF.  */
     304  
     305    /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
     306       the size of the output will be at most 1.5 * input_len.  */
     307    size_t allocated = input_len + (input_len >> 1);
     308    char *output = XNMALLOC (allocated, char);
     309  
     310    const char *input_end = input + input_len;
     311    const char *ip;
     312    char *op;
     313  
     314    for (ip = input, op = output; ip < input_end; )
     315      {
     316        unsigned char byte = (unsigned char) *ip;
     317  
     318        /* Test for the first byte of a Cyrillic character.  */
     319        if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
     320          {
     321            unsigned char second_byte = (unsigned char) ip[1];
     322  
     323            /* Verify the second byte is valid.  */
     324            if (second_byte >= 0x80 && second_byte < 0xc0)
     325              {
     326                unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
     327  
     328                if (uc >= 0x0400 && uc <= 0x04ef)
     329                  {
     330                    /* Look up replacement from the table.  */
     331                    const char *repl = table[uc - 0x0400];
     332  
     333                    if (repl[0] != '\0')
     334                      {
     335                        /* Found a replacement.
     336                           Now handle the special cases.  */
     337                        if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
     338                          if ((ip + 2 < input_end
     339                               && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
     340                              || (ip + 3 < input_end
     341                                  && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
     342                                                            (unsigned char) ip[3]))
     343                              || (ip >= input + 1
     344                                  && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
     345                              || (ip >= input + 2
     346                                  && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
     347                                                            (unsigned char) ip[-1])))
     348                            {
     349                              /* Use the upper-case replacement instead of
     350                                 the mixed-case replacement.  */
     351                              switch (uc)
     352                                {
     353                                case 0x0409:
     354                                  repl = "LJ"; break;
     355                                case 0x040a:
     356                                  repl = "NJ"; break;
     357                                case 0x040f:
     358                                  repl = "D\xC5\xBD"/* "DŽ" */; break;
     359                                default:
     360                                  abort ();
     361                                }
     362                            }
     363  
     364                        /* Use the replacement.  */
     365                        *op++ = *repl++;
     366                        if (*repl != '\0')
     367                          {
     368                            *op++ = *repl++;
     369                            if (*repl != '\0')
     370                              {
     371                                *op++ = *repl++;
     372                                /* All replacements have at most 3 bytes.  */
     373                                if (*repl != '\0')
     374                                  abort ();
     375                              }
     376                          }
     377                        ip += 2;
     378                        continue;
     379                      }
     380                  }
     381              }
     382          }
     383        *op++ = *ip++;
     384      }
     385  
     386    {
     387      size_t output_len = op - output;
     388  
     389      /* Verify that the allocated size was not exceeded.  */
     390      if (output_len > allocated)
     391        abort ();
     392      /* Shrink the result.  */
     393      if (output_len < allocated)
     394        output = (char *) xrealloc (output, output_len);
     395  
     396      /* Done.  */
     397      *output_p = output;
     398      *output_len_p = output_len;
     399    }
     400  }