(root)/
libredwg-0.13/
src/
codepages.c
       1  /*****************************************************************************/
       2  /*  LibreDWG - free implementation of the DWG file format                    */
       3  /*                                                                           */
       4  /*  Copyright (C) 2023 Free Software Foundation, Inc.                        */
       5  /*                                                                           */
       6  /*  This library is free software, licensed under the terms of the GNU       */
       7  /*  General Public License as published by the Free Software Foundation,     */
       8  /*  either version 3 of the License, or (at your option) any later version.  */
       9  /*  You should have received a copy of the GNU General Public License        */
      10  /*  along with this program.  If not, see <http://www.gnu.org/licenses/>.    */
      11  /*****************************************************************************/
      12  
      13  /*
      14   * codepages.c: preR2007 codepages support via iconv
      15   * written by Reini Urban
      16   *
      17   * See also the src mappings from https://www.unicode.org/Public/MAPPINGS/
      18   * or the libdxfrw/src/intern/drw_textcodec.cpp mappings.
      19   */
      20  
      21  #include "config.h"
      22  #include <string.h>
      23  #include <stdint.h>
      24  #include <stdlib.h>
      25  #include <errno.h>
      26  #include <ctype.h>
      27  #include <assert.h>
      28  #if defined HAVE_ICONV && defined HAVE_ICONV_H
      29  #  include <iconv.h>
      30  #endif
      31  // #define CODEPAGES_C
      32  #include "common.h"
      33  #include "codepages.h"
      34  
      35  #include "codepages/ISO-8859-2.h"
      36  #include "codepages/ISO-8859-3.h"
      37  #include "codepages/ISO-8859-4.h"
      38  #include "codepages/ISO-8859-5.h"
      39  #include "codepages/ISO-8859-6.h"
      40  #include "codepages/ISO-8859-7.h"
      41  #include "codepages/ISO-8859-8.h"
      42  #include "codepages/ISO-8859-9.h"
      43  #include "codepages/CP437.h"
      44  #include "codepages/CP850.h"
      45  #include "codepages/CP852.h"
      46  #include "codepages/CP855.h"
      47  #include "codepages/CP857.h"
      48  #include "codepages/CP860.h"
      49  #include "codepages/CP861.h"
      50  #include "codepages/CP863.h"
      51  #include "codepages/CP864.h"
      52  #include "codepages/CP865.h"
      53  #include "codepages/CP869.h"
      54  #include "codepages/CP932.h"
      55  #include "codepages/MACINTOSH.h"
      56  #include "codepages/BIG5.h"
      57  #include "codepages/CP949.h"
      58  #include "codepages/JOHAB.h"
      59  #include "codepages/CP866.h"
      60  #include "codepages/WINDOWS-1250.h"
      61  #include "codepages/WINDOWS-1251.h"
      62  #include "codepages/WINDOWS-1252.h"
      63  #include "codepages/GB2312.h"
      64  #include "codepages/WINDOWS-1253.h"
      65  #include "codepages/WINDOWS-1254.h"
      66  #include "codepages/WINDOWS-1255.h"
      67  #include "codepages/WINDOWS-1256.h"
      68  #include "codepages/WINDOWS-1257.h"
      69  #include "codepages/WINDOWS-874.h"
      70  #include "codepages/WINDOWS-932.h"
      71  #include "codepages/WINDOWS-936.h"
      72  #include "codepages/WINDOWS-949.h"
      73  #include "codepages/WINDOWS-950.h"
      74  #include "codepages/WINDOWS-1361.h"
      75  #include "codepages/WINDOWS-1258.h"
      76  
      77  static const uint16_t *cp_fntbl[] = { NULL, // UTF8
      78                                        NULL, // US-ASCII
      79                                        NULL, // ISO-8859-1
      80                                        cptbl_iso_8859_2,
      81                                        cptbl_iso_8859_3,
      82                                        cptbl_iso_8859_4,
      83                                        cptbl_iso_8859_5,
      84                                        cptbl_iso_8859_6,
      85                                        cptbl_iso_8859_7,
      86                                        cptbl_iso_8859_8,
      87                                        cptbl_iso_8859_9,
      88                                        cptbl_cp437,
      89                                        cptbl_cp850,
      90                                        cptbl_cp852,
      91                                        cptbl_cp855,
      92                                        cptbl_cp857,
      93                                        cptbl_cp860,
      94                                        cptbl_cp861,
      95                                        cptbl_cp863,
      96                                        cptbl_cp864,
      97                                        cptbl_cp865,
      98                                        cptbl_cp869,
      99                                        cptbl_cp932, /* original shiftjis */
     100                                        cptbl_macintosh,
     101                                        cptbl_big5,
     102                                        cptbl_cp949, /* 25 */
     103                                        cptbl_johab, /* 26 */
     104                                        cptbl_cp866,
     105                                        cptbl_windows_1250,
     106                                        cptbl_windows_1251, /* 29 */
     107                                        cptbl_windows_1252, /* 30 */
     108                                        cptbl_gb2312,
     109                                        cptbl_windows_1253,
     110                                        cptbl_windows_1254,
     111                                        cptbl_windows_1255,
     112                                        cptbl_windows_1256,
     113                                        cptbl_windows_1257,
     114                                        cptbl_windows_874,
     115                                        cptbl_windows_932, /* windows-31j */
     116                                        cptbl_windows_936,
     117                                        cptbl_windows_949,
     118                                        cptbl_windows_950,
     119                                        cptbl_windows_1361, /* 42 */
     120                                        NULL,               /* 43 UTF16 */
     121                                        cptbl_windows_1258,
     122                                        NULL };
     123  
     124  // synced with typedef enum _dwg_codepage in codepages.h
     125  #ifdef HAVE_ICONV
     126  
     127  const char *
     128  dwg_codepage_iconvstr (Dwg_Codepage cp)
     129  {
     130    // for iconv
     131    const char *_codepage_iconvstr[] = { "UTF8",         "US-ASCII",
     132                                         "ISO-8859-1",   "ISO-8859-2",
     133                                         "ISO-8859-3",   "ISO-8859-4",
     134                                         "ISO-8859-5",   "ISO-8859-6",
     135                                         "ISO-8859-7",   "ISO-8859-8",
     136                                         "ISO-8859-9",   "CP437",
     137                                         "CP850",        "CP852",
     138                                         "CP855",        "CP857",
     139                                         "CP860",        "CP861",
     140                                         "CP863",        "CP864",
     141                                         "CP865",        "CP869",
     142                                         "CP932",        "MACINTOSH",
     143                                         "BIG5",         "CP949", /* 25 */
     144                                         "JOHAB",        "CP866",
     145                                         "WINDOWS-1250", "WINDOWS-1251", /* 29 */
     146                                         "WINDOWS-1252",                 /* 30 */
     147                                         "GB2312",       "WINDOWS-1253",
     148                                         "WINDOWS-1254", "WINDOWS-1255",
     149                                         "WINDOWS-1256", "WINDOWS-1257",
     150                                         "WINDOWS-874",  "WINDOWS-932",
     151                                         "WINDOWS-936",  "WINDOWS-949",
     152                                         "WINDOWS-950",  "WINDOWS-1361",
     153                                         "UTF16", /* 43 */
     154                                         "WINDOWS-1258", NULL };
     155    if (cp <= CP_ANSI_1258)
     156      return _codepage_iconvstr[cp];
     157    else
     158      return NULL;
     159  }
     160  #endif
     161  
     162  const char *_codepage_dxfstr[]
     163      = { "UTF8",       "US_ASCII",   "ISO-8859-1", "ISO-8859-2", "ISO-8859-3",
     164          "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
     165          "ISO-8859-9", "CP437",      "CP850",      "CP852",      "CP855",
     166          "CP857",      "CP860",      "CP861",      "CP863",      "CP864",
     167          "CP865",      "CP869",      "CP932",      "MACINTOSH",  "BIG5",
     168          "CP949",                                               /* 25 */
     169          "JOHAB",      "CP866",      "ANSI_1250",  "ANSI_1251", /* 29 */
     170          "ANSI_1252", /* 30 WesternEurope Windows */
     171          "GB2312",     "ANSI_1253",  "ANSI_1254",  "ANSI_1255",  "ANSI_1256",
     172          "ANSI_1257",  "ANSI_874",   "ANSI_932",   "ANSI_936",   "ANSI_949",
     173          "ANSI_950",   "ANSI_1361",  "UTF16", /* 43 */
     174          "ANSI_1258",  NULL };
     175  
     176  const char *
     177  dwg_codepage_dxfstr (Dwg_Codepage cp)
     178  {
     179    if (cp <= CP_ANSI_1258)
     180      return _codepage_dxfstr[cp];
     181    else if (cp == CP_UNDEFINED)
     182      return "undefined";
     183    else
     184      return NULL;
     185  }
     186  
     187  Dwg_Codepage
     188  dwg_codepage_int (const char *s)
     189  {
     190    for (int i = 0; i <= (int)CP_ANSI_1258; i++)
     191      {
     192        if (strEQ (s, _codepage_dxfstr[i]))
     193          return (Dwg_Codepage)i;
     194        if (islower (*s) && 0 == strcasecmp (s, _codepage_dxfstr[i]))
     195          return (Dwg_Codepage)i;
     196      }
     197    return CP_UNDEFINED;
     198  }
     199  
     200  /* helper to check if a codepoint exists in the codepage,
     201     and convert it to/from unicode.
     202     dir = 1: from unicode wc to charset
     203     asian = 1: 2-byte CJK charset, else 1-byte (0-255)
     204  */
     205  static wchar_t
     206  codepage_helper (const Dwg_Codepage codepage, const wchar_t wc, const int dir,
     207                   const int asian)
     208  {
     209    const uint16_t *fntbl;
     210    uint16_t maxc;
     211    assert (codepage != CP_UTF8 && codepage != CP_UTF16
     212            && codepage != CP_US_ASCII && codepage != CP_ISO_8859_1);
     213    fntbl = cp_fntbl[codepage];
     214    maxc = fntbl[0];
     215    assert (maxc);
     216    if (dir) // from unicode to charset
     217      {      // reverse lookup
     218        for (uint16_t i = 0x80; i < maxc; i++)
     219          {
     220            if (wc == fntbl[i])
     221              return i;
     222          }
     223        return 0;
     224      }
     225    else
     226      {
     227        if (wc < maxc)
     228          return fntbl[wc];
     229        else
     230          return 0;
     231      }
     232  }
     233  
     234  // returns the matching unicode codepoint,
     235  // or 0 if the codepage does not contain the character
     236  wchar_t
     237  dwg_codepage_uc (Dwg_Codepage cp, unsigned char c)
     238  {
     239    if (c < 128)
     240      return (wchar_t)c;
     241    else if (cp == CP_US_ASCII)
     242      return 0;
     243    if (cp == CP_ISO_8859_1 || cp == CP_UTF8 || cp == CP_UTF16)
     244      return (wchar_t)c;
     245    return codepage_helper (cp, (wchar_t)c, 0, 0);
     246  }
     247  // for wide asian chars
     248  wchar_t
     249  dwg_codepage_uwc (Dwg_Codepage cp, uint16_t c)
     250  {
     251    if (cp == CP_CP864 && c == 0x25)
     252      return 0x066a;
     253    else if (cp == CP_CP932 && c == 0x5c)
     254      return 0x00A5;
     255    else if (cp == CP_CP932 && c == 0x7e)
     256      return 0x203E;
     257    else if (cp == CP_JOHAB && c == 0x5c)
     258      return 0x20A9;
     259    else if (c < 128 || cp == CP_UTF8 || cp == CP_UTF16)
     260      return (wchar_t)c;
     261    return codepage_helper (cp, (wchar_t)c, 0, 1);
     262  }
     263  // returns the matching codepoint,
     264  // or 0 if the codepage does not contain the wide character
     265  unsigned char
     266  dwg_codepage_c (Dwg_Codepage cp, wchar_t wc)
     267  {
     268    if (wc < 128)
     269      {
     270        if (cp == CP_US_ASCII || cp == CP_UTF8 || cp == CP_UTF16)
     271          return wc & 0xff;
     272      }
     273    else if (cp == CP_US_ASCII)
     274      return 0;
     275    if (cp == CP_ISO_8859_1 || cp == CP_UTF8)
     276      return wc < 256 ? wc : 0;
     277    return (unsigned char)codepage_helper (cp, wc, 1, 0);
     278  }
     279  // for wide asian chars
     280  uint16_t
     281  dwg_codepage_wc (Dwg_Codepage cp, wchar_t wc)
     282  {
     283    if (wc < 128 || cp == CP_UTF8 || cp == CP_UTF16)
     284      return wc & 0xffff;
     285    return (uint16_t)codepage_helper (cp, wc, 1, 1);
     286  }
     287  
     288  /* for possible wide asian chars:
     289     932 is single-byte for most chars, but 0x8*, 0x9*, 0xE* and 0xF* lead bytes
     290     CP949, JOHAB, ANSI_949, 936, 950 for all > 0x8* lead bytes
     291     1361 for all but 0x8[0123], 0xD[4567F], 0xF[A-F] lead bytes
     292     BIG5, GB2312 are two-byte only.
     293  
     294     none have valid 0x00 bytes, so strlen works as before in the TV case.
     295  */
     296  bool
     297  dwg_codepage_isasian (const Dwg_Codepage cp)
     298  {
     299    if (cp >= CP_BIG5 && cp <= CP_JOHAB)
     300      return true;
     301    else if (cp >= CP_ANSI_932 && cp <= CP_ANSI_1258)
     302      return true;
     303    else if (cp == CP_GB2312)
     304      return true;
     305    else
     306      return false;
     307  }
     308  
     309  bool
     310  dwg_codepage_is_twobyte (const Dwg_Codepage cp, const unsigned char c)
     311  {
     312    if (cp == CP_CP932 || cp == CP_ANSI_932)
     313      return (c >= 0x80 && c <= 0x9F) || (c >= 0xE0);
     314    else if (cp == CP_CP949 || cp == CP_ANSI_949 || cp == CP_ANSI_936
     315             || cp == CP_ANSI_950)
     316      return c & 0x80;
     317    else if (cp == CP_ANSI_1361)
     318      return (c >= 0x80 && c <= 0x83) || (c >= 0xD4 && c <= 0xD7) || (c == 0xDF)
     319             || (c >= 0xFA);
     320    else if (cp == CP_GB2312 || cp == CP_BIG5)
     321      return true;
     322    else
     323      return false;
     324  }