1 /*****************************************************************************/
2 /* LibreDWG - free implementation of the DWG file format */
3 /* */
4 /* Copyright (C) 2023 Free Software Foundation, Inc. */
5 /* */
6 /* This library is free software, licensed under the terms of the GNU */
7 /* General Public License as published by the Free Software Foundation, */
8 /* either version 3 of the License, or (at your option) any later version. */
9 /* You should have received a copy of the GNU General Public License */
10 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
11 /*****************************************************************************/
12
13 /*
14 * codepages.c: preR2007 codepages support via iconv
15 * written by Reini Urban
16 *
17 * See also the src mappings from https://www.unicode.org/Public/MAPPINGS/
18 * or the libdxfrw/src/intern/drw_textcodec.cpp mappings.
19 */
20
21 #include "config.h"
22 #include <string.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <errno.h>
26 #include <ctype.h>
27 #include <assert.h>
28 #if defined HAVE_ICONV && defined HAVE_ICONV_H
29 # include <iconv.h>
30 #endif
31 // #define CODEPAGES_C
32 #include "common.h"
33 #include "codepages.h"
34
35 #include "codepages/ISO-8859-2.h"
36 #include "codepages/ISO-8859-3.h"
37 #include "codepages/ISO-8859-4.h"
38 #include "codepages/ISO-8859-5.h"
39 #include "codepages/ISO-8859-6.h"
40 #include "codepages/ISO-8859-7.h"
41 #include "codepages/ISO-8859-8.h"
42 #include "codepages/ISO-8859-9.h"
43 #include "codepages/CP437.h"
44 #include "codepages/CP850.h"
45 #include "codepages/CP852.h"
46 #include "codepages/CP855.h"
47 #include "codepages/CP857.h"
48 #include "codepages/CP860.h"
49 #include "codepages/CP861.h"
50 #include "codepages/CP863.h"
51 #include "codepages/CP864.h"
52 #include "codepages/CP865.h"
53 #include "codepages/CP869.h"
54 #include "codepages/CP932.h"
55 #include "codepages/MACINTOSH.h"
56 #include "codepages/BIG5.h"
57 #include "codepages/CP949.h"
58 #include "codepages/JOHAB.h"
59 #include "codepages/CP866.h"
60 #include "codepages/WINDOWS-1250.h"
61 #include "codepages/WINDOWS-1251.h"
62 #include "codepages/WINDOWS-1252.h"
63 #include "codepages/GB2312.h"
64 #include "codepages/WINDOWS-1253.h"
65 #include "codepages/WINDOWS-1254.h"
66 #include "codepages/WINDOWS-1255.h"
67 #include "codepages/WINDOWS-1256.h"
68 #include "codepages/WINDOWS-1257.h"
69 #include "codepages/WINDOWS-874.h"
70 #include "codepages/WINDOWS-932.h"
71 #include "codepages/WINDOWS-936.h"
72 #include "codepages/WINDOWS-949.h"
73 #include "codepages/WINDOWS-950.h"
74 #include "codepages/WINDOWS-1361.h"
75 #include "codepages/WINDOWS-1258.h"
76
77 static const uint16_t *cp_fntbl[] = { NULL, // UTF8
78 NULL, // US-ASCII
79 NULL, // ISO-8859-1
80 cptbl_iso_8859_2,
81 cptbl_iso_8859_3,
82 cptbl_iso_8859_4,
83 cptbl_iso_8859_5,
84 cptbl_iso_8859_6,
85 cptbl_iso_8859_7,
86 cptbl_iso_8859_8,
87 cptbl_iso_8859_9,
88 cptbl_cp437,
89 cptbl_cp850,
90 cptbl_cp852,
91 cptbl_cp855,
92 cptbl_cp857,
93 cptbl_cp860,
94 cptbl_cp861,
95 cptbl_cp863,
96 cptbl_cp864,
97 cptbl_cp865,
98 cptbl_cp869,
99 cptbl_cp932, /* original shiftjis */
100 cptbl_macintosh,
101 cptbl_big5,
102 cptbl_cp949, /* 25 */
103 cptbl_johab, /* 26 */
104 cptbl_cp866,
105 cptbl_windows_1250,
106 cptbl_windows_1251, /* 29 */
107 cptbl_windows_1252, /* 30 */
108 cptbl_gb2312,
109 cptbl_windows_1253,
110 cptbl_windows_1254,
111 cptbl_windows_1255,
112 cptbl_windows_1256,
113 cptbl_windows_1257,
114 cptbl_windows_874,
115 cptbl_windows_932, /* windows-31j */
116 cptbl_windows_936,
117 cptbl_windows_949,
118 cptbl_windows_950,
119 cptbl_windows_1361, /* 42 */
120 NULL, /* 43 UTF16 */
121 cptbl_windows_1258,
122 NULL };
123
124 // synced with typedef enum _dwg_codepage in codepages.h
125 #ifdef HAVE_ICONV
126
127 const char *
128 dwg_codepage_iconvstr (Dwg_Codepage cp)
129 {
130 // for iconv
131 const char *_codepage_iconvstr[] = { "UTF8", "US-ASCII",
132 "ISO-8859-1", "ISO-8859-2",
133 "ISO-8859-3", "ISO-8859-4",
134 "ISO-8859-5", "ISO-8859-6",
135 "ISO-8859-7", "ISO-8859-8",
136 "ISO-8859-9", "CP437",
137 "CP850", "CP852",
138 "CP855", "CP857",
139 "CP860", "CP861",
140 "CP863", "CP864",
141 "CP865", "CP869",
142 "CP932", "MACINTOSH",
143 "BIG5", "CP949", /* 25 */
144 "JOHAB", "CP866",
145 "WINDOWS-1250", "WINDOWS-1251", /* 29 */
146 "WINDOWS-1252", /* 30 */
147 "GB2312", "WINDOWS-1253",
148 "WINDOWS-1254", "WINDOWS-1255",
149 "WINDOWS-1256", "WINDOWS-1257",
150 "WINDOWS-874", "WINDOWS-932",
151 "WINDOWS-936", "WINDOWS-949",
152 "WINDOWS-950", "WINDOWS-1361",
153 "UTF16", /* 43 */
154 "WINDOWS-1258", NULL };
155 if (cp <= CP_ANSI_1258)
156 return _codepage_iconvstr[cp];
157 else
158 return NULL;
159 }
160 #endif
161
162 const char *_codepage_dxfstr[]
163 = { "UTF8", "US_ASCII", "ISO-8859-1", "ISO-8859-2", "ISO-8859-3",
164 "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
165 "ISO-8859-9", "CP437", "CP850", "CP852", "CP855",
166 "CP857", "CP860", "CP861", "CP863", "CP864",
167 "CP865", "CP869", "CP932", "MACINTOSH", "BIG5",
168 "CP949", /* 25 */
169 "JOHAB", "CP866", "ANSI_1250", "ANSI_1251", /* 29 */
170 "ANSI_1252", /* 30 WesternEurope Windows */
171 "GB2312", "ANSI_1253", "ANSI_1254", "ANSI_1255", "ANSI_1256",
172 "ANSI_1257", "ANSI_874", "ANSI_932", "ANSI_936", "ANSI_949",
173 "ANSI_950", "ANSI_1361", "UTF16", /* 43 */
174 "ANSI_1258", NULL };
175
176 const char *
177 dwg_codepage_dxfstr (Dwg_Codepage cp)
178 {
179 if (cp <= CP_ANSI_1258)
180 return _codepage_dxfstr[cp];
181 else if (cp == CP_UNDEFINED)
182 return "undefined";
183 else
184 return NULL;
185 }
186
187 Dwg_Codepage
188 dwg_codepage_int (const char *s)
189 {
190 for (int i = 0; i <= (int)CP_ANSI_1258; i++)
191 {
192 if (strEQ (s, _codepage_dxfstr[i]))
193 return (Dwg_Codepage)i;
194 if (islower (*s) && 0 == strcasecmp (s, _codepage_dxfstr[i]))
195 return (Dwg_Codepage)i;
196 }
197 return CP_UNDEFINED;
198 }
199
200 /* helper to check if a codepoint exists in the codepage,
201 and convert it to/from unicode.
202 dir = 1: from unicode wc to charset
203 asian = 1: 2-byte CJK charset, else 1-byte (0-255)
204 */
205 static wchar_t
206 codepage_helper (const Dwg_Codepage codepage, const wchar_t wc, const int dir,
207 const int asian)
208 {
209 const uint16_t *fntbl;
210 uint16_t maxc;
211 assert (codepage != CP_UTF8 && codepage != CP_UTF16
212 && codepage != CP_US_ASCII && codepage != CP_ISO_8859_1);
213 fntbl = cp_fntbl[codepage];
214 maxc = fntbl[0];
215 assert (maxc);
216 if (dir) // from unicode to charset
217 { // reverse lookup
218 for (uint16_t i = 0x80; i < maxc; i++)
219 {
220 if (wc == fntbl[i])
221 return i;
222 }
223 return 0;
224 }
225 else
226 {
227 if (wc < maxc)
228 return fntbl[wc];
229 else
230 return 0;
231 }
232 }
233
234 // returns the matching unicode codepoint,
235 // or 0 if the codepage does not contain the character
236 wchar_t
237 dwg_codepage_uc (Dwg_Codepage cp, unsigned char c)
238 {
239 if (c < 128)
240 return (wchar_t)c;
241 else if (cp == CP_US_ASCII)
242 return 0;
243 if (cp == CP_ISO_8859_1 || cp == CP_UTF8 || cp == CP_UTF16)
244 return (wchar_t)c;
245 return codepage_helper (cp, (wchar_t)c, 0, 0);
246 }
247 // for wide asian chars
248 wchar_t
249 dwg_codepage_uwc (Dwg_Codepage cp, uint16_t c)
250 {
251 if (cp == CP_CP864 && c == 0x25)
252 return 0x066a;
253 else if (cp == CP_CP932 && c == 0x5c)
254 return 0x00A5;
255 else if (cp == CP_CP932 && c == 0x7e)
256 return 0x203E;
257 else if (cp == CP_JOHAB && c == 0x5c)
258 return 0x20A9;
259 else if (c < 128 || cp == CP_UTF8 || cp == CP_UTF16)
260 return (wchar_t)c;
261 return codepage_helper (cp, (wchar_t)c, 0, 1);
262 }
263 // returns the matching codepoint,
264 // or 0 if the codepage does not contain the wide character
265 unsigned char
266 dwg_codepage_c (Dwg_Codepage cp, wchar_t wc)
267 {
268 if (wc < 128)
269 {
270 if (cp == CP_US_ASCII || cp == CP_UTF8 || cp == CP_UTF16)
271 return wc & 0xff;
272 }
273 else if (cp == CP_US_ASCII)
274 return 0;
275 if (cp == CP_ISO_8859_1 || cp == CP_UTF8)
276 return wc < 256 ? wc : 0;
277 return (unsigned char)codepage_helper (cp, wc, 1, 0);
278 }
279 // for wide asian chars
280 uint16_t
281 dwg_codepage_wc (Dwg_Codepage cp, wchar_t wc)
282 {
283 if (wc < 128 || cp == CP_UTF8 || cp == CP_UTF16)
284 return wc & 0xffff;
285 return (uint16_t)codepage_helper (cp, wc, 1, 1);
286 }
287
288 /* for possible wide asian chars:
289 932 is single-byte for most chars, but 0x8*, 0x9*, 0xE* and 0xF* lead bytes
290 CP949, JOHAB, ANSI_949, 936, 950 for all > 0x8* lead bytes
291 1361 for all but 0x8[0123], 0xD[4567F], 0xF[A-F] lead bytes
292 BIG5, GB2312 are two-byte only.
293
294 none have valid 0x00 bytes, so strlen works as before in the TV case.
295 */
296 bool
297 dwg_codepage_isasian (const Dwg_Codepage cp)
298 {
299 if (cp >= CP_BIG5 && cp <= CP_JOHAB)
300 return true;
301 else if (cp >= CP_ANSI_932 && cp <= CP_ANSI_1258)
302 return true;
303 else if (cp == CP_GB2312)
304 return true;
305 else
306 return false;
307 }
308
309 bool
310 dwg_codepage_is_twobyte (const Dwg_Codepage cp, const unsigned char c)
311 {
312 if (cp == CP_CP932 || cp == CP_ANSI_932)
313 return (c >= 0x80 && c <= 0x9F) || (c >= 0xE0);
314 else if (cp == CP_CP949 || cp == CP_ANSI_949 || cp == CP_ANSI_936
315 || cp == CP_ANSI_950)
316 return c & 0x80;
317 else if (cp == CP_ANSI_1361)
318 return (c >= 0x80 && c <= 0x83) || (c >= 0xD4 && c <= 0xD7) || (c == 0xDF)
319 || (c >= 0xFA);
320 else if (cp == CP_GB2312 || cp == CP_BIG5)
321 return true;
322 else
323 return false;
324 }