1 /* Association between Unicode characters and their names.
2 Copyright (C) 2000-2002, 2005-2007, 2009-2023 Free Software Foundation, Inc.
3
4 This file is free software.
5 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
6 You can redistribute it and/or modify it under either
7 - the terms of the GNU Lesser General Public License as published
8 by the Free Software Foundation, either version 3, or (at your
9 option) any later version, or
10 - the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 2, or (at your option)
12 any later version, or
13 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
14
15 This file is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License and the GNU General Public License
19 for more details.
20
21 You should have received a copy of the GNU Lesser General Public
22 License and of the GNU General Public License along with this
23 program. If not, see <https://www.gnu.org/licenses/>. */
24
25 #include <config.h>
26
27 /* Specification. */
28 #include "uniname.h"
29
30 #include <assert.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <string.h>
34
35 #include "attribute.h"
36
37 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
38
39
40 /* Table of Unicode character names, derived from UnicodeData.txt.
41 This table is generated in a way to minimize the memory footprint:
42 1. its compiled size is small (less than 350 KB),
43 2. it resides entirely in the text or read-only data segment of the
44 executable or shared library: the table contains only immediate
45 integers, no pointers, and the functions don't do heap allocation.
46 */
47 #include "uninames.h"
48 /* It contains:
49 static const char unicode_name_words[36303] = ...;
50 #define UNICODE_CHARNAME_NUM_WORDS 6260
51 static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[26] = ...;
52 #define UNICODE_CHARNAME_WORD_HANGUL 3902
53 #define UNICODE_CHARNAME_WORD_SYLLABLE 4978
54 #define UNICODE_CHARNAME_WORD_CJK 417
55 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107
56 static const uint16_t unicode_names[68940] = ...;
57 static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...;
58 static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;
59 #define UNICODE_CHARNAME_MAX_LENGTH 83
60 #define UNICODE_CHARNAME_MAX_WORDS 13
61 static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;
62 */
63
64 /* Returns the word with a given index. */
65 static const char *
66 unicode_name_word (unsigned int index, unsigned int *lengthp)
67 {
68 unsigned int i1;
69 unsigned int i2;
70
71 assert (index < UNICODE_CHARNAME_NUM_WORDS);
72
73 /* Binary search for i with
74 unicode_name_by_length[i].ind_offset <= index
75 and
76 index < unicode_name_by_length[i+1].ind_offset
77 */
78
79 i1 = 0;
80 i2 = SIZEOF (unicode_name_by_length) - 1;
81 while (i2 - i1 > 1)
82 {
83 unsigned int i = (i1 + i2) >> 1;
84 if (unicode_name_by_length[i].ind_offset <= index)
85 i1 = i;
86 else
87 i2 = i;
88 }
89 unsigned int i = i1;
90 assert (unicode_name_by_length[i].ind_offset <= index
91 && index < unicode_name_by_length[i+1].ind_offset);
92 *lengthp = i;
93 return &unicode_name_words[unicode_name_by_length[i].extra_offset
94 + (index-unicode_name_by_length[i].ind_offset)*i];
95 }
96
97 /* Looks up the index of a word. */
98 static int
99 unicode_name_word_lookup (const char *word, size_t length)
100 {
101 if (length > 0 && length < SIZEOF (unicode_name_by_length) - 1)
102 {
103 /* Binary search among the words of given length. */
104 unsigned int extra_offset = unicode_name_by_length[length].extra_offset;
105 unsigned int i0 = unicode_name_by_length[length].ind_offset;
106 unsigned int i1 = i0;
107 unsigned int i2 = unicode_name_by_length[length+1].ind_offset;
108 while (i2 - i1 > 0)
109 {
110 unsigned int i = (i1 + i2) >> 1;
111 const char *p = &unicode_name_words[extra_offset + (i-i0)*length];
112 const char *w = word;
113 unsigned int n = length;
114 for (;;)
115 {
116 if (*p < *w)
117 {
118 if (i1 == i)
119 return -1;
120 /* Note here: i1 < i < i2. */
121 i1 = i;
122 break;
123 }
124 if (*p > *w)
125 {
126 /* Note here: i1 <= i < i2. */
127 i2 = i;
128 break;
129 }
130 p++; w++; n--;
131 if (n == 0)
132 return i;
133 }
134 }
135 }
136 return -1;
137 }
138
139 #define UNINAME_INVALID_INDEX UINT16_MAX
140
141 /* Looks up the internal index of a Unicode character. */
142 static uint16_t
143 unicode_code_to_index (ucs4_t c)
144 {
145 /* Binary search in unicode_ranges. */
146 unsigned int i1 = 0;
147 unsigned int i2 = SIZEOF (unicode_ranges);
148
149 for (;;)
150 {
151 unsigned int i = (i1 + i2) >> 1;
152 ucs4_t start_code =
153 unicode_ranges[i].index + unicode_ranges[i].gap;
154 ucs4_t end_code =
155 start_code + unicode_ranges[i].length - 1;
156
157 if (start_code <= c && c <= end_code)
158 return c - unicode_ranges[i].gap;
159
160 if (end_code < c)
161 {
162 if (i1 == i)
163 break;
164 /* Note here: i1 < i < i2. */
165 i1 = i;
166 }
167 else if (c < start_code)
168 {
169 if (i2 == i)
170 break;
171 /* Note here: i1 <= i < i2. */
172 i2 = i;
173 }
174 }
175 return UNINAME_INVALID_INDEX;
176 }
177
178 /* Looks up the codepoint of a Unicode character, from the given
179 internal index. */
180 static ucs4_t
181 unicode_index_to_code (uint16_t index)
182 {
183 /* Binary search in unicode_ranges. */
184 unsigned int i1 = 0;
185 unsigned int i2 = SIZEOF (unicode_ranges);
186
187 for (;;)
188 {
189 unsigned int i = (i1 + i2) >> 1;
190 uint16_t start_index = unicode_ranges[i].index;
191 uint16_t end_index = start_index + unicode_ranges[i].length - 1;
192
193 if (start_index <= index && index <= end_index)
194 return index + unicode_ranges[i].gap;
195
196 if (end_index < index)
197 {
198 if (i1 == i)
199 break;
200 /* Note here: i1 < i < i2. */
201 i1 = i;
202 }
203 else if (index < start_index)
204 {
205 if (i2 == i)
206 break;
207 /* Note here: i1 <= i < i2. */
208 i2 = i;
209 }
210 }
211 return UNINAME_INVALID;
212 }
213
214
215 /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
216 sections 3.11 and 4.4. */
217 static const char jamo_initial_short_name[19][3] =
218 {
219 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ",
220 "C", "K", "T", "P", "H"
221 };
222 static const char jamo_medial_short_name[21][4] =
223 {
224 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO",
225 "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I"
226 };
227 static const char jamo_final_short_name[28][3] =
228 {
229 "", "G", "GG", "GS", "N", "NI", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT",
230 "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
231 };
232
233 /* Looks up the name of a Unicode character, in uppercase ASCII.
234 Returns the filled buf, or NULL if the character does not have a name. */
235 char *
236 unicode_character_name (ucs4_t c, char *buf)
237 {
238 if (c >= 0xAC00 && c <= 0xD7A3)
239 {
240 /* Special case for Hangul syllables. Keeps the tables small. */
241 char *ptr;
242 unsigned int tmp;
243 unsigned int index1;
244 unsigned int index2;
245 unsigned int index3;
246 const char *q;
247
248 /* buf needs to have at least 16 + 7 + 1 bytes here. */
249 memcpy (buf, "HANGUL SYLLABLE ", 16);
250 ptr = buf + 16;
251
252 tmp = c - 0xAC00;
253 index3 = tmp % 28; tmp = tmp / 28;
254 index2 = tmp % 21; tmp = tmp / 21;
255 index1 = tmp;
256
257 q = jamo_initial_short_name[index1];
258 while (*q != '\0')
259 *ptr++ = *q++;
260 q = jamo_medial_short_name[index2];
261 while (*q != '\0')
262 *ptr++ = *q++;
263 q = jamo_final_short_name[index3];
264 while (*q != '\0')
265 *ptr++ = *q++;
266 *ptr = '\0';
267 return buf;
268 }
269 else if ((c >= 0xF900 && c <= 0xFA2D) || (c >= 0xFA30 && c <= 0xFA6A)
270 || (c >= 0xFA70 && c <= 0xFAD9) || (c >= 0x2F800 && c <= 0x2FA1D))
271 {
272 /* Special case for CJK compatibility ideographs. Keeps the tables
273 small. */
274 char *ptr;
275 int i;
276
277 /* buf needs to have at least 28 + 5 + 1 bytes here. */
278 memcpy (buf, "CJK COMPATIBILITY IDEOGRAPH-", 28);
279 ptr = buf + 28;
280
281 for (i = (c < 0x10000 ? 12 : 16); i >= 0; i -= 4)
282 {
283 unsigned int x = (c >> i) & 0xf;
284 *ptr++ = (x < 10 ? '0' : 'A' - 10) + x;
285 }
286 *ptr = '\0';
287 return buf;
288 }
289 else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF))
290 {
291 /* Special case for variation selectors. Keeps the tables
292 small. */
293
294 /* buf needs to have at least 19 + 3 + 1 bytes here. */
295 sprintf (buf, "VARIATION SELECTOR-%u",
296 c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17);
297 return buf;
298 }
299 else
300 {
301 uint16_t index = unicode_code_to_index (c);
302 const uint16_t *words = NULL;
303
304 if (index != UNINAME_INVALID_INDEX)
305 {
306 /* Binary search in unicode_code_to_name. */
307 unsigned int i1 = 0;
308 unsigned int i2 = SIZEOF (unicode_index_to_name);
309 for (;;)
310 {
311 unsigned int i = (i1 + i2) >> 1;
312 if (unicode_index_to_name[i].index == index)
313 {
314 words = &unicode_names[unicode_index_to_name[i].name];
315 break;
316 }
317 else if (unicode_index_to_name[i].index < index)
318 {
319 if (i1 == i)
320 {
321 words = NULL;
322 break;
323 }
324 /* Note here: i1 < i < i2. */
325 i1 = i;
326 }
327 else if (unicode_index_to_name[i].index > index)
328 {
329 if (i2 == i)
330 {
331 words = NULL;
332 break;
333 }
334 /* Note here: i1 <= i < i2. */
335 i2 = i;
336 }
337 }
338 }
339 if (words != NULL)
340 {
341 /* Found it in unicode_index_to_name. Now concatenate the words. */
342 /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH + 1
343 bytes. */
344 char *ptr = buf;
345 for (;;)
346 {
347 unsigned int wordlen;
348 const char *word = unicode_name_word (*words>>1, &wordlen);
349 do
350 *ptr++ = *word++;
351 while (--wordlen > 0);
352 if ((*words & 1) == 0)
353 break;
354 *ptr++ = ' ';
355 words++;
356 }
357 *ptr = '\0';
358 return buf;
359 }
360 return NULL;
361 }
362 }
363
364 /* Looks up the Unicode character with a given name, in upper- or lowercase
365 ASCII. Returns the character if found, or UNINAME_INVALID if not found. */
366 ucs4_t
367 unicode_name_character (const char *name)
368 {
369 size_t len = strlen (name);
370 if (len > 1 && len <= UNICODE_CHARNAME_MAX_LENGTH)
371 {
372 /* Test for "word1 word2 ..." syntax. */
373 char buf[UNICODE_CHARNAME_MAX_LENGTH];
374 char *ptr = buf;
375 for (;;)
376 {
377 char c = *name++;
378 if (!(c >= ' ' && c <= '~'))
379 break;
380 *ptr++ = (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
381 if (--len == 0)
382 goto filled_buf;
383 }
384 if (false)
385 filled_buf:
386 {
387 {
388 /* Special case for variation selector aliases. Keeps the
389 tables small. */
390 const char *p1 = buf;
391 if (ptr >= buf + 3 && *p1++ == 'V')
392 {
393 if (*p1++ == 'S')
394 {
395 if (*p1 != '0')
396 {
397 unsigned int c = 0;
398 for (;;)
399 {
400 if (*p1 >= '0' && *p1 <= '9')
401 c += (*p1 - '0');
402 p1++;
403 if (p1 == ptr)
404 {
405 if (c >= 1 && c <= 16)
406 return c - 1 + 0xFE00;
407 else if (c >= 17 && c <= 256)
408 return c - 17 + 0xE0100;
409 else
410 break;
411 }
412 c = c * 10;
413 }
414 }
415 }
416 }
417 }
418 {
419 /* Convert the constituents to uint16_t words. */
420 uint16_t words[UNICODE_CHARNAME_MAX_WORDS];
421 uint16_t *wordptr = words;
422 {
423 const char *p1 = buf;
424 for (;;)
425 {
426 {
427 int word;
428 const char *p2 = p1;
429 while (p2 < ptr && *p2 != ' ')
430 p2++;
431 word = unicode_name_word_lookup (p1, p2 - p1);
432 if (word < 0)
433 break;
434 if (wordptr == &words[UNICODE_CHARNAME_MAX_WORDS])
435 break;
436 *wordptr++ = word;
437 if (p2 == ptr)
438 goto filled_words;
439 p1 = p2 + 1;
440 }
441 /* Special case for Hangul syllables. Keeps the tables small. */
442 if (wordptr == &words[2]
443 && words[0] == UNICODE_CHARNAME_WORD_HANGUL
444 && words[1] == UNICODE_CHARNAME_WORD_SYLLABLE)
445 {
446 /* Split the last word [p1..ptr) into three parts:
447 1) [BCDGHJKMNPRST]
448 2) [AEIOUWY]
449 3) [BCDGHIJKLMNPST]
450 */
451 const char *p2;
452 const char *p3;
453 const char *p4;
454
455 p2 = p1;
456 while (p2 < ptr
457 && (*p2 == 'B' || *p2 == 'C' || *p2 == 'D'
458 || *p2 == 'G' || *p2 == 'H' || *p2 == 'J'
459 || *p2 == 'K' || *p2 == 'M' || *p2 == 'N'
460 || *p2 == 'P' || *p2 == 'R' || *p2 == 'S'
461 || *p2 == 'T'))
462 p2++;
463 p3 = p2;
464 while (p3 < ptr
465 && (*p3 == 'A' || *p3 == 'E' || *p3 == 'I'
466 || *p3 == 'O' || *p3 == 'U' || *p3 == 'W'
467 || *p3 == 'Y'))
468 p3++;
469 p4 = p3;
470 while (p4 < ptr
471 && (*p4 == 'B' || *p4 == 'C' || *p4 == 'D'
472 || *p4 == 'G' || *p4 == 'H' || *p4 == 'I'
473 || *p4 == 'J' || *p4 == 'K' || *p4 == 'L'
474 || *p4 == 'M' || *p4 == 'N' || *p4 == 'P'
475 || *p4 == 'S' || *p4 == 'T'))
476 p4++;
477 if (p4 == ptr)
478 {
479 size_t n1 = p2 - p1;
480 size_t n2 = p3 - p2;
481 size_t n3 = p4 - p3;
482
483 if (n1 <= 2 && (n2 >= 1 && n2 <= 3) && n3 <= 2)
484 {
485 unsigned int index1;
486
487 for (index1 = 0; index1 < 19; index1++)
488 if (memcmp (jamo_initial_short_name[index1], p1, n1) == 0
489 && jamo_initial_short_name[index1][n1] == '\0')
490 {
491 unsigned int index2;
492
493 for (index2 = 0; index2 < 21; index2++)
494 if (memcmp (jamo_medial_short_name[index2], p2, n2) == 0
495 && jamo_medial_short_name[index2][n2] == '\0')
496 {
497 unsigned int index3;
498
499 for (index3 = 0; index3 < 28; index3++)
500 if (memcmp (jamo_final_short_name[index3], p3, n3) == 0
501 && jamo_final_short_name[index3][n3] == '\0')
502 {
503 return 0xAC00 + (index1 * 21 + index2) * 28 + index3;
504 }
505 break;
506 }
507 break;
508 }
509 }
510 }
511 }
512 /* Special case for CJK compatibility ideographs. Keeps the
513 tables small. */
514 if (wordptr == &words[2]
515 && words[0] == UNICODE_CHARNAME_WORD_CJK
516 && words[1] == UNICODE_CHARNAME_WORD_COMPATIBILITY
517 && p1 + 14 <= ptr
518 && p1 + 15 >= ptr
519 && memcmp (p1, "IDEOGRAPH-", 10) == 0)
520 {
521 const char *p2 = p1 + 10;
522
523 if (*p2 != '0')
524 {
525 unsigned int c = 0;
526
527 for (;;)
528 {
529 if (*p2 >= '0' && *p2 <= '9')
530 c += (*p2 - '0');
531 else if (*p2 >= 'A' && *p2 <= 'F')
532 c += (*p2 - 'A' + 10);
533 else
534 break;
535 p2++;
536 if (p2 == ptr)
537 {
538 if ((c >= 0xF900 && c <= 0xFA2D)
539 || (c >= 0xFA30 && c <= 0xFA6A)
540 || (c >= 0xFA70 && c <= 0xFAD9)
541 || (c >= 0x2F800 && c <= 0x2FA1D))
542 return c;
543 else
544 break;
545 }
546 c = c << 4;
547 }
548 }
549 }
550 /* Special case for variation selectors. Keeps the
551 tables small. */
552 if (wordptr == &words[1]
553 && words[0] == UNICODE_CHARNAME_WORD_VARIATION
554 && p1 + 10 <= ptr
555 && p1 + 12 >= ptr
556 && memcmp (p1, "SELECTOR-", 9) == 0)
557 {
558 const char *p2 = p1 + 9;
559
560 if (*p2 != '0')
561 {
562 unsigned int c = 0;
563
564 for (;;)
565 {
566 if (*p2 >= '0' && *p2 <= '9')
567 c += (*p2 - '0');
568 p2++;
569 if (p2 == ptr)
570 {
571 if (c >= 1 && c <= 16)
572 return c - 1 + 0xFE00;
573 else if (c >= 17 && c <= 256)
574 return c - 17 + 0xE0100;
575 else
576 break;
577 }
578 c = c * 10;
579 }
580 }
581 }
582 }
583 }
584 if (false)
585 filled_words:
586 {
587 /* Multiply by 2, to simplify later comparisons. */
588 size_t words_length = wordptr - words;
589 {
590 size_t i = words_length - 1;
591 words[i] = 2 * words[i];
592 for (; i > 0; )
593 {
594 --i;
595 words[i] = 2 * words[i] + 1;
596 }
597 }
598 /* Binary search in unicode_name_to_index. */
599 {
600 unsigned int i1 = 0;
601 unsigned int i2 = SIZEOF (unicode_name_to_index);
602 for (;;)
603 {
604 unsigned int i = (i1 + i2) >> 1;
605 const uint16_t *w = words;
606 const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];
607 size_t n = words_length;
608 for (;;)
609 {
610 if (*p < *w)
611 {
612 if (i1 == i)
613 goto name_not_found;
614 /* Note here: i1 < i < i2. */
615 i1 = i;
616 break;
617 }
618 else if (*p > *w)
619 {
620 if (i2 == i)
621 goto name_not_found;
622 /* Note here: i1 <= i < i2. */
623 i2 = i;
624 break;
625 }
626 p++; w++; n--;
627 if (n == 0)
628 return unicode_index_to_code (unicode_name_to_index[i].index);
629 }
630 }
631 }
632 name_not_found: ;
633 }
634 }
635 }
636 }
637 return UNINAME_INVALID;
638 }