1 /* Line breaking auxiliary tables.
2 Copyright (C) 2001-2003, 2006-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include "unitypes.h"
27
28 /* Line breaking classification. */
29
30 enum
31 {
32 /* Values >= 33 are resolved at run time. */
33 LBP_BK = 33, /* mandatory break */
34 LBP_CR = 34, /* carriage return */
35 LBP_LF = 35, /* line feed */
36 LBP_CM = 36, /* attached characters and combining marks */
37 /*LBP_NL, next line - not used here because it's equivalent to LBP_BK */
38 /*LBP_SG, surrogates - not used here because they are not characters */
39 LBP_WJ = 0, /* word joiner */
40 LBP_ZW = 37, /* zero width space */
41 LBP_GL = 1, /* non-breaking (glue) */
42 LBP_SP = 38, /* space */
43 LBP_B2 = 2, /* break opportunity before and after */
44 LBP_BA = 3, /* break opportunity after */
45 LBP_BB = 4, /* break opportunity before */
46 LBP_HY = 5, /* hyphen */
47 LBP_CB = 39, /* contingent break opportunity */
48 LBP_CL = 6, /* closing punctuation */
49 LBP_CP1 = 7, /* closing parenthesis, non-EastAsian character */
50 LBP_CP2 = 8, /* closing parenthesis, EastAsian character */
51 LBP_EX = 9, /* exclamation/interrogation */
52 LBP_IN = 10, /* inseparable */
53 LBP_NS = 11, /* non starter */
54 LBP_OP1 = 12, /* opening punctuation, non-EastAsian character */
55 LBP_OP2 = 13, /* opening punctuation, EastAsian character */
56 LBP_QU = 14, /* ambiguous quotation */
57 LBP_IS = 15, /* infix separator (numeric) */
58 LBP_NU = 16, /* numeric */
59 LBP_PO = 17, /* postfix (numeric) */
60 LBP_PR = 18, /* prefix (numeric) */
61 LBP_SY = 19, /* symbols allowing breaks */
62 LBP_AI = 40, /* ambiguous (alphabetic or ideograph) */
63 LBP_AL = 20, /* ordinary alphabetic and symbol characters */
64 /*LBP_CJ, conditional Japanese starter, resolved to NS */
65 LBP_H2 = 21, /* Hangul LV syllable */
66 LBP_H3 = 22, /* Hangul LVT syllable */
67 LBP_HL = 28, /* Hebrew letter */
68 LBP_ID1 = 23, /* ideographic */
69 LBP_ID2 = 24, /* ideographic and potential future emoji */
70 LBP_JL = 25, /* Hangul L Jamo */
71 LBP_JV = 26, /* Hangul V Jamo */
72 LBP_JT = 27, /* Hangul T Jamo */
73 LBP_RI = 29, /* regional indicator */
74 LBP_SA = 41, /* complex context (South East Asian) */
75 LBP_ZWJ = 30, /* zero width joiner */
76 LBP_EB = 31, /* emoji base */
77 LBP_EM = 32, /* emoji modifier */
78 LBP_XX = 42, /* unknown */
79 /* Artificial values that exist only at runtime, not in the tables. */
80 LBP_HL_BA = 100
81 };
82
83 #include "lbrkprop1.h"
84
85 static inline unsigned char
86 unilbrkprop_lookup (ucs4_t uc)
87 {
88 unsigned int index1 = uc >> lbrkprop_header_0;
89 if (index1 < lbrkprop_header_1)
90 {
91 int lookup1 = unilbrkprop.level1[index1];
92 if (lookup1 >= 0)
93 {
94 unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
95 int lookup2 = unilbrkprop.level2[lookup1 + index2];
96 if (lookup2 >= 0)
97 {
98 unsigned int index3 = uc & lbrkprop_header_4;
99 return unilbrkprop.level3[lookup2 + index3];
100 }
101 }
102 }
103 return LBP_XX;
104 }
105
106 /* Table indexed by two line breaking classifications. */
107 #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
108 #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
109 #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
110
111 extern const unsigned char unilbrk_table[33][33];
112
113 /* We don't support line breaking of complex-context dependent characters
114 (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */