1 /* Line breaking of UTF-8 strings.
2 Copyright (C) 2001-2003, 2006-2023 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2001.
4
5 This file is free software.
6 It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7 You can redistribute it and/or modify it under either
8 - the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation, either version 3, or (at your
10 option) any later version, or
11 - the terms of the GNU General Public License as published by the
12 Free Software Foundation; either version 2, or (at your option)
13 any later version, or
14 - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15
16 This file is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 Lesser General Public License and the GNU General Public License
20 for more details.
21
22 You should have received a copy of the GNU Lesser General Public
23 License and of the GNU General Public License along with this
24 program. If not, see <https://www.gnu.org/licenses/>. */
25
26 #include <config.h>
27
28 /* Specification. */
29 #include "unilbrk.h"
30 #include "unilbrk/internal.h"
31
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "unilbrk/lbrktables.h"
36 #include "uniwidth/cjk.h"
37 #include "unistr.h"
38
39 /* This file implements
40 Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>. */
41
42 void
43 u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
44 int cr, char *p)
45 {
46 if (n > 0)
47 {
48 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
49 const uint8_t *s_end = s + n;
50 int prev_prop = LBP_BK; /* line break property of last character */
51 int last_prop = LBP_BK; /* line break property of last non-space character */
52 char *seen_space = NULL; /* Was a space seen after the last non-space character? */
53
54 /* Don't break inside multibyte characters. */
55 memset (p, UC_BREAK_PROHIBITED, n);
56
57 /* Number of consecutive regional indicator (RI) characters seen
58 immediately before the current point. */
59 size_t ri_count = 0;
60
61 do
62 {
63 ucs4_t uc;
64 int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
65 int prop = unilbrkprop_lookup (uc);
66
67 if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
68 {
69 /* (LB4,LB5,LB6) Mandatory break. */
70 *p = UC_BREAK_MANDATORY;
71 /* cr is either LBP_CR or -1. In the first case, recognize
72 a CR-LF sequence. */
73 if (prev_prop == cr && prop == LBP_LF)
74 p[-1] = UC_BREAK_CR_BEFORE_LF;
75 prev_prop = prop;
76 last_prop = LBP_BK;
77 seen_space = NULL;
78 }
79 else
80 {
81 /* Resolve property values whose behaviour is not fixed. */
82 switch (prop)
83 {
84 case LBP_AI:
85 /* Resolve ambiguous. */
86 prop = LBP_AI_REPLACEMENT;
87 break;
88 case LBP_CB:
89 /* This is arbitrary. */
90 prop = LBP_ID1;
91 break;
92 case LBP_SA:
93 /* We don't handle complex scripts yet.
94 Treat LBP_SA like LBP_XX. */
95 case LBP_XX:
96 /* This is arbitrary. */
97 prop = LBP_AL;
98 break;
99 }
100
101 /* Deal with spaces and combining characters. */
102 if (prop == LBP_SP)
103 {
104 /* (LB7) Don't break just before a space. */
105 *p = UC_BREAK_PROHIBITED;
106 seen_space = p;
107 }
108 else if (prop == LBP_ZW)
109 {
110 /* (LB7) Don't break just before a zero-width space. */
111 *p = UC_BREAK_PROHIBITED;
112 last_prop = LBP_ZW;
113 seen_space = NULL;
114 }
115 else if (prop == LBP_CM || prop == LBP_ZWJ)
116 {
117 /* (LB9) Don't break just before a combining character or
118 zero-width joiner, except immediately after a mandatory
119 break character, space, or zero-width space. */
120 if (last_prop == LBP_BK)
121 {
122 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
123 *p = UC_BREAK_PROHIBITED;
124 /* (LB10) Treat CM or ZWJ as AL. */
125 last_prop = LBP_AL;
126 seen_space = NULL;
127 }
128 else if (last_prop == LBP_ZW || seen_space != NULL)
129 {
130 /* (LB8) Break after zero-width space. */
131 /* (LB18) Break after spaces.
132 We do *not* implement the "legacy support for space
133 character as base for combining marks" because now the
134 NBSP CM sequence is recommended instead of SP CM. */
135 *p = UC_BREAK_POSSIBLE;
136 /* (LB10) Treat CM or ZWJ as AL. */
137 last_prop = LBP_AL;
138 seen_space = NULL;
139 }
140 else
141 {
142 /* Treat X CM as if it were X. */
143 *p = UC_BREAK_PROHIBITED;
144 }
145 }
146 else
147 {
148 /* prop must be usable as an index for table 7.3 of UTR #14. */
149 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
150 abort ();
151
152 if (last_prop == LBP_BK)
153 {
154 /* (LB4,LB5,LB6) Don't break at the beginning of a line. */
155 *p = UC_BREAK_PROHIBITED;
156 }
157 else if (last_prop == LBP_ZW)
158 {
159 /* (LB8) Break after zero-width space. */
160 *p = UC_BREAK_POSSIBLE;
161 }
162 else if (prev_prop == LBP_ZWJ)
163 {
164 /* (LB8a) Don't break right after a zero-width joiner. */
165 *p = UC_BREAK_PROHIBITED;
166 }
167 else if (last_prop == LBP_RI && prop == LBP_RI)
168 {
169 /* (LB30a) Break between two regional indicator symbols
170 if and only if there are an even number of regional
171 indicators preceding the position of the break. */
172 *p = (seen_space != NULL || (ri_count % 2) == 0
173 ? UC_BREAK_POSSIBLE
174 : UC_BREAK_PROHIBITED);
175 }
176 else if (prev_prop == LBP_HL_BA)
177 {
178 /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
179 *p = UC_BREAK_PROHIBITED;
180 }
181 else
182 {
183 switch (unilbrk_table [last_prop] [prop])
184 {
185 case D:
186 *p = UC_BREAK_POSSIBLE;
187 break;
188 case I:
189 *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
190 break;
191 case P:
192 *p = UC_BREAK_PROHIBITED;
193 break;
194 default:
195 abort ();
196 }
197 }
198 last_prop = prop;
199 seen_space = NULL;
200 }
201
202 prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
203 ? LBP_HL_BA
204 : prop);
205 }
206
207 if (prop == LBP_RI)
208 ri_count++;
209 else
210 ri_count = 0;
211
212 s += count;
213 p += count;
214 }
215 while (s < s_end);
216 }
217 }
218
219 #if defined IN_LIBUNISTRING
220 /* For backward compatibility with older versions of libunistring. */
221
222 # undef u8_possible_linebreaks
223
224 void
225 u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
226 char *p)
227 {
228 u8_possible_linebreaks_loop (s, n, encoding, -1, p);
229 }
230
231 #endif
232
233 void
234 u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
235 char *p)
236 {
237 u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
238 }
239
240
241 #ifdef TEST
242
243 #include <stdio.h>
244 #include <string.h>
245
246 /* Read the contents of an input stream, and return it, terminated with a NUL
247 byte. */
248 char *
249 read_file (FILE *stream)
250 {
251 #define BUFSIZE 4096
252 char *buf = NULL;
253 int alloc = 0;
254 int size = 0;
255 int count;
256
257 while (! feof (stream))
258 {
259 if (size + BUFSIZE > alloc)
260 {
261 alloc = alloc + alloc / 2;
262 if (alloc < size + BUFSIZE)
263 alloc = size + BUFSIZE;
264 buf = realloc (buf, alloc);
265 if (buf == NULL)
266 {
267 fprintf (stderr, "out of memory\n");
268 exit (1);
269 }
270 }
271 count = fread (buf + size, 1, BUFSIZE, stream);
272 if (count == 0)
273 {
274 if (ferror (stream))
275 {
276 perror ("fread");
277 exit (1);
278 }
279 }
280 else
281 size += count;
282 }
283 buf = realloc (buf, size + 1);
284 if (buf == NULL)
285 {
286 fprintf (stderr, "out of memory\n");
287 exit (1);
288 }
289 buf[size] = '\0';
290 return buf;
291 #undef BUFSIZE
292 }
293
294 int
295 main (int argc, char * argv[])
296 {
297 if (argc == 1)
298 {
299 /* Display all the break opportunities in the input string. */
300 char *input = read_file (stdin);
301 int length = strlen (input);
302 char *breaks = malloc (length);
303 int i;
304
305 u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
306
307 for (i = 0; i < length; i++)
308 {
309 switch (breaks[i])
310 {
311 case UC_BREAK_POSSIBLE:
312 /* U+2027 in UTF-8 encoding */
313 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
314 break;
315 case UC_BREAK_MANDATORY:
316 /* U+21B2 (or U+21B5) in UTF-8 encoding */
317 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
318 break;
319 case UC_BREAK_CR_BEFORE_LF:
320 /* U+21E4 in UTF-8 encoding */
321 putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
322 break;
323 case UC_BREAK_PROHIBITED:
324 break;
325 default:
326 abort ();
327 }
328 putc (input[i], stdout);
329 }
330
331 free (breaks);
332
333 return 0;
334 }
335 else
336 return 1;
337 }
338
339 #endif /* TEST */