1 /* Align/Truncate a string in a given screen width
2 Copyright (C) 2009-2010 Free Software Foundation, Inc.
3
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU Lesser General Public License as published by
6 the Free Software Foundation, either version 2.1 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
17 /* Written by Pádraig Brady. */
18
19 #include <stdlib.h>
20 #include <string.h>
21 #include <stdio.h>
22 #include <stdbool.h>
23 #include <limits.h>
24 #include <ctype.h>
25
26 #include "c.h"
27 #include "mbsalign.h"
28 #include "strutils.h"
29 #include "widechar.h"
30
31 /*
32 * Counts number of cells in multibyte string. All control and
33 * non-printable chars are ignored.
34 *
35 * Returns: number of cells.
36 */
37 size_t mbs_nwidth(const char *buf, size_t bufsz)
38 {
39 const char *p = buf, *last = buf;
40 size_t width = 0;
41
42 #ifdef HAVE_WIDECHAR
43 mbstate_t st;
44 memset(&st, 0, sizeof(st));
45 #endif
46 if (p && *p && bufsz)
47 last = p + (bufsz - 1);
48
49 while (p && *p && p <= last) {
50 if (iscntrl((unsigned char) *p)) {
51 p++;
52
53 /* try detect "\e[x;ym" and skip on success */
54 if (*p && *p == '[') {
55 const char *e = p;
56 while (*e && e < last && *e != 'm')
57 e++;
58 if (*e == 'm')
59 p = e + 1;
60 }
61 continue;
62 }
63 #ifdef HAVE_WIDECHAR
64 wchar_t wc;
65 size_t len = mbrtowc(&wc, p, MB_CUR_MAX, &st);
66
67 if (len == 0)
68 break;
69 if (len > 0 && iswprint(wc)) {
70 int x = wcwidth(wc);
71 if (x > 0)
72 width += x;
73 } else if (len == (size_t) -1 || len == (size_t) -2)
74 len = 1;
75 p += len;
76 #else
77 if (isprint((unsigned char) *p))
78 width++;
79 p++;
80 #endif
81 }
82
83 return width;
84 }
85
86 size_t mbs_width(const char *s)
87 {
88 if (!s || !*s)
89 return 0;
90 return mbs_nwidth(s, strlen(s));
91 }
92
93 /*
94 * Counts number of cells in multibyte string. For all control and
95 * non-printable chars is the result width enlarged to store \x?? hex
96 * sequence. See mbs_safe_encode().
97 *
98 * Returns: number of cells, @sz returns number of bytes.
99 */
100 size_t mbs_safe_nwidth(const char *buf, size_t bufsz, size_t *sz)
101 {
102 const char *p = buf, *last = buf;
103 size_t width = 0, bytes = 0;
104
105 #ifdef HAVE_WIDECHAR
106 mbstate_t st;
107 memset(&st, 0, sizeof(st));
108 #endif
109 if (p && *p && bufsz)
110 last = p + (bufsz - 1);
111
112 while (p && *p && p <= last) {
113 if ((p < last && *p == '\\' && *(p + 1) == 'x')
114 || iscntrl((unsigned char) *p)) {
115 width += 4, bytes += 4; /* *p encoded to \x?? */
116 p++;
117 }
118 #ifdef HAVE_WIDECHAR
119 else {
120 wchar_t wc;
121 size_t len = mbrtowc(&wc, p, MB_CUR_MAX, &st);
122
123 if (len == 0)
124 break;
125
126 if (len == (size_t) -1 || len == (size_t) -2) {
127 len = 1;
128 if (isprint((unsigned char) *p))
129 width += 1, bytes += 1;
130 else
131 width += 4, bytes += 4;
132
133 } else if (!iswprint(wc)) {
134 width += len * 4; /* hex encode whole sequence */
135 bytes += len * 4;
136 } else {
137 width += wcwidth(wc); /* number of cells */
138 bytes += len; /* number of bytes */
139 }
140 p += len;
141 }
142 #else
143 else if (!isprint((unsigned char) *p)) {
144 width += 4, bytes += 4; /* *p encoded to \x?? */
145 p++;
146 } else {
147 width++, bytes++;
148 p++;
149 }
150 #endif
151 }
152
153 if (sz)
154 *sz = bytes;
155 return width;
156 }
157
158 size_t mbs_safe_width(const char *s)
159 {
160 if (!s || !*s)
161 return 0;
162 return mbs_safe_nwidth(s, strlen(s), NULL);
163 }
164
165 /*
166 * Copy @s to @buf and replace control and non-printable chars with
167 * \x?? hex sequence. The @width returns number of cells. The @safechars
168 * are not encoded.
169 *
170 * The @buf has to be big enough to store mbs_safe_encode_size(strlen(s)))
171 * bytes.
172 */
173 char *mbs_safe_encode_to_buffer(const char *s, size_t *width, char *buf, const char *safechars)
174 {
175 const char *p = s;
176 char *r;
177 size_t sz = s ? strlen(s) : 0;
178
179 #ifdef HAVE_WIDECHAR
180 mbstate_t st;
181 memset(&st, 0, sizeof(st));
182 #endif
183 if (!sz || !buf)
184 return NULL;
185
186 r = buf;
187 *width = 0;
188
189 while (p && *p) {
190 if (safechars && strchr(safechars, *p)) {
191 *r++ = *p++;
192 continue;
193 }
194
195 if ((*p == '\\' && *(p + 1) == 'x')
196 || iscntrl((unsigned char) *p)) {
197 sprintf(r, "\\x%02x", (unsigned char) *p);
198 r += 4;
199 *width += 4;
200 p++;
201 }
202 #ifdef HAVE_WIDECHAR
203 else {
204 wchar_t wc;
205 size_t len = mbrtowc(&wc, p, MB_CUR_MAX, &st);
206
207 if (len == 0)
208 break; /* end of string */
209
210 if (len == (size_t) -1 || len == (size_t) -2) {
211 len = 1;
212 /*
213 * Not valid multibyte sequence -- maybe it's
214 * printable char according to the current locales.
215 */
216 if (!isprint((unsigned char) *p)) {
217 sprintf(r, "\\x%02x", (unsigned char) *p);
218 r += 4;
219 *width += 4;
220 } else {
221 (*width)++;
222 *r++ = *p;
223 }
224 } else if (!iswprint(wc)) {
225 size_t i;
226 for (i = 0; i < len; i++) {
227 sprintf(r, "\\x%02x", (unsigned char) p[i]);
228 r += 4;
229 *width += 4;
230 }
231 } else {
232 memcpy(r, p, len);
233 r += len;
234 *width += wcwidth(wc);
235 }
236 p += len;
237 }
238 #else
239 else if (!isprint((unsigned char) *p)) {
240 sprintf(r, "\\x%02x", (unsigned char) *p);
241 p++;
242 r += 4;
243 *width += 4;
244 } else {
245 *r++ = *p++;
246 (*width)++;
247 }
248 #endif
249 }
250
251 *r = '\0';
252 return buf;
253 }
254
255 /*
256 * Copy @s to @buf and replace broken sequences to \x?? hex sequence. The
257 * @width returns number of cells. The @safechars are not encoded.
258 *
259 * The @buf has to be big enough to store mbs_safe_encode_size(strlen(s)))
260 * bytes.
261 */
262 char *mbs_invalid_encode_to_buffer(const char *s, size_t *width, char *buf)
263 {
264 const char *p = s;
265 char *r;
266 size_t sz = s ? strlen(s) : 0;
267
268 #ifdef HAVE_WIDECHAR
269 mbstate_t st;
270 memset(&st, 0, sizeof(st));
271 #endif
272 if (!sz || !buf)
273 return NULL;
274
275 r = buf;
276 *width = 0;
277
278 while (p && *p) {
279 #ifdef HAVE_WIDECHAR
280 wchar_t wc;
281 size_t len = mbrtowc(&wc, p, MB_CUR_MAX, &st);
282 #else
283 size_t len = 1;
284 #endif
285
286 if (len == 0)
287 break; /* end of string */
288
289 if (len == (size_t) -1 || len == (size_t) -2) {
290 len = 1;
291 /*
292 * Not valid multibyte sequence -- maybe it's
293 * printable char according to the current locales.
294 */
295 if (!isprint((unsigned char) *p)) {
296 sprintf(r, "\\x%02x", (unsigned char) *p);
297 r += 4;
298 *width += 4;
299 } else {
300 (*width)++;
301 *r++ = *p;
302 }
303 } else if (*p == '\\' && *(p + 1) == 'x') {
304 sprintf(r, "\\x%02x", (unsigned char) *p);
305 r += 4;
306 *width += 4;
307 } else {
308 r = mempcpy(r, p, len);
309 *width += wcwidth(wc);
310 }
311 p += len;
312 }
313
314 *r = '\0';
315 return buf;
316 }
317
318 size_t mbs_safe_encode_size(size_t bytes)
319 {
320 return (bytes * 4) + 1;
321 }
322
323 /*
324 * Returns allocated string where all control and non-printable chars are
325 * replaced with \x?? hex sequence.
326 */
327 char *mbs_safe_encode(const char *s, size_t *width)
328 {
329 size_t sz = s ? strlen(s) : 0;
330 char *buf, *ret = NULL;
331
332 if (!sz)
333 return NULL;
334 buf = malloc(mbs_safe_encode_size(sz));
335 if (buf)
336 ret = mbs_safe_encode_to_buffer(s, width, buf, NULL);
337 if (!ret)
338 free(buf);
339 return ret;
340 }
341
342 /*
343 * Returns allocated string where all broken widechars chars are
344 * replaced with \x?? hex sequence.
345 */
346 char *mbs_invalid_encode(const char *s, size_t *width)
347 {
348 size_t sz = s ? strlen(s) : 0;
349 char *buf, *ret = NULL;
350
351 if (!sz)
352 return NULL;
353 buf = malloc(mbs_safe_encode_size(sz));
354 if (buf)
355 ret = mbs_invalid_encode_to_buffer(s, width, buf);
356 if (!ret)
357 free(buf);
358 return ret;
359 }
360
361 #ifdef HAVE_WIDECHAR
362
363 static bool
364 wc_ensure_printable (wchar_t *wchars)
365 {
366 bool replaced = false;
367 wchar_t *wc = wchars;
368 while (*wc)
369 {
370 if (!iswprint ((wint_t) *wc))
371 {
372 *wc = 0xFFFD; /* L'\uFFFD' (replacement char) */
373 replaced = true;
374 }
375 wc++;
376 }
377 return replaced;
378 }
379
380 /* Truncate wchar string to width cells.
381 * Returns number of cells used. */
382
383 static size_t
384 wc_truncate (wchar_t *wc, size_t width)
385 {
386 size_t cells = 0;
387 int next_cells = 0;
388
389 while (*wc)
390 {
391 next_cells = wcwidth (*wc);
392 if (next_cells == -1) /* non printable */
393 {
394 *wc = 0xFFFD; /* L'\uFFFD' (replacement char) */
395 next_cells = 1;
396 }
397 if (cells + next_cells > width)
398 break;
399
400 cells += next_cells;
401 wc++;
402 }
403 *wc = L'\0';
404 return cells;
405 }
406
407 static int
408 rpl_wcswidth (const wchar_t *s, size_t n)
409 {
410 int ret = 0;
411
412 while (n-- > 0 && *s != L'\0')
413 {
414 int nwidth = wcwidth (*s++);
415 if (nwidth == -1) /* non printable */
416 return -1;
417 if (ret > (INT_MAX - nwidth)) /* overflow */
418 return -1;
419 ret += nwidth;
420 }
421
422 return ret;
423 }
424 #endif /* HAVE_WIDECHAR */
425
426 /* Truncate multi-byte string to @width and returns number of
427 * bytes of the new string @str, and in @width returns number
428 * of cells.
429 */
430 size_t
431 mbs_truncate(char *str, size_t *width)
432 {
433 ssize_t bytes = strlen(str);
434 #ifdef HAVE_WIDECHAR
435 ssize_t sz = mbstowcs(NULL, str, 0);
436 wchar_t *wcs = NULL;
437
438 if (sz == (ssize_t) -1)
439 goto done;
440
441 wcs = calloc(1, (sz + 1) * sizeof(wchar_t));
442 if (!wcs)
443 goto done;
444
445 if (!mbstowcs(wcs, str, sz))
446 goto done;
447 *width = wc_truncate(wcs, *width);
448 bytes = wcstombs(str, wcs, bytes);
449 done:
450 free(wcs);
451 #else
452 if (bytes >= 0 && *width < (size_t) bytes)
453 bytes = *width;
454 #endif
455 if (bytes >= 0)
456 str[bytes] = '\0';
457 return bytes;
458 }
459
460 /* Write N_SPACES space characters to DEST while ensuring
461 nothing is written beyond DEST_END. A terminating NUL
462 is always added to DEST.
463 A pointer to the terminating NUL is returned. */
464
465 static char*
466 mbs_align_pad (char *dest, const char* dest_end, size_t n_spaces, int padchar)
467 {
468 for (/* nothing */; n_spaces && (dest < dest_end); n_spaces--)
469 *dest++ = padchar;
470 *dest = '\0';
471 return dest;
472 }
473
474 size_t
475 mbsalign (const char *src, char *dest, size_t dest_size,
476 size_t *width, mbs_align_t align, int flags)
477 {
478 return mbsalign_with_padding(src, dest, dest_size, width, align, flags, ' ');
479 }
480
481 /* Align a string, SRC, in a field of *WIDTH columns, handling multi-byte
482 characters; write the result into the DEST_SIZE-byte buffer, DEST.
483 ALIGNMENT specifies whether to left- or right-justify or to center.
484 If SRC requires more than *WIDTH columns, truncate it to fit.
485 When centering, the number of trailing spaces may be one less than the
486 number of leading spaces. The FLAGS parameter is unused at present.
487 Return the length in bytes required for the final result, not counting
488 the trailing NUL. A return value of DEST_SIZE or larger means there
489 wasn't enough space. DEST will be NUL terminated in any case.
490 Return (size_t) -1 upon error (invalid multi-byte sequence in SRC,
491 or malloc failure), unless MBA_UNIBYTE_FALLBACK is specified.
492 Update *WIDTH to indicate how many columns were used before padding. */
493
494 size_t
495 mbsalign_with_padding (const char *src, char *dest, size_t dest_size,
496 size_t *width, mbs_align_t align,
497 #ifdef HAVE_WIDECHAR
498 int flags,
499 #else
500 int flags __attribute__((__unused__)),
501 #endif
502 int padchar)
503 {
504 size_t ret = -1;
505 size_t src_size = strlen (src) + 1;
506 char *newstr = NULL;
507 wchar_t *str_wc = NULL;
508 const char *str_to_print = src;
509 size_t n_cols = src_size - 1;
510 size_t n_used_bytes = n_cols; /* Not including NUL */
511 size_t n_spaces = 0, space_left;
512
513 #ifdef HAVE_WIDECHAR
514 bool conversion = false;
515 bool wc_enabled = false;
516
517 /* In multi-byte locales convert to wide characters
518 to allow easy truncation. Also determine number
519 of screen columns used. */
520 if (MB_CUR_MAX > 1)
521 {
522 size_t src_chars = mbstowcs (NULL, src, 0);
523 if (src_chars == (size_t) -1)
524 {
525 if (flags & MBA_UNIBYTE_FALLBACK)
526 goto mbsalign_unibyte;
527 else
528 goto mbsalign_cleanup;
529 }
530 src_chars += 1; /* make space for NUL */
531 str_wc = malloc (src_chars * sizeof (wchar_t));
532 if (str_wc == NULL)
533 {
534 if (flags & MBA_UNIBYTE_FALLBACK)
535 goto mbsalign_unibyte;
536 else
537 goto mbsalign_cleanup;
538 }
539 if (mbstowcs (str_wc, src, src_chars) != 0)
540 {
541 str_wc[src_chars - 1] = L'\0';
542 wc_enabled = true;
543 conversion = wc_ensure_printable (str_wc);
544 n_cols = rpl_wcswidth (str_wc, src_chars);
545 }
546 }
547
548 /* If we transformed or need to truncate the source string
549 then create a modified copy of it. */
550 if (wc_enabled && (conversion || (n_cols > *width)))
551 {
552 if (conversion)
553 {
554 /* May have increased the size by converting
555 \t to \uFFFD for example. */
556 src_size = wcstombs(NULL, str_wc, 0) + 1;
557 }
558 newstr = malloc (src_size);
559 if (newstr == NULL)
560 {
561 if (flags & MBA_UNIBYTE_FALLBACK)
562 goto mbsalign_unibyte;
563 else
564 goto mbsalign_cleanup;
565 }
566 str_to_print = newstr;
567 n_cols = wc_truncate (str_wc, *width);
568 n_used_bytes = wcstombs (newstr, str_wc, src_size);
569 }
570
571 mbsalign_unibyte:
572 #endif
573
574 if (n_cols > *width) /* Unibyte truncation required. */
575 {
576 n_cols = *width;
577 n_used_bytes = n_cols;
578 }
579
580 if (*width > n_cols) /* Padding required. */
581 n_spaces = *width - n_cols;
582
583 /* indicate to caller how many cells needed (not including padding). */
584 *width = n_cols;
585
586 /* indicate to caller how many bytes needed (not including NUL). */
587 ret = n_used_bytes + (n_spaces * 1);
588
589 /* Write as much NUL terminated output to DEST as possible. */
590 if (dest_size != 0)
591 {
592 char *dest_end = dest + dest_size - 1;
593 size_t start_spaces;
594 size_t end_spaces;
595
596 switch (align)
597 {
598 case MBS_ALIGN_CENTER:
599 start_spaces = n_spaces / 2 + n_spaces % 2;
600 end_spaces = n_spaces / 2;
601 break;
602 case MBS_ALIGN_LEFT:
603 start_spaces = 0;
604 end_spaces = n_spaces;
605 break;
606 case MBS_ALIGN_RIGHT:
607 start_spaces = n_spaces;
608 end_spaces = 0;
609 break;
610 default:
611 abort();
612 }
613
614 dest = mbs_align_pad (dest, dest_end, start_spaces, padchar);
615 space_left = dest_end - dest;
616 dest = mempcpy (dest, str_to_print, min (n_used_bytes, space_left));
617 mbs_align_pad (dest, dest_end, end_spaces, padchar);
618 }
619 #ifdef HAVE_WIDECHAR
620 mbsalign_cleanup:
621 #endif
622 free (str_wc);
623 free (newstr);
624
625 return ret;
626 }