1 /* Copyright 2010-2023 Free Software Foundation, Inc.
2
3 This program is free software: you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation, either version 3 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program. If not, see <http://www.gnu.org/licenses/>. */
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <locale.h>
23 #ifndef _WIN32
24 #include <langinfo.h>
25 #else /* _WIN32 */
26 /* Workaround for problems caused in mingw.org's MinGW build by
27 Gnulib's wchar.h overriding the wint_t type definition, which
28 causes compilation errors when perl.h is included below, because
29 perl.h includes ctype.h. */
30 #include <ctype.h>
31 #endif
32 #include <wchar.h>
33 #include <wctype.h>
34
35 /* See "How do I use all this in extensions" in 'man perlguts'. */
36 #define PERL_NO_GET_CONTEXT
37
38 #include "EXTERN.h"
39 #include "perl.h"
40 #if defined _WIN32 && !defined __CYGWIN__
41 # undef free
42 #endif
43 #include "XSUB.h"
44
45 #include "ppport.h"
46
47 #include "xspara.h"
48
49 #include "text.h"
50
51 int debug = 0;
52
53 typedef struct {
54 TEXT space; /* Pending space, to be output before the pending word. */
55 TEXT word; /* Pending word. If outputting this would have led to
56 the line to be too long, the line should have been cut before
57 saving it. */
58
59 /* When word.end == 0, this indicates a word of length 0. */
60 int invisible_pending_word;
61
62 /* Length of space in multibyte characters. */
63 int space_counter;
64
65 /* Characters added so far in current word. */
66 int word_counter;
67
68 /* -2 means we are not at the end of a sentence (undefined in Perl),
69 1 means we are at the end of a sentence and French spacing is off,
70 -1 means we are at the end of a sentence and French spacing is on.
71 0 means it is "inhibited". */
72 int end_sentence;
73
74 int max; /* Maximum length of line. */
75 int indent_length; /* Columns to indent this line. */
76 int indent_length_next; /* Columns to indent the rest of the lines. */
77 int counter; /* Columns so far on this line. */
78
79 int lines_counter; /* Lines so far added in paragraph. */
80 int end_line_count; /* Number of newlines so far in an output unit, i.e.
81 with add_text or add_next. */
82
83 wint_t last_letter; /* Last letter in word, used to decide if we're
84 at the end of a sentence. */
85
86 /* Options set with set_space_protection. */
87 int no_break; /* Line break forbidden, as in @w. */
88 int ignore_columns; /* Don't cut line at right margin. Used by
89 @flushleft and @flushright. */
90 int keep_end_lines; /* A newline in the input ends a line in the output.
91 Used by @flushleft and @flushright. */
92 int french_spacing; /* Only one space, not two, after a full stop. */
93 int double_width_no_break; /* No line break between double width chars. */
94
95 /* No wrapping of lines and spaces are kept as-is. */
96 int unfilled;
97
98 /* Do not terminate with a final newline. */
99 int no_final_newline;
100
101 /* Terminate with any trailing space. */
102 int add_final_space;
103
104 int in_use;
105 } PARAGRAPH;
106
107 static PARAGRAPH state;
108
109 #ifdef _WIN32
110
111 #define WIN32_LEAN_AND_MEAN
112 #include <windows.h>
113 #include <errno.h>
114
115 /* If Gnulib overrides wint_t with a wider type, we cannot use
116 iswspace etc. names, whose prototypes were seen with the original
117 wint_t in effect. */
118 #ifdef GNULIB_defined_wint_t
119 # undef iswspace
120 # define iswspace(w) w32_iswspace(w)
121 # undef iswupper
122 # define iswupper(w) w32_iswupper(w)
123 #endif
124
125 char *
126 w32_setlocale (int category, const char *value)
127 {
128 if (_stricmp (value, "en_us.utf-8") != 0)
129 return NULL;
130
131 /* Switch to the Windows U.S. English locale with its default
132 codeset. We will handle the non-ASCII text ourselves, so the
133 codeset is unimportant, and Windows doesn't support UTF-8 as the
134 codeset anyway. */
135 return setlocale (category, "ENU");
136 }
137 #define setlocale(c,v) w32_setlocale(c,v)
138
139 size_t
140 mbrlen (const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps)
141 {
142 unsigned char byte1 = *mbs;
143
144 if (ps != NULL)
145 {
146 errno = ENOSYS;
147 return -1;
148 }
149
150 return
151 ((byte1 & 0x80) == 0) ? 1 : ((byte1 & 0x20) == 0) ? 2 :
152 ((byte1 & 0x10) == 0) ? 3 : 4;
153 }
154
155 /* Convert a UTF-8 encoded multibyte string to a wide character. */
156 size_t
157 mbrtowc (wchar_t * __restrict__ pwc, const char * __restrict__ mbs, size_t n,
158 mbstate_t * __restrict__ ps)
159 {
160 int len = mbrlen (mbs, n, ps);
161
162 if (mbs == NULL)
163 return 0;
164 else
165 {
166 wchar_t wc[2];
167 size_t n_utf16 = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS,
168 mbs, len, wc, 2);
169 if (n_utf16 == 0)
170 {
171 errno = EILSEQ;
172 return (size_t)-1;
173 }
174 if (ps != NULL)
175 {
176 errno = ENOSYS;
177 return (size_t)-1;
178 }
179 /* We don't support UTF-16 surrogates, because the calling code
180 doesn't, and because character classification functions on
181 Windows don't support anything beyond the BMP anyway. So we
182 return the first character of the surrogate pair and set
183 errno. */
184 if (n_utf16 > 1)
185 errno = ENOSYS;
186 if (pwc != NULL)
187 *pwc = wc[0];
188
189 return len;
190 }
191 }
192
193 /* NOTE - not used at present */
194 int
195 iswspace (wint_t wc)
196 {
197 /* See Unicode's Proplist.txt. */
198 if ((wc >= 0x09 && wc <= 0x0D)
199 || wc == 0x20
200 || wc == 0x85
201 || wc == 0xA0
202 || wc == 0x1680
203 || (wc >= 0x2000 && wc <= 0x200A)
204 || wc == 0x2028
205 || wc == 0x2029
206 || wc == 0x202F
207 || wc == 0x205F
208 || wc == 0x3000)
209 return 1;
210
211 return 0;
212 }
213
214 int
215 iswupper (wint_t wi)
216 {
217 WORD char_type;
218 wchar_t wc = wi;
219 BOOL status = GetStringTypeW (CT_CTYPE1, &wc, 1, &char_type);
220
221 if (!status || (char_type & C1_UPPER) == 0)
222 return 0;
223
224 return 1;
225 }
226
227 /* Avoid warnings due to redefinition of popen/pclose in Perl headers. */
228 #ifdef popen
229 # undef popen
230 # define popen(c,m) _popen(c,m)
231 #endif
232 #ifdef pclose
233 # undef pclose
234 # define pclose(f) _pclose(f)
235 #endif
236
237 #endif
238
239 /* for debug */
240 char *
241 xspara__print_escaped_spaces (char *string)
242 {
243 static TEXT t;
244 char *p = string;
245 text_reset (&t);
246 while (*p)
247 {
248 if (*p == ' ')
249 text_append_n (&t, p, 1);
250 else if (*p == '\n')
251 text_append_n (&t, "\\n", 2);
252 else if (*p == '\f')
253 text_append_n (&t, "\\f", 2);
254 else if (isspace(*p))
255 {
256 char protected_string[7];
257 sprintf (protected_string, "\\x%04x", *p);
258 text_append (&t, protected_string);
259 }
260 p++;
261 }
262 return t.text;
263 }
264
265 int
266 xspara_init (int unused, char *unused2)
267 {
268 char *utf8_locale = 0;
269 int len;
270 char *cur;
271 char *dot;
272
273 dTHX;
274
275 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
276 /* needed due to thread-safe locale handling in newer perls */
277 switch_to_global_locale();
278 #endif
279
280 if (setlocale (LC_CTYPE, "en_US.UTF-8")
281 || setlocale (LC_CTYPE, "en_US.utf8"))
282 goto success;
283
284 cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
285 if (!cur)
286 goto failure;
287 len = strlen (cur);
288 if ((len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6))
289 || (len >= 5 && !memcmp (".utf8", cur + len - 5, 5))
290 || (len >= 6 && !memcmp (".utf-8", cur + len - 6, 6))
291 || (len >= 5 && !memcmp (".UTF8", cur + len - 5, 5)))
292 {
293 setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
294 goto success;
295 }
296
297 /* Otherwise try altering the current locale name. */
298 dot = strchr (cur, '.');
299 if (!dot)
300 dot = cur + len;
301 utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
302 memcpy (utf8_locale, cur, dot - cur);
303 dot = utf8_locale + (dot - cur);
304 memcpy (dot, ".UTF-8", 7);
305 if (setlocale (LC_CTYPE, utf8_locale))
306 goto success;
307
308 memcpy (dot, ".utf8", 6);
309 if (setlocale (LC_CTYPE, utf8_locale))
310 goto success;
311
312 /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
313 {
314 FILE *p;
315 char *line = 0;
316 size_t n = 0;
317 ssize_t ret;
318 p = popen ("locale -a", "r");
319 if (!p)
320 goto failure;
321 while (1)
322 {
323 ret = getline (&line, &n, p);
324 if (ret == (ssize_t) -1)
325 {
326 free (line);
327 pclose (p);
328 goto failure;
329 }
330 if (strstr (line, "UTF-8") || strstr (line, "utf8"))
331 {
332 line[ret - 1] = '\0'; /* Remove trailing newline. */
333 if (setlocale (LC_CTYPE, line))
334 {
335 free (line);
336 pclose (p);
337 goto success;
338 }
339 }
340 }
341 }
342
343 if (1)
344 {
345 failure:
346 return 0; /* failure */
347 }
348 else
349 {
350 success: ;
351 free (utf8_locale);
352 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
353 /* needed due to thread-safe locale handling in newer perls */
354 sync_locale();
355 #endif
356 /*
357 fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
358 fprintf (stderr, "character encoding is: %s\n",
359 nl_langinfo (CODESET));
360 */
361 return 1; /* success */
362 }
363 }
364
365 /* Array for storing paragraph states which aren't in use. */
366 static PARAGRAPH *state_array;
367 static int state_array_size;
368
369 /* The slot in state_array for saving the current state. */
370 static int current_state;
371
372 static void
373 xspara__switch_state (int id)
374 {
375 if (current_state == id)
376 return;
377 if (current_state != -1)
378 memcpy (&state_array[current_state], &state, sizeof (PARAGRAPH));
379
380 memcpy (&state, &state_array[id], sizeof (PARAGRAPH));
381 current_state = id;
382 }
383
384 int
385 xspara_new (HV *conf)
386 {
387 int i;
388
389 dTHX; /* Perl boiler plate */
390
391 TEXT saved_space, saved_word;
392
393 /* Find an unused slot in state_array */
394 for (i = 0; i < state_array_size; i++)
395 {
396 if (!state_array[i].in_use)
397 break;
398 }
399 if (i == state_array_size)
400 {
401 state_array = realloc (state_array,
402 (state_array_size += 10) * sizeof (PARAGRAPH));
403 memset (state_array + i, 0, 10 * sizeof (PARAGRAPH));
404 }
405
406 state_array[i].in_use = 1;
407 xspara__switch_state (i);
408
409 /* Zero formatter, reusing storage. */
410 saved_space = state.space;
411 saved_word = state.word;
412 memset (&state, 0, sizeof (state));
413 state.space = saved_space;
414 state.word = saved_word;
415 state.space.end = state.word.end = 0;
416 state.in_use = 1;
417
418 /* Default values. */
419 state.max = 72;
420 state.indent_length_next = -1; /* Special value meaning undefined. */
421 state.end_sentence = -2; /* Special value meaning undefined. */
422 state.last_letter = L'\0';
423
424 if (conf)
425 xspara_init_state (conf);
426
427 /* The paragraph ID. */
428 return i;
429 }
430
431
432 /* SV is a blessed reference to an integer containing the paragraph ID. */
433 void
434 xspara_set_state (SV *sv)
435 {
436 dTHX;
437
438 xspara__switch_state (SvIV (sv));
439 }
440
441 /* Set the state internal to this C module from the Perl hash. */
442 void
443 xspara_init_state (HV *hash)
444 {
445 #define FETCH(key) hv_fetch (hash, key, strlen (key), 0)
446 #define FETCH_INT(key,where) { val = FETCH(key); \
447 if (val) { where = SvIV (*val); } }
448
449 SV **val;
450
451 dTHX; /* This is boilerplate for interacting with Perl. */
452
453 /* Fetch all these so they are set, and reset for each paragraph. */
454 FETCH_INT("end_sentence", state.end_sentence);
455 FETCH_INT("max", state.max);
456
457 FETCH_INT("indent_length", state.indent_length);
458 FETCH_INT("indent_length_next", state.indent_length_next);
459 FETCH_INT("counter", state.counter);
460
461 FETCH_INT("word_counter", state.word_counter);
462
463 FETCH_INT("lines_counter", state.lines_counter);
464 FETCH_INT("end_line_count", state.end_line_count);
465
466 FETCH_INT("no_break", state.no_break);
467 FETCH_INT("ignore_columns", state.ignore_columns);
468 FETCH_INT("keep_end_lines", state.keep_end_lines);
469 FETCH_INT("frenchspacing", state.french_spacing);
470
471 FETCH_INT("unfilled", state.unfilled);
472 FETCH_INT("no_final_newline", state.no_final_newline);
473 FETCH_INT("add_final_space", state.add_final_space);
474
475 val = FETCH("word");
476 if (val)
477 {
478 fprintf (stderr, "Bug: setting 'word' is not supported.\n");
479 abort ();
480 }
481 val = FETCH("space");
482 if (val)
483 {
484 fprintf (stderr, "Bug: setting 'space' is not supported.\n");
485 abort ();
486 }
487 return;
488
489 #undef FETCH
490 #undef FETCH_INT
491 }
492
493
494 /************************************************************************/
495
496
497 /* Append a newline character to RESULT. */
498 void
499 xspara__cut_line (TEXT *result)
500 {
501 if (!state.ignore_columns)
502 {
503 xspara__end_line ();
504
505 text_append (result, "\n");
506 }
507 }
508
509 int
510 xspara_end_line_count (void)
511 {
512 return state.end_line_count;
513 }
514
515 int
516 xspara_counter (void)
517 {
518 return state.counter;
519 }
520
521 /* End a line (throwing away a pending space, which we don't need)
522 Note _end_line in Paragraph.pm returned "\n". */
523 void
524 xspara__end_line (void)
525 {
526 state.counter = 0;
527 state.space.end = 0;
528 state.space_counter = 0;
529
530 /* This will only be true for the first line of output. */
531 if (state.indent_length_next != -1)
532 {
533 state.indent_length = state.indent_length_next;
534 state.indent_length_next = -1;
535 }
536
537 state.lines_counter++;
538 state.end_line_count++;
539 /* could be set to other values, anything that is not upper case. */
540 state.last_letter = L'\n';
541 }
542
543 char *
544 xspara_end_line (void)
545 {
546 state.end_line_count = 0;
547 xspara__end_line ();
548 return "\n";
549 }
550
551 /* Return concatenation of SPACE and WORD. */
552 char *
553 xspara_get_pending (void)
554 {
555 static TEXT t;
556 text_reset (&t);
557 text_append_n (&t, state.space.text, state.space.end);
558 text_append_n (&t, state.word.text, state.word.end);
559 return t.text;
560 }
561
562 /* Append to RESULT pending space followed by pending word, clearing them
563 afterwards. Assume we don't need to wrap a line. Only add spaces without a
564 word if ADD_SPACES. */
565 void
566 xspara__add_pending_word (TEXT *result, int add_spaces)
567 {
568 dTHX;
569
570 if (state.word.end == 0 && !state.invisible_pending_word && !add_spaces)
571 return;
572
573 if (state.indent_length > state.counter)
574 {
575 int i;
576 /* If we are not up to the left margin yet, output spaces to get there,
577 and ignore 'state.space', the pending space string. In this case
578 state.counter is probably 0. */
579
580 for (i = 0; i < state.indent_length - state.counter; i++)
581 text_append (result, " ");
582 state.counter = state.indent_length;
583
584 if (debug)
585 fprintf (stderr, "INDENT(%d+%d)\n", state.counter, state.word_counter);
586
587 /* Do not output leading spaces after the indent, unless 'unfilled'
588 is on. */
589 if (!state.unfilled)
590 {
591 state.space.end = 0;
592 state.space_counter = 0;
593 }
594 }
595
596 if (state.space.end > 0)
597 {
598 text_append_n (result, state.space.text, state.space.end);
599
600 state.counter += state.space_counter;
601
602 if (debug)
603 fprintf (stderr, "ADD_SPACES(%d+%d)\n", state.counter,
604 state.word_counter);
605
606 state.space.end = 0;
607 state.space_counter = 0;
608 }
609
610 if (state.word.end > 0 || state.invisible_pending_word)
611 {
612 text_append_n (result, state.word.text, state.word.end);
613 state.counter += state.word_counter;
614
615 if (debug)
616 fprintf (stderr, "ADD_WORD[%s]+%d (%d)\n", state.word.text,
617 state.word_counter, state.counter);
618
619 state.word.end = 0;
620 state.word_counter = 0;
621 state.invisible_pending_word = 0;
622 }
623 }
624
625 /* Function for users of this module. */
626 char *
627 xspara_add_pending_word (int add_spaces)
628 {
629 static TEXT ret;
630
631 text_reset (&ret);
632 state.end_line_count = 0;
633 xspara__add_pending_word (&ret, add_spaces);
634 if (ret.text)
635 return ret.text;
636 else
637 return "";
638 }
639
640 /* End a paragraph. */
641 char *
642 xspara_end (void)
643 {
644 static TEXT ret;
645
646 dTHX;
647
648 text_reset (&ret);
649 state.end_line_count = 0;
650
651 if (debug)
652 fprintf (stderr, "PARA END\n");
653
654 /* probably not really useful, but cleaner */
655 state.last_letter = L'\0';
656
657 xspara__add_pending_word (&ret, state.add_final_space);
658 if (!state.no_final_newline && state.counter != 0)
659 {
660 text_append (&ret, "\n");
661 state.lines_counter++;
662 state.end_line_count++;
663 }
664
665 /* Now it's time to forget about the state. */
666 state_array[current_state].in_use = 0;
667 state.in_use = 0;
668
669 /* Don't do this so we can get the closing line counts. */
670 /* current_state = -1; */
671
672 if (ret.text)
673 return ret.text;
674 else
675 return "";
676 }
677
678 /* check if a byte is in the printable ASCII range */
679 #define PRINTABLE_ASCII(c) (0x20 <= (c) && (c) <= 0x7E)
680
681 /* ignored after end sentence character to determine if
682 at the end of a sentence */
683 #define after_punctuation_characters "\"')]"
684 /* characters triggering an end of sentence */
685 #define end_sentence_characters ".?!"
686
687 /* Add WORD to paragraph in RESULT, not refilling WORD. If we go past the end
688 of the line start a new one. TRANSPARENT means that the letters in WORD
689 are ignored for the purpose of deciding whether a full stop ends a sentence
690 or not. */
691 void
692 xspara__add_next (TEXT *result, char *word, int word_len, int transparent)
693 {
694 dTHX;
695
696 int disinhibit = 0;
697 if (!word)
698 return;
699
700 if (word_len >= 1 && word[word_len - 1] == '\b')
701 {
702 word[--word_len] = '\0';
703 disinhibit = 1;
704 }
705
706 text_append_n (&state.word, word, word_len);
707 if (word_len == 0 && word)
708 state.invisible_pending_word = 1;
709
710 if (!transparent)
711 {
712 if (disinhibit)
713 state.last_letter = L'a'; /* a lower-case letter */
714 else
715 {
716 /* Save last character in WORD */
717 char *p = word + word_len;
718
719 while (p > word)
720 {
721 int len = 0;
722 /* Back one UTF-8 code point */
723 do
724 {
725 p--;
726 len++;
727 }
728 while ((*p & 0xC0) == 0x80 && p > word);
729
730 if (!strchr (end_sentence_characters
731 after_punctuation_characters, *p))
732 {
733 if (!PRINTABLE_ASCII(*p))
734 {
735 wchar_t wc = L'\0';
736 mbrtowc (&wc, p, len, NULL);
737 state.last_letter = wc;
738 break;
739 }
740 else
741 {
742 state.last_letter = btowc (*p);
743 break;
744 }
745 }
746 }
747 }
748 }
749
750 if (strchr (word, '\n'))
751 {
752 /* If there was a newline in the word we just added, put the entire
753 pending ouput in the results string, and start a new line. */
754 xspara__add_pending_word (result, 0);
755 xspara__end_line ();
756 }
757 else
758 {
759 /* Calculate length of multibyte string in characters. */
760 int len = 0;
761 int left = word_len;
762 wchar_t w;
763 char *p = word;
764
765 while (left > 0)
766 {
767 int columns;
768 int char_len;
769
770 if (PRINTABLE_ASCII(*p))
771 {
772 len++; p++; left--;
773 continue;
774 }
775
776 char_len = mbrtowc (&w, p, left, NULL);
777 if (char_len == (size_t) -2) {
778 /* unfinished multibyte character */
779 char_len = left;
780 } else if (char_len == (size_t) -1) {
781 /* invalid character */
782 char_len = 1;
783 } else if (char_len == 0) {
784 /* not sure what this means but we must avoid an infinite loop.
785 Possibly only happens with invalid strings */
786 char_len = 1;
787 }
788 left -= char_len;
789
790 columns = wcwidth (w);
791 if (columns > 0)
792 len += columns;
793
794 p += char_len;
795 }
796
797 state.word_counter += len;
798
799 if (state.counter != 0
800 && state.counter + state.word_counter + state.space_counter
801 > state.max)
802 {
803 xspara__cut_line (result);
804 }
805 }
806 if (debug)
807 fprintf (stderr, "WORD+ %s -> %s\n", word, state.word.space == 0 ?
808 "UNDEF" : state.word.text);
809 }
810
811 /* Like _add_next but zero end_line_count at beginning. */
812 TEXT
813 xspara_add_next (char *text, int text_len, int transparent)
814 {
815 static TEXT t;
816
817 text_reset (&t);
818 state.end_line_count = 0;
819 xspara__add_next (&t, text, text_len, transparent);
820
821 return t;
822 }
823
824 void
825 xspara_remove_end_sentence (void)
826 {
827 state.end_sentence = 0;
828 }
829
830 void
831 xspara_add_end_sentence (int value)
832 {
833 state.end_sentence = value;
834 }
835
836 void
837 xspara_allow_end_sentence (void)
838 {
839 state.last_letter = L'a'; /* A lower-case letter. */
840 }
841
842 /* -1 in a parameter means leave that value as it is. */
843 void
844 xspara_set_space_protection (int no_break,
845 int ignore_columns,
846 int keep_end_lines,
847 int french_spacing,
848 int double_width_no_break)
849 {
850 if (no_break != -1)
851 state.no_break = no_break;
852 if (ignore_columns != -1)
853 state.ignore_columns = ignore_columns;
854 if (keep_end_lines != -1)
855 state.keep_end_lines = keep_end_lines;
856 if (double_width_no_break != -1)
857 state.double_width_no_break = double_width_no_break;
858 if (french_spacing != -1)
859 state.french_spacing = french_spacing;
860
861 /*fprintf (stderr, "SETTING SPACE (%d, %d, %d, %d)\n",
862 no_break,
863 ignore_columns,
864 keep_end_lines,
865 french_spacing);*/
866
867 if (no_break != -1 && state.no_break)
868 {
869 if (state.word.end == 0)
870 {
871 /* In _add_pending_word this meant that an "empty word" would
872 be output. This makes "a @w{} b" -> "a b", not "a b", and
873 "a @w{}" at end of paragraph -> "a ", not "a". */
874
875 state.invisible_pending_word = 1;
876 }
877 }
878
879 return;
880 }
881
882 /*****************************************************************/
883
884 /* Return string to be added to paragraph contents, wrapping text. This
885 function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
886 work correctly. */
887 TEXT
888 xspara_add_text (char *text, int len)
889 {
890 char *p = text;
891 wchar_t wc;
892 size_t char_len;
893 int width;
894 static TEXT result;
895 dTHX;
896
897 text_reset (&result);
898
899 state.end_line_count = 0;
900
901 while (len > 0)
902 {
903 if (debug)
904 {
905 fprintf(stderr, "p (%d+%d) s `%s', l `%lc', w `%s'\n",
906 state.counter, state.word_counter,
907 state.space.end == 0 ? ""
908 : xspara__print_escaped_spaces (state.space.text),
909 state.last_letter,
910 state.word.end > 0 ? state.word.text : "UNDEF");
911 }
912 if (isspace ((unsigned char) *p))
913 {
914 if (debug)
915 {
916 char t[2];
917 t[0] = *p;
918 t[1] = '\0';
919 fprintf(stderr, "SPACES(%d) `%s'\n", state.counter,
920 xspara__print_escaped_spaces (t));
921 }
922
923 if (state.unfilled)
924 {
925 xspara__add_pending_word (&result, 0);
926 if (*p == '\n')
927 {
928 xspara__end_line ();
929 text_append (&result, "\n");
930 }
931 else
932 {
933 text_append_n (&state.space, p, 1);
934 state.space_counter++;
935 }
936 }
937 else if (state.no_break)
938 {
939 /* Append the spaces to the pending word. */
940 if (state.word.end == 0
941 || state.word.text[state.word.end - 1] != ' ')
942 {
943 if (state.end_sentence == 1 && !state.french_spacing)
944 {
945 text_append_n (&state.word, " ", 2);
946 state.word_counter += 2;
947 }
948 else
949 {
950 text_append_n (&state.word, " ", 1);
951 state.word_counter += 1;
952 }
953
954 if (state.counter != 0
955 && state.counter + state.word_counter
956 + state.space_counter > state.max)
957 {
958 xspara__cut_line (&result);
959 }
960 }
961 }
962 else /* no_break off */
963 {
964 int pending = state.invisible_pending_word;
965 xspara__add_pending_word (&result, 0);
966
967 if (state.counter != 0 || pending)
968 {
969 /* If we are at the end of a sentence where two spaces
970 are required. */
971 if (state.end_sentence == 1
972 && !state.french_spacing)
973 {
974 state.space.end = 0;
975 text_append_n (&state.space, " ", 2);
976 state.space_counter = 2;
977 }
978 else /* Not at end of sentence. */
979 {
980 /* Only save the first space. */
981 if (state.space_counter < 1)
982 {
983 if (*p == '\n')
984 {
985 text_append_n (&state.space, " ", 1);
986 state.space_counter++;
987 }
988 else
989 {
990 text_append_n (&state.space, p, 1);
991 state.space_counter++;
992 }
993 }
994 }
995 }
996 }
997
998 /* If not enough space in the line for the pending space, start
999 a new line. */
1000 if (state.counter + state.space_counter > state.max)
1001 {
1002 xspara__cut_line (&result);
1003 }
1004
1005 if (!state.unfilled && *p == '\n' && state.keep_end_lines)
1006 {
1007 xspara__end_line ();
1008 text_append (&result, "\n");
1009 }
1010 p++; len--;
1011 state.last_letter = ' ';
1012 continue;
1013 }
1014
1015 /************** Not a white space character. *****************/
1016 if (!PRINTABLE_ASCII(*p))
1017 {
1018 char_len = mbrtowc (&wc, p, len, NULL);
1019 }
1020 else
1021 {
1022 /* Functonally the same as mbrtowc but (tested) slightly quicker. */
1023 char_len = 1;
1024 wc = btowc (*p);
1025 }
1026
1027 if ((long) char_len == 0)
1028 break; /* Null character. Shouldn't happen. */
1029 else if ((long) char_len < 0)
1030 {
1031 p++; len--; /* Invalid. Just try to keep going. */
1032 continue;
1033 }
1034
1035 width = wcwidth (wc);
1036 /*************** Double width character. *********************/
1037 if (width == 2)
1038 {
1039 if (debug)
1040 fprintf (stderr, "FULLWIDTH\n");
1041
1042 text_append_n (&state.word, p, char_len);
1043 state.word_counter += 2;
1044
1045 /* fullwidth latin letters can be upper case, so it is important to
1046 use the actual characters here. */
1047 state.last_letter = wc;
1048
1049 /* We allow a line break in between Chinese characters even if
1050 there was no space between them, unlike single-width
1051 characters. */
1052
1053 if (state.counter != 0
1054 && state.counter + state.word_counter > state.max)
1055 {
1056 xspara__cut_line (&result);
1057 }
1058 /* Accumulate the characters so that they can be pushed
1059 onto the next line if necessary. */
1060 if (!state.no_break && !state.double_width_no_break)
1061 {
1062 xspara__add_pending_word (&result, 0);
1063 }
1064 state.end_sentence = -2;
1065 }
1066 else if (wc == L'\b')
1067 {
1068 /* Code to say that a following full stop (or question or
1069 exclamation mark) may be an end of sentence. */
1070 xspara_allow_end_sentence ();
1071 }
1072 /*************** Word character ******************************/
1073 /* Note: width == 0 includes accent characters which should not
1074 properly increase the column count. This is not what the pure
1075 Perl code does, though. */
1076 else if (width == 1 || width == 0)
1077 {
1078 static char added_word[8]; /* long enough for one UTF-8 character */
1079 memcpy (added_word, p, char_len);
1080 added_word[char_len] = '\0';
1081
1082 xspara__add_next (&result, added_word, char_len, 0);
1083
1084 /* Now check if it is considered as an end of sentence, and
1085 set state.end_sentence if it is. */
1086
1087 if (strchr (end_sentence_characters, *p) && !state.unfilled)
1088 {
1089 /* Doesn't count if preceded by an upper-case letter. */
1090 if (!iswupper (state.last_letter))
1091 {
1092 if (state.french_spacing)
1093 state.end_sentence = -1;
1094 else
1095 state.end_sentence = 1;
1096 if (debug)
1097 fprintf (stderr, "END_SENTENCE\n");
1098 }
1099 }
1100 else if (strchr (after_punctuation_characters, *p))
1101 {
1102 /* '"', '\'', ']' and ')' are ignored for the purpose
1103 of deciding whether a full stop ends a sentence. */
1104 }
1105 else
1106 {
1107 /* Otherwise reset the end of sentence marker: a full stop in
1108 a string like "aaaa.bbbb" doesn't mark an end of
1109 sentence. */
1110 state.last_letter = wc;
1111 if (debug && state.end_sentence != -2)
1112 fprintf (stderr, "delete END_SENTENCE(%d)\n",
1113 state.end_sentence);
1114 state.end_sentence = -2;
1115 }
1116 }
1117 else
1118 {
1119 /* Not printable, possibly a tab, or a combining character.
1120 Add it to the pending word without increasing the column
1121 count. */
1122 text_append_n (&state.word, p, char_len);
1123 }
1124 p += char_len; len -= char_len;
1125 }
1126
1127 return result;
1128 }
1129