1  /* Copyright 2010-2023 Free Software Foundation, Inc.
       2  
       3     This program is free software: you can redistribute it and/or modify
       4     it under the terms of the GNU General Public License as published by
       5     the Free Software Foundation, either version 3 of the License, or
       6     (at your option) any later version.
       7  
       8     This program is distributed in the hope that it will be useful,
       9     but WITHOUT ANY WARRANTY; without even the implied warranty of
      10     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      11     GNU General Public License for more details.
      12  
      13     You should have received a copy of the GNU General Public License
      14     along with this program.  If not, see <http://www.gnu.org/licenses/>. */
      15  
      16  #ifdef HAVE_CONFIG_H
      17    #include <config.h>
      18  #endif
      19  #include <stdlib.h>
      20  #include <stdio.h>
      21  #include <string.h>
      22  #include <locale.h>
      23  #ifndef _WIN32
      24  #include <langinfo.h>
      25  #else  /* _WIN32 */
      26  /* Workaround for problems caused in mingw.org's MinGW build by
      27     Gnulib's wchar.h overriding the wint_t type definition, which
      28     causes compilation errors when perl.h is included below, because
      29     perl.h includes ctype.h.  */
      30  #include <ctype.h>
      31  #endif
      32  #include <wchar.h>
      33  #include <wctype.h>
      34  
      35  /* See "How do I use all this in extensions" in 'man perlguts'. */
      36  #define PERL_NO_GET_CONTEXT
      37  
      38  #include "EXTERN.h"
      39  #include "perl.h"
      40  #if defined _WIN32 && !defined __CYGWIN__
      41  # undef free
      42  #endif
      43  #include "XSUB.h"
      44  
      45  #include "ppport.h"
      46  
      47  #include "xspara.h"
      48  
      49  #include "text.h"
      50  
      51  int debug = 0;
      52  
      53  typedef struct {
      54      TEXT space; /* Pending space, to be output before the pending word. */
      55      TEXT word; /* Pending word.  If outputting this would have led to
      56                    the line to be too long, the line should have been cut before
      57                    saving it. */
      58  
      59      /* When word.end == 0, this indicates a word of length 0. */
      60      int invisible_pending_word;
      61  
      62      /* Length of space in multibyte characters. */
      63      int space_counter;
      64  
      65      /* Characters added so far in current word. */
      66      int word_counter; 
      67  
      68      /* -2 means we are not at the end of a sentence (undefined in Perl),
      69         1 means we are at the end of a sentence and French spacing is off,
      70         -1 means we are at the end of a sentence and French spacing is on.
      71         0 means it is "inhibited". */
      72      int end_sentence;
      73  
      74      int max; /* Maximum length of line. */
      75      int indent_length; /* Columns to indent this line. */
      76      int indent_length_next; /* Columns to indent the rest of the lines. */
      77      int counter; /* Columns so far on this line. */
      78  
      79      int lines_counter; /* Lines so far added in paragraph. */
      80      int end_line_count; /* Number of newlines so far in an output unit, i.e.
      81                             with add_text or add_next. */
      82  
      83      wint_t last_letter; /* Last letter in word, used to decide if we're
      84                              at the end of a sentence. */
      85  
      86      /* Options set with set_space_protection. */
      87      int no_break;       /* Line break forbidden, as in @w. */
      88      int ignore_columns; /* Don't cut line at right margin.  Used by
      89                             @flushleft and @flushright. */
      90      int keep_end_lines; /* A newline in the input ends a line in the output.
      91                             Used by @flushleft and @flushright. */
      92      int french_spacing; /* Only one space, not two, after a full stop. */
      93      int double_width_no_break; /* No line break between double width chars. */
      94  
      95      /* No wrapping of lines and spaces are kept as-is. */
      96      int unfilled;
      97  
      98      /* Do not terminate with a final newline. */
      99      int no_final_newline;
     100  
     101      /* Terminate with any trailing space. */
     102      int add_final_space;
     103  
     104      int in_use;
     105  } PARAGRAPH;
     106  
     107  static PARAGRAPH state;
     108  
     109  #ifdef _WIN32
     110  
     111  #define WIN32_LEAN_AND_MEAN
     112  #include <windows.h>
     113  #include <errno.h>
     114  
     115  /* If Gnulib overrides wint_t with a wider type, we cannot use
     116     iswspace etc. names, whose prototypes were seen with the original
     117     wint_t in effect.  */
     118  #ifdef GNULIB_defined_wint_t
     119  # undef iswspace
     120  # define iswspace(w) w32_iswspace(w)
     121  # undef iswupper
     122  # define iswupper(w) w32_iswupper(w)
     123  #endif
     124  
     125  char *
     126  w32_setlocale (int category, const char *value)
     127  {
     128    if (_stricmp (value, "en_us.utf-8") != 0)
     129      return NULL;
     130  
     131    /* Switch to the Windows U.S. English locale with its default
     132       codeset.  We will handle the non-ASCII text ourselves, so the
     133       codeset is unimportant, and Windows doesn't support UTF-8 as the
     134       codeset anyway.  */
     135    return setlocale (category, "ENU");
     136  }
     137  #define setlocale(c,v)  w32_setlocale(c,v)
     138  
     139  size_t
     140  mbrlen (const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps)
     141  {
     142    unsigned char byte1 = *mbs;
     143  
     144    if (ps != NULL)
     145      {
     146        errno = ENOSYS;
     147        return -1;
     148      }
     149  
     150    return
     151      ((byte1 & 0x80) == 0) ? 1 : ((byte1 & 0x20) == 0) ? 2 :
     152      ((byte1 & 0x10) == 0) ? 3 : 4;
     153  }
     154  
     155  /* Convert a UTF-8 encoded multibyte string to a wide character.  */
     156  size_t
     157  mbrtowc (wchar_t * __restrict__ pwc, const char * __restrict__ mbs, size_t n,
     158  	 mbstate_t * __restrict__ ps)
     159  {
     160    int len = mbrlen (mbs, n, ps);
     161  
     162    if (mbs == NULL)
     163      return 0;
     164    else
     165      {
     166        wchar_t wc[2];
     167        size_t n_utf16 = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS,
     168  					    mbs, len, wc, 2);
     169        if (n_utf16 == 0)
     170  	{
     171  	  errno = EILSEQ;
     172  	  return (size_t)-1;
     173  	}
     174        if (ps != NULL)
     175  	{
     176  	  errno = ENOSYS;
     177  	  return (size_t)-1;
     178  	}
     179        /* We don't support UTF-16 surrogates, because the calling code
     180  	 doesn't, and because character classification functions on
     181  	 Windows don't support anything beyond the BMP anyway.  So we
     182  	 return the first character of the surrogate pair and set
     183  	 errno.  */
     184        if (n_utf16 > 1)
     185  	errno = ENOSYS;
     186        if (pwc != NULL)
     187  	*pwc = wc[0];
     188  
     189        return len;
     190      }
     191  }
     192  
     193  /* NOTE - not used at present */
     194  int
     195  iswspace (wint_t wc)
     196  {
     197    /* See Unicode's Proplist.txt.  */
     198    if ((wc >= 0x09 && wc <= 0x0D)
     199        || wc == 0x20
     200        || wc == 0x85
     201        || wc == 0xA0
     202        || wc == 0x1680
     203        || (wc >= 0x2000 && wc <= 0x200A)
     204        || wc == 0x2028
     205        || wc == 0x2029
     206        || wc == 0x202F
     207        || wc == 0x205F
     208        || wc == 0x3000)
     209      return 1;
     210  
     211    return 0;
     212  }
     213  
     214  int
     215  iswupper (wint_t wi)
     216  {
     217    WORD char_type;
     218    wchar_t wc = wi;
     219    BOOL status = GetStringTypeW (CT_CTYPE1, &wc, 1, &char_type);
     220  
     221    if (!status || (char_type & C1_UPPER) == 0)
     222      return 0;
     223  
     224    return 1;
     225  }
     226  
     227  /* Avoid warnings due to redefinition of popen/pclose in Perl headers.  */
     228  #ifdef popen
     229  # undef popen
     230  # define popen(c,m) _popen(c,m)
     231  #endif
     232  #ifdef pclose
     233  # undef pclose
     234  # define pclose(f)  _pclose(f)
     235  #endif
     236  
     237  #endif
     238  
     239  /* for debug */
     240  char *
     241  xspara__print_escaped_spaces (char *string)
     242  {
     243    static TEXT t;
     244    char *p = string;
     245    text_reset (&t);
     246    while (*p)
     247      {
     248        if (*p == ' ')
     249          text_append_n (&t, p, 1);
     250        else if (*p == '\n')
     251          text_append_n (&t, "\\n", 2);
     252        else if (*p == '\f')
     253          text_append_n (&t, "\\f", 2);
     254        else if (isspace(*p))
     255          {
     256            char protected_string[7];
     257            sprintf (protected_string, "\\x%04x", *p);
     258            text_append (&t, protected_string);
     259          }
     260        p++;
     261      }
     262    return t.text;
     263  }
     264  
     265  int
     266  xspara_init (int unused, char *unused2)
     267  {
     268    char *utf8_locale = 0;
     269    int len;
     270    char *cur;
     271    char *dot;
     272  
     273    dTHX;
     274  
     275  #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
     276    /* needed due to thread-safe locale handling in newer perls */
     277    switch_to_global_locale();
     278  #endif
     279  
     280    if (setlocale (LC_CTYPE, "en_US.UTF-8")
     281        || setlocale (LC_CTYPE, "en_US.utf8"))
     282      goto success;
     283  
     284    cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
     285    if (!cur)
     286      goto failure;
     287    len = strlen (cur);
     288    if ((len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6))
     289        || (len >= 5 && !memcmp (".utf8", cur + len - 5, 5))
     290        || (len >= 6 && !memcmp (".utf-8", cur + len - 6, 6))
     291        || (len >= 5 && !memcmp (".UTF8", cur + len - 5, 5)))
     292      {
     293        setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
     294        goto success;
     295      }
     296  
     297    /* Otherwise try altering the current locale name. */
     298    dot = strchr (cur, '.');
     299    if (!dot)
     300      dot = cur + len;
     301    utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
     302    memcpy (utf8_locale, cur, dot - cur);
     303    dot = utf8_locale + (dot - cur);
     304    memcpy (dot, ".UTF-8", 7);
     305    if (setlocale (LC_CTYPE, utf8_locale))
     306      goto success;
     307  
     308    memcpy (dot, ".utf8", 6);
     309    if (setlocale (LC_CTYPE, utf8_locale))
     310      goto success;
     311  
     312    /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
     313    {
     314    FILE *p;
     315    char *line = 0;
     316    size_t n = 0;
     317    ssize_t ret;
     318    p = popen ("locale -a", "r");
     319    if (!p)
     320      goto failure;
     321    while (1)
     322      {
     323        ret = getline (&line, &n, p);
     324        if (ret == (ssize_t) -1)
     325          {
     326            free (line);
     327            pclose (p);
     328            goto failure;
     329          }
     330        if (strstr (line, "UTF-8") || strstr (line, "utf8"))
     331          {
     332            line[ret - 1] = '\0';   /* Remove trailing newline. */
     333            if (setlocale (LC_CTYPE, line))
     334              {
     335                free (line);
     336                pclose (p);
     337                goto success;
     338              }
     339          }
     340      }
     341    }
     342        
     343    if (1)
     344      {
     345  failure:
     346        return 0; /* failure */
     347      }
     348    else
     349      {
     350  success: ;
     351        free (utf8_locale);
     352  #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
     353        /* needed due to thread-safe locale handling in newer perls */
     354        sync_locale();
     355  #endif
     356        /*
     357        fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
     358        fprintf (stderr, "character encoding is: %s\n",
     359                 nl_langinfo (CODESET));
     360         */
     361        return 1; /* success */
     362      }
     363  }
     364  
     365  /* Array for storing paragraph states which aren't in use. */
     366  static PARAGRAPH *state_array;
     367  static int state_array_size;
     368  
     369  /* The slot in state_array for saving the current state. */
     370  static int current_state;
     371  
     372  static void
     373  xspara__switch_state (int id)
     374  {
     375    if (current_state == id)
     376      return;
     377    if (current_state != -1)
     378      memcpy (&state_array[current_state], &state, sizeof (PARAGRAPH));
     379  
     380    memcpy (&state, &state_array[id], sizeof (PARAGRAPH));
     381    current_state = id;
     382  }
     383  
     384  int
     385  xspara_new (HV *conf)
     386  {
     387    int i;
     388  
     389    dTHX; /* Perl boiler plate */
     390  
     391    TEXT saved_space, saved_word;
     392  
     393    /* Find an unused slot in state_array */
     394    for (i = 0; i < state_array_size; i++)
     395      {
     396        if (!state_array[i].in_use)
     397          break;
     398      }
     399    if (i == state_array_size)
     400      {
     401        state_array = realloc (state_array,
     402                               (state_array_size += 10) * sizeof (PARAGRAPH));
     403        memset (state_array + i, 0, 10 * sizeof (PARAGRAPH));
     404      }
     405  
     406    state_array[i].in_use = 1;
     407    xspara__switch_state (i);
     408  
     409    /* Zero formatter, reusing storage. */
     410    saved_space = state.space;
     411    saved_word = state.word;
     412    memset (&state, 0, sizeof (state));
     413    state.space = saved_space;
     414    state.word = saved_word;
     415    state.space.end = state.word.end = 0;
     416    state.in_use = 1;
     417  
     418    /* Default values. */
     419    state.max = 72;
     420    state.indent_length_next = -1; /* Special value meaning undefined. */
     421    state.end_sentence = -2; /* Special value meaning undefined. */
     422    state.last_letter = L'\0';
     423  
     424    if (conf)
     425      xspara_init_state (conf);
     426  
     427    /* The paragraph ID. */
     428    return i;
     429  }
     430  
     431  
     432  /* SV is a blessed reference to an integer containing the paragraph ID. */
     433  void
     434  xspara_set_state (SV *sv)
     435  {
     436    dTHX;
     437  
     438    xspara__switch_state (SvIV (sv));
     439  }
     440  
     441  /* Set the state internal to this C module from the Perl hash. */
     442  void
     443  xspara_init_state (HV *hash)
     444  {
     445  #define FETCH(key) hv_fetch (hash, key, strlen (key), 0)
     446  #define FETCH_INT(key,where) { val = FETCH(key); \
     447                                 if (val) { where = SvIV (*val); } }
     448  
     449    SV **val;
     450    
     451    dTHX; /* This is boilerplate for interacting with Perl. */
     452  
     453    /* Fetch all these so they are set, and reset for each paragraph. */
     454    FETCH_INT("end_sentence", state.end_sentence);
     455    FETCH_INT("max", state.max);
     456  
     457    FETCH_INT("indent_length", state.indent_length);
     458    FETCH_INT("indent_length_next", state.indent_length_next);
     459    FETCH_INT("counter", state.counter); 
     460  
     461    FETCH_INT("word_counter", state.word_counter);
     462  
     463    FETCH_INT("lines_counter", state.lines_counter);
     464    FETCH_INT("end_line_count", state.end_line_count);
     465  
     466    FETCH_INT("no_break", state.no_break);
     467    FETCH_INT("ignore_columns", state.ignore_columns);
     468    FETCH_INT("keep_end_lines", state.keep_end_lines);
     469    FETCH_INT("frenchspacing", state.french_spacing);
     470  
     471    FETCH_INT("unfilled", state.unfilled);
     472    FETCH_INT("no_final_newline", state.no_final_newline);
     473    FETCH_INT("add_final_space", state.add_final_space);
     474  
     475    val = FETCH("word");
     476    if (val)
     477      {
     478        fprintf (stderr, "Bug: setting 'word' is not supported.\n");
     479        abort ();
     480      }
     481    val = FETCH("space");
     482    if (val)
     483      {
     484        fprintf (stderr, "Bug: setting 'space' is not supported.\n");
     485        abort ();
     486      }
     487    return;
     488  
     489  #undef FETCH
     490  #undef FETCH_INT
     491  }
     492  
     493  
     494  /************************************************************************/
     495  
     496  
     497  /* Append a newline character to RESULT. */
     498  void
     499  xspara__cut_line (TEXT *result)
     500  {
     501    if (!state.ignore_columns)
     502      {
     503        xspara__end_line ();
     504  
     505        text_append (result, "\n");
     506      }
     507  }
     508  
     509  int
     510  xspara_end_line_count (void)
     511  {
     512    return state.end_line_count;
     513  }
     514  
     515  int
     516  xspara_counter (void)
     517  {
     518    return state.counter;
     519  }
     520  
     521  /* End a line (throwing away a pending space, which we don't need)
     522     Note _end_line in Paragraph.pm returned "\n". */
     523  void
     524  xspara__end_line (void)
     525  {
     526    state.counter = 0;
     527    state.space.end = 0;
     528    state.space_counter = 0;
     529  
     530    /* This will only be true for the first line of output. */
     531    if (state.indent_length_next != -1)
     532      {
     533        state.indent_length = state.indent_length_next;
     534        state.indent_length_next = -1;
     535      }
     536  
     537    state.lines_counter++;
     538    state.end_line_count++;
     539    /* could be set to other values, anything that is not upper case. */
     540    state.last_letter = L'\n';
     541  }
     542  
     543  char *
     544  xspara_end_line (void)
     545  {
     546    state.end_line_count = 0;
     547    xspara__end_line ();
     548    return "\n";
     549  }
     550  
     551  /* Return concatenation of SPACE and WORD. */
     552  char *
     553  xspara_get_pending (void)
     554  {
     555    static TEXT t;
     556    text_reset (&t);
     557    text_append_n (&t, state.space.text, state.space.end);
     558    text_append_n (&t, state.word.text, state.word.end);
     559    return t.text;
     560  }
     561  
     562  /* Append to RESULT pending space followed by pending word, clearing them 
     563     afterwards.  Assume we don't need to wrap a line.  Only add spaces without a 
     564     word if ADD_SPACES. */
     565  void
     566  xspara__add_pending_word (TEXT *result, int add_spaces)
     567  {
     568    dTHX;
     569  
     570    if (state.word.end == 0 && !state.invisible_pending_word && !add_spaces)
     571      return;
     572  
     573    if (state.indent_length > state.counter)
     574      {
     575        int i;
     576        /* If we are not up to the left margin yet, output spaces to get there, 
     577           and ignore 'state.space', the pending space string.  In this case 
     578           state.counter is probably 0.  */
     579  
     580        for (i = 0; i < state.indent_length - state.counter; i++)
     581          text_append (result, " ");
     582        state.counter = state.indent_length;
     583  
     584        if (debug)
     585          fprintf (stderr, "INDENT(%d+%d)\n", state.counter, state.word_counter);
     586  
     587        /* Do not output leading spaces after the indent, unless 'unfilled'
     588           is on.  */
     589        if (!state.unfilled)
     590          {
     591            state.space.end = 0;
     592            state.space_counter = 0;
     593          }
     594      }
     595  
     596    if (state.space.end > 0)
     597      {
     598        text_append_n (result, state.space.text, state.space.end);
     599  
     600        state.counter += state.space_counter;
     601  
     602        if (debug)
     603          fprintf (stderr, "ADD_SPACES(%d+%d)\n", state.counter,
     604                                                  state.word_counter);
     605  
     606        state.space.end = 0;
     607        state.space_counter = 0;
     608      }
     609  
     610    if (state.word.end > 0 || state.invisible_pending_word)
     611      {
     612        text_append_n (result, state.word.text, state.word.end);
     613        state.counter += state.word_counter;
     614  
     615        if (debug)
     616          fprintf (stderr, "ADD_WORD[%s]+%d (%d)\n", state.word.text,
     617                   state.word_counter, state.counter);
     618  
     619        state.word.end = 0;
     620        state.word_counter = 0;
     621        state.invisible_pending_word = 0;
     622      }
     623  }
     624  
     625  /* Function for users of this module. */
     626  char *
     627  xspara_add_pending_word (int add_spaces)
     628  {
     629    static TEXT ret;
     630  
     631    text_reset (&ret);
     632    state.end_line_count = 0;
     633    xspara__add_pending_word (&ret, add_spaces);
     634    if (ret.text)
     635      return ret.text;
     636    else
     637      return "";
     638  }
     639  
     640  /* End a paragraph. */
     641  char *
     642  xspara_end (void)
     643  {
     644    static TEXT ret;
     645  
     646    dTHX;
     647  
     648    text_reset (&ret);
     649    state.end_line_count = 0;
     650  
     651    if (debug)
     652      fprintf (stderr, "PARA END\n");
     653  
     654    /* probably not really useful, but cleaner */
     655    state.last_letter = L'\0';
     656  
     657    xspara__add_pending_word (&ret, state.add_final_space);
     658    if (!state.no_final_newline && state.counter != 0)
     659      {
     660        text_append (&ret, "\n");
     661        state.lines_counter++;
     662        state.end_line_count++;
     663      }
     664  
     665    /* Now it's time to forget about the state. */
     666    state_array[current_state].in_use = 0;
     667    state.in_use = 0;
     668  
     669    /* Don't do this so we can get the closing line counts. */
     670    /* current_state = -1; */
     671  
     672    if (ret.text)
     673      return ret.text;
     674    else
     675      return "";
     676  }
     677  
     678  /* check if a byte is in the printable ASCII range */
     679  #define PRINTABLE_ASCII(c) (0x20 <= (c) && (c) <= 0x7E)
     680  
     681  /* ignored after end sentence character to determine if
     682     at the end of a sentence */
     683  #define after_punctuation_characters "\"')]"
     684  /* characters triggering an end of sentence */
     685  #define end_sentence_characters ".?!"
     686  
     687  /* Add WORD to paragraph in RESULT, not refilling WORD.  If we go past the end 
     688     of the line start a new one.  TRANSPARENT means that the letters in WORD
     689     are ignored for the purpose of deciding whether a full stop ends a sentence
     690     or not. */
     691  void
     692  xspara__add_next (TEXT *result, char *word, int word_len, int transparent)
     693  {
     694    dTHX;
     695  
     696    int disinhibit = 0;
     697    if (!word)
     698      return;
     699  
     700    if (word_len >= 1 && word[word_len - 1] == '\b')
     701      {
     702        word[--word_len] = '\0';
     703        disinhibit = 1;
     704      }
     705  
     706    text_append_n (&state.word, word, word_len);
     707    if (word_len == 0 && word)
     708      state.invisible_pending_word = 1;
     709  
     710    if (!transparent)
     711      {
     712        if (disinhibit)
     713          state.last_letter = L'a'; /* a lower-case letter */
     714        else
     715          {
     716            /* Save last character in WORD */
     717            char *p = word + word_len;
     718  
     719            while (p > word)
     720              {
     721                int len = 0;
     722                /* Back one UTF-8 code point */
     723                do
     724                  {
     725                    p--;
     726                    len++;
     727                  }
     728                while ((*p & 0xC0) == 0x80 && p > word);
     729  
     730                if (!strchr (end_sentence_characters
     731                             after_punctuation_characters, *p))
     732                  {
     733                    if (!PRINTABLE_ASCII(*p))
     734                      {
     735                        wchar_t wc = L'\0';
     736                        mbrtowc (&wc, p, len, NULL);
     737                        state.last_letter = wc;
     738                        break;
     739                      }
     740                    else
     741                      {
     742                        state.last_letter = btowc (*p);
     743                        break;
     744                      }
     745                  }
     746              }
     747          }
     748      }
     749  
     750    if (strchr (word, '\n'))
     751      {
     752        /* If there was a newline in the word we just added, put the entire
     753           pending ouput in the results string, and start a new line. */
     754        xspara__add_pending_word (result, 0);
     755        xspara__end_line ();
     756      }
     757    else
     758      {
     759        /* Calculate length of multibyte string in characters. */
     760        int len = 0;
     761        int left = word_len;
     762        wchar_t w;
     763        char *p = word;
     764  
     765        while (left > 0)
     766          {
     767            int columns;
     768            int char_len;
     769  
     770            if (PRINTABLE_ASCII(*p))
     771              {
     772                len++; p++; left--;
     773                continue;
     774              }
     775  
     776            char_len = mbrtowc (&w, p, left, NULL);
     777            if (char_len == (size_t) -2) {
     778              /* unfinished multibyte character */
     779              char_len = left;
     780            } else if (char_len == (size_t) -1) {
     781              /* invalid character */
     782              char_len = 1;
     783            } else if (char_len == 0) {
     784              /* not sure what this means but we must avoid an infinite loop.
     785                 Possibly only happens with invalid strings */
     786              char_len = 1;
     787            }
     788            left -= char_len;
     789  
     790            columns = wcwidth (w);
     791            if (columns > 0)
     792              len += columns;
     793  
     794            p += char_len;
     795          }
     796  
     797        state.word_counter += len;
     798  
     799        if (state.counter != 0
     800            && state.counter + state.word_counter + state.space_counter
     801                > state.max)
     802          {
     803            xspara__cut_line (result);
     804          }
     805      }
     806    if (debug)
     807      fprintf (stderr, "WORD+ %s -> %s\n", word, state.word.space == 0 ?
     808                  "UNDEF" : state.word.text);
     809  }
     810  
     811  /* Like _add_next but zero end_line_count at beginning. */
     812  TEXT
     813  xspara_add_next (char *text, int text_len, int transparent)
     814  {
     815    static TEXT t;
     816  
     817    text_reset (&t);
     818    state.end_line_count = 0;
     819    xspara__add_next (&t, text, text_len, transparent);
     820  
     821    return t;
     822  }
     823  
     824  void
     825  xspara_remove_end_sentence (void)
     826  {
     827    state.end_sentence = 0;
     828  }
     829  
     830  void
     831  xspara_add_end_sentence (int value)
     832  {
     833    state.end_sentence = value;
     834  }
     835  
     836  void
     837  xspara_allow_end_sentence (void)
     838  {
     839    state.last_letter = L'a'; /* A lower-case letter. */
     840  }
     841  
     842  /* -1 in a parameter means leave that value as it is. */
     843  void
     844  xspara_set_space_protection (int no_break,
     845                               int ignore_columns,
     846                               int keep_end_lines,
     847                               int french_spacing,
     848                               int double_width_no_break)
     849  {
     850    if (no_break != -1)
     851      state.no_break = no_break;
     852    if (ignore_columns != -1)
     853      state.ignore_columns = ignore_columns;
     854    if (keep_end_lines != -1)
     855      state.keep_end_lines = keep_end_lines;
     856    if (double_width_no_break != -1)
     857      state.double_width_no_break = double_width_no_break;
     858    if (french_spacing != -1)
     859      state.french_spacing = french_spacing;
     860  
     861    /*fprintf (stderr, "SETTING SPACE (%d, %d, %d, %d)\n",
     862                                     no_break,
     863                                     ignore_columns,
     864                                     keep_end_lines,
     865                                     french_spacing);*/
     866  
     867   if (no_break != -1 && state.no_break)
     868     {
     869       if (state.word.end == 0)
     870         {
     871           /* In _add_pending_word this meant that an "empty word" would
     872              be output.  This makes "a @w{} b" -> "a  b", not "a b", and
     873              "a @w{}" at end of paragraph -> "a ", not "a". */
     874  
     875           state.invisible_pending_word = 1;
     876         }
     877     }
     878  
     879   return;
     880  }
     881  
     882  /*****************************************************************/
     883  
     884  /* Return string to be added to paragraph contents, wrapping text. This 
     885     function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
     886     work correctly. */
     887  TEXT
     888  xspara_add_text (char *text, int len)
     889  {
     890    char *p = text;
     891    wchar_t wc;
     892    size_t char_len;
     893    int width;
     894    static TEXT result;
     895    dTHX;
     896  
     897    text_reset (&result);
     898  
     899    state.end_line_count = 0;
     900  
     901    while (len > 0)
     902      {
     903        if (debug)
     904          {
     905            fprintf(stderr, "p (%d+%d) s `%s', l `%lc', w `%s'\n",
     906                      state.counter, state.word_counter,
     907                      state.space.end == 0 ? ""
     908                        : xspara__print_escaped_spaces (state.space.text),
     909                      state.last_letter,
     910                      state.word.end > 0 ? state.word.text : "UNDEF");
     911          }
     912        if (isspace ((unsigned char) *p))
     913          {
     914            if (debug)
     915              {
     916                char t[2];
     917                t[0] = *p;
     918                t[1] = '\0';
     919                fprintf(stderr, "SPACES(%d) `%s'\n", state.counter,
     920                        xspara__print_escaped_spaces (t));
     921              }
     922  
     923            if (state.unfilled)
     924              {
     925                xspara__add_pending_word (&result, 0);
     926                if (*p == '\n')
     927                  {
     928                     xspara__end_line ();
     929                     text_append (&result, "\n");
     930                  }
     931                else
     932                  {
     933                    text_append_n (&state.space, p, 1);
     934                    state.space_counter++;
     935                  }
     936              }
     937            else if (state.no_break)
     938              {
     939                /* Append the spaces to the pending word. */
     940                if (state.word.end == 0
     941                    || state.word.text[state.word.end - 1] != ' ')
     942                  {
     943                    if (state.end_sentence == 1 && !state.french_spacing)
     944                      {
     945                        text_append_n (&state.word, "  ", 2);
     946                        state.word_counter += 2;
     947                      }
     948                    else
     949                      {
     950                        text_append_n (&state.word, " ", 1);
     951                        state.word_counter += 1;
     952                      }
     953  
     954                    if (state.counter != 0
     955                        && state.counter + state.word_counter
     956                            + state.space_counter > state.max)
     957                      {
     958                        xspara__cut_line (&result);
     959                      }
     960                  }
     961              }
     962            else /* no_break off */
     963              {
     964                int pending = state.invisible_pending_word;
     965                xspara__add_pending_word (&result, 0);
     966  
     967                if (state.counter != 0 || pending)
     968                  {
     969                    /* If we are at the end of a sentence where two spaces
     970                       are required. */
     971                    if (state.end_sentence == 1
     972                        && !state.french_spacing)
     973                      {
     974                        state.space.end = 0;
     975                        text_append_n (&state.space, "  ", 2);
     976                        state.space_counter = 2;
     977                      }
     978                    else /* Not at end of sentence. */
     979                      {
     980                        /* Only save the first space. */
     981                        if (state.space_counter < 1)
     982                          {
     983                            if (*p == '\n')
     984                              {
     985                                text_append_n (&state.space, " ", 1);
     986                                state.space_counter++;
     987                              }
     988                            else
     989                              {
     990                                text_append_n (&state.space, p, 1);
     991                                state.space_counter++;
     992                              }
     993                          }
     994                      }
     995                  }
     996              }
     997  
     998            /* If not enough space in the line for the pending space, start
     999               a new line. */
    1000            if (state.counter + state.space_counter > state.max)
    1001              {
    1002                xspara__cut_line (&result);
    1003              }
    1004  
    1005            if (!state.unfilled && *p == '\n' && state.keep_end_lines)
    1006              {
    1007                xspara__end_line ();
    1008                text_append (&result, "\n");
    1009              }
    1010            p++; len--;
    1011            state.last_letter = ' ';
    1012            continue;
    1013          }
    1014  
    1015        /************** Not a white space character. *****************/
    1016        if (!PRINTABLE_ASCII(*p))
    1017          {
    1018            char_len = mbrtowc (&wc, p, len, NULL);
    1019          }
    1020        else
    1021          {
    1022            /* Functonally the same as mbrtowc but (tested) slightly quicker. */
    1023            char_len = 1;
    1024            wc = btowc (*p);
    1025          }
    1026  
    1027        if ((long) char_len == 0)
    1028          break; /* Null character. Shouldn't happen. */
    1029        else if ((long) char_len < 0)
    1030          {
    1031            p++; len--; /* Invalid.  Just try to keep going. */
    1032            continue;
    1033          }
    1034  
    1035        width = wcwidth (wc);
    1036        /*************** Double width character. *********************/
    1037        if (width == 2)
    1038          {
    1039            if (debug)
    1040              fprintf (stderr, "FULLWIDTH\n");
    1041  
    1042            text_append_n (&state.word, p, char_len);
    1043            state.word_counter += 2;
    1044  
    1045            /* fullwidth latin letters can be upper case, so it is important to
    1046               use the actual characters here. */
    1047            state.last_letter = wc;
    1048  
    1049            /* We allow a line break in between Chinese characters even if
    1050               there was no space between them, unlike single-width
    1051               characters. */
    1052  
    1053            if (state.counter != 0
    1054                && state.counter + state.word_counter > state.max)
    1055              {
    1056                xspara__cut_line (&result);
    1057              }
    1058            /* Accumulate the characters so that they can be pushed
    1059               onto the next line if necessary. */
    1060            if (!state.no_break && !state.double_width_no_break)
    1061              {
    1062                xspara__add_pending_word (&result, 0);
    1063              }
    1064            state.end_sentence = -2;
    1065          }
    1066        else if (wc == L'\b')
    1067          {
    1068            /* Code to say that a following full stop (or question or
    1069               exclamation mark) may be an end of sentence. */
    1070            xspara_allow_end_sentence ();
    1071          }
    1072        /*************** Word character ******************************/
    1073        /* Note: width == 0 includes accent characters which should not
    1074           properly increase the column count.  This is not what the pure
    1075           Perl code does, though. */
    1076        else if (width == 1 || width == 0)
    1077          {
    1078            static char added_word[8]; /* long enough for one UTF-8 character */
    1079            memcpy (added_word, p, char_len);
    1080            added_word[char_len] = '\0';
    1081  
    1082            xspara__add_next (&result, added_word, char_len, 0);
    1083  
    1084            /* Now check if it is considered as an end of sentence, and
    1085               set state.end_sentence if it is. */
    1086  
    1087            if (strchr (end_sentence_characters, *p) && !state.unfilled)
    1088              {
    1089                /* Doesn't count if preceded by an upper-case letter. */
    1090                if (!iswupper (state.last_letter))
    1091                  {
    1092                    if (state.french_spacing)
    1093                      state.end_sentence = -1;
    1094                    else
    1095                      state.end_sentence = 1;
    1096                    if (debug)
    1097                      fprintf (stderr, "END_SENTENCE\n");
    1098                  }
    1099              }
    1100            else if (strchr (after_punctuation_characters, *p))
    1101              {
    1102                /* '"', '\'', ']' and ')' are ignored for the purpose
    1103                 of deciding whether a full stop ends a sentence. */
    1104              }
    1105            else
    1106              {
    1107                /* Otherwise reset the end of sentence marker: a full stop in
    1108                   a string like "aaaa.bbbb" doesn't mark an end of
    1109                   sentence. */
    1110                state.last_letter = wc;
    1111                if (debug && state.end_sentence != -2)
    1112                  fprintf (stderr, "delete END_SENTENCE(%d)\n",
    1113                                    state.end_sentence);
    1114                state.end_sentence = -2;
    1115              }
    1116          }
    1117        else
    1118          {
    1119            /* Not printable, possibly a tab, or a combining character.
    1120               Add it to the pending word without increasing the column
    1121               count. */
    1122            text_append_n (&state.word, p, char_len);
    1123          }
    1124        p += char_len; len -= char_len;
    1125      }
    1126  
    1127    return result;
    1128  }
    1129