1  /* Line breaking of strings.
       2     Copyright (C) 2001-2003, 2006-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2001.
       4  
       5     This file is free software.
       6     It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
       7     You can redistribute it and/or modify it under either
       8       - the terms of the GNU Lesser General Public License as published
       9         by the Free Software Foundation, either version 3, or (at your
      10         option) any later version, or
      11       - the terms of the GNU General Public License as published by the
      12         Free Software Foundation; either version 2, or (at your option)
      13         any later version, or
      14       - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
      15  
      16     This file is distributed in the hope that it will be useful,
      17     but WITHOUT ANY WARRANTY; without even the implied warranty of
      18     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      19     Lesser General Public License and the GNU General Public License
      20     for more details.
      21  
      22     You should have received a copy of the GNU Lesser General Public
      23     License and of the GNU General Public License along with this
      24     program.  If not, see <https://www.gnu.org/licenses/>.  */
      25  
      26  #include <config.h>
      27  
      28  /* Specification.  */
      29  #include "unilbrk.h"
      30  
      31  #include <stdlib.h>
      32  #include <string.h>
      33  
      34  #include "c-ctype.h"
      35  #include "uniconv.h"
      36  #include "unilbrk/internal.h"
      37  #include "unilbrk/lbrktables.h"
      38  #include "unilbrk/ulc-common.h"
      39  
      40  /* Line breaking of a string in an arbitrary encoding.
      41  
      42     We convert the input string to Unicode.
      43  
      44     The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
      45     UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
      46     \U0000FFFF.  UTF-16 and variants support only characters up to
      47     \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
      48     UCS-4 specification leaves doubts about endianness and byte order mark.
      49     glibc currently interprets it as big endian without byte order mark,
      50     but this is not backed by an RFC.  So we use UTF-8. It supports
      51     characters up to \U7FFFFFFF and is unambiguously defined.  */
      52  
      53  static int
      54  ulc_width_linebreaks_internal (const char *s, size_t n,
      55                                 int width, int start_column, int at_end_columns,
      56                                 const char *o, const char *encoding, int cr,
      57                                 char *p)
      58  {
      59    if (n > 0)
      60      {
      61        if (is_utf8_encoding (encoding))
      62          return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
      63        else
      64          {
      65            /* Convert the string to UTF-8 and build a translation table
      66               from offsets into s to offsets into the translated string.  */
      67            size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
      68  
      69            if (offsets != NULL)
      70              {
      71                uint8_t *t;
      72                size_t m;
      73  
      74                t = u8_conv_from_encoding (encoding, iconveh_question_mark,
      75                                           s, n, offsets, NULL, &m);
      76                if (t != NULL)
      77                  {
      78                    char *memory =
      79                      (char *) (m > 0 ? malloc (m + (o != NULL ? m : 0)) : NULL);
      80  
      81                    if (m == 0 || memory != NULL)
      82                      {
      83                        char *q = (char *) memory;
      84                        char *o8 = (o != NULL ? (char *) (q + m) : NULL);
      85                        int res_column;
      86                        size_t i;
      87  
      88                        /* Translate the overrides to the UTF-8 string.  */
      89                        if (o != NULL)
      90                          {
      91                            memset (o8, UC_BREAK_UNDEFINED, m);
      92                            for (i = 0; i < n; i++)
      93                              if (offsets[i] != (size_t)(-1))
      94                                o8[offsets[i]] = o[i];
      95                          }
      96  
      97                        /* Determine the line breaks of the UTF-8 string.  */
      98                        res_column =
      99                          u8_width_linebreaks_internal (t, m, width, start_column, at_end_columns, o8, encoding, cr, q);
     100  
     101                        /* Translate the result back to the original string.  */
     102                        memset (p, UC_BREAK_PROHIBITED, n);
     103                        for (i = 0; i < n; i++)
     104                          if (offsets[i] != (size_t)(-1))
     105                            p[i] = q[offsets[i]];
     106  
     107                        free (memory);
     108                        free (t);
     109                        free (offsets);
     110                        return res_column;
     111                      }
     112                    free (t);
     113                  }
     114                free (offsets);
     115              }
     116            /* Impossible to convert.  */
     117  #if C_CTYPE_ASCII
     118            if (is_all_ascii (s, n))
     119              {
     120                /* ASCII is a subset of UTF-8.  */
     121                return u8_width_linebreaks_internal ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, cr, p);
     122              }
     123  #endif
     124            /* We have a non-ASCII string and cannot convert it.
     125               Don't produce line breaks except those already present in the
     126               input string.  All we assume here is that the encoding is
     127               minimally ASCII compatible.  */
     128            {
     129              const char *s_end = s + n;
     130              while (s < s_end)
     131                {
     132                  *p = ((o != NULL && *o == UC_BREAK_MANDATORY)
     133                        || *s == '\n'
     134                        ? UC_BREAK_MANDATORY
     135                        : ((o != NULL && *o == UC_BREAK_CR_BEFORE_LF)
     136                           || (cr >= 0
     137                               && *s == '\r'
     138                               && s + 1 < s_end
     139                               && *(s + 1) == '\n')
     140                           ? UC_BREAK_CR_BEFORE_LF
     141                           : UC_BREAK_PROHIBITED));
     142                  s++;
     143                  p++;
     144                  if (o != NULL)
     145                    o++;
     146                }
     147              /* We cannot compute widths in this case.  */
     148            }
     149          }
     150      }
     151    return start_column;
     152  }
     153  
     154  #if defined IN_LIBUNISTRING
     155  /* For backward compatibility with older versions of libunistring.  */
     156  
     157  # undef ulc_width_linebreaks
     158  
     159  int
     160  ulc_width_linebreaks (const char *s, size_t n,
     161                        int width, int start_column, int at_end_columns,
     162                        const char *o, const char *encoding,
     163                        char *p)
     164  {
     165    return ulc_width_linebreaks_internal (s, n,
     166                                          width, start_column, at_end_columns,
     167                                          o, encoding, -1, p);
     168  }
     169  
     170  #endif
     171  
     172  int
     173  ulc_width_linebreaks_v2 (const char *s, size_t n,
     174                           int width, int start_column, int at_end_columns,
     175                           const char *o, const char *encoding,
     176                           char *p)
     177  {
     178    return ulc_width_linebreaks_internal (s, n,
     179                                          width, start_column, at_end_columns,
     180                                          o, encoding, LBP_CR, p);
     181  }
     182  
     183  
     184  #ifdef TEST
     185  
     186  #include <stdio.h>
     187  #include <locale.h>
     188  
     189  /* Read the contents of an input stream, and return it, terminated with a NUL
     190     byte. */
     191  char *
     192  read_file (FILE *stream)
     193  {
     194  #define BUFSIZE 4096
     195    char *buf = NULL;
     196    int alloc = 0;
     197    int size = 0;
     198    int count;
     199  
     200    while (! feof (stream))
     201      {
     202        if (size + BUFSIZE > alloc)
     203          {
     204            alloc = alloc + alloc / 2;
     205            if (alloc < size + BUFSIZE)
     206              alloc = size + BUFSIZE;
     207            buf = realloc (buf, alloc);
     208            if (buf == NULL)
     209              {
     210                fprintf (stderr, "out of memory\n");
     211                exit (1);
     212              }
     213          }
     214        count = fread (buf + size, 1, BUFSIZE, stream);
     215        if (count == 0)
     216          {
     217            if (ferror (stream))
     218              {
     219                perror ("fread");
     220                exit (1);
     221              }
     222          }
     223        else
     224          size += count;
     225      }
     226    buf = realloc (buf, size + 1);
     227    if (buf == NULL)
     228      {
     229        fprintf (stderr, "out of memory\n");
     230        exit (1);
     231      }
     232    buf[size] = '\0';
     233    return buf;
     234  #undef BUFSIZE
     235  }
     236  
     237  int
     238  main (int argc, char * argv[])
     239  {
     240    setlocale (LC_CTYPE, "");
     241    if (argc == 2)
     242      {
     243        /* Insert line breaks for a given width.  */
     244        int width = atoi (argv[1]);
     245        char *input = read_file (stdin);
     246        int length = strlen (input);
     247        char *breaks = malloc (length);
     248        int i;
     249  
     250        ulc_width_linebreaks_v2 (input, length, width, 0, 0, NULL, locale_charset (), breaks);
     251  
     252        for (i = 0; i < length; i++)
     253          {
     254            switch (breaks[i])
     255              {
     256              case UC_BREAK_POSSIBLE:
     257                putc ('\n', stdout);
     258                break;
     259              case UC_BREAK_MANDATORY:
     260                break;
     261              case UC_BREAK_CR_BEFORE_LF:
     262                break;
     263              case UC_BREAK_PROHIBITED:
     264                break;
     265              default:
     266                abort ();
     267              }
     268            putc (input[i], stdout);
     269          }
     270  
     271        free (breaks);
     272  
     273        return 0;
     274      }
     275    else
     276      return 1;
     277  }
     278  
     279  #endif /* TEST */