1  /* Line breaking of UTF-8 strings.
       2     Copyright (C) 2001-2003, 2006-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2001.
       4  
       5     This file is free software.
       6     It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
       7     You can redistribute it and/or modify it under either
       8       - the terms of the GNU Lesser General Public License as published
       9         by the Free Software Foundation, either version 3, or (at your
      10         option) any later version, or
      11       - the terms of the GNU General Public License as published by the
      12         Free Software Foundation; either version 2, or (at your option)
      13         any later version, or
      14       - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
      15  
      16     This file is distributed in the hope that it will be useful,
      17     but WITHOUT ANY WARRANTY; without even the implied warranty of
      18     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      19     Lesser General Public License and the GNU General Public License
      20     for more details.
      21  
      22     You should have received a copy of the GNU Lesser General Public
      23     License and of the GNU General Public License along with this
      24     program.  If not, see <https://www.gnu.org/licenses/>.  */
      25  
      26  #include <config.h>
      27  
      28  /* Specification.  */
      29  #include "unilbrk.h"
      30  #include "unilbrk/internal.h"
      31  
      32  #include <stdlib.h>
      33  #include <string.h>
      34  
      35  #include "unilbrk/lbrktables.h"
      36  #include "uniwidth/cjk.h"
      37  #include "unistr.h"
      38  
      39  /* This file implements
      40     Unicode Standard Annex #14 <https://www.unicode.org/reports/tr14/>.  */
      41  
      42  void
      43  u8_possible_linebreaks_loop (const uint8_t *s, size_t n, const char *encoding,
      44                               int cr, char *p)
      45  {
      46    if (n > 0)
      47      {
      48        int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
      49        const uint8_t *s_end = s + n;
      50        int prev_prop = LBP_BK; /* line break property of last character */
      51        int last_prop = LBP_BK; /* line break property of last non-space character */
      52        char *seen_space = NULL; /* Was a space seen after the last non-space character? */
      53  
      54        /* Don't break inside multibyte characters.  */
      55        memset (p, UC_BREAK_PROHIBITED, n);
      56  
      57        /* Number of consecutive regional indicator (RI) characters seen
      58           immediately before the current point.  */
      59        size_t ri_count = 0;
      60  
      61        do
      62          {
      63            ucs4_t uc;
      64            int count = u8_mbtouc_unsafe (&uc, s, s_end - s);
      65            int prop = unilbrkprop_lookup (uc);
      66  
      67            if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
      68              {
      69                /* (LB4,LB5,LB6) Mandatory break.  */
      70                *p = UC_BREAK_MANDATORY;
      71                /* cr is either LBP_CR or -1.  In the first case, recognize
      72                   a CR-LF sequence.  */
      73                if (prev_prop == cr && prop == LBP_LF)
      74                  p[-1] = UC_BREAK_CR_BEFORE_LF;
      75                prev_prop = prop;
      76                last_prop = LBP_BK;
      77                seen_space = NULL;
      78              }
      79            else
      80              {
      81                /* Resolve property values whose behaviour is not fixed.  */
      82                switch (prop)
      83                  {
      84                  case LBP_AI:
      85                    /* Resolve ambiguous.  */
      86                    prop = LBP_AI_REPLACEMENT;
      87                    break;
      88                  case LBP_CB:
      89                    /* This is arbitrary.  */
      90                    prop = LBP_ID1;
      91                    break;
      92                  case LBP_SA:
      93                    /* We don't handle complex scripts yet.
      94                       Treat LBP_SA like LBP_XX.  */
      95                  case LBP_XX:
      96                    /* This is arbitrary.  */
      97                    prop = LBP_AL;
      98                    break;
      99                  }
     100  
     101                /* Deal with spaces and combining characters.  */
     102                if (prop == LBP_SP)
     103                  {
     104                    /* (LB7) Don't break just before a space.  */
     105                    *p = UC_BREAK_PROHIBITED;
     106                    seen_space = p;
     107                  }
     108                else if (prop == LBP_ZW)
     109                  {
     110                    /* (LB7) Don't break just before a zero-width space.  */
     111                    *p = UC_BREAK_PROHIBITED;
     112                    last_prop = LBP_ZW;
     113                    seen_space = NULL;
     114                  }
     115                else if (prop == LBP_CM || prop == LBP_ZWJ)
     116                  {
     117                    /* (LB9) Don't break just before a combining character or
     118                       zero-width joiner, except immediately after a mandatory
     119                       break character, space, or zero-width space.  */
     120                    if (last_prop == LBP_BK)
     121                      {
     122                        /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
     123                        *p = UC_BREAK_PROHIBITED;
     124                        /* (LB10) Treat CM or ZWJ as AL.  */
     125                        last_prop = LBP_AL;
     126                        seen_space = NULL;
     127                      }
     128                    else if (last_prop == LBP_ZW || seen_space != NULL)
     129                      {
     130                        /* (LB8) Break after zero-width space.  */
     131                        /* (LB18) Break after spaces.
     132                           We do *not* implement the "legacy support for space
     133                           character as base for combining marks" because now the
     134                           NBSP CM sequence is recommended instead of SP CM.  */
     135                        *p = UC_BREAK_POSSIBLE;
     136                        /* (LB10) Treat CM or ZWJ as AL.  */
     137                        last_prop = LBP_AL;
     138                        seen_space = NULL;
     139                      }
     140                    else
     141                      {
     142                        /* Treat X CM as if it were X.  */
     143                        *p = UC_BREAK_PROHIBITED;
     144                      }
     145                  }
     146                else
     147                  {
     148                    /* prop must be usable as an index for table 7.3 of UTR #14.  */
     149                    if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0])))
     150                      abort ();
     151  
     152                    if (last_prop == LBP_BK)
     153                      {
     154                        /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */
     155                        *p = UC_BREAK_PROHIBITED;
     156                      }
     157                    else if (last_prop == LBP_ZW)
     158                      {
     159                        /* (LB8) Break after zero-width space.  */
     160                        *p = UC_BREAK_POSSIBLE;
     161                      }
     162                    else if (prev_prop == LBP_ZWJ)
     163                      {
     164                        /* (LB8a) Don't break right after a zero-width joiner.  */
     165                        *p = UC_BREAK_PROHIBITED;
     166                      }
     167                    else if (last_prop == LBP_RI && prop == LBP_RI)
     168                      {
     169                        /* (LB30a) Break between two regional indicator symbols
     170                           if and only if there are an even number of regional
     171                           indicators preceding the position of the break.  */
     172                        *p = (seen_space != NULL || (ri_count % 2) == 0
     173                              ? UC_BREAK_POSSIBLE
     174                              : UC_BREAK_PROHIBITED);
     175                      }
     176                    else if (prev_prop == LBP_HL_BA)
     177                      {
     178                        /* (LB21a) Don't break after Hebrew + Hyphen/Break-After.  */
     179                        *p = UC_BREAK_PROHIBITED;
     180                      }
     181                    else
     182                      {
     183                        switch (unilbrk_table [last_prop] [prop])
     184                          {
     185                          case D:
     186                            *p = UC_BREAK_POSSIBLE;
     187                            break;
     188                          case I:
     189                            *p = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
     190                            break;
     191                          case P:
     192                            *p = UC_BREAK_PROHIBITED;
     193                            break;
     194                          default:
     195                            abort ();
     196                          }
     197                      }
     198                    last_prop = prop;
     199                    seen_space = NULL;
     200                  }
     201  
     202                prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
     203                             ? LBP_HL_BA
     204                             : prop);
     205              }
     206  
     207            if (prop == LBP_RI)
     208              ri_count++;
     209            else
     210              ri_count = 0;
     211  
     212            s += count;
     213            p += count;
     214          }
     215        while (s < s_end);
     216      }
     217  }
     218  
     219  #if defined IN_LIBUNISTRING
     220  /* For backward compatibility with older versions of libunistring.  */
     221  
     222  # undef u8_possible_linebreaks
     223  
     224  void
     225  u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding,
     226                          char *p)
     227  {
     228    u8_possible_linebreaks_loop (s, n, encoding, -1, p);
     229  }
     230  
     231  #endif
     232  
     233  void
     234  u8_possible_linebreaks_v2 (const uint8_t *s, size_t n, const char *encoding,
     235                             char *p)
     236  {
     237    u8_possible_linebreaks_loop (s, n, encoding, LBP_CR, p);
     238  }
     239  
     240  
     241  #ifdef TEST
     242  
     243  #include <stdio.h>
     244  #include <string.h>
     245  
     246  /* Read the contents of an input stream, and return it, terminated with a NUL
     247     byte. */
     248  char *
     249  read_file (FILE *stream)
     250  {
     251  #define BUFSIZE 4096
     252    char *buf = NULL;
     253    int alloc = 0;
     254    int size = 0;
     255    int count;
     256  
     257    while (! feof (stream))
     258      {
     259        if (size + BUFSIZE > alloc)
     260          {
     261            alloc = alloc + alloc / 2;
     262            if (alloc < size + BUFSIZE)
     263              alloc = size + BUFSIZE;
     264            buf = realloc (buf, alloc);
     265            if (buf == NULL)
     266              {
     267                fprintf (stderr, "out of memory\n");
     268                exit (1);
     269              }
     270          }
     271        count = fread (buf + size, 1, BUFSIZE, stream);
     272        if (count == 0)
     273          {
     274            if (ferror (stream))
     275              {
     276                perror ("fread");
     277                exit (1);
     278              }
     279          }
     280        else
     281          size += count;
     282      }
     283    buf = realloc (buf, size + 1);
     284    if (buf == NULL)
     285      {
     286        fprintf (stderr, "out of memory\n");
     287        exit (1);
     288      }
     289    buf[size] = '\0';
     290    return buf;
     291  #undef BUFSIZE
     292  }
     293  
     294  int
     295  main (int argc, char * argv[])
     296  {
     297    if (argc == 1)
     298      {
     299        /* Display all the break opportunities in the input string.  */
     300        char *input = read_file (stdin);
     301        int length = strlen (input);
     302        char *breaks = malloc (length);
     303        int i;
     304  
     305        u8_possible_linebreaks_v2 ((uint8_t *) input, length, "UTF-8", breaks);
     306  
     307        for (i = 0; i < length; i++)
     308          {
     309            switch (breaks[i])
     310              {
     311              case UC_BREAK_POSSIBLE:
     312                /* U+2027 in UTF-8 encoding */
     313                putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
     314                break;
     315              case UC_BREAK_MANDATORY:
     316                /* U+21B2 (or U+21B5) in UTF-8 encoding */
     317                putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
     318                break;
     319              case UC_BREAK_CR_BEFORE_LF:
     320                /* U+21E4 in UTF-8 encoding */
     321                putc (0xe2, stdout); putc (0x87, stdout); putc (0xa4, stdout);
     322                break;
     323              case UC_BREAK_PROHIBITED:
     324                break;
     325              default:
     326                abort ();
     327              }
     328            putc (input[i], stdout);
     329          }
     330  
     331        free (breaks);
     332  
     333        return 0;
     334      }
     335    else
     336      return 1;
     337  }
     338  
     339  #endif /* TEST */