(root)/
gettext-0.22.4/
gettext-tools/
src/
sentence.c
       1  /* Sentence handling.
       2     Copyright (C) 2015 Free Software Foundation, Inc.
       3     Written by Daiki Ueno <ueno@gnu.org>, 2015.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "sentence.h"
      24  
      25  #include <stdlib.h>
      26  #include <string.h>
      27  #include "unistr.h"
      28  
      29  
      30  /* The minimal number of white spaces which should follow after the
      31     end of sentence.  */
      32  int sentence_end_required_spaces = 1;
      33  
      34  /* This function works in a similar way to 'forward-sentence' in
      35     Emacs, which basically does a regular expression matching of:
      36  
      37       [.?!\u2026]
      38         []"'\u201d)}]*
      39           \($\|[ \u00a0]$\|\t\|[ \u00a0]\{REQUIRED_SPACES\}\)
      40  
      41     Since we are lacking a regular expression routine capable of
      42     Unicode (though gnulib-lib/lib/regex.c provides a locale-dependent
      43     version, we would rather avoid depending on it), apply a manually
      44     constructed DFA, which consists of 8 states where 4 of them are a
      45     terminal.  */
      46  const char *
      47  sentence_end (const char *string, ucs4_t *ending_charp)
      48  {
      49    const char *str = string;
      50    const char *str_limit = string + strlen (str);
      51    /* States of the DFA, 0 to 7, where 3, 5, 6, and 7 are a terminal.  */
      52    int state = 0;
      53    /* Previous character before an end marker.  */
      54    ucs4_t ending_char = 0xfffd;
      55    /* Possible starting position of the match, and the next starting
      56       position if the current match fails.  */
      57    const char *match_start = NULL, *match_next = NULL;
      58    /* Number of spaces.  */
      59    int spaces = 0;
      60  
      61    while (str <= str_limit)
      62      {
      63        ucs4_t uc;
      64        size_t length;
      65  
      66        length = u8_mbtouc (&uc, (const unsigned char *) str, str_limit - str);
      67  
      68        if (state == 0)
      69          {
      70            switch (uc)
      71              {
      72              case '.': case '?': case '!': case 0x2026:
      73                state = 1;
      74                match_start = str;
      75                match_next = str + length;
      76                ending_char = uc;
      77                spaces = 0;
      78                break;
      79  
      80              default:
      81                break;
      82              }
      83  
      84            str += length;
      85            continue;
      86          }
      87  
      88        if (state == 1)
      89          {
      90            switch (uc)
      91              {
      92              case ']': case '"': case '\'': case ')': case '}': case 0x201d:
      93                state = 2;
      94                break;
      95  
      96              case '\0': case '\n':
      97                /* State 3.  */
      98                *ending_charp = ending_char;
      99                return match_start;
     100  
     101              case ' ': case 0x00a0:
     102                if (++spaces == sentence_end_required_spaces)
     103                  {
     104                    /* State 7.  */
     105                    *ending_charp = ending_char;
     106                    return match_start;
     107                  }
     108                state = 4;
     109                break;
     110  
     111              case '\t':
     112                /* State 5.  */
     113                *ending_charp = ending_char;
     114                return match_start;
     115  
     116              default:
     117                str = match_next;
     118                state = 0;
     119                continue;
     120              }
     121  
     122            str += length;
     123            continue;
     124          }
     125  
     126        if (state == 2)
     127          {
     128            switch (uc)
     129              {
     130              case ']': case '"': case '\'': case ')': case '}': case 0x201d:
     131                break;
     132  
     133              case '\0': case '\n':
     134                /* State 3.  */
     135                *ending_charp = ending_char;
     136                return match_start;
     137  
     138              case ' ': case 0x00a0:
     139                if (++spaces == sentence_end_required_spaces)
     140                  {
     141                    /* State 7.  */
     142                    *ending_charp = ending_char;
     143                    return match_start;
     144                  }
     145                state = 4;
     146                break;
     147  
     148              case '\t':
     149                /* State 5.  */
     150                *ending_charp = ending_char;
     151                return match_start;
     152  
     153              default:
     154                state = 0;
     155                str = match_next;
     156                continue;
     157              }
     158  
     159            str += length;
     160            continue;
     161          }
     162  
     163        if (state == 4)
     164          {
     165            switch (uc)
     166              {
     167              case '\0': case '\n':
     168                /* State 6.  */
     169                *ending_charp = ending_char;
     170                return match_start;
     171  
     172              case ' ': case 0x00a0:
     173                if (++spaces == sentence_end_required_spaces)
     174                  {
     175                    /* State 7.  */
     176                    *ending_charp = ending_char;
     177                    return match_start;
     178                  }
     179                break;
     180  
     181              default:
     182                state = 0;
     183                str = match_next;
     184                continue;
     185              }
     186  
     187            str += length;
     188            continue;
     189          }
     190      }
     191  
     192    *ending_charp = 0xfffd;
     193    return str_limit;
     194  }