(root)/
gettext-0.22.4/
gettext-tools/
src/
xg-mixed-string.c
       1  /* Handling strings that are given partially in the source encoding and
       2     partially in Unicode.
       3     Copyright (C) 2001-2018 Free Software Foundation, Inc.
       4  
       5     This program is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3 of the License, or
       8     (at your option) any later version.
       9  
      10     This program is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     You should have received a copy of the GNU General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifdef HAVE_CONFIG_H
      19  # include <config.h>
      20  #endif
      21  
      22  /* Specification.  */
      23  #include "xg-mixed-string.h"
      24  
      25  #include <assert.h>
      26  #include <stdlib.h>
      27  #include <string.h>
      28  
      29  #include "error.h"
      30  #include "error-progname.h"
      31  #include "flexmember.h"
      32  #include "msgl-ascii.h"
      33  #include "po-charset.h"
      34  #include "unistr.h"
      35  #include "xalloc.h"
      36  
      37  #include "xg-pos.h"
      38  
      39  #include "gettext.h"
      40  #define _(str) gettext (str)
      41  
      42  
      43  /* Allocates a single segment.  */
      44  static inline struct mixed_string_segment *
      45  segment_alloc (enum segment_type type, const char *string, size_t length)
      46  {
      47    struct mixed_string_segment *segment =
      48      (struct mixed_string_segment *)
      49      xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, length));
      50    segment->type = type;
      51    segment->length = length;
      52    memcpy (segment->contents, string, length);
      53    return segment;
      54  }
      55  
      56  /* Clones a single segment.  */
      57  static inline struct mixed_string_segment *
      58  segment_clone (const struct mixed_string_segment *segment)
      59  {
      60    return segment_alloc (segment->type, segment->contents, segment->length);
      61  }
      62  
      63  mixed_string_ty *
      64  mixed_string_alloc_simple (const char *string,
      65                             lexical_context_ty lcontext,
      66                             const char *logical_file_name,
      67                             int line_number)
      68  {
      69    struct mixed_string *ms = XMALLOC (struct mixed_string);
      70  
      71    if (*string == '\0')
      72      {
      73        /* An empty string.  */
      74        ms->segments = NULL;
      75        ms->nsegments = 0;
      76      }
      77    else
      78      {
      79        ms->segments = XNMALLOC (1, struct mixed_string_segment *);
      80        if ((xgettext_current_source_encoding == po_charset_ascii
      81             || xgettext_current_source_encoding == po_charset_utf8)
      82            && is_ascii_string (string))
      83          /* An optimization.  */
      84          ms->segments[0] =
      85            segment_alloc (utf8_encoded, string, strlen (string));
      86        else
      87          /* The general case.  */
      88          ms->segments[0] =
      89            segment_alloc (source_encoded, string, strlen (string));
      90        ms->nsegments = 1;
      91      }
      92    ms->lcontext = lcontext;
      93    ms->logical_file_name = logical_file_name;
      94    ms->line_number = line_number;
      95  
      96    return ms;
      97  }
      98  
      99  mixed_string_ty *
     100  mixed_string_alloc_utf8 (const char *string,
     101                           lexical_context_ty lcontext,
     102                           const char *logical_file_name,
     103                           int line_number)
     104  {
     105    struct mixed_string *ms = XMALLOC (struct mixed_string);
     106  
     107    if (*string == '\0')
     108      {
     109        /* An empty string.  */
     110        ms->segments = NULL;
     111        ms->nsegments = 0;
     112      }
     113    else
     114      {
     115        ms->segments = XNMALLOC (1, struct mixed_string_segment *);
     116        ms->segments[0] = segment_alloc (utf8_encoded, string, strlen (string));
     117        ms->nsegments = 1;
     118      }
     119    ms->lcontext = lcontext;
     120    ms->logical_file_name = logical_file_name;
     121    ms->line_number = line_number;
     122  
     123    return ms;
     124  }
     125  
     126  mixed_string_ty *
     127  mixed_string_clone (const mixed_string_ty *ms1)
     128  {
     129    struct mixed_string *ms = XMALLOC (struct mixed_string);
     130    size_t nsegments = ms1->nsegments;
     131  
     132    if (nsegments == 0)
     133      {
     134        ms->segments = NULL;
     135        ms->nsegments = 0;
     136      }
     137    else
     138      {
     139        size_t i;
     140  
     141        ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
     142        for (i = 0; i < nsegments; i++)
     143          ms->segments[i] = segment_clone (ms1->segments[i]);
     144        ms->nsegments = nsegments;
     145      }
     146    ms->lcontext = ms1->lcontext;
     147    ms->logical_file_name = ms1->logical_file_name;
     148    ms->line_number = ms1->line_number;
     149  
     150    return ms;
     151  }
     152  
     153  char *
     154  mixed_string_contents (const mixed_string_ty *ms)
     155  {
     156    size_t nsegments = ms->nsegments;
     157    /* Trivial cases.  */
     158    if (nsegments == 0)
     159      return xstrdup ("");
     160    if (nsegments == 1 && ms->segments[0]->type == utf8_encoded)
     161      {
     162        /* Return the segment, with a NUL at the end.  */
     163        size_t len = ms->segments[0]->length;
     164        char *string = XNMALLOC (len + 1, char);
     165        memcpy (string, ms->segments[0]->contents, len);
     166        string[len] = '\0';
     167        return string;
     168      }
     169    /* General case.  */
     170    {
     171      size_t i;
     172  
     173      for (i = 0; i < nsegments - 1; i++)
     174        if (memchr (ms->segments[i]->contents, '\0', ms->segments[i]->length)
     175            != NULL)
     176          {
     177            /* Segment i contains a NUL character.  Ignore the remaining
     178               segments.  */
     179            nsegments = i + 1;
     180            break;
     181          }
     182    }
     183    {
     184      char **converted_segments = XNMALLOC (nsegments, char *);
     185      size_t length;
     186  
     187      length = 0;
     188      {
     189        size_t i;
     190  
     191        for (i = 0; i < nsegments; i++)
     192          if (ms->segments[i]->type == source_encoded)
     193            {
     194              char *source_encoded_string;
     195              char *utf8_encoded_string;
     196  
     197              /* Copy the segment's contents, with a NUL at the end.  */
     198              {
     199                size_t len = ms->segments[i]->length;
     200                source_encoded_string = XNMALLOC (len + 1, char);
     201                memcpy (source_encoded_string, ms->segments[i]->contents, len);
     202                source_encoded_string[len] = '\0';
     203              }
     204              /* Convert it to UTF-8 encoding.  */
     205              utf8_encoded_string =
     206                from_current_source_encoding (source_encoded_string,
     207                                              ms->lcontext,
     208                                              ms->logical_file_name,
     209                                              ms->line_number);
     210              if (utf8_encoded_string != source_encoded_string)
     211                free (source_encoded_string);
     212              converted_segments[i] = utf8_encoded_string;
     213              length += strlen (utf8_encoded_string);
     214            }
     215          else
     216            length += ms->segments[i]->length;
     217      }
     218  
     219      {
     220        char *string = XNMALLOC (length + 1, char);
     221        {
     222          char *p;
     223          size_t i;
     224  
     225          p = string;
     226          for (i = 0; i < nsegments; i++)
     227            if (ms->segments[i]->type == source_encoded)
     228              {
     229                p = stpcpy (p, converted_segments[i]);
     230                free (converted_segments[i]);
     231              }
     232            else
     233              {
     234                memcpy (p, ms->segments[i]->contents, ms->segments[i]->length);
     235                p += ms->segments[i]->length;
     236              }
     237          assert (p == string + length);
     238          *p = '\0';
     239        }
     240  
     241        free (converted_segments);
     242        return string;
     243      }
     244    }
     245  }
     246  
     247  void
     248  mixed_string_free (mixed_string_ty *ms)
     249  {
     250    struct mixed_string_segment **segments = ms->segments;
     251    size_t nsegments = ms->nsegments;
     252    if (nsegments > 0)
     253      {
     254        size_t i;
     255        for (i = 0; i < nsegments; i++)
     256          free (segments[i]);
     257      }
     258    free (segments);
     259    free (ms);
     260  }
     261  
     262  char *
     263  mixed_string_contents_free1 (mixed_string_ty *ms)
     264  {
     265    char *contents = mixed_string_contents (ms);
     266    mixed_string_free (ms);
     267    return contents;
     268  }
     269  
     270  mixed_string_ty *
     271  mixed_string_concat (const mixed_string_ty *ms1,
     272                       const mixed_string_ty *ms2)
     273  {
     274    /* Trivial cases.  */
     275    if (ms2->nsegments == 0)
     276      return mixed_string_clone (ms1);
     277    if (ms1->nsegments == 0)
     278      return mixed_string_clone (ms2);
     279    /* General case.  */
     280    {
     281      struct mixed_string *ms = XMALLOC (struct mixed_string);
     282      size_t nsegments = ms1->nsegments + ms2->nsegments;
     283      size_t j;
     284      if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
     285        {
     286          /* Combine the last segment of ms1 with the first segment of ms2.  */
     287          size_t i;
     288  
     289          nsegments -= 1;
     290          ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
     291          j = 0;
     292          for (i = 0; i < ms1->nsegments - 1; i++)
     293            ms->segments[j++] = segment_clone (ms1->segments[i]);
     294          {
     295            size_t len1 = ms1->segments[i]->length;
     296            size_t len2 = ms2->segments[0]->length;
     297            struct mixed_string_segment *newseg =
     298              (struct mixed_string_segment *)
     299              xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
     300                                   len1 + len2));
     301            newseg->type = ms2->segments[0]->type;
     302            newseg->length = len1 + len2;
     303            memcpy (newseg->contents, ms1->segments[i]->contents, len1);
     304            memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
     305            ms->segments[j++] = newseg;
     306          }
     307          for (i = 1; i < ms2->nsegments; i++)
     308            ms->segments[j++] = segment_clone (ms2->segments[i]);
     309        }
     310      else
     311        {
     312          size_t i;
     313  
     314          ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
     315          j = 0;
     316          for (i = 0; i < ms1->nsegments; i++)
     317            ms->segments[j++] = segment_clone (ms1->segments[i]);
     318          for (i = 0; i < ms2->nsegments; i++)
     319            ms->segments[j++] = segment_clone (ms2->segments[i]);
     320        }
     321      assert (j == nsegments);
     322      ms->nsegments = nsegments;
     323      ms->lcontext = ms1->lcontext;
     324      ms->logical_file_name = ms1->logical_file_name;
     325      ms->line_number = ms1->line_number;
     326  
     327      return ms;
     328    }
     329  }
     330  
     331  mixed_string_ty *
     332  mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2)
     333  {
     334    /* Trivial cases.  */
     335    if (ms2->nsegments == 0)
     336      return ms1;
     337    if (ms1->nsegments == 0)
     338      {
     339        mixed_string_free (ms1);
     340        return mixed_string_clone (ms2);
     341      }
     342    /* General case.  */
     343    {
     344      struct mixed_string *ms = XMALLOC (struct mixed_string);
     345      size_t nsegments = ms1->nsegments + ms2->nsegments;
     346      size_t j;
     347      if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
     348        {
     349          /* Combine the last segment of ms1 with the first segment of ms2.  */
     350          size_t i;
     351  
     352          nsegments -= 1;
     353          ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
     354          j = 0;
     355          for (i = 0; i < ms1->nsegments - 1; i++)
     356            ms->segments[j++] = ms1->segments[i];
     357          {
     358            size_t len1 = ms1->segments[i]->length;
     359            size_t len2 = ms2->segments[0]->length;
     360            struct mixed_string_segment *newseg =
     361              (struct mixed_string_segment *)
     362              xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
     363                                   len1 + len2));
     364            newseg->type = ms2->segments[0]->type;
     365            newseg->length = len1 + len2;
     366            memcpy (newseg->contents, ms1->segments[i]->contents, len1);
     367            memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
     368            ms->segments[j++] = newseg;
     369          }
     370          free (ms1->segments[i]);
     371          for (i = 1; i < ms2->nsegments; i++)
     372            ms->segments[j++] = segment_clone (ms2->segments[i]);
     373        }
     374      else
     375        {
     376          size_t i;
     377  
     378          ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
     379          j = 0;
     380          for (i = 0; i < ms1->nsegments; i++)
     381            ms->segments[j++] = ms1->segments[i];
     382          for (i = 0; i < ms2->nsegments; i++)
     383            ms->segments[j++] = segment_clone (ms2->segments[i]);
     384        }
     385      assert (j == nsegments);
     386      free (ms1->segments);
     387      ms->nsegments = nsegments;
     388      ms->lcontext = ms1->lcontext;
     389      ms->logical_file_name = ms1->logical_file_name;
     390      ms->line_number = ms1->line_number;
     391      free (ms1);
     392  
     393      return ms;
     394    }
     395  }
     396  
     397  
     398  void
     399  mixed_string_buffer_init (struct mixed_string_buffer *bp,
     400                            lexical_context_ty lcontext,
     401                            const char *logical_file_name,
     402                            int line_number)
     403  {
     404    bp->segments = NULL;
     405    bp->nsegments = 0;
     406    bp->nsegments_allocated = 0;
     407    bp->curr_type = -1;
     408    bp->curr_buffer = NULL;
     409    bp->curr_buflen = 0;
     410    bp->curr_allocated = 0;
     411    bp->utf16_surr = 0;
     412    bp->lcontext = lcontext;
     413    bp->logical_file_name = logical_file_name;
     414    bp->line_number = line_number;
     415  }
     416  
     417  bool
     418  mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp)
     419  {
     420    return (bp->nsegments == 0 && bp->curr_buflen == 0);
     421  }
     422  
     423  /* Auxiliary function: Ensure count more bytes are available in
     424     bp->curr_buffer.  */
     425  static inline void
     426  mixed_string_buffer_grow_curr_buffer (struct mixed_string_buffer *bp,
     427                                        size_t count)
     428  {
     429    if (bp->curr_buflen + count > bp->curr_allocated)
     430      {
     431        size_t new_allocated = 2 * bp->curr_allocated + 10;
     432        if (new_allocated < bp->curr_buflen + count)
     433          new_allocated = bp->curr_buflen + count;
     434        bp->curr_allocated = new_allocated;
     435        bp->curr_buffer = xrealloc (bp->curr_buffer, new_allocated);
     436      }
     437  }
     438  
     439  /* Auxiliary function: Append a byte to bp->curr.  */
     440  static inline void
     441  mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp,
     442                                             unsigned char c)
     443  {
     444    if (bp->curr_buflen == bp->curr_allocated)
     445      {
     446        bp->curr_allocated = 2 * bp->curr_allocated + 10;
     447        bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
     448      }
     449    bp->curr_buffer[bp->curr_buflen++] = c;
     450  }
     451  
     452  /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, append a
     453     Unicode character to bp->curr_buffer.  uc must be < 0x110000.  */
     454  static inline void
     455  mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp,
     456                                             ucs4_t uc)
     457  {
     458    unsigned char utf8buf[6];
     459    int count = u8_uctomb (utf8buf, uc, 6);
     460  
     461    if (count < 0)
     462      /* The caller should have ensured that uc is not out-of-range.  */
     463      abort ();
     464  
     465    mixed_string_buffer_grow_curr_buffer (bp, count);
     466    memcpy (bp->curr_buffer + bp->curr_buflen, utf8buf, count);
     467    bp->curr_buflen += count;
     468  }
     469  
     470  /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, handle the
     471     attempt to append a lone surrogate to bp->curr_buffer.  */
     472  static void
     473  mixed_string_buffer_append_lone_surrogate (struct mixed_string_buffer *bp,
     474                                             ucs4_t uc)
     475  {
     476    /* A half surrogate is invalid, therefore use U+FFFD instead.
     477       It may be valid in a particular programming language.
     478       But a half surrogate is invalid in UTF-8:
     479         - RFC 3629 says
     480             "The definition of UTF-8 prohibits encoding character
     481              numbers between U+D800 and U+DFFF".
     482         - Unicode 4.0 chapter 3
     483           <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
     484           section 3.9, p.77, says
     485             "Because surrogate code points are not Unicode scalar
     486              values, any UTF-8 byte sequence that would otherwise
     487              map to code points D800..DFFF is ill-formed."
     488           and in table 3-6, p. 78, does not mention D800..DFFF.
     489         - The unicode.org FAQ question "How do I convert an unpaired
     490           UTF-16 surrogate to UTF-8?" has the answer
     491             "By representing such an unpaired surrogate on its own
     492              as a 3-byte sequence, the resulting UTF-8 data stream
     493              would become ill-formed."
     494       So use U+FFFD instead.  */
     495    error_with_progname = false;
     496    error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
     497           logical_file_name, line_number, uc);
     498    error_with_progname = true;
     499    mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd);
     500  }
     501  
     502  /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, flush
     503     bp->utf16_surr into bp->curr_buffer.  */
     504  static inline void
     505  mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
     506  {
     507    if (bp->utf16_surr != 0)
     508      {
     509        mixed_string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
     510        bp->utf16_surr = 0;
     511      }
     512  }
     513  
     514  /* Auxiliary function: Append a segment to bp->segments.  */
     515  static inline void
     516  mixed_string_buffer_add_segment (struct mixed_string_buffer *bp,
     517                                   struct mixed_string_segment *newseg)
     518  {
     519    if (bp->nsegments == bp->nsegments_allocated)
     520      {
     521        size_t new_allocated =
     522          bp->nsegments_allocated = 2 * bp->nsegments_allocated + 1;
     523        bp->segments =
     524          (struct mixed_string_segment **)
     525          xrealloc (bp->segments,
     526                    new_allocated * sizeof (struct mixed_string_segment *));
     527      }
     528    bp->segments[bp->nsegments++] = newseg;
     529  }
     530  
     531  /* Auxiliary function: Flush bp->curr_buffer and bp->utf16_surr into
     532     bp->segments.  */
     533  static void
     534  mixed_string_buffer_flush_curr (struct mixed_string_buffer *bp)
     535  {
     536    if (bp->curr_type == utf8_encoded)
     537      mixed_string_buffer_flush_utf16_surr (bp);
     538    if (bp->curr_type != -1)
     539      {
     540        if (bp->curr_buflen > 0)
     541          {
     542            struct mixed_string_segment *segment =
     543              segment_alloc (bp->curr_type, bp->curr_buffer, bp->curr_buflen);
     544            mixed_string_buffer_add_segment (bp, segment);
     545          }
     546        bp->curr_buflen = 0;
     547      }
     548  }
     549  
     550  void
     551  mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c)
     552  {
     553    /* Switch to multibyte character mode.  */
     554    if (bp->curr_type != source_encoded)
     555      {
     556        mixed_string_buffer_flush_curr (bp);
     557        bp->curr_type = source_encoded;
     558      }
     559  
     560      mixed_string_buffer_append_to_curr_buffer (bp, (unsigned char) c);
     561  }
     562  
     563  void
     564  mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c)
     565  {
     566    /* Switch to Unicode character mode.  */
     567    if (bp->curr_type != utf8_encoded)
     568      {
     569        mixed_string_buffer_flush_curr (bp);
     570        bp->curr_type = utf8_encoded;
     571        assert (bp->utf16_surr == 0);
     572      }
     573  
     574    /* Test whether this character and the previous one form a Unicode
     575       surrogate character pair.  */
     576    if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000))
     577      {
     578        unsigned short utf16buf[2];
     579        ucs4_t uc;
     580  
     581        utf16buf[0] = bp->utf16_surr;
     582        utf16buf[1] = c;
     583        if (u16_mbtouc (&uc, utf16buf, 2) != 2)
     584          abort ();
     585  
     586        mixed_string_buffer_append_to_utf8_buffer (bp, uc);
     587        bp->utf16_surr = 0;
     588      }
     589    else
     590      {
     591        mixed_string_buffer_flush_utf16_surr (bp);
     592  
     593        if (c >= 0xd800 && c < 0xdc00)
     594          bp->utf16_surr = c;
     595        else if (c >= 0xdc00 && c < 0xe000)
     596          mixed_string_buffer_append_lone_surrogate (bp, c);
     597        else
     598          mixed_string_buffer_append_to_utf8_buffer (bp, c);
     599      }
     600  }
     601  
     602  void
     603  mixed_string_buffer_destroy (struct mixed_string_buffer *bp)
     604  {
     605    struct mixed_string_segment **segments = bp->segments;
     606    size_t nsegments = bp->nsegments;
     607    if (nsegments > 0)
     608      {
     609        size_t i;
     610        for (i = 0; i < nsegments; i++)
     611          free (segments[i]);
     612      }
     613    free (segments);
     614    free (bp->curr_buffer);
     615  }
     616  
     617  mixed_string_ty *
     618  mixed_string_buffer_result (struct mixed_string_buffer *bp)
     619  {
     620    mixed_string_buffer_flush_curr (bp);
     621  
     622    {
     623      struct mixed_string *ms = XMALLOC (struct mixed_string);
     624      size_t nsegments = bp->nsegments;
     625  
     626      if (nsegments > 0)
     627        ms->segments =
     628          (struct mixed_string_segment **)
     629          xrealloc (bp->segments,
     630                    nsegments * sizeof (struct mixed_string_segment *));
     631      else
     632        {
     633          assert (bp->segments == NULL);
     634          ms->segments = NULL;
     635        }
     636      ms->nsegments = nsegments;
     637      ms->lcontext = bp->lcontext;
     638      ms->logical_file_name = bp->logical_file_name;
     639      ms->line_number = bp->line_number;
     640  
     641      free (bp->curr_buffer);
     642  
     643      return ms;
     644    }
     645  }