1  /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
       2  
       3  /* libcroco - Library for parsing and applying CSS
       4   * Copyright (C) 2006-2019 Free Software Foundation, Inc.
       5   *
       6   * This file is not part of the GNU gettext program, but is used with
       7   * GNU gettext.
       8   *
       9   * The original copyright notice is as follows:
      10   */
      11  
      12  /*
      13   * This file is part of The Croco Library
      14   *
      15   * Copyright (C) 2003-2004 Dodji Seketeli.  All Rights Reserved.
      16   *
      17   * This program is free software; you can redistribute it and/or
      18   * modify it under the terms of version 2.1 of the GNU Lesser General Public
      19   * License as published by the Free Software Foundation.
      20   *
      21   * This program is distributed in the hope that it will be useful,
      22   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      23   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      24   * GNU General Public License for more details.
      25   *
      26   * You should have received a copy of the GNU Lesser General Public License
      27   * along with this program; if not, write to the Free Software
      28   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
      29   * USA
      30   *
      31   * Author: Dodji Seketeli
      32   */
      33  
      34  #include <config.h>
      35  #include "cr-utils.h"
      36  #include "cr-string.h"
      37  
      38  /**
      39   *@file:
      40   *Some misc utility functions used
      41   *in the libcroco.
      42   *Note that troughout this file I will
      43   *refer to the CSS SPECIFICATIONS DOCUMENTATION
      44   *written by the w3c guys. You can find that document
      45   *at http://www.w3.org/TR/REC-CSS2/ .
      46   */
      47  
      48  /****************************
      49   *Encoding transformations and
      50   *encoding helpers
      51   ****************************/
      52  
      53  /*
      54   *Here is the correspondance between the ucs-4 charactere codes
      55   *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
      56   *
      57   *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
      58   *------------------    -----------------------------
      59   *0000 0000-0000 007F   0xxxxxxx
      60   *0000 0080-0000 07FF   110xxxxx 10xxxxxx
      61   *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
      62   *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      63   *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      64   *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
      65   */
      66  
      67  /**
      68   *Given an utf8 string buffer, calculates
      69   *the length of this string if it was encoded
      70   *in ucs4.
      71   *@param a_in_start a pointer to the begining of
      72   *the input utf8 string.
      73   *@param a_in_end a pointre to the end of the input
      74   *utf8 string (points to the last byte of the buffer)
      75   *@param a_len out parameter the calculated length.
      76   *@return CR_OK upon succesfull completion, an error code
      77   *otherwise.
      78   */
      79  enum CRStatus
      80  cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
      81                                 const guchar * a_in_end, gulong * a_len)
      82  {
      83          guchar *byte_ptr = NULL;
      84          gint len = 0;
      85  
      86          /*
      87           *to store the final decoded 
      88           *unicode char
      89           */
      90          guint c = 0;
      91  
      92          g_return_val_if_fail (a_in_start && a_in_end && a_len,
      93                                CR_BAD_PARAM_ERROR);
      94          *a_len = 0;
      95  
      96          for (byte_ptr = (guchar *) a_in_start;
      97               byte_ptr <= a_in_end; byte_ptr++) {
      98                  gint nb_bytes_2_decode = 0;
      99  
     100                  if (*byte_ptr <= 0x7F) {
     101                          /*
     102                           *7 bits long char
     103                           *encoded over 1 byte:
     104                           * 0xxx xxxx
     105                           */
     106                          c = *byte_ptr;
     107                          nb_bytes_2_decode = 1;
     108  
     109                  } else if ((*byte_ptr & 0xE0) == 0xC0) {
     110                          /*
     111                           *up to 11 bits long char.
     112                           *encoded over 2 bytes:
     113                           *110x xxxx  10xx xxxx
     114                           */
     115                          c = *byte_ptr & 0x1F;
     116                          nb_bytes_2_decode = 2;
     117  
     118                  } else if ((*byte_ptr & 0xF0) == 0xE0) {
     119                          /*
     120                           *up to 16 bit long char
     121                           *encoded over 3 bytes:
     122                           *1110 xxxx  10xx xxxx  10xx xxxx
     123                           */
     124                          c = *byte_ptr & 0x0F;
     125                          nb_bytes_2_decode = 3;
     126  
     127                  } else if ((*byte_ptr & 0xF8) == 0xF0) {
     128                          /*
     129                           *up to 21 bits long char
     130                           *encoded over 4 bytes:
     131                           *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
     132                           */
     133                          c = *byte_ptr & 0x7;
     134                          nb_bytes_2_decode = 4;
     135  
     136                  } else if ((*byte_ptr & 0xFC) == 0xF8) {
     137                          /*
     138                           *up to 26 bits long char
     139                           *encoded over 5 bytes.
     140                           *1111 10xx  10xx xxxx  10xx xxxx  
     141                           *10xx xxxx  10xx xxxx
     142                           */
     143                          c = *byte_ptr & 3;
     144                          nb_bytes_2_decode = 5;
     145  
     146                  } else if ((*byte_ptr & 0xFE) == 0xFC) {
     147                          /*
     148                           *up to 31 bits long char
     149                           *encoded over 6 bytes:
     150                           *1111 110x  10xx xxxx  10xx xxxx  
     151                           *10xx xxxx  10xx xxxx  10xx xxxx
     152                           */
     153                          c = *byte_ptr & 1;
     154                          nb_bytes_2_decode = 6;
     155  
     156                  } else {
     157                          /*
     158                           *BAD ENCODING
     159                           */
     160                          return CR_ENCODING_ERROR;
     161                  }
     162  
     163                  /*
     164                   *Go and decode the remaining byte(s)
     165                   *(if any) to get the current character.
     166                   */
     167                  for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
     168                          /*decode the next byte */
     169                          byte_ptr++;
     170  
     171                          /*byte pattern must be: 10xx xxxx */
     172                          if ((*byte_ptr & 0xC0) != 0x80) {
     173                                  return CR_ENCODING_ERROR;
     174                          }
     175  
     176                          c = (c << 6) | (*byte_ptr & 0x3F);
     177                  }
     178  
     179                  len++;
     180          }
     181  
     182          *a_len = len;
     183  
     184          return CR_OK;
     185  }
     186  
     187  /**
     188   *Given an ucs4 string, this function
     189   *returns the size (in bytes) this string
     190   *would have occupied if it was encoded in utf-8.
     191   *@param a_in_start a pointer to the beginning of the input
     192   *buffer.
     193   *@param a_in_end a pointer to the end of the input buffer.
     194   *@param a_len out parameter. The computed length.
     195   *@return CR_OK upon successfull completion, an error code otherwise.
     196   */
     197  enum CRStatus
     198  cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
     199                                 const guint32 * a_in_end, gulong * a_len)
     200  {
     201          gint len = 0;
     202          guint32 *char_ptr = NULL;
     203  
     204          g_return_val_if_fail (a_in_start && a_in_end && a_len,
     205                                CR_BAD_PARAM_ERROR);
     206  
     207          for (char_ptr = (guint32 *) a_in_start;
     208               char_ptr <= a_in_end; char_ptr++) {
     209                  if (*char_ptr <= 0x7F) {
     210                          /*the utf-8 char would take 1 byte */
     211                          len += 1;
     212                  } else if (*char_ptr <= 0x7FF) {
     213                          /*the utf-8 char would take 2 bytes */
     214                          len += 2;
     215                  } else if (*char_ptr <= 0xFFFF) {
     216                          len += 3;
     217                  } else if (*char_ptr <= 0x1FFFFF) {
     218                          len += 4;
     219                  } else if (*char_ptr <= 0x3FFFFFF) {
     220                          len += 5;
     221                  } else if (*char_ptr <= 0x7FFFFFFF) {
     222                          len += 6;
     223                  }
     224          }
     225  
     226          *a_len = len;
     227          return CR_OK;
     228  }
     229  
     230  /**
     231   *Given an ucsA string, this function
     232   *returns the size (in bytes) this string
     233   *would have occupied if it was encoded in utf-8.
     234   *@param a_in_start a pointer to the beginning of the input
     235   *buffer.
     236   *@param a_in_end a pointer to the end of the input buffer.
     237   *@param a_len out parameter. The computed length.
     238   *@return CR_OK upon successfull completion, an error code otherwise.
     239   */
     240  enum CRStatus
     241  cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
     242                                 const guchar * a_in_end, gulong * a_len)
     243  {
     244          gint len = 0;
     245          guchar *char_ptr = NULL;
     246  
     247          g_return_val_if_fail (a_in_start && a_in_end && a_len,
     248                                CR_BAD_PARAM_ERROR);
     249  
     250          for (char_ptr = (guchar *) a_in_start;
     251               char_ptr <= a_in_end; char_ptr++) {
     252                  if (*char_ptr <= 0x7F) {
     253                          /*the utf-8 char would take 1 byte */
     254                          len += 1;
     255                  } else {
     256                          /*the utf-8 char would take 2 bytes */
     257                          len += 2;
     258                  }
     259          }
     260  
     261          *a_len = len;
     262          return CR_OK;
     263  }
     264  
     265  /**
     266   *Converts an utf8 buffer into an ucs4 buffer.
     267   *
     268   *@param a_in the input utf8 buffer to convert.
     269   *@param a_in_len in/out parameter. The size of the
     270   *input buffer to convert. After return, this parameter contains
     271   *the actual number of bytes consumed.
     272   *@param a_out the output converted ucs4 buffer. Must be allocated by
     273   *the caller.
     274   *@param a_out_len in/out parameter. The size of the output buffer.
     275   *If this size is actually smaller than the real needed size, the function
     276   *just converts what it can and returns a success status. After return,
     277   *this param points to the actual number of characters decoded.
     278   *@return CR_OK upon successfull completion, an error code otherwise.
     279   */
     280  enum CRStatus
     281  cr_utils_utf8_to_ucs4 (const guchar * a_in,
     282                         gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
     283  {
     284          gulong in_len = 0,
     285                  out_len = 0,
     286                  in_index = 0,
     287                  out_index = 0;
     288          enum CRStatus status = CR_OK;
     289  
     290          /*
     291           *to store the final decoded 
     292           *unicode char
     293           */
     294          guint c = 0;
     295  
     296          g_return_val_if_fail (a_in && a_in_len
     297                                && a_out && a_out_len, CR_BAD_PARAM_ERROR);
     298  
     299          if (*a_in_len < 1) {
     300                  status = CR_OK;
     301                  goto end;
     302          }
     303  
     304          in_len = *a_in_len;
     305          out_len = *a_out_len;
     306  
     307          for (in_index = 0, out_index = 0;
     308               (in_index < in_len) && (out_index < out_len);
     309               in_index++, out_index++) {
     310                  gint nb_bytes_2_decode = 0;
     311  
     312                  if (a_in[in_index] <= 0x7F) {
     313                          /*
     314                           *7 bits long char
     315                           *encoded over 1 byte:
     316                           * 0xxx xxxx
     317                           */
     318                          c = a_in[in_index];
     319                          nb_bytes_2_decode = 1;
     320  
     321                  } else if ((a_in[in_index] & 0xE0) == 0xC0) {
     322                          /*
     323                           *up to 11 bits long char.
     324                           *encoded over 2 bytes:
     325                           *110x xxxx  10xx xxxx
     326                           */
     327                          c = a_in[in_index] & 0x1F;
     328                          nb_bytes_2_decode = 2;
     329  
     330                  } else if ((a_in[in_index] & 0xF0) == 0xE0) {
     331                          /*
     332                           *up to 16 bit long char
     333                           *encoded over 3 bytes:
     334                           *1110 xxxx  10xx xxxx  10xx xxxx
     335                           */
     336                          c = a_in[in_index] & 0x0F;
     337                          nb_bytes_2_decode = 3;
     338  
     339                  } else if ((a_in[in_index] & 0xF8) == 0xF0) {
     340                          /*
     341                           *up to 21 bits long char
     342                           *encoded over 4 bytes:
     343                           *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
     344                           */
     345                          c = a_in[in_index] & 0x7;
     346                          nb_bytes_2_decode = 4;
     347  
     348                  } else if ((a_in[in_index] & 0xFC) == 0xF8) {
     349                          /*
     350                           *up to 26 bits long char
     351                           *encoded over 5 bytes.
     352                           *1111 10xx  10xx xxxx  10xx xxxx  
     353                           *10xx xxxx  10xx xxxx
     354                           */
     355                          c = a_in[in_index] & 3;
     356                          nb_bytes_2_decode = 5;
     357  
     358                  } else if ((a_in[in_index] & 0xFE) == 0xFC) {
     359                          /*
     360                           *up to 31 bits long char
     361                           *encoded over 6 bytes:
     362                           *1111 110x  10xx xxxx  10xx xxxx  
     363                           *10xx xxxx  10xx xxxx  10xx xxxx
     364                           */
     365                          c = a_in[in_index] & 1;
     366                          nb_bytes_2_decode = 6;
     367  
     368                  } else {
     369                          /*BAD ENCODING */
     370                          goto end;
     371                  }
     372  
     373                  /*
     374                   *Go and decode the remaining byte(s)
     375                   *(if any) to get the current character.
     376                   */
     377                  for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
     378                          /*decode the next byte */
     379                          in_index++;
     380  
     381                          /*byte pattern must be: 10xx xxxx */
     382                          if ((a_in[in_index] & 0xC0) != 0x80) {
     383                                  goto end;
     384                          }
     385  
     386                          c = (c << 6) | (a_in[in_index] & 0x3F);
     387                  }
     388  
     389                  /*
     390                   *The decoded ucs4 char is now
     391                   *in c.
     392                   */
     393  
     394                  /************************
     395                   *Some security tests
     396                   ***********************/
     397  
     398                  /*be sure c is a char */
     399                  if (c == 0xFFFF || c == 0xFFFE)
     400                          goto end;
     401  
     402                  /*be sure c is inferior to the max ucs4 char value */
     403                  if (c > 0x10FFFF)
     404                          goto end;
     405  
     406                  /*
     407                   *c must be less than UTF16 "lower surrogate begin"
     408                   *or higher than UTF16 "High surrogate end"
     409                   */
     410                  if (c >= 0xD800 && c <= 0xDFFF)
     411                          goto end;
     412  
     413                  /*Avoid characters that equals zero */
     414                  if (c == 0)
     415                          goto end;
     416  
     417                  a_out[out_index] = c;
     418          }
     419  
     420        end:
     421          *a_out_len = out_index + 1;
     422          *a_in_len = in_index + 1;
     423  
     424          return status;
     425  }
     426  
     427  /**
     428   *Reads a character from an utf8 buffer.
     429   *Actually decode the next character code (unicode character code)
     430   *and returns it.
     431   *@param a_in the starting address of the utf8 buffer.
     432   *@param a_in_len the length of the utf8 buffer.
     433   *@param a_out output parameter. The resulting read char.
     434   *@param a_consumed the number of the bytes consumed to
     435   *decode the returned character code.
     436   *@return CR_OK upon successfull completion, an error code otherwise.
     437   */
     438  enum CRStatus
     439  cr_utils_read_char_from_utf8_buf (const guchar * a_in,
     440                                    gulong a_in_len,
     441                                    guint32 * a_out, gulong * a_consumed)
     442  {
     443          gulong in_index = 0,
     444                 nb_bytes_2_decode = 0;
     445          enum CRStatus status = CR_OK;
     446  
     447          /*
     448           *to store the final decoded 
     449           *unicode char
     450           */
     451          guint32 c = 0;
     452  
     453          g_return_val_if_fail (a_in && a_out && a_out
     454                                && a_consumed, CR_BAD_PARAM_ERROR);
     455  
     456          if (a_in_len < 1) {
     457                  status = CR_OK;
     458                  goto end;
     459          }
     460  
     461          if (*a_in <= 0x7F) {
     462                  /*
     463                   *7 bits long char
     464                   *encoded over 1 byte:
     465                   * 0xxx xxxx
     466                   */
     467                  c = *a_in;
     468                  nb_bytes_2_decode = 1;
     469  
     470          } else if ((*a_in & 0xE0) == 0xC0) {
     471                  /*
     472                   *up to 11 bits long char.
     473                   *encoded over 2 bytes:
     474                   *110x xxxx  10xx xxxx
     475                   */
     476                  c = *a_in & 0x1F;
     477                  nb_bytes_2_decode = 2;
     478  
     479          } else if ((*a_in & 0xF0) == 0xE0) {
     480                  /*
     481                   *up to 16 bit long char
     482                   *encoded over 3 bytes:
     483                   *1110 xxxx  10xx xxxx  10xx xxxx
     484                   */
     485                  c = *a_in & 0x0F;
     486                  nb_bytes_2_decode = 3;
     487  
     488          } else if ((*a_in & 0xF8) == 0xF0) {
     489                  /*
     490                   *up to 21 bits long char
     491                   *encoded over 4 bytes:
     492                   *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
     493                   */
     494                  c = *a_in & 0x7;
     495                  nb_bytes_2_decode = 4;
     496  
     497          } else if ((*a_in & 0xFC) == 0xF8) {
     498                  /*
     499                   *up to 26 bits long char
     500                   *encoded over 5 bytes.
     501                   *1111 10xx  10xx xxxx  10xx xxxx  
     502                   *10xx xxxx  10xx xxxx
     503                   */
     504                  c = *a_in & 3;
     505                  nb_bytes_2_decode = 5;
     506  
     507          } else if ((*a_in & 0xFE) == 0xFC) {
     508                  /*
     509                   *up to 31 bits long char
     510                   *encoded over 6 bytes:
     511                   *1111 110x  10xx xxxx  10xx xxxx  
     512                   *10xx xxxx  10xx xxxx  10xx xxxx
     513                   */
     514                  c = *a_in & 1;
     515                  nb_bytes_2_decode = 6;
     516  
     517          } else {
     518                  /*BAD ENCODING */
     519                  goto end;
     520          }
     521  
     522          if (nb_bytes_2_decode > a_in_len) {
     523                  status = CR_END_OF_INPUT_ERROR;
     524                  goto end;
     525          }
     526  
     527          /*
     528           *Go and decode the remaining byte(s)
     529           *(if any) to get the current character.
     530           */
     531          for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
     532                  /*byte pattern must be: 10xx xxxx */
     533                  if ((a_in[in_index] & 0xC0) != 0x80) {
     534                          goto end;
     535                  }
     536  
     537                  c = (c << 6) | (a_in[in_index] & 0x3F);
     538          }
     539  
     540          /*
     541           *The decoded ucs4 char is now
     542           *in c.
     543           */
     544  
     545      /************************
     546       *Some security tests
     547       ***********************/
     548  
     549          /*be sure c is a char */
     550          if (c == 0xFFFF || c == 0xFFFE)
     551                  goto end;
     552  
     553          /*be sure c is inferior to the max ucs4 char value */
     554          if (c > 0x10FFFF)
     555                  goto end;
     556  
     557          /*
     558           *c must be less than UTF16 "lower surrogate begin"
     559           *or higher than UTF16 "High surrogate end"
     560           */
     561          if (c >= 0xD800 && c <= 0xDFFF)
     562                  goto end;
     563  
     564          /*Avoid characters that equals zero */
     565          if (c == 0)
     566                  goto end;
     567  
     568          *a_out = c;
     569  
     570        end:
     571          *a_consumed = nb_bytes_2_decode;
     572  
     573          return status;
     574  }
     575  
     576  /**
     577   *
     578   */
     579  enum CRStatus
     580  cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
     581                                 const guchar * a_in_end, gulong * a_len)
     582  {
     583          /*
     584           *Note: this function can be made shorter
     585           *but it considers all the cases of the utf8 encoding
     586           *to ease further extensions ...
     587           */
     588  
     589          guchar *byte_ptr = NULL;
     590          gint len = 0;
     591  
     592          /*
     593           *to store the final decoded 
     594           *unicode char
     595           */
     596          guint c = 0;
     597  
     598          g_return_val_if_fail (a_in_start && a_in_end && a_len,
     599                                CR_BAD_PARAM_ERROR);
     600          *a_len = 0;
     601  
     602          for (byte_ptr = (guchar *) a_in_start;
     603               byte_ptr <= a_in_end; byte_ptr++) {
     604                  gint nb_bytes_2_decode = 0;
     605  
     606                  if (*byte_ptr <= 0x7F) {
     607                          /*
     608                           *7 bits long char
     609                           *encoded over 1 byte:
     610                           * 0xxx xxxx
     611                           */
     612                          c = *byte_ptr;
     613                          nb_bytes_2_decode = 1;
     614  
     615                  } else if ((*byte_ptr & 0xE0) == 0xC0) {
     616                          /*
     617                           *up to 11 bits long char.
     618                           *encoded over 2 bytes:
     619                           *110x xxxx  10xx xxxx
     620                           */
     621                          c = *byte_ptr & 0x1F;
     622                          nb_bytes_2_decode = 2;
     623  
     624                  } else if ((*byte_ptr & 0xF0) == 0xE0) {
     625                          /*
     626                           *up to 16 bit long char
     627                           *encoded over 3 bytes:
     628                           *1110 xxxx  10xx xxxx  10xx xxxx
     629                           */
     630                          c = *byte_ptr & 0x0F;
     631                          nb_bytes_2_decode = 3;
     632  
     633                  } else if ((*byte_ptr & 0xF8) == 0xF0) {
     634                          /*
     635                           *up to 21 bits long char
     636                           *encoded over 4 bytes:
     637                           *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
     638                           */
     639                          c = *byte_ptr & 0x7;
     640                          nb_bytes_2_decode = 4;
     641  
     642                  } else if ((*byte_ptr & 0xFC) == 0xF8) {
     643                          /*
     644                           *up to 26 bits long char
     645                           *encoded over 5 bytes.
     646                           *1111 10xx  10xx xxxx  10xx xxxx  
     647                           *10xx xxxx  10xx xxxx
     648                           */
     649                          c = *byte_ptr & 3;
     650                          nb_bytes_2_decode = 5;
     651  
     652                  } else if ((*byte_ptr & 0xFE) == 0xFC) {
     653                          /*
     654                           *up to 31 bits long char
     655                           *encoded over 6 bytes:
     656                           *1111 110x  10xx xxxx  10xx xxxx  
     657                           *10xx xxxx  10xx xxxx  10xx xxxx
     658                           */
     659                          c = *byte_ptr & 1;
     660                          nb_bytes_2_decode = 6;
     661  
     662                  } else {
     663                          /*
     664                           *BAD ENCODING
     665                           */
     666                          return CR_ENCODING_ERROR;
     667                  }
     668  
     669                  /*
     670                   *Go and decode the remaining byte(s)
     671                   *(if any) to get the current character.
     672                   */
     673                  for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
     674                          /*decode the next byte */
     675                          byte_ptr++;
     676  
     677                          /*byte pattern must be: 10xx xxxx */
     678                          if ((*byte_ptr & 0xC0) != 0x80) {
     679                                  return CR_ENCODING_ERROR;
     680                          }
     681  
     682                          c = (c << 6) | (*byte_ptr & 0x3F);
     683                  }
     684  
     685                  /*
     686                   *The decoded ucs4 char is now
     687                   *in c.
     688                   */
     689  
     690                  if (c <= 0xFF) { /*Add other conditions to support
     691                                    *other char sets (ucs2, ucs3, ucs4).
     692                                    */
     693                          len++;
     694                  } else {
     695                          /*the char is too long to fit
     696                           *into the supposed charset len.
     697                           */
     698                          return CR_ENCODING_ERROR;
     699                  }
     700          }
     701  
     702          *a_len = len;
     703  
     704          return CR_OK;
     705  }
     706  
     707  /**
     708   *Converts an utf8 string into an ucs4 string.
     709   *@param a_in the input string to convert.
     710   *@param a_in_len in/out parameter. The length of the input
     711   *string. After return, points to the actual number of bytes
     712   *consumed. This can be usefull to debug the input stream in case
     713   *of encoding error.
     714   *@param a_out out parameter. Points to the output string. It is allocated 
     715   *by this function and must be freed by the caller.
     716   *@param a_out_len out parameter. The length of the output string.
     717   *@return CR_OK upon successfull completion, an error code otherwise.
     718   *
     719   */
     720  enum CRStatus
     721  cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
     722                             gulong * a_in_len,
     723                             guint32 ** a_out, gulong * a_out_len)
     724  {
     725          enum CRStatus status = CR_OK;
     726  
     727          g_return_val_if_fail (a_in && a_in_len
     728                                && a_out && a_out_len, CR_BAD_PARAM_ERROR);
     729  
     730          status = cr_utils_utf8_str_len_as_ucs4 (a_in,
     731                                                  &a_in[*a_in_len - 1],
     732                                                  a_out_len);
     733  
     734          g_return_val_if_fail (status == CR_OK, status);
     735  
     736          *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
     737  
     738          status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
     739  
     740          return status;
     741  }
     742  
     743  /**
     744   *Converts an ucs4 buffer into an utf8 buffer.
     745   *
     746   *@param a_in the input ucs4 buffer to convert.
     747   *@param a_in_len in/out parameter. The size of the
     748   *input buffer to convert. After return, this parameter contains
     749   *the actual number of characters consumed.
     750   *@param a_out the output converted utf8 buffer. Must be allocated by
     751   *the caller.
     752   *@param a_out_len in/out parameter. The size of the output buffer.
     753   *If this size is actually smaller than the real needed size, the function
     754   *just converts what it can and returns a success status. After return,
     755   *this param points to the actual number of bytes in the buffer.
     756   *@return CR_OK upon successfull completion, an error code otherwise.
     757   */
     758  enum CRStatus
     759  cr_utils_ucs4_to_utf8 (const guint32 * a_in,
     760                         gulong * a_in_len, guchar * a_out, gulong * a_out_len)
     761  {
     762          gulong in_len = 0,
     763                  in_index = 0,
     764                  out_index = 0;
     765          enum CRStatus status = CR_OK;
     766  
     767          g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
     768                                CR_BAD_PARAM_ERROR);
     769  
     770          if (*a_in_len < 1) {
     771                  status = CR_OK;
     772                  goto end;
     773          }
     774  
     775          in_len = *a_in_len;
     776  
     777          for (in_index = 0; in_index < in_len; in_index++) {
     778                  /*
     779                   *FIXME: return whenever we encounter forbidden char values.
     780                   */
     781  
     782                  if (a_in[in_index] <= 0x7F) {
     783                          a_out[out_index] = a_in[in_index];
     784                          out_index++;
     785                  } else if (a_in[in_index] <= 0x7FF) {
     786                          a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
     787                          a_out[out_index + 1] =
     788                                  (0x80 | (a_in[in_index] & 0x3F));
     789                          out_index += 2;
     790                  } else if (a_in[in_index] <= 0xFFFF) {
     791                          a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
     792                          a_out[out_index + 1] =
     793                                  (0x80 | ((a_in[in_index] >> 6) & 0x3F));
     794                          a_out[out_index + 2] =
     795                                  (0x80 | (a_in[in_index] & 0x3F));
     796                          out_index += 3;
     797                  } else if (a_in[in_index] <= 0x1FFFFF) {
     798                          a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
     799                          a_out[out_index + 1]
     800                                  = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
     801                          a_out[out_index + 2]
     802                                  = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
     803                          a_out[out_index + 3]
     804                                  = (0x80 | (a_in[in_index] & 0x3F));
     805                          out_index += 4;
     806                  } else if (a_in[in_index] <= 0x3FFFFFF) {
     807                          a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
     808                          a_out[out_index + 1] =
     809                                  (0x80 | (a_in[in_index] >> 18));
     810                          a_out[out_index + 2]
     811                                  = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
     812                          a_out[out_index + 3]
     813                                  = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
     814                          a_out[out_index + 4]
     815                                  = (0x80 | (a_in[in_index] & 0x3F));
     816                          out_index += 5;
     817                  } else if (a_in[in_index] <= 0x7FFFFFFF) {
     818                          a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
     819                          a_out[out_index + 1] =
     820                                  (0x80 | (a_in[in_index] >> 24));
     821                          a_out[out_index + 2]
     822                                  = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
     823                          a_out[out_index + 3]
     824                                  = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
     825                          a_out[out_index + 4]
     826                                  = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
     827                          a_out[out_index + 4]
     828                                  = (0x80 | (a_in[in_index] & 0x3F));
     829                          out_index += 6;
     830                  } else {
     831                          status = CR_ENCODING_ERROR;
     832                          goto end;
     833                  }
     834          }                       /*end for */
     835  
     836        end:
     837          *a_in_len = in_index + 1;
     838          *a_out_len = out_index + 1;
     839  
     840          return status;
     841  }
     842  
     843  /**
     844   *Converts an ucs4 string into an utf8 string.
     845   *@param a_in the input string to convert.
     846   *@param a_in_len in/out parameter. The length of the input
     847   *string. After return, points to the actual number of characters
     848   *consumed. This can be usefull to debug the input string in case
     849   *of encoding error.
     850   *@param a_out out parameter. Points to the output string. It is allocated 
     851   *by this function and must be freed by the caller.
     852   *@param a_out_len out parameter. The length (in bytes) of the output string.
     853   *@return CR_OK upon successfull completion, an error code otherwise.
     854   */
     855  enum CRStatus
     856  cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
     857                             gulong * a_in_len,
     858                             guchar ** a_out, gulong * a_out_len)
     859  {
     860          enum CRStatus status = CR_OK;
     861  
     862          g_return_val_if_fail (a_in && a_in_len && a_out
     863                                && a_out_len, CR_BAD_PARAM_ERROR);
     864  
     865          status = cr_utils_ucs4_str_len_as_utf8 (a_in,
     866                                                  &a_in[*a_out_len - 1],
     867                                                  a_out_len);
     868  
     869          g_return_val_if_fail (status == CR_OK, status);
     870  
     871          status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
     872  
     873          return status;
     874  }
     875  
     876  /**
     877   *Converts an ucs1 buffer into an utf8 buffer.
     878   *The caller must know the size of the resulting buffer and
     879   *allocate it prior to calling this function.
     880   *
     881   *@param a_in the input ucs1 buffer.
     882   *
     883   *@param a_in_len in/out parameter. The length of the input buffer.
     884   *After return, points to the number of bytes actually consumed even
     885   *in case of encoding error.
     886   *
     887   *@param a_out out parameter. The output utf8 converted buffer.
     888   *
     889   *@param a_out_len in/out parameter. The size of the output buffer.
     890   *If the output buffer size is shorter than the actual needed size, 
     891   *this function just convert what it can.
     892   *
     893   *@return CR_OK upon successfull completion, an error code otherwise.
     894   *
     895   */
     896  enum CRStatus
     897  cr_utils_ucs1_to_utf8 (const guchar * a_in,
     898                         gulong * a_in_len, guchar * a_out, gulong * a_out_len)
     899  {
     900          gulong out_index = 0,
     901                  in_index = 0,
     902                  in_len = 0,
     903                  out_len = 0;
     904          enum CRStatus status = CR_OK;
     905  
     906          g_return_val_if_fail (a_in && a_in_len
     907                                && a_out_len, 
     908                                CR_BAD_PARAM_ERROR);
     909  
     910          if (*a_in_len == 0) {
     911                  *a_out_len = 0 ;
     912                  return status;
     913          }
     914          g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
     915  
     916          in_len = *a_in_len;
     917          out_len = *a_out_len;
     918  
     919          for (in_index = 0, out_index = 0;
     920               (in_index < in_len) && (out_index < out_len); in_index++) {
     921                  /*
     922                   *FIXME: return whenever we encounter forbidden char values.
     923                   */
     924  
     925                  if (a_in[in_index] <= 0x7F) {
     926                          a_out[out_index] = a_in[in_index];
     927                          out_index++;
     928                  } else {
     929                          a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
     930                          a_out[out_index + 1] =
     931                                  (0x80 | (a_in[in_index] & 0x3F));
     932                          out_index += 2;
     933                  }
     934          }                       /*end for */
     935  
     936          *a_in_len = in_index;
     937          *a_out_len = out_index;
     938  
     939          return status;
     940  }
     941  
     942  /**
     943   *Converts an ucs1 string into an utf8 string.
     944   *@param a_in_start the beginning of the input string to convert.
     945   *@param a_in_end the end of the input string to convert.
     946   *@param a_out out parameter. The converted string.
     947   *@param a_out out parameter. The length of the converted string.
     948   *@return CR_OK upon successfull completion, an error code otherwise.
     949   *
     950   */
     951  enum CRStatus
     952  cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
     953                             gulong * a_in_len,
     954                             guchar ** a_out, gulong * a_out_len)
     955  {
     956          gulong out_len = 0;
     957          enum CRStatus status = CR_OK;
     958  
     959          g_return_val_if_fail (a_in && a_in_len && a_out
     960                                && a_out_len, CR_BAD_PARAM_ERROR);
     961  
     962          if (*a_in_len < 1) {
     963                  *a_out_len = 0;
     964                  *a_out = NULL;
     965                  return CR_OK;
     966          }
     967  
     968          status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
     969                                                  &out_len);
     970  
     971          g_return_val_if_fail (status == CR_OK, status);
     972  
     973          *a_out = g_malloc0 (out_len);
     974  
     975          status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
     976  
     977          *a_out_len = out_len;
     978  
     979          return status;
     980  }
     981  
     982  /**
     983   *Converts an utf8 buffer into an ucs1 buffer.
     984   *The caller must know the size of the resulting
     985   *converted buffer, and allocated it prior to calling this
     986   *function.
     987   *
     988   *@param a_in the input utf8 buffer to convert.
     989   *
     990   *@param a_in_len in/out parameter. The size of the input utf8 buffer.
     991   *After return, points to the number of bytes consumed
     992   *by the function even in case of encoding error.
     993   *
     994   *@param a_out out parameter. Points to the resulting buffer.
     995   *Must be allocated by the caller. If the size of a_out is shorter
     996   *than its required size, this function converts what it can and return
     997   *a successfull status.
     998   *
     999   *@param a_out_len in/out parameter. The size of the output buffer.
    1000   *After return, points to the number of bytes consumed even in case of
    1001   *encoding error.
    1002   *
    1003   *@return CR_OK upon successfull completion, an error code otherwise.
    1004   */
    1005  enum CRStatus
    1006  cr_utils_utf8_to_ucs1 (const guchar * a_in,
    1007                         gulong * a_in_len, guchar * a_out, gulong * a_out_len)
    1008  {
    1009          gulong in_index = 0,
    1010                  out_index = 0,
    1011                  in_len = 0,
    1012                  out_len = 0;
    1013          enum CRStatus status = CR_OK;
    1014  
    1015          /*
    1016           *to store the final decoded 
    1017           *unicode char
    1018           */
    1019          guint32 c = 0;
    1020  
    1021          g_return_val_if_fail (a_in && a_in_len
    1022                                && a_out && a_out_len, CR_BAD_PARAM_ERROR);
    1023  
    1024          if (*a_in_len < 1) {
    1025                  goto end;
    1026          }
    1027  
    1028          in_len = *a_in_len;
    1029          out_len = *a_out_len;
    1030  
    1031          for (in_index = 0, out_index = 0;
    1032               (in_index < in_len) && (out_index < out_len);
    1033               in_index++, out_index++) {
    1034                  gint nb_bytes_2_decode = 0;
    1035  
    1036                  if (a_in[in_index] <= 0x7F) {
    1037                          /*
    1038                           *7 bits long char
    1039                           *encoded over 1 byte:
    1040                           * 0xxx xxxx
    1041                           */
    1042                          c = a_in[in_index];
    1043                          nb_bytes_2_decode = 1;
    1044  
    1045                  } else if ((a_in[in_index] & 0xE0) == 0xC0) {
    1046                          /*
    1047                           *up to 11 bits long char.
    1048                           *encoded over 2 bytes:
    1049                           *110x xxxx  10xx xxxx
    1050                           */
    1051                          c = a_in[in_index] & 0x1F;
    1052                          nb_bytes_2_decode = 2;
    1053  
    1054                  } else if ((a_in[in_index] & 0xF0) == 0xE0) {
    1055                          /*
    1056                           *up to 16 bit long char
    1057                           *encoded over 3 bytes:
    1058                           *1110 xxxx  10xx xxxx  10xx xxxx
    1059                           */
    1060                          c = a_in[in_index] & 0x0F;
    1061                          nb_bytes_2_decode = 3;
    1062  
    1063                  } else if ((a_in[in_index] & 0xF8) == 0xF0) {
    1064                          /*
    1065                           *up to 21 bits long char
    1066                           *encoded over 4 bytes:
    1067                           *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
    1068                           */
    1069                          c = a_in[in_index] & 0x7;
    1070                          nb_bytes_2_decode = 4;
    1071  
    1072                  } else if ((a_in[in_index] & 0xFC) == 0xF8) {
    1073                          /*
    1074                           *up to 26 bits long char
    1075                           *encoded over 5 bytes.
    1076                           *1111 10xx  10xx xxxx  10xx xxxx  
    1077                           *10xx xxxx  10xx xxxx
    1078                           */
    1079                          c = a_in[in_index] & 3;
    1080                          nb_bytes_2_decode = 5;
    1081  
    1082                  } else if ((a_in[in_index] & 0xFE) == 0xFC) {
    1083                          /*
    1084                           *up to 31 bits long char
    1085                           *encoded over 6 bytes:
    1086                           *1111 110x  10xx xxxx  10xx xxxx  
    1087                           *10xx xxxx  10xx xxxx  10xx xxxx
    1088                           */
    1089                          c = a_in[in_index] & 1;
    1090                          nb_bytes_2_decode = 6;
    1091  
    1092                  } else {
    1093                          /*BAD ENCODING */
    1094                          status = CR_ENCODING_ERROR;
    1095                          goto end;
    1096                  }
    1097  
    1098                  /*
    1099                   *Go and decode the remaining byte(s)
    1100                   *(if any) to get the current character.
    1101                   */
    1102                  if (in_index + nb_bytes_2_decode - 1 >= in_len) {
    1103                          goto end;
    1104                  }
    1105  
    1106                  for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
    1107                          /*decode the next byte */
    1108                          in_index++;
    1109  
    1110                          /*byte pattern must be: 10xx xxxx */
    1111                          if ((a_in[in_index] & 0xC0) != 0x80) {
    1112                                  status = CR_ENCODING_ERROR;
    1113                                  goto end;
    1114                          }
    1115  
    1116                          c = (c << 6) | (a_in[in_index] & 0x3F);
    1117                  }
    1118  
    1119                  /*
    1120                   *The decoded ucs4 char is now
    1121                   *in c.
    1122                   */
    1123  
    1124                  if (c > 0xFF) {
    1125                          status = CR_ENCODING_ERROR;
    1126                          goto end;
    1127                  }
    1128  
    1129                  a_out[out_index] = c;
    1130          }
    1131  
    1132        end:
    1133          *a_out_len = out_index;
    1134          *a_in_len = in_index;
    1135  
    1136          return status;
    1137  }
    1138  
    1139  /**
    1140   *Converts an utf8 buffer into an
    1141   *ucs1 buffer.
    1142   *@param a_in_start the start of the input buffer.
    1143   *@param a_in_end the end of the input buffer.
    1144   *@param a_out out parameter. The resulting converted ucs4 buffer.
    1145   *Must be freed by the caller.
    1146   *@param a_out_len out parameter. The length of the converted buffer.
    1147   *@return CR_OK upon successfull completion, an error code otherwise.
    1148   *Note that out parameters are valid if and only if this function
    1149   *returns CR_OK.
    1150   */
    1151  enum CRStatus
    1152  cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
    1153                             gulong * a_in_len,
    1154                             guchar ** a_out, gulong * a_out_len)
    1155  {
    1156          enum CRStatus status = CR_OK;
    1157  
    1158          g_return_val_if_fail (a_in && a_in_len
    1159                                && a_out && a_out_len, CR_BAD_PARAM_ERROR);
    1160  
    1161          if (*a_in_len < 1) {
    1162                  *a_out_len = 0;
    1163                  *a_out = NULL;
    1164                  return CR_OK;
    1165          }
    1166  
    1167          status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
    1168                                                  a_out_len);
    1169  
    1170          g_return_val_if_fail (status == CR_OK, status);
    1171  
    1172          *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
    1173  
    1174          status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
    1175          return status;
    1176  }
    1177  
    1178  /*****************************************
    1179   *CSS basic types identification utilities
    1180   *****************************************/
    1181  
    1182  /**
    1183   *Returns TRUE if a_char is a white space as
    1184   *defined in the css spec in chap 4.1.1.
    1185   *
    1186   *white-space ::= ' '| \t|\r|\n|\f
    1187   *
    1188   *@param a_char the character to test.
    1189   *return TRUE if is a white space, false otherwise.
    1190   */
    1191  gboolean
    1192  cr_utils_is_white_space (guint32 a_char)
    1193  {
    1194          switch (a_char) {
    1195          case ' ':
    1196          case '\t':
    1197          case '\r':
    1198          case '\n':
    1199          case '\f':
    1200                  return TRUE;
    1201                  break;
    1202          default:
    1203                  return FALSE;
    1204          }
    1205  }
    1206  
    1207  /**
    1208   *Returns true if the character is a newline
    1209   *as defined in the css spec in the chap 4.1.1.
    1210   *
    1211   *nl ::= \n|\r\n|\r|\f
    1212   *
    1213   *@param a_char the character to test.
    1214   *@return TRUE if the character is a newline, FALSE otherwise.
    1215   */
    1216  gboolean
    1217  cr_utils_is_newline (guint32 a_char)
    1218  {
    1219          switch (a_char) {
    1220          case '\n':
    1221          case '\r':
    1222          case '\f':
    1223                  return TRUE;
    1224                  break;
    1225          default:
    1226                  return FALSE;
    1227          }
    1228  }
    1229  
    1230  /**
    1231   *returns TRUE if the char is part of an hexa num char:
    1232   *i.e hexa_char ::= [0-9A-F]
    1233   */
    1234  gboolean
    1235  cr_utils_is_hexa_char (guint32 a_char)
    1236  {
    1237          if ((a_char >= '0' && a_char <= '9')
    1238              || (a_char >= 'A' && a_char <= 'F')) {
    1239                  return TRUE;
    1240          }
    1241          return FALSE;
    1242  }
    1243  
    1244  /**
    1245   *Returns true if the character is a nonascii
    1246   *character (as defined in the css spec chap 4.1.1):
    1247   *
    1248   *nonascii ::= [^\0-\177]
    1249   *
    1250   *@param a_char the character to test.
    1251   *@return TRUE if the character is a nonascii char,
    1252   *FALSE otherwise.
    1253   */
    1254  gboolean
    1255  cr_utils_is_nonascii (guint32 a_char)
    1256  {
    1257          if (a_char <= 177) {
    1258                  return FALSE;
    1259          }
    1260  
    1261          return TRUE;
    1262  }
    1263  
    1264  /**
    1265   *Dumps a character a_nb times on a file.
    1266   *@param a_char the char to dump
    1267   *@param a_fp the destination file pointer
    1268   *@param a_nb the number of times a_char is to be dumped.
    1269   */
    1270  void
    1271  cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
    1272  {
    1273          glong i = 0;
    1274  
    1275          for (i = 0; i < a_nb; i++) {
    1276                  fprintf (a_fp, "%c", a_char);
    1277          }
    1278  }
    1279  
    1280  void
    1281  cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
    1282  {
    1283          glong i = 0;
    1284  
    1285          g_return_if_fail (a_string);
    1286  
    1287          for (i = 0; i < a_nb; i++) {
    1288                  g_string_append_printf (a_string, "%c", a_char);
    1289          }
    1290  }
    1291  
    1292  /**
    1293   *Duplicates a list of GString instances.
    1294   *@return the duplicated list of GString instances or NULL if
    1295   *something bad happened.
    1296   *@param a_list_of_strings the list of strings to be duplicated.
    1297   */
    1298  GList *
    1299  cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
    1300  {
    1301          GList const *cur = NULL;
    1302          GList *result = NULL;
    1303  
    1304          g_return_val_if_fail (a_list_of_strings, NULL);
    1305  
    1306          for (cur = a_list_of_strings; cur; cur = cur->next) {
    1307                  GString *str = NULL;
    1308  
    1309                  str = g_string_new_len (((GString *) cur->data)->str,
    1310                                          ((GString *) cur->data)->len);
    1311                  if (str)
    1312                          result = g_list_append (result, str);
    1313          }
    1314  
    1315          return result;
    1316  }
    1317  
    1318  /**
    1319   *Duplicate a GList where the GList::data is a CRString.
    1320   *@param a_list_of_strings the list to duplicate
    1321   *@return the duplicated list, or NULL if something bad
    1322   *happened.
    1323   */
    1324  GList *
    1325  cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
    1326  {
    1327          GList const *cur = NULL;
    1328          GList *result = NULL;
    1329  
    1330          g_return_val_if_fail (a_list_of_strings, NULL);
    1331  
    1332          for (cur = a_list_of_strings; cur; cur = cur->next) {
    1333                  CRString *str = NULL;
    1334  
    1335                  str = cr_string_dup ((CRString const *) cur->data) ;
    1336                  if (str)
    1337                          result = g_list_append (result, str);
    1338          }
    1339  
    1340          return result;
    1341  }