(root)/
glib-2.79.0/
glib/
tests/
utf8-validate.c
       1  /* GLIB - Library of useful routines for C programming
       2   * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
       3   *
       4   * SPDX-License-Identifier: LGPL-2.1-or-later
       5   *
       6   * This library is free software; you can redistribute it and/or
       7   * modify it under the terms of the GNU Lesser General Public
       8   * License as published by the Free Software Foundation; either
       9   * version 2.1 of the License, or (at your option) any later version.
      10   *
      11   * This library is distributed in the hope that it will be useful,
      12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14   * Lesser General Public License for more details.
      15   *
      16   * You should have received a copy of the GNU Lesser General Public
      17   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
      18   */
      19  
      20  #include "glib.h"
      21  #include <string.h>
      22  
      23  #define UNICODE_VALID(Char)                   \
      24      ((Char) < 0x110000 &&                     \
      25       (((Char) & 0xFFFFF800) != 0xD800) &&     \
      26       ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
      27       ((Char) & 0xFFFE) != 0xFFFE)
      28  
      29  
      30  typedef struct {
      31    const gchar *text;
      32    gint max_len;
      33    gint offset;
      34    gboolean valid;
      35  } Test;
      36  
      37  static Test global_test[] = {
      38    /* some tests to check max_len handling */
      39    /* length 1 */
      40    { "abcde", -1, 5, TRUE },
      41    { "abcde", 3, 3, TRUE },
      42    { "abcde", 5, 5, TRUE },
      43    { "abcde", 7, 5, FALSE },
      44    /* length 2 */
      45    { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE },
      46    { "\xc2\xa9\xc2\xa9\xc2\xa9",  1, 0, FALSE },
      47    { "\xc2\xa9\xc2\xa9\xc2\xa9",  2, 2, TRUE },
      48    { "\xc2\xa9\xc2\xa9\xc2\xa9",  3, 2, FALSE },
      49    { "\xc2\xa9\xc2\xa9\xc2\xa9",  4, 4, TRUE },
      50    { "\xc2\xa9\xc2\xa9\xc2\xa9",  5, 4, FALSE },
      51    { "\xc2\xa9\xc2\xa9\xc2\xa9",  6, 6, TRUE },
      52    { "\xc2\xa9\xc2\xa9\xc2\xa9",  7, 6, FALSE },
      53    /* length 3 */
      54    { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
      55    { "\xe2\x89\xa0\xe2\x89\xa0",  1, 0, FALSE },
      56    { "\xe2\x89\xa0\xe2\x89\xa0",  2, 0, FALSE },
      57    { "\xe2\x89\xa0\xe2\x89\xa0",  3, 3, TRUE },
      58    { "\xe2\x89\xa0\xe2\x89\xa0",  4, 3, FALSE },
      59    { "\xe2\x89\xa0\xe2\x89\xa0",  5, 3, FALSE },
      60    { "\xe2\x89\xa0\xe2\x89\xa0",  6, 6, TRUE },
      61    { "\xe2\x89\xa0\xe2\x89\xa0",  7, 6, FALSE },
      62  
      63    /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
      64    /* greek 'kosme' */
      65    { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
      66    /* first sequence of each length */
      67    { "\x00", -1, 0, TRUE },
      68    { "\xc2\x80", -1, 2, TRUE },
      69    { "\xe0\xa0\x80", -1, 3, TRUE },
      70    { "\xf0\x90\x80\x80", -1, 4, TRUE },
      71    { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
      72    { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
      73    /* last sequence of each length */
      74    { "\x7f", -1, 1, TRUE },
      75    { "\xdf\xbf", -1, 2, TRUE },
      76    { "\xef\xbf\xbf", -1, 3, TRUE },
      77    { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
      78    { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
      79    { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
      80    /* other boundary conditions */
      81    { "\xed\x9f\xbf", -1, 3, TRUE },
      82    { "\xee\x80\x80", -1, 3, TRUE },
      83    { "\xef\xbf\xbd", -1, 3, TRUE },
      84    { "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
      85    { "\xf4\x90\x80\x80", -1, 0, FALSE },
      86    /* malformed sequences */
      87    /* continuation bytes */
      88    { "\x80", -1, 0, FALSE },
      89    { "\xbf", -1, 0, FALSE },
      90    { "\xbf\x80", -1, 0, FALSE },
      91    { "\x80\xbf", -1, 0, FALSE },
      92    { "\x80\xbf\x80", -1, 0, FALSE },
      93    { "\x80\xbf\x80\xbf", -1, 0, FALSE },
      94    { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
      95    { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
      96    { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
      97  
      98    /* all possible continuation byte */
      99    { "\x80", -1, 0, FALSE },
     100    { "\x81", -1, 0, FALSE },
     101    { "\x82", -1, 0, FALSE },
     102    { "\x83", -1, 0, FALSE },
     103    { "\x84", -1, 0, FALSE },
     104    { "\x85", -1, 0, FALSE },
     105    { "\x86", -1, 0, FALSE },
     106    { "\x87", -1, 0, FALSE },
     107    { "\x88", -1, 0, FALSE },
     108    { "\x89", -1, 0, FALSE },
     109    { "\x8a", -1, 0, FALSE },
     110    { "\x8b", -1, 0, FALSE },
     111    { "\x8c", -1, 0, FALSE },
     112    { "\x8d", -1, 0, FALSE },
     113    { "\x8e", -1, 0, FALSE },
     114    { "\x8f", -1, 0, FALSE },
     115    { "\x90", -1, 0, FALSE },
     116    { "\x91", -1, 0, FALSE },
     117    { "\x92", -1, 0, FALSE },
     118    { "\x93", -1, 0, FALSE },
     119    { "\x94", -1, 0, FALSE },
     120    { "\x95", -1, 0, FALSE },
     121    { "\x96", -1, 0, FALSE },
     122    { "\x97", -1, 0, FALSE },
     123    { "\x98", -1, 0, FALSE },
     124    { "\x99", -1, 0, FALSE },
     125    { "\x9a", -1, 0, FALSE },
     126    { "\x9b", -1, 0, FALSE },
     127    { "\x9c", -1, 0, FALSE },
     128    { "\x9d", -1, 0, FALSE },
     129    { "\x9e", -1, 0, FALSE },
     130    { "\x9f", -1, 0, FALSE },
     131    { "\xa0", -1, 0, FALSE },
     132    { "\xa1", -1, 0, FALSE },
     133    { "\xa2", -1, 0, FALSE },
     134    { "\xa3", -1, 0, FALSE },
     135    { "\xa4", -1, 0, FALSE },
     136    { "\xa5", -1, 0, FALSE },
     137    { "\xa6", -1, 0, FALSE },
     138    { "\xa7", -1, 0, FALSE },
     139    { "\xa8", -1, 0, FALSE },
     140    { "\xa9", -1, 0, FALSE },
     141    { "\xaa", -1, 0, FALSE },
     142    { "\xab", -1, 0, FALSE },
     143    { "\xac", -1, 0, FALSE },
     144    { "\xad", -1, 0, FALSE },
     145    { "\xae", -1, 0, FALSE },
     146    { "\xaf", -1, 0, FALSE },
     147    { "\xb0", -1, 0, FALSE },
     148    { "\xb1", -1, 0, FALSE },
     149    { "\xb2", -1, 0, FALSE },
     150    { "\xb3", -1, 0, FALSE },
     151    { "\xb4", -1, 0, FALSE },
     152    { "\xb5", -1, 0, FALSE },
     153    { "\xb6", -1, 0, FALSE },
     154    { "\xb7", -1, 0, FALSE },
     155    { "\xb8", -1, 0, FALSE },
     156    { "\xb9", -1, 0, FALSE },
     157    { "\xba", -1, 0, FALSE },
     158    { "\xbb", -1, 0, FALSE },
     159    { "\xbc", -1, 0, FALSE },
     160    { "\xbd", -1, 0, FALSE },
     161    { "\xbe", -1, 0, FALSE },
     162    { "\xbf", -1, 0, FALSE },
     163    /* lone start characters */
     164    { "\xc0\x20", -1, 0, FALSE },
     165    { "\xc1\x20", -1, 0, FALSE },
     166    { "\xc2\x20", -1, 0, FALSE },
     167    { "\xc3\x20", -1, 0, FALSE },
     168    { "\xc4\x20", -1, 0, FALSE },
     169    { "\xc5\x20", -1, 0, FALSE },
     170    { "\xc6\x20", -1, 0, FALSE },
     171    { "\xc7\x20", -1, 0, FALSE },
     172    { "\xc8\x20", -1, 0, FALSE },
     173    { "\xc9\x20", -1, 0, FALSE },
     174    { "\xca\x20", -1, 0, FALSE },
     175    { "\xcb\x20", -1, 0, FALSE },
     176    { "\xcc\x20", -1, 0, FALSE },
     177    { "\xcd\x20", -1, 0, FALSE },
     178    { "\xce\x20", -1, 0, FALSE },
     179    { "\xcf\x20", -1, 0, FALSE },
     180    { "\xd0\x20", -1, 0, FALSE },
     181    { "\xd1\x20", -1, 0, FALSE },
     182    { "\xd2\x20", -1, 0, FALSE },
     183    { "\xd3\x20", -1, 0, FALSE },
     184    { "\xd4\x20", -1, 0, FALSE },
     185    { "\xd5\x20", -1, 0, FALSE },
     186    { "\xd6\x20", -1, 0, FALSE },
     187    { "\xd7\x20", -1, 0, FALSE },
     188    { "\xd8\x20", -1, 0, FALSE },
     189    { "\xd9\x20", -1, 0, FALSE },
     190    { "\xda\x20", -1, 0, FALSE },
     191    { "\xdb\x20", -1, 0, FALSE },
     192    { "\xdc\x20", -1, 0, FALSE },
     193    { "\xdd\x20", -1, 0, FALSE },
     194    { "\xde\x20", -1, 0, FALSE },
     195    { "\xdf\x20", -1, 0, FALSE },
     196    { "\xe0\x20", -1, 0, FALSE },
     197    { "\xe1\x20", -1, 0, FALSE },
     198    { "\xe2\x20", -1, 0, FALSE },
     199    { "\xe3\x20", -1, 0, FALSE },
     200    { "\xe4\x20", -1, 0, FALSE },
     201    { "\xe5\x20", -1, 0, FALSE },
     202    { "\xe6\x20", -1, 0, FALSE },
     203    { "\xe7\x20", -1, 0, FALSE },
     204    { "\xe8\x20", -1, 0, FALSE },
     205    { "\xe9\x20", -1, 0, FALSE },
     206    { "\xea\x20", -1, 0, FALSE },
     207    { "\xeb\x20", -1, 0, FALSE },
     208    { "\xec\x20", -1, 0, FALSE },
     209    { "\xed\x20", -1, 0, FALSE },
     210    { "\xee\x20", -1, 0, FALSE },
     211    { "\xef\x20", -1, 0, FALSE },
     212    { "\xf0\x20", -1, 0, FALSE },
     213    { "\xf1\x20", -1, 0, FALSE },
     214    { "\xf2\x20", -1, 0, FALSE },
     215    { "\xf3\x20", -1, 0, FALSE },
     216    { "\xf4\x20", -1, 0, FALSE },
     217    { "\xf5\x20", -1, 0, FALSE },
     218    { "\xf6\x20", -1, 0, FALSE },
     219    { "\xf7\x20", -1, 0, FALSE },
     220    { "\xf8\x20", -1, 0, FALSE },
     221    { "\xf9\x20", -1, 0, FALSE },
     222    { "\xfa\x20", -1, 0, FALSE },
     223    { "\xfb\x20", -1, 0, FALSE },
     224    { "\xfc\x20", -1, 0, FALSE },
     225    { "\xfd\x20", -1, 0, FALSE },
     226    /* missing continuation bytes */
     227    { "\x20\xc0", -1, 1, FALSE },
     228    { "\x20\xe0\x80", -1, 1, FALSE },
     229    { "\x20\xf0\x80\x80", -1, 1, FALSE },
     230    { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
     231    { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
     232    { "\x20\xdf", -1, 1, FALSE },
     233    { "\x20\xef\xbf", -1, 1, FALSE },
     234    { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
     235    { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
     236    { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
     237    /* impossible bytes */
     238    { "\x20\xfe\x20", -1, 1, FALSE },
     239    { "\x20\xff\x20", -1, 1, FALSE },
     240    /* overlong sequences */
     241    { "\x20\xc0\xaf\x20", -1, 1, FALSE },
     242    { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
     243    { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
     244    { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
     245    { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
     246    { "\x20\xc1\xbf\x20", -1, 1, FALSE },
     247    { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
     248    { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
     249    { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
     250    { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
     251    { "\x20\xc0\x80\x20", -1, 1, FALSE },
     252    { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
     253    { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
     254    { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
     255    { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
     256    /* illegal code positions */
     257    { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
     258    { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
     259    { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
     260    { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
     261    { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
     262    { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
     263    { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
     264    { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
     265    { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
     266    { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
     267    { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
     268    { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
     269    { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
     270    { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
     271    { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
     272  
     273    { NULL, 0, 0, 0 }
     274  };
     275  
     276  static void
     277  do_test (gconstpointer d)
     278  {
     279    const Test *test = d;
     280    const gchar *end;
     281    gboolean result;
     282  
     283    result = g_utf8_validate (test->text, test->max_len, &end);
     284  
     285    g_assert_true (result == test->valid);
     286    g_assert_cmpint (end - test->text, ==, test->offset);
     287  
     288    if (test->max_len < 0)
     289      {
     290        result = g_utf8_validate (test->text, strlen (test->text), &end);
     291  
     292        g_assert_true (result == test->valid);
     293        g_assert_cmpint (end - test->text, ==, test->offset);
     294      }
     295    else
     296      {
     297        result = g_utf8_validate_len (test->text, test->max_len, &end);
     298  
     299        g_assert_true (result == test->valid);
     300        g_assert_cmpint (end - test->text, ==, test->offset);
     301      }
     302  }
     303  
     304  /* Test the behaviour of g_utf8_get_char_validated() with various inputs and
     305   * length restrictions. */
     306  static void
     307  test_utf8_get_char_validated (void)
     308  {
     309    const struct {
     310      const gchar *buf;
     311      gssize max_len;
     312      gunichar expected_result;
     313    } test_vectors[] = {
     314      /* Bug #780095: */
     315      { "\xC0\x00_45678", 8, (gunichar) -2 },
     316      { "\xC0\x00_45678", -1, (gunichar) -2 },
     317      /* It seems odd that the return value differs with the length input, but
     318       * that’s how it’s documented: */
     319      { "", 0, (gunichar) -2 },
     320      { "", -1, (gunichar) 0 },
     321      { "\0", 1, (gunichar) -2 },
     322      { "AB\0", 3, 'A' },
     323      { "A\0B", 3, 'A' },
     324      { "\0AB", 3, (gunichar) -2 },
     325      { "\xD8\0", 2, (gunichar) -2 },
     326      /* Normal inputs: */
     327      { "hello", 5, (gunichar) 'h' },
     328      { "hello", -1, (gunichar) 'h' },
     329      { "\xD8\x9F", 2, 0x061F },
     330      { "\xD8\x9F", -1, 0x061F },
     331      { "\xD8\x9Fmore", 6, 0x061F },
     332      { "\xD8\x9Fmore", -1, 0x061F },
     333      { "\xD8\x9F\0", 3, 0x061F },
     334      { "\xE2\x96\xB3", 3, 0x25B3 },
     335      { "\xE2\x96\xB3", -1, 0x25B3 },
     336      { "\xE2\x96\xB3more", 7, 0x25B3 },
     337      { "\xE2\x96\xB3more", -1, 0x25B3 },
     338      { "\xF0\x9F\x92\xA9", 4, 0x1F4A9 },
     339      { "\xF0\x9F\x92\xA9", -1, 0x1F4A9 },
     340      { "\xF0\x9F\x92\xA9more", 8, 0x1F4A9 },
     341      { "\xF0\x9F\x92\xA9more", -1, 0x1F4A9 },
     342      /* Partial unichars: */
     343      { "\xD8", -1, (gunichar) -2 },
     344      { "\xD8\x9F", 1, (gunichar) -2 },
     345      { "\xCE", -1, (gunichar) -2 },
     346      { "\xCE", 1, (gunichar) -2 },
     347    };
     348    gsize i;
     349  
     350    for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
     351      {
     352        gunichar actual_result;
     353  
     354        g_test_message ("Vector %" G_GSIZE_FORMAT, i);
     355        actual_result = g_utf8_get_char_validated (test_vectors[i].buf,
     356                                                   test_vectors[i].max_len);
     357        g_assert_cmpint (actual_result, ==, test_vectors[i].expected_result);
     358      }
     359  }
     360  
     361  int
     362  main (int argc, char *argv[])
     363  {
     364    gint i;
     365    gchar *path;
     366  
     367    g_test_init (&argc, &argv, NULL);
     368  
     369    for (i = 0; global_test[i].text; i++)
     370      {
     371        path = g_strdup_printf ("/utf8/validate/%d", i);
     372        g_test_add_data_func (path, &global_test[i], do_test);
     373        g_free (path);
     374      }
     375  
     376    g_test_add_func ("/utf8/get-char-validated", test_utf8_get_char_validated);
     377  
     378    return g_test_run ();
     379  }