(root)/
glib-2.79.0/
glib/
tests/
unicode-normalize.c
       1  #undef G_DISABLE_ASSERT
       2  #undef G_LOG_DOMAIN
       3  
       4  #include <glib.h>
       5  #include <stdio.h>
       6  #include <stdlib.h>
       7  #include <string.h>
       8  
       9  static char *
      10  decode (const gchar *input)
      11  {
      12    unsigned ch;
      13    int offset = 0;
      14    GString *result = g_string_new (NULL);
      15  
      16    do
      17      {
      18        g_assert_cmpint (sscanf (input + offset, "%x", &ch), ==, 1);
      19        g_string_append_unichar (result, ch);
      20  
      21        while (input[offset] && input[offset] != ' ')
      22  	offset++;
      23        while (input[offset] && input[offset] == ' ')
      24  	offset++;
      25      }
      26    while (input[offset]);
      27  
      28    return g_string_free (result, FALSE);
      29  }
      30  
      31  const char *names[4] = {
      32    "NFD",
      33    "NFC",
      34    "NFKD",
      35    "NFKC"
      36  };
      37  
      38  static void
      39  test_form (int            line,
      40  	   GNormalizeMode mode,
      41  	   gboolean       do_compat,
      42  	   int            expected,
      43  	   char         **c,
      44  	   char         **raw)
      45  {
      46    int i;
      47    gboolean mode_is_compat = (mode == G_NORMALIZE_NFKC ||
      48  			     mode == G_NORMALIZE_NFKD);
      49  
      50    if (mode_is_compat || !do_compat)
      51      {
      52        for (i = 0; i < 3; i++)
      53  	{
      54  	  char *result = g_utf8_normalize (c[i], -1, mode);
      55            g_assert_cmpstr (result, ==, c[expected]);
      56            g_free (result);
      57  	}
      58      }
      59    if (mode_is_compat || do_compat)
      60      {
      61        for (i = 3; i < 5; i++)
      62  	{
      63  	  char *result = g_utf8_normalize (c[i], -1, mode);
      64            g_assert_cmpstr (result, ==, c[expected]);
      65            g_free (result);
      66  	}
      67      }
      68  }
      69  
      70  static void
      71  process_one (int line, gchar **columns)
      72  {
      73    char *c[5];
      74    int i;
      75  
      76    for (i = 0; i < 5; i++)
      77      {
      78        c[i] = decode (columns[i]);
      79        g_assert_nonnull (c[i]);
      80      }
      81  
      82    test_form (line, G_NORMALIZE_NFD, FALSE, 2, c, columns);
      83    test_form (line, G_NORMALIZE_NFD, TRUE, 4, c, columns);
      84    test_form (line, G_NORMALIZE_NFC, FALSE, 1, c, columns);
      85    test_form (line, G_NORMALIZE_NFC, TRUE, 3, c, columns);
      86    test_form (line, G_NORMALIZE_NFKD, TRUE, 4, c, columns);
      87    test_form (line, G_NORMALIZE_NFKC, TRUE, 3, c, columns);
      88  
      89    for (i = 0; i < 5; i++)
      90      g_free (c[i]);
      91  }
      92  
      93  static void
      94  test_unicode_normalize (void)
      95  {
      96    GIOChannel *in;
      97    GError *error = NULL;
      98    gchar *filename = NULL;
      99    GString *buffer = g_string_new (NULL);
     100    int line = 1;
     101  
     102    filename = g_test_build_filename (G_TEST_DIST, "NormalizationTest.txt", NULL);
     103    g_assert_nonnull (filename);
     104  
     105    in = g_io_channel_new_file (filename, "r", &error);
     106    g_assert_no_error (error);
     107    g_assert_nonnull (in);
     108    g_free (filename);
     109  
     110    while (TRUE)
     111      {
     112        gsize term_pos;
     113        gchar **columns;
     114  
     115        if (g_io_channel_read_line_string (in, buffer, &term_pos, &error) != G_IO_STATUS_NORMAL)
     116  	break;
     117  
     118        buffer->str[term_pos] = '\0';
     119  
     120        if (buffer->str[0] == '#') /* Comment */
     121  	goto next;
     122        if (buffer->str[0] == '@') /* Part */
     123  	{
     124  	  g_test_message ("Processing %s", buffer->str + 1);
     125  	  goto next;
     126  	}
     127  
     128        columns = g_strsplit (buffer->str, ";", -1);
     129        if (!columns[0])
     130          {
     131            g_strfreev (columns);
     132            goto next;
     133          }
     134  
     135        process_one (line, columns);
     136        g_strfreev (columns);
     137  
     138      next:
     139        g_string_truncate (buffer, 0);
     140        line++;
     141      }
     142  
     143    g_assert_no_error (error);
     144  
     145    g_io_channel_unref (in);
     146    g_string_free (buffer, TRUE);
     147  }
     148  
     149  static void
     150  test_unicode_normalize_invalid (void)
     151  {
     152    /* g_utf8_normalize() should return NULL for all of these invalid inputs */
     153    const struct
     154    {
     155      gssize max_len;
     156      const gchar *str;
     157    } test_vectors[] = {
     158      /* input ending with truncated multibyte encoding */
     159      { -1, "\xC0" },
     160      { 1, "\xC0\x80" },
     161      { -1, "\xE0\x80" },
     162      { 2, "\xE0\x80\x80" },
     163      { -1, "\xF0\x80\x80" },
     164      { 3, "\xF0\x80\x80\x80" },
     165      { -1, "\xF8\x80\x80\x80" },
     166      { 4, "\xF8\x80\x80\x80\x80" },
     167      { 3, "\x20\xE2\x84\xAA" },
     168      { -1, "\x20\xE2\x00\xAA" },
     169      { -1, "\xC0\x80\xE0\x80" },
     170      { 4, "\xC0\x80\xE0\x80\x80" },
     171      /* input containing invalid multibyte encoding */
     172      { -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" },
     173    };
     174    gsize i;
     175  
     176    for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
     177      {
     178        g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i);
     179        g_assert_null (g_utf8_normalize (test_vectors[i].str,
     180                                         test_vectors[i].max_len,
     181                                         G_NORMALIZE_ALL));
     182      }
     183  }
     184  
     185  static void
     186  test_unicode_normalize_bad_length (void)
     187  {
     188    const char *input = "fórmula, vol. 2 (deluxe edition)";
     189    gsize len = 2;
     190    char *output;
     191  
     192    output = g_utf8_normalize (input, len, G_NORMALIZE_ALL_COMPOSE);
     193    g_assert_null (output);
     194  
     195    g_free (output);
     196  }
     197  
     198  int
     199  main (int argc, char **argv)
     200  {
     201    g_test_init (&argc, &argv, NULL);
     202  
     203    g_test_add_func ("/unicode/normalize", test_unicode_normalize);
     204    g_test_add_func ("/unicode/normalize-invalid",
     205                     test_unicode_normalize_invalid);
     206    g_test_add_func ("/unicode/normalize/bad-length", test_unicode_normalize_bad_length);
     207  
     208    return g_test_run ();
     209  }