1 #undef G_DISABLE_ASSERT
2 #undef G_LOG_DOMAIN
3
4 #include <glib.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8
9 static char *
10 decode (const gchar *input)
11 {
12 unsigned ch;
13 int offset = 0;
14 GString *result = g_string_new (NULL);
15
16 do
17 {
18 g_assert_cmpint (sscanf (input + offset, "%x", &ch), ==, 1);
19 g_string_append_unichar (result, ch);
20
21 while (input[offset] && input[offset] != ' ')
22 offset++;
23 while (input[offset] && input[offset] == ' ')
24 offset++;
25 }
26 while (input[offset]);
27
28 return g_string_free (result, FALSE);
29 }
30
31 const char *names[4] = {
32 "NFD",
33 "NFC",
34 "NFKD",
35 "NFKC"
36 };
37
38 static void
39 test_form (int line,
40 GNormalizeMode mode,
41 gboolean do_compat,
42 int expected,
43 char **c,
44 char **raw)
45 {
46 int i;
47 gboolean mode_is_compat = (mode == G_NORMALIZE_NFKC ||
48 mode == G_NORMALIZE_NFKD);
49
50 if (mode_is_compat || !do_compat)
51 {
52 for (i = 0; i < 3; i++)
53 {
54 char *result = g_utf8_normalize (c[i], -1, mode);
55 g_assert_cmpstr (result, ==, c[expected]);
56 g_free (result);
57 }
58 }
59 if (mode_is_compat || do_compat)
60 {
61 for (i = 3; i < 5; i++)
62 {
63 char *result = g_utf8_normalize (c[i], -1, mode);
64 g_assert_cmpstr (result, ==, c[expected]);
65 g_free (result);
66 }
67 }
68 }
69
70 static void
71 process_one (int line, gchar **columns)
72 {
73 char *c[5];
74 int i;
75
76 for (i = 0; i < 5; i++)
77 {
78 c[i] = decode (columns[i]);
79 g_assert_nonnull (c[i]);
80 }
81
82 test_form (line, G_NORMALIZE_NFD, FALSE, 2, c, columns);
83 test_form (line, G_NORMALIZE_NFD, TRUE, 4, c, columns);
84 test_form (line, G_NORMALIZE_NFC, FALSE, 1, c, columns);
85 test_form (line, G_NORMALIZE_NFC, TRUE, 3, c, columns);
86 test_form (line, G_NORMALIZE_NFKD, TRUE, 4, c, columns);
87 test_form (line, G_NORMALIZE_NFKC, TRUE, 3, c, columns);
88
89 for (i = 0; i < 5; i++)
90 g_free (c[i]);
91 }
92
93 static void
94 test_unicode_normalize (void)
95 {
96 GIOChannel *in;
97 GError *error = NULL;
98 gchar *filename = NULL;
99 GString *buffer = g_string_new (NULL);
100 int line = 1;
101
102 filename = g_test_build_filename (G_TEST_DIST, "NormalizationTest.txt", NULL);
103 g_assert_nonnull (filename);
104
105 in = g_io_channel_new_file (filename, "r", &error);
106 g_assert_no_error (error);
107 g_assert_nonnull (in);
108 g_free (filename);
109
110 while (TRUE)
111 {
112 gsize term_pos;
113 gchar **columns;
114
115 if (g_io_channel_read_line_string (in, buffer, &term_pos, &error) != G_IO_STATUS_NORMAL)
116 break;
117
118 buffer->str[term_pos] = '\0';
119
120 if (buffer->str[0] == '#') /* Comment */
121 goto next;
122 if (buffer->str[0] == '@') /* Part */
123 {
124 g_test_message ("Processing %s", buffer->str + 1);
125 goto next;
126 }
127
128 columns = g_strsplit (buffer->str, ";", -1);
129 if (!columns[0])
130 {
131 g_strfreev (columns);
132 goto next;
133 }
134
135 process_one (line, columns);
136 g_strfreev (columns);
137
138 next:
139 g_string_truncate (buffer, 0);
140 line++;
141 }
142
143 g_assert_no_error (error);
144
145 g_io_channel_unref (in);
146 g_string_free (buffer, TRUE);
147 }
148
149 static void
150 test_unicode_normalize_invalid (void)
151 {
152 /* g_utf8_normalize() should return NULL for all of these invalid inputs */
153 const struct
154 {
155 gssize max_len;
156 const gchar *str;
157 } test_vectors[] = {
158 /* input ending with truncated multibyte encoding */
159 { -1, "\xC0" },
160 { 1, "\xC0\x80" },
161 { -1, "\xE0\x80" },
162 { 2, "\xE0\x80\x80" },
163 { -1, "\xF0\x80\x80" },
164 { 3, "\xF0\x80\x80\x80" },
165 { -1, "\xF8\x80\x80\x80" },
166 { 4, "\xF8\x80\x80\x80\x80" },
167 { 3, "\x20\xE2\x84\xAA" },
168 { -1, "\x20\xE2\x00\xAA" },
169 { -1, "\xC0\x80\xE0\x80" },
170 { 4, "\xC0\x80\xE0\x80\x80" },
171 /* input containing invalid multibyte encoding */
172 { -1, "\xED\x85\x9C\xED\x15\x9C\xED\x85\x9C" },
173 };
174 gsize i;
175
176 for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
177 {
178 g_test_message ("Invalid UTF-8 vector %" G_GSIZE_FORMAT, i);
179 g_assert_null (g_utf8_normalize (test_vectors[i].str,
180 test_vectors[i].max_len,
181 G_NORMALIZE_ALL));
182 }
183 }
184
185 static void
186 test_unicode_normalize_bad_length (void)
187 {
188 const char *input = "fórmula, vol. 2 (deluxe edition)";
189 gsize len = 2;
190 char *output;
191
192 output = g_utf8_normalize (input, len, G_NORMALIZE_ALL_COMPOSE);
193 g_assert_null (output);
194
195 g_free (output);
196 }
197
198 int
199 main (int argc, char **argv)
200 {
201 g_test_init (&argc, &argv, NULL);
202
203 g_test_add_func ("/unicode/normalize", test_unicode_normalize);
204 g_test_add_func ("/unicode/normalize-invalid",
205 test_unicode_normalize_invalid);
206 g_test_add_func ("/unicode/normalize/bad-length", test_unicode_normalize_bad_length);
207
208 return g_test_run ();
209 }