1 /* gunicollate.c - Collation
2 *
3 * Copyright 2001,2005 Red Hat, Inc.
4 *
5 * SPDX-License-Identifier: LGPL-2.1-or-later
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "config.h"
22
23 #include <locale.h>
24 #include <string.h>
25 #ifdef HAVE_WCHAR_H
26 #include <wchar.h>
27 #endif
28
29 #ifdef HAVE_CARBON
30 #include <CoreServices/CoreServices.h>
31 #endif
32
33 #include "gmem.h"
34 #include "gunicode.h"
35 #include "gunicodeprivate.h"
36 #include "gstring.h"
37 #include "gstrfuncs.h"
38 #include "gtestutils.h"
39 #include "gcharset.h"
40 #include "gconvert.h"
41
42 #if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
43 #define GUNICHAR_EQUALS_WCHAR_T 1
44 #endif
45
46 #ifdef _MSC_VER
47 /* Workaround for bug in MSVCR80.DLL */
48 static gsize
49 msc_strxfrm_wrapper (char *string1,
50 const char *string2,
51 gsize count)
52 {
53 if (!string1 || count <= 0)
54 {
55 char tmp;
56
57 return strxfrm (&tmp, string2, 1);
58 }
59 return strxfrm (string1, string2, count);
60 }
61 #define strxfrm msc_strxfrm_wrapper
62 #endif
63
64 /**
65 * g_utf8_collate:
66 * @str1: a UTF-8 encoded string
67 * @str2: a UTF-8 encoded string
68 *
69 * Compares two strings for ordering using the linguistically
70 * correct rules for the [current locale][setlocale].
71 * When sorting a large number of strings, it will be significantly
72 * faster to obtain collation keys with g_utf8_collate_key() and
73 * compare the keys with strcmp() when sorting instead of sorting
74 * the original strings.
75 *
76 * If the two strings are not comparable due to being in different collation
77 * sequences, the result is undefined. This can happen if the strings are in
78 * different language scripts, for example.
79 *
80 * Returns: < 0 if @str1 compares before @str2,
81 * 0 if they compare equal, > 0 if @str1 compares after @str2.
82 **/
83 gint
84 g_utf8_collate (const gchar *str1,
85 const gchar *str2)
86 {
87 gint result;
88
89 #ifdef HAVE_CARBON
90
91 UniChar *str1_utf16;
92 UniChar *str2_utf16;
93 glong len1;
94 glong len2;
95 SInt32 retval = 0;
96
97 g_return_val_if_fail (str1 != NULL, 0);
98 g_return_val_if_fail (str2 != NULL, 0);
99
100 str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
101 str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
102
103 UCCompareTextDefault (kUCCollateStandardOptions,
104 str1_utf16, len1, str2_utf16, len2,
105 NULL, &retval);
106 result = retval;
107
108 g_free (str2_utf16);
109 g_free (str1_utf16);
110
111 #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
112
113 gunichar *str1_norm;
114 gunichar *str2_norm;
115
116 g_return_val_if_fail (str1 != NULL, 0);
117 g_return_val_if_fail (str2 != NULL, 0);
118
119 str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
120 str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
121
122 result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
123
124 g_free (str1_norm);
125 g_free (str2_norm);
126
127 #else
128
129 const gchar *charset;
130 gchar *str1_norm;
131 gchar *str2_norm;
132
133 g_return_val_if_fail (str1 != NULL, 0);
134 g_return_val_if_fail (str2 != NULL, 0);
135
136 str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
137 str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
138
139 if (g_get_charset (&charset))
140 {
141 result = strcoll (str1_norm, str2_norm);
142 }
143 else
144 {
145 gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
146 gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
147
148 if (str1_locale && str2_locale)
149 result = strcoll (str1_locale, str2_locale);
150 else if (str1_locale)
151 result = -1;
152 else if (str2_locale)
153 result = 1;
154 else
155 result = strcmp (str1_norm, str2_norm);
156
157 g_free (str1_locale);
158 g_free (str2_locale);
159 }
160
161 g_free (str1_norm);
162 g_free (str2_norm);
163
164 #endif
165
166 return result;
167 }
168
169 #if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
170 /* We need UTF-8 encoding of numbers to encode the weights if
171 * we are using wcsxfrm. However, we aren't encoding Unicode
172 * characters, so we can't simply use g_unichar_to_utf8.
173 *
174 * The following routine is taken (with modification) from GNU
175 * libc's strxfrm routine:
176 *
177 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
178 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
179 */
180 static inline int
181 utf8_encode (char *buf, wchar_t val)
182 {
183 int retval;
184
185 if (val < 0x80)
186 {
187 if (buf)
188 *buf++ = (char) val;
189 retval = 1;
190 }
191 else
192 {
193 int step;
194
195 for (step = 2; step < 6; ++step)
196 if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
197 break;
198 retval = step;
199
200 if (buf)
201 {
202 *buf = (unsigned char) (~0xff >> step);
203 --step;
204 do
205 {
206 buf[step] = 0x80 | (val & 0x3f);
207 val >>= 6;
208 }
209 while (--step > 0);
210 *buf |= val;
211 }
212 }
213
214 return retval;
215 }
216 #endif
217
218 #ifdef HAVE_CARBON
219
220 static gchar *
221 collate_key_to_string (UCCollationValue *key,
222 gsize key_len)
223 {
224 gchar *result;
225 gsize result_len;
226 long *lkey = (long *) key;
227
228 /* UCCollationValue format:
229 *
230 * UCCollateOptions (32/64 bits)
231 * SizeInBytes (32/64 bits)
232 * Value (8 bits arrey)
233 *
234 * UCCollateOptions: ordering option mask of the collator
235 * used to create the key. Size changes on 32-bit / 64-bit
236 * hosts. On 64-bits also the extra half-word seems to have
237 * some extra (unknown) meaning.
238 * SizeInBytes: size of the whole structure, in bytes
239 * (including UCCollateOptions and SizeInBytes fields). Size
240 * changes on 32-bit & 64-bit hosts.
241 * Value: array of bytes containing the comparison weights.
242 * Seems to have several sub-strings separated by \001 and \002
243 * chars. Also, experience shows this is directly strcmp-able.
244 */
245
246 result_len = lkey[1];
247 result = g_malloc (result_len + 1);
248 memcpy (result, &lkey[2], result_len);
249 result[result_len] = '\0';
250
251 return result;
252 }
253
254 static gchar *
255 carbon_collate_key_with_collator (const gchar *str,
256 gssize len,
257 CollatorRef collator)
258 {
259 UniChar *str_utf16 = NULL;
260 glong len_utf16;
261 OSStatus ret;
262 UCCollationValue staticbuf[512];
263 UCCollationValue *freeme = NULL;
264 UCCollationValue *buf;
265 ItemCount buf_len;
266 ItemCount key_len;
267 ItemCount try_len;
268 gchar *result = NULL;
269
270 str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
271 try_len = len_utf16 * 5 + 2;
272
273 if (try_len <= sizeof staticbuf)
274 {
275 buf = staticbuf;
276 buf_len = sizeof staticbuf;
277 }
278 else
279 {
280 freeme = g_new (UCCollationValue, try_len);
281 buf = freeme;
282 buf_len = try_len;
283 }
284
285 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
286 buf_len, &key_len, buf);
287
288 if (ret == kCollateBufferTooSmall)
289 {
290 freeme = g_renew (UCCollationValue, freeme, try_len * 2);
291 buf = freeme;
292 buf_len = try_len * 2;
293 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
294 buf_len, &key_len, buf);
295 }
296
297 if (ret == 0)
298 result = collate_key_to_string (buf, key_len);
299 else
300 result = g_strdup ("");
301
302 g_free (freeme);
303 g_free (str_utf16);
304 return result;
305 }
306
307 static gchar *
308 carbon_collate_key (const gchar *str,
309 gssize len)
310 {
311 static CollatorRef collator;
312
313 if (G_UNLIKELY (!collator))
314 {
315 UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
316
317 if (!collator)
318 {
319 static gboolean been_here;
320 if (!been_here)
321 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
322 been_here = TRUE;
323 return g_strdup ("");
324 }
325 }
326
327 return carbon_collate_key_with_collator (str, len, collator);
328 }
329
330 static gchar *
331 carbon_collate_key_for_filename (const gchar *str,
332 gssize len)
333 {
334 static CollatorRef collator;
335
336 if (G_UNLIKELY (!collator))
337 {
338 /* http://developer.apple.com/qa/qa2004/qa1159.html */
339 UCCreateCollator (NULL, 0,
340 kUCCollateComposeInsensitiveMask
341 | kUCCollateWidthInsensitiveMask
342 | kUCCollateCaseInsensitiveMask
343 | kUCCollateDigitsOverrideMask
344 | kUCCollateDigitsAsNumberMask
345 | kUCCollatePunctuationSignificantMask,
346 &collator);
347
348 if (!collator)
349 {
350 static gboolean been_here;
351 if (!been_here)
352 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
353 been_here = TRUE;
354 return g_strdup ("");
355 }
356 }
357
358 return carbon_collate_key_with_collator (str, len, collator);
359 }
360
361 #endif /* HAVE_CARBON */
362
363 /**
364 * g_utf8_collate_key:
365 * @str: a UTF-8 encoded string.
366 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
367 *
368 * Converts a string into a collation key that can be compared
369 * with other collation keys produced by the same function using
370 * strcmp().
371 *
372 * The results of comparing the collation keys of two strings
373 * with strcmp() will always be the same as comparing the two
374 * original keys with g_utf8_collate().
375 *
376 * Note that this function depends on the [current locale][setlocale].
377 *
378 * Returns: a newly allocated string. This string should
379 * be freed with g_free() when you are done with it.
380 **/
381 gchar *
382 g_utf8_collate_key (const gchar *str,
383 gssize len)
384 {
385 gchar *result;
386
387 #ifdef HAVE_CARBON
388
389 g_return_val_if_fail (str != NULL, NULL);
390 result = carbon_collate_key (str, len);
391
392 #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
393
394 gsize xfrm_len;
395 gunichar *str_norm;
396 wchar_t *result_wc;
397 gsize i;
398 gsize result_len = 0;
399
400 g_return_val_if_fail (str != NULL, NULL);
401
402 str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
403
404 g_return_val_if_fail (str_norm != NULL, NULL);
405
406 xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
407 result_wc = g_new (wchar_t, xfrm_len + 1);
408 wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
409
410 for (i = 0; i < xfrm_len; i++)
411 result_len += utf8_encode (NULL, result_wc[i]);
412
413 result = g_malloc (result_len + 1);
414 result_len = 0;
415 for (i = 0; i < xfrm_len; i++)
416 result_len += utf8_encode (result + result_len, result_wc[i]);
417
418 result[result_len] = '\0';
419
420 g_free (result_wc);
421 g_free (str_norm);
422
423 return result;
424 #else
425
426 gsize xfrm_len = 0;
427 const gchar *charset;
428 gchar *str_norm;
429
430 g_return_val_if_fail (str != NULL, NULL);
431
432 str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
433
434 result = NULL;
435
436 if (g_get_charset (&charset))
437 {
438 xfrm_len = strxfrm (NULL, str_norm, 0);
439 if (xfrm_len < G_MAXINT - 2)
440 {
441 result = g_malloc (xfrm_len + 1);
442 strxfrm (result, str_norm, xfrm_len + 1);
443 }
444 }
445 else
446 {
447 gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
448
449 if (str_locale)
450 {
451 xfrm_len = strxfrm (NULL, str_locale, 0);
452 if (xfrm_len >= G_MAXINT - 2)
453 {
454 g_free (str_locale);
455 str_locale = NULL;
456 }
457 }
458 if (str_locale)
459 {
460 result = g_malloc (xfrm_len + 2);
461 result[0] = 'A';
462 strxfrm (result + 1, str_locale, xfrm_len + 1);
463
464 g_free (str_locale);
465 }
466 }
467
468 if (!result)
469 {
470 xfrm_len = strlen (str_norm);
471 result = g_malloc (xfrm_len + 2);
472 result[0] = 'B';
473 memcpy (result + 1, str_norm, xfrm_len);
474 result[xfrm_len+1] = '\0';
475 }
476
477 g_free (str_norm);
478 #endif
479
480 return result;
481 }
482
483 /* This is a collation key that is very very likely to sort before any
484 * collation key that libc strxfrm generates. We use this before any
485 * special case (dot or number) to make sure that its sorted before
486 * anything else.
487 */
488 #define COLLATION_SENTINEL "\1\1\1"
489
490 /**
491 * g_utf8_collate_key_for_filename:
492 * @str: a UTF-8 encoded string.
493 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
494 *
495 * Converts a string into a collation key that can be compared
496 * with other collation keys produced by the same function using strcmp().
497 *
498 * In order to sort filenames correctly, this function treats the dot '.'
499 * as a special case. Most dictionary orderings seem to consider it
500 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
501 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
502 * would like to treat numbers intelligently so that "file1" "file10" "file5"
503 * is sorted as "file1" "file5" "file10".
504 *
505 * Note that this function depends on the [current locale][setlocale].
506 *
507 * Returns: a newly allocated string. This string should
508 * be freed with g_free() when you are done with it.
509 *
510 * Since: 2.8
511 */
512 gchar *
513 g_utf8_collate_key_for_filename (const gchar *str,
514 gssize len)
515 {
516 #ifndef HAVE_CARBON
517 GString *result;
518 GString *append;
519 const gchar *p;
520 const gchar *prev;
521 const gchar *end;
522 gchar *collate_key;
523 gint digits;
524 gint leading_zeros;
525
526 /*
527 * How it works:
528 *
529 * Split the filename into collatable substrings which do
530 * not contain [.0-9] and special-cased substrings. The collatable
531 * substrings are run through the normal g_utf8_collate_key() and the
532 * resulting keys are concatenated with keys generated from the
533 * special-cased substrings.
534 *
535 * Special cases: Dots are handled by replacing them with '\1' which
536 * implies that short dot-delimited substrings are before long ones,
537 * e.g.
538 *
539 * a\1a (a.a)
540 * a-\1a (a-.a)
541 * aa\1a (aa.a)
542 *
543 * Numbers are handled by prepending to each number d-1 superdigits
544 * where d = number of digits in the number and SUPERDIGIT is a
545 * character with an integer value higher than any digit (for instance
546 * ':'). This ensures that single-digit numbers are sorted before
547 * double-digit numbers which in turn are sorted separately from
548 * triple-digit numbers, etc. To avoid strange side-effects when
549 * sorting strings that already contain SUPERDIGITs, a '\2'
550 * is also prepended, like this
551 *
552 * file\21 (file1)
553 * file\25 (file5)
554 * file\2:10 (file10)
555 * file\2:26 (file26)
556 * file\2::100 (file100)
557 * file:foo (file:foo)
558 *
559 * This has the side-effect of sorting numbers before everything else (except
560 * dots), but this is probably OK.
561 *
562 * Leading digits are ignored when doing the above. To discriminate
563 * numbers which differ only in the number of leading digits, we append
564 * the number of leading digits as a byte at the very end of the collation
565 * key.
566 *
567 * To try avoid conflict with any collation key sequence generated by libc we
568 * start each switch to a special cased part with a sentinel that hopefully
569 * will sort before anything libc will generate.
570 */
571
572 if (len < 0)
573 len = strlen (str);
574
575 result = g_string_sized_new (len * 2);
576 append = g_string_sized_new (0);
577
578 end = str + len;
579
580 /* No need to use utf8 functions, since we're only looking for ascii chars */
581 for (prev = p = str; p < end; p++)
582 {
583 switch (*p)
584 {
585 case '.':
586 if (prev != p)
587 {
588 collate_key = g_utf8_collate_key (prev, p - prev);
589 g_string_append (result, collate_key);
590 g_free (collate_key);
591 }
592
593 g_string_append (result, COLLATION_SENTINEL "\1");
594
595 /* skip the dot */
596 prev = p + 1;
597 break;
598
599 case '0':
600 case '1':
601 case '2':
602 case '3':
603 case '4':
604 case '5':
605 case '6':
606 case '7':
607 case '8':
608 case '9':
609 if (prev != p)
610 {
611 collate_key = g_utf8_collate_key (prev, p - prev);
612 g_string_append (result, collate_key);
613 g_free (collate_key);
614 }
615
616 g_string_append (result, COLLATION_SENTINEL "\2");
617
618 prev = p;
619
620 /* write d-1 colons */
621 if (*p == '0')
622 {
623 leading_zeros = 1;
624 digits = 0;
625 }
626 else
627 {
628 leading_zeros = 0;
629 digits = 1;
630 }
631
632 while (++p < end)
633 {
634 if (*p == '0' && !digits)
635 ++leading_zeros;
636 else if (g_ascii_isdigit(*p))
637 ++digits;
638 else
639 {
640 /* count an all-zero sequence as
641 * one digit plus leading zeros
642 */
643 if (!digits)
644 {
645 ++digits;
646 --leading_zeros;
647 }
648 break;
649 }
650 }
651
652 while (digits > 1)
653 {
654 g_string_append_c (result, ':');
655 --digits;
656 }
657
658 if (leading_zeros > 0)
659 {
660 g_string_append_c (append, (char)leading_zeros);
661 prev += leading_zeros;
662 }
663
664 /* write the number itself */
665 g_string_append_len (result, prev, p - prev);
666
667 prev = p;
668 --p; /* go one step back to avoid disturbing outer loop */
669 break;
670
671 default:
672 /* other characters just accumulate */
673 break;
674 }
675 }
676
677 if (prev != p)
678 {
679 collate_key = g_utf8_collate_key (prev, p - prev);
680 g_string_append (result, collate_key);
681 g_free (collate_key);
682 }
683
684 g_string_append (result, append->str);
685 g_string_free (append, TRUE);
686
687 return g_string_free (result, FALSE);
688 #else /* HAVE_CARBON */
689 return carbon_collate_key_for_filename (str, len);
690 #endif
691 }