1  /* gunicollate.c - Collation
       2   *
       3   *  Copyright 2001,2005 Red Hat, Inc.
       4   *
       5   * SPDX-License-Identifier: LGPL-2.1-or-later
       6   *
       7   * This library is free software; you can redistribute it and/or
       8   * modify it under the terms of the GNU Lesser General Public
       9   * License as published by the Free Software Foundation; either
      10   * version 2.1 of the License, or (at your option) any later version.
      11   *
      12   * This library is distributed in the hope that it will be useful,
      13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      15   * Lesser General Public License for more details.
      16   *
      17   * You should have received a copy of the GNU Lesser General Public License
      18   * along with this library; if not, see <http://www.gnu.org/licenses/>.
      19   */
      20  
      21  #include "config.h"
      22  
      23  #include <locale.h>
      24  #include <string.h>
      25  #ifdef HAVE_WCHAR_H
      26  #include <wchar.h>
      27  #endif
      28  
      29  #ifdef HAVE_CARBON
      30  #include <CoreServices/CoreServices.h>
      31  #endif
      32  
      33  #include "gmem.h"
      34  #include "gunicode.h"
      35  #include "gunicodeprivate.h"
      36  #include "gstring.h"
      37  #include "gstrfuncs.h"
      38  #include "gtestutils.h"
      39  #include "gcharset.h"
      40  #include "gconvert.h"
      41  
      42  #if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
      43  #define GUNICHAR_EQUALS_WCHAR_T 1
      44  #endif
      45  
      46  #ifdef _MSC_VER
      47  /* Workaround for bug in MSVCR80.DLL */
      48  static gsize
      49  msc_strxfrm_wrapper (char       *string1,
      50  		     const char *string2,
      51  		     gsize       count)
      52  {
      53    if (!string1 || count <= 0)
      54      {
      55        char tmp;
      56  
      57        return strxfrm (&tmp, string2, 1);
      58      }
      59    return strxfrm (string1, string2, count);
      60  }
      61  #define strxfrm msc_strxfrm_wrapper
      62  #endif
      63  
      64  /**
      65   * g_utf8_collate:
      66   * @str1: a UTF-8 encoded string
      67   * @str2: a UTF-8 encoded string
      68   * 
      69   * Compares two strings for ordering using the linguistically
      70   * correct rules for the [current locale][setlocale].
      71   * When sorting a large number of strings, it will be significantly 
      72   * faster to obtain collation keys with g_utf8_collate_key() and 
      73   * compare the keys with strcmp() when sorting instead of sorting 
      74   * the original strings.
      75   * 
      76   * If the two strings are not comparable due to being in different collation
      77   * sequences, the result is undefined. This can happen if the strings are in
      78   * different language scripts, for example.
      79   *
      80   * Returns: < 0 if @str1 compares before @str2, 
      81   *   0 if they compare equal, > 0 if @str1 compares after @str2.
      82   **/
      83  gint
      84  g_utf8_collate (const gchar *str1,
      85  		const gchar *str2)
      86  {
      87    gint result;
      88  
      89  #ifdef HAVE_CARBON
      90  
      91    UniChar *str1_utf16;
      92    UniChar *str2_utf16;
      93    glong len1;
      94    glong len2;
      95    SInt32 retval = 0;
      96  
      97    g_return_val_if_fail (str1 != NULL, 0);
      98    g_return_val_if_fail (str2 != NULL, 0);
      99  
     100    str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
     101    str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
     102  
     103    UCCompareTextDefault (kUCCollateStandardOptions,
     104                          str1_utf16, len1, str2_utf16, len2,
     105                          NULL, &retval);
     106    result = retval;
     107  
     108    g_free (str2_utf16);
     109    g_free (str1_utf16);
     110  
     111  #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
     112  
     113    gunichar *str1_norm;
     114    gunichar *str2_norm;
     115  
     116    g_return_val_if_fail (str1 != NULL, 0);
     117    g_return_val_if_fail (str2 != NULL, 0);
     118  
     119    str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
     120    str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
     121  
     122    result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
     123  
     124    g_free (str1_norm);
     125    g_free (str2_norm);
     126  
     127  #else
     128  
     129    const gchar *charset;
     130    gchar *str1_norm;
     131    gchar *str2_norm;
     132  
     133    g_return_val_if_fail (str1 != NULL, 0);
     134    g_return_val_if_fail (str2 != NULL, 0);
     135  
     136    str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
     137    str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
     138  
     139    if (g_get_charset (&charset))
     140      {
     141        result = strcoll (str1_norm, str2_norm);
     142      }
     143    else
     144      {
     145        gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
     146        gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
     147  
     148        if (str1_locale && str2_locale)
     149  	result =  strcoll (str1_locale, str2_locale);
     150        else if (str1_locale)
     151  	result = -1;
     152        else if (str2_locale)
     153  	result = 1;
     154        else
     155  	result = strcmp (str1_norm, str2_norm);
     156  
     157        g_free (str1_locale);
     158        g_free (str2_locale);
     159      }
     160  
     161    g_free (str1_norm);
     162    g_free (str2_norm);
     163  
     164  #endif
     165  
     166    return result;
     167  }
     168  
     169  #if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
     170  /* We need UTF-8 encoding of numbers to encode the weights if
     171   * we are using wcsxfrm. However, we aren't encoding Unicode
     172   * characters, so we can't simply use g_unichar_to_utf8.
     173   *
     174   * The following routine is taken (with modification) from GNU
     175   * libc's strxfrm routine:
     176   *
     177   * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
     178   * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
     179   */
     180  static inline int
     181  utf8_encode (char *buf, wchar_t val)
     182  {
     183    int retval;
     184  
     185    if (val < 0x80)
     186      {
     187        if (buf)
     188  	*buf++ = (char) val;
     189        retval = 1;
     190      }
     191    else
     192      {
     193        int step;
     194  
     195        for (step = 2; step < 6; ++step)
     196          if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
     197            break;
     198        retval = step;
     199  
     200        if (buf)
     201  	{
     202  	  *buf = (unsigned char) (~0xff >> step);
     203  	  --step;
     204  	  do
     205  	    {
     206  	      buf[step] = 0x80 | (val & 0x3f);
     207  	      val >>= 6;
     208  	    }
     209  	  while (--step > 0);
     210  	  *buf |= val;
     211  	}
     212      }
     213  
     214    return retval;
     215  }
     216  #endif
     217  
     218  #ifdef HAVE_CARBON
     219  
     220  static gchar *
     221  collate_key_to_string (UCCollationValue *key,
     222                         gsize             key_len)
     223  {
     224    gchar *result;
     225    gsize result_len;
     226    long *lkey = (long *) key;
     227  
     228    /* UCCollationValue format:
     229     *
     230     * UCCollateOptions (32/64 bits)
     231     * SizeInBytes      (32/64 bits)
     232     * Value            (8 bits arrey)
     233     *
     234     * UCCollateOptions: ordering option mask of the collator
     235     * used to create the key. Size changes on 32-bit / 64-bit
     236     * hosts. On 64-bits also the extra half-word seems to have
     237     * some extra (unknown) meaning.
     238     * SizeInBytes: size of the whole structure, in bytes
     239     * (including UCCollateOptions and SizeInBytes fields). Size
     240     * changes on 32-bit & 64-bit hosts.
     241     * Value: array of bytes containing the comparison weights.
     242     * Seems to have several sub-strings separated by \001 and \002
     243     * chars. Also, experience shows this is directly strcmp-able.
     244     */
     245  
     246    result_len = lkey[1];
     247    result = g_malloc (result_len + 1);
     248    memcpy (result, &lkey[2], result_len);
     249    result[result_len] = '\0';
     250  
     251    return result;
     252  }
     253  
     254  static gchar *
     255  carbon_collate_key_with_collator (const gchar *str,
     256                                    gssize       len,
     257                                    CollatorRef  collator)
     258  {
     259    UniChar *str_utf16 = NULL;
     260    glong len_utf16;
     261    OSStatus ret;
     262    UCCollationValue staticbuf[512];
     263    UCCollationValue *freeme = NULL;
     264    UCCollationValue *buf;
     265    ItemCount buf_len;
     266    ItemCount key_len;
     267    ItemCount try_len;
     268    gchar *result = NULL;
     269  
     270    str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
     271    try_len = len_utf16 * 5 + 2;
     272  
     273    if (try_len <= sizeof staticbuf)
     274      {
     275        buf = staticbuf;
     276        buf_len = sizeof staticbuf;
     277      }
     278    else
     279      {
     280        freeme = g_new (UCCollationValue, try_len);
     281        buf = freeme;
     282        buf_len = try_len;
     283      }
     284  
     285    ret = UCGetCollationKey (collator, str_utf16, len_utf16,
     286                             buf_len, &key_len, buf);
     287  
     288    if (ret == kCollateBufferTooSmall)
     289      {
     290        freeme = g_renew (UCCollationValue, freeme, try_len * 2);
     291        buf = freeme;
     292        buf_len = try_len * 2;
     293        ret = UCGetCollationKey (collator, str_utf16, len_utf16,
     294                                 buf_len, &key_len, buf);
     295      }
     296  
     297    if (ret == 0)
     298      result = collate_key_to_string (buf, key_len);
     299    else
     300      result = g_strdup ("");
     301  
     302    g_free (freeme);
     303    g_free (str_utf16);
     304    return result;
     305  }
     306  
     307  static gchar *
     308  carbon_collate_key (const gchar *str,
     309                      gssize       len)
     310  {
     311    static CollatorRef collator;
     312  
     313    if (G_UNLIKELY (!collator))
     314      {
     315        UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
     316  
     317        if (!collator)
     318          {
     319            static gboolean been_here;
     320            if (!been_here)
     321              g_warning ("%s: UCCreateCollator failed", G_STRLOC);
     322            been_here = TRUE;
     323            return g_strdup ("");
     324          }
     325      }
     326  
     327    return carbon_collate_key_with_collator (str, len, collator);
     328  }
     329  
     330  static gchar *
     331  carbon_collate_key_for_filename (const gchar *str,
     332                                   gssize       len)
     333  {
     334    static CollatorRef collator;
     335  
     336    if (G_UNLIKELY (!collator))
     337      {
     338        /* http://developer.apple.com/qa/qa2004/qa1159.html */
     339        UCCreateCollator (NULL, 0,
     340                          kUCCollateComposeInsensitiveMask
     341                           | kUCCollateWidthInsensitiveMask
     342                           | kUCCollateCaseInsensitiveMask
     343                           | kUCCollateDigitsOverrideMask
     344                           | kUCCollateDigitsAsNumberMask
     345                           | kUCCollatePunctuationSignificantMask, 
     346                          &collator);
     347  
     348        if (!collator)
     349          {
     350            static gboolean been_here;
     351            if (!been_here)
     352              g_warning ("%s: UCCreateCollator failed", G_STRLOC);
     353            been_here = TRUE;
     354            return g_strdup ("");
     355          }
     356      }
     357  
     358    return carbon_collate_key_with_collator (str, len, collator);
     359  }
     360  
     361  #endif /* HAVE_CARBON */
     362  
     363  /**
     364   * g_utf8_collate_key:
     365   * @str: a UTF-8 encoded string.
     366   * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
     367   *
     368   * Converts a string into a collation key that can be compared
     369   * with other collation keys produced by the same function using 
     370   * strcmp(). 
     371   *
     372   * The results of comparing the collation keys of two strings 
     373   * with strcmp() will always be the same as comparing the two 
     374   * original keys with g_utf8_collate().
     375   * 
     376   * Note that this function depends on the [current locale][setlocale].
     377   * 
     378   * Returns: a newly allocated string. This string should
     379   *   be freed with g_free() when you are done with it.
     380   **/
     381  gchar *
     382  g_utf8_collate_key (const gchar *str,
     383  		    gssize       len)
     384  {
     385    gchar *result;
     386  
     387  #ifdef HAVE_CARBON
     388  
     389    g_return_val_if_fail (str != NULL, NULL);
     390    result = carbon_collate_key (str, len);
     391  
     392  #elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
     393  
     394    gsize xfrm_len;
     395    gunichar *str_norm;
     396    wchar_t *result_wc;
     397    gsize i;
     398    gsize result_len = 0;
     399  
     400    g_return_val_if_fail (str != NULL, NULL);
     401  
     402    str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
     403  
     404    g_return_val_if_fail (str_norm != NULL, NULL);
     405  
     406    xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
     407    result_wc = g_new (wchar_t, xfrm_len + 1);
     408    wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
     409  
     410    for (i = 0; i < xfrm_len; i++)
     411      result_len += utf8_encode (NULL, result_wc[i]);
     412  
     413    result = g_malloc (result_len + 1);
     414    result_len = 0;
     415    for (i = 0; i < xfrm_len; i++)
     416      result_len += utf8_encode (result + result_len, result_wc[i]);
     417  
     418    result[result_len] = '\0';
     419  
     420    g_free (result_wc);
     421    g_free (str_norm);
     422  
     423    return result;
     424  #else
     425  
     426    gsize xfrm_len = 0;
     427    const gchar *charset;
     428    gchar *str_norm;
     429  
     430    g_return_val_if_fail (str != NULL, NULL);
     431  
     432    str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
     433  
     434    result = NULL;
     435  
     436    if (g_get_charset (&charset))
     437      {
     438        xfrm_len = strxfrm (NULL, str_norm, 0);
     439        if (xfrm_len < G_MAXINT - 2)
     440          {
     441            result = g_malloc (xfrm_len + 1);
     442            strxfrm (result, str_norm, xfrm_len + 1);
     443          }
     444      }
     445    else
     446      {
     447        gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
     448  
     449        if (str_locale)
     450  	{
     451  	  xfrm_len = strxfrm (NULL, str_locale, 0);
     452  	  if (xfrm_len >= G_MAXINT - 2)
     453  	    {
     454  	      g_free (str_locale);
     455  	      str_locale = NULL;
     456  	    }
     457  	}
     458        if (str_locale)
     459  	{
     460  	  result = g_malloc (xfrm_len + 2);
     461  	  result[0] = 'A';
     462  	  strxfrm (result + 1, str_locale, xfrm_len + 1);
     463  	  
     464  	  g_free (str_locale);
     465  	}
     466      }
     467      
     468    if (!result) 
     469      {
     470        xfrm_len = strlen (str_norm);
     471        result = g_malloc (xfrm_len + 2);
     472        result[0] = 'B';
     473        memcpy (result + 1, str_norm, xfrm_len);
     474        result[xfrm_len+1] = '\0';
     475      }
     476  
     477    g_free (str_norm);
     478  #endif
     479  
     480    return result;
     481  }
     482  
     483  /* This is a collation key that is very very likely to sort before any
     484   * collation key that libc strxfrm generates. We use this before any
     485   * special case (dot or number) to make sure that its sorted before
     486   * anything else.
     487   */
     488  #define COLLATION_SENTINEL "\1\1\1"
     489  
     490  /**
     491   * g_utf8_collate_key_for_filename:
     492   * @str: a UTF-8 encoded string.
     493   * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
     494   *
     495   * Converts a string into a collation key that can be compared
     496   * with other collation keys produced by the same function using strcmp(). 
     497   * 
     498   * In order to sort filenames correctly, this function treats the dot '.' 
     499   * as a special case. Most dictionary orderings seem to consider it
     500   * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
     501   * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
     502   * would like to treat numbers intelligently so that "file1" "file10" "file5"
     503   * is sorted as "file1" "file5" "file10".
     504   * 
     505   * Note that this function depends on the [current locale][setlocale].
     506   *
     507   * Returns: a newly allocated string. This string should
     508   *   be freed with g_free() when you are done with it.
     509   *
     510   * Since: 2.8
     511   */
     512  gchar *
     513  g_utf8_collate_key_for_filename (const gchar *str,
     514  				 gssize       len)
     515  {
     516  #ifndef HAVE_CARBON
     517    GString *result;
     518    GString *append;
     519    const gchar *p;
     520    const gchar *prev;
     521    const gchar *end;
     522    gchar *collate_key;
     523    gint digits;
     524    gint leading_zeros;
     525  
     526    /*
     527     * How it works:
     528     *
     529     * Split the filename into collatable substrings which do
     530     * not contain [.0-9] and special-cased substrings. The collatable 
     531     * substrings are run through the normal g_utf8_collate_key() and the 
     532     * resulting keys are concatenated with keys generated from the 
     533     * special-cased substrings.
     534     *
     535     * Special cases: Dots are handled by replacing them with '\1' which 
     536     * implies that short dot-delimited substrings are before long ones, 
     537     * e.g.
     538     * 
     539     *   a\1a   (a.a)
     540     *   a-\1a  (a-.a)
     541     *   aa\1a  (aa.a)
     542     * 
     543     * Numbers are handled by prepending to each number d-1 superdigits 
     544     * where d = number of digits in the number and SUPERDIGIT is a 
     545     * character with an integer value higher than any digit (for instance 
     546     * ':'). This ensures that single-digit numbers are sorted before 
     547     * double-digit numbers which in turn are sorted separately from 
     548     * triple-digit numbers, etc. To avoid strange side-effects when 
     549     * sorting strings that already contain SUPERDIGITs, a '\2'
     550     * is also prepended, like this
     551     *
     552     *   file\21      (file1)
     553     *   file\25      (file5)
     554     *   file\2:10    (file10)
     555     *   file\2:26    (file26)
     556     *   file\2::100  (file100)
     557     *   file:foo     (file:foo)
     558     * 
     559     * This has the side-effect of sorting numbers before everything else (except
     560     * dots), but this is probably OK.
     561     *
     562     * Leading digits are ignored when doing the above. To discriminate
     563     * numbers which differ only in the number of leading digits, we append
     564     * the number of leading digits as a byte at the very end of the collation
     565     * key.
     566     *
     567     * To try avoid conflict with any collation key sequence generated by libc we
     568     * start each switch to a special cased part with a sentinel that hopefully
     569     * will sort before anything libc will generate.
     570     */
     571  
     572    if (len < 0)
     573      len = strlen (str);
     574  
     575    result = g_string_sized_new (len * 2);
     576    append = g_string_sized_new (0);
     577  
     578    end = str + len;
     579  
     580    /* No need to use utf8 functions, since we're only looking for ascii chars */
     581    for (prev = p = str; p < end; p++)
     582      {
     583        switch (*p)
     584  	{
     585  	case '.':
     586  	  if (prev != p) 
     587  	    {
     588  	      collate_key = g_utf8_collate_key (prev, p - prev);
     589  	      g_string_append (result, collate_key);
     590  	      g_free (collate_key);
     591  	    }
     592  	  
     593  	  g_string_append (result, COLLATION_SENTINEL "\1");
     594  	  
     595  	  /* skip the dot */
     596  	  prev = p + 1;
     597  	  break;
     598  	  
     599  	case '0':
     600  	case '1':
     601  	case '2':
     602  	case '3':
     603  	case '4':
     604  	case '5':
     605  	case '6':
     606  	case '7':
     607  	case '8':
     608  	case '9':
     609  	  if (prev != p) 
     610  	    {
     611  	      collate_key = g_utf8_collate_key (prev, p - prev);
     612  	      g_string_append (result, collate_key);
     613  	      g_free (collate_key);
     614  	    }
     615  	  
     616  	  g_string_append (result, COLLATION_SENTINEL "\2");
     617  	  
     618  	  prev = p;
     619  	  
     620  	  /* write d-1 colons */
     621  	  if (*p == '0')
     622  	    {
     623  	      leading_zeros = 1;
     624  	      digits = 0;
     625  	    }
     626  	  else
     627  	    {
     628  	      leading_zeros = 0;
     629  	      digits = 1;
     630  	    }
     631  	  
     632  	  while (++p < end)
     633  	    {
     634  	      if (*p == '0' && !digits)
     635  		++leading_zeros;
     636  	      else if (g_ascii_isdigit(*p))
     637  		++digits;
     638  	      else
     639                  {
     640   		  /* count an all-zero sequence as
     641                     * one digit plus leading zeros
     642                     */
     643            	  if (!digits)
     644                      {
     645                        ++digits;
     646                        --leading_zeros;
     647                      }        
     648  		  break;
     649                  }
     650  	    }
     651  
     652  	  while (digits > 1)
     653  	    {
     654  	      g_string_append_c (result, ':');
     655  	      --digits;
     656  	    }
     657  
     658  	  if (leading_zeros > 0)
     659  	    {
     660  	      g_string_append_c (append, (char)leading_zeros);
     661  	      prev += leading_zeros;
     662  	    }
     663  	  
     664  	  /* write the number itself */
     665  	  g_string_append_len (result, prev, p - prev);
     666  	  
     667  	  prev = p;
     668  	  --p;	  /* go one step back to avoid disturbing outer loop */
     669  	  break;
     670  	  
     671  	default:
     672  	  /* other characters just accumulate */
     673  	  break;
     674  	}
     675      }
     676    
     677    if (prev != p) 
     678      {
     679        collate_key = g_utf8_collate_key (prev, p - prev);
     680        g_string_append (result, collate_key);
     681        g_free (collate_key);
     682      }
     683    
     684    g_string_append (result, append->str);
     685    g_string_free (append, TRUE);
     686  
     687    return g_string_free (result, FALSE);
     688  #else /* HAVE_CARBON */
     689    return carbon_collate_key_for_filename (str, len);
     690  #endif
     691  }