1  /* decomp.c - Character decomposition.
       2   *
       3   *  Copyright (C) 1999, 2000 Tom Tromey
       4   *  Copyright 2000 Red Hat, Inc.
       5   *
       6   * SPDX-License-Identifier: LGPL-2.1-or-later
       7   *
       8   * This library is free software; you can redistribute it and/or
       9   * modify it under the terms of the GNU Lesser General Public
      10   * License as published by the Free Software Foundation; either
      11   * version 2.1 of the License, or (at your option) any later version.
      12   *
      13   * This library is distributed in the hope that it will be useful,
      14   * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16   * Lesser General Public License for more details.
      17   *
      18   * You should have received a copy of the GNU Lesser General Public License
      19   * along with this library; if not, see <http://www.gnu.org/licenses/>.
      20   */
      21  
      22  #include "config.h"
      23  
      24  #include <stdlib.h>
      25  
      26  #include "gunicode.h"
      27  #include "gunidecomp.h"
      28  #include "gmem.h"
      29  #include "gunicomp.h"
      30  #include "gunicodeprivate.h"
      31  
      32  
      33  #define CC_PART1(Page, Char) \
      34    ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
      35     ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
      36     : (cclass_data[combining_class_table_part1[Page]][Char]))
      37  
      38  #define CC_PART2(Page, Char) \
      39    ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
      40     ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
      41     : (cclass_data[combining_class_table_part2[Page]][Char]))
      42  
      43  #define COMBINING_CLASS(Char) \
      44    (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
      45     ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
      46     : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
      47        ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
      48        : 0))
      49  
      50  /**
      51   * g_unichar_combining_class:
      52   * @uc: a Unicode character
      53   * 
      54   * Determines the canonical combining class of a Unicode character.
      55   * 
      56   * Returns: the combining class of the character
      57   *
      58   * Since: 2.14
      59   **/
      60  gint
      61  g_unichar_combining_class (gunichar uc)
      62  {
      63    return COMBINING_CLASS (uc);
      64  }
      65  
      66  /* constants for hangul syllable [de]composition */
      67  #define SBase 0xAC00 
      68  #define LBase 0x1100 
      69  #define VBase 0x1161 
      70  #define TBase 0x11A7
      71  #define LCount 19 
      72  #define VCount 21
      73  #define TCount 28
      74  #define NCount (VCount * TCount)
      75  #define SCount (LCount * NCount)
      76  
      77  /**
      78   * g_unicode_canonical_ordering:
      79   * @string: (array length=len) (element-type gunichar): a UCS-4 encoded string.
      80   * @len: the maximum length of @string to use.
      81   *
      82   * Computes the canonical ordering of a string in-place.  
      83   * This rearranges decomposed characters in the string 
      84   * according to their combining classes.  See the Unicode 
      85   * manual for more information. 
      86   **/
      87  void
      88  g_unicode_canonical_ordering (gunichar *string,
      89  			      gsize     len)
      90  {
      91    gsize i;
      92    int swap = 1;
      93  
      94    while (swap)
      95      {
      96        int last;
      97        swap = 0;
      98        last = COMBINING_CLASS (string[0]);
      99        for (i = 0; i < len - 1; ++i)
     100  	{
     101  	  int next = COMBINING_CLASS (string[i + 1]);
     102  	  if (next != 0 && last > next)
     103  	    {
     104  	      gsize j;
     105  	      /* Percolate item leftward through string.  */
     106  	      for (j = i + 1; j > 0; --j)
     107  		{
     108  		  gunichar t;
     109  		  if (COMBINING_CLASS (string[j - 1]) <= next)
     110  		    break;
     111  		  t = string[j];
     112  		  string[j] = string[j - 1];
     113  		  string[j - 1] = t;
     114  		  swap = 1;
     115  		}
     116  	      /* We're re-entering the loop looking at the old
     117  		 character again.  */
     118  	      next = last;
     119  	    }
     120  	  last = next;
     121  	}
     122      }
     123  }
     124  
     125  /* http://www.unicode.org/unicode/reports/tr15/#Hangul
     126   * r should be null or have sufficient space. Calling with r == NULL will
     127   * only calculate the result_len; however, a buffer with space for three
     128   * characters will always be big enough. */
     129  static void
     130  decompose_hangul (gunichar s,
     131                    gunichar *r,
     132                    gsize *result_len)
     133  {
     134    gint SIndex = s - SBase;
     135    gint TIndex = SIndex % TCount;
     136  
     137    if (r)
     138      {
     139        r[0] = LBase + SIndex / NCount;
     140        r[1] = VBase + (SIndex % NCount) / TCount;
     141      }
     142  
     143    if (TIndex)
     144      {
     145        if (r)
     146  	r[2] = TBase + TIndex;
     147        *result_len = 3;
     148      }
     149    else
     150      *result_len = 2;
     151  }
     152  
     153  /* returns a pointer to a null-terminated UTF-8 string */
     154  static const gchar *
     155  find_decomposition (gunichar ch,
     156  		    gboolean compat)
     157  {
     158    int start = 0;
     159    int end = G_N_ELEMENTS (decomp_table);
     160    
     161    if (ch >= decomp_table[start].ch &&
     162        ch <= decomp_table[end - 1].ch)
     163      {
     164        while (TRUE)
     165  	{
     166  	  int half = (start + end) / 2;
     167  	  if (ch == decomp_table[half].ch)
     168  	    {
     169  	      int offset;
     170  
     171  	      if (compat)
     172  		{
     173  		  offset = decomp_table[half].compat_offset;
     174  		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
     175  		    offset = decomp_table[half].canon_offset;
     176  		}
     177  	      else
     178  		{
     179  		  offset = decomp_table[half].canon_offset;
     180  		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
     181  		    return NULL;
     182  		}
     183  	      
     184  	      return &(decomp_expansion_string[offset]);
     185  	    }
     186  	  else if (half == start)
     187  	    break;
     188  	  else if (ch > decomp_table[half].ch)
     189  	    start = half;
     190  	  else
     191  	    end = half;
     192  	}
     193      }
     194  
     195    return NULL;
     196  }
     197  
     198  /**
     199   * g_unicode_canonical_decomposition:
     200   * @ch: a Unicode character.
     201   * @result_len: location to store the length of the return value.
     202   *
     203   * Computes the canonical decomposition of a Unicode character.  
     204   * 
     205   * Returns: a newly allocated string of Unicode characters.
     206   *   @result_len is set to the resulting length of the string.
     207   *
     208   * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
     209   *   instead.
     210   **/
     211  gunichar *
     212  g_unicode_canonical_decomposition (gunichar ch,
     213  				   gsize   *result_len)
     214  {
     215    const gchar *decomp;
     216    const gchar *p;
     217    gunichar *r;
     218  
     219    /* Hangul syllable */
     220    if (ch >= SBase && ch < SBase + SCount)
     221      {
     222        decompose_hangul (ch, NULL, result_len);
     223        r = g_malloc (*result_len * sizeof (gunichar));
     224        decompose_hangul (ch, r, result_len);
     225      }
     226    else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
     227      {
     228        /* Found it.  */
     229        int i;
     230        
     231        *result_len = g_utf8_strlen (decomp, -1);
     232        r = g_malloc (*result_len * sizeof (gunichar));
     233        
     234        for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
     235          r[i] = g_utf8_get_char (p);
     236      }
     237    else
     238      {
     239        /* Not in our table.  */
     240        r = g_malloc (sizeof (gunichar));
     241        *r = ch;
     242        *result_len = 1;
     243      }
     244  
     245    return r;
     246  }
     247  
     248  /* L,V => LV and LV,T => LVT  */
     249  static gboolean
     250  combine_hangul (gunichar a,
     251                  gunichar b,
     252                  gunichar *result)
     253  {
     254    gint LIndex = a - LBase;
     255    gint SIndex = a - SBase;
     256  
     257    gint VIndex = b - VBase;
     258    gint TIndex = b - TBase;
     259  
     260    if (0 <= LIndex && LIndex < LCount
     261        && 0 <= VIndex && VIndex < VCount)
     262      {
     263        *result = SBase + (LIndex * VCount + VIndex) * TCount;
     264        return TRUE;
     265      }
     266    else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
     267             && 0 < TIndex && TIndex < TCount)
     268      {
     269        *result = a + TIndex;
     270        return TRUE;
     271      }
     272  
     273    return FALSE;
     274  }
     275  
     276  #define CI(Page, Char) \
     277    ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
     278     ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
     279     : (compose_data[compose_table[Page]][Char]))
     280  
     281  #define COMPOSE_INDEX(Char) \
     282       (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
     283  
     284  static gboolean
     285  combine (gunichar  a,
     286  	 gunichar  b,
     287  	 gunichar *result)
     288  {
     289    gushort index_a, index_b;
     290  
     291    if (combine_hangul (a, b, result))
     292      return TRUE;
     293  
     294    index_a = COMPOSE_INDEX(a);
     295  
     296    if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
     297      {
     298        if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
     299  	{
     300  	  *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
     301  	  return TRUE;
     302  	}
     303        else
     304          return FALSE;
     305      }
     306    
     307    index_b = COMPOSE_INDEX(b);
     308  
     309    if (index_b >= COMPOSE_SECOND_SINGLE_START)
     310      {
     311        if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
     312  	{
     313  	  *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
     314  	  return TRUE;
     315  	}
     316        else
     317          return FALSE;
     318      }
     319  
     320    if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
     321        index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
     322      {
     323        gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
     324  
     325        if (res)
     326  	{
     327  	  *result = res;
     328  	  return TRUE;
     329  	}
     330      }
     331  
     332    return FALSE;
     333  }
     334  
     335  gunichar *
     336  _g_utf8_normalize_wc (const gchar    *str,
     337  		      gssize          max_len,
     338  		      GNormalizeMode  mode)
     339  {
     340    gsize n_wc;
     341    gunichar *wc_buffer;
     342    const char *p;
     343    gsize last_start;
     344    gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
     345  			mode == G_NORMALIZE_NFKD);
     346    gboolean do_compose = (mode == G_NORMALIZE_NFC ||
     347  			 mode == G_NORMALIZE_NFKC);
     348  
     349    n_wc = 0;
     350    p = str;
     351    while ((max_len < 0 || p < str + max_len) && *p)
     352      {
     353        const gchar *decomp;
     354        const char *next, *between;
     355        gunichar wc;
     356  
     357        next = g_utf8_next_char (p);
     358        /* Avoid reading truncated multibyte characters
     359           which run past the end of the buffer */
     360        if (max_len < 0)
     361          {
     362            /* Does the character contain a NUL terminator? */
     363            for (between = &p[1]; between < next; between++)
     364              {
     365                if (G_UNLIKELY (!*between))
     366                  return NULL;
     367              }
     368          }
     369        else
     370          {
     371            if (G_UNLIKELY (next > str + max_len))
     372              return NULL;
     373          }
     374        wc = g_utf8_get_char (p);
     375  
     376        if (G_UNLIKELY (wc == (gunichar) -1))
     377          {
     378            return NULL;
     379          }
     380        else if (wc >= SBase && wc < SBase + SCount)
     381          {
     382            gsize result_len;
     383            decompose_hangul (wc, NULL, &result_len);
     384            n_wc += result_len;
     385          }
     386        else 
     387          {
     388            decomp = find_decomposition (wc, do_compat);
     389  
     390            if (decomp)
     391              n_wc += g_utf8_strlen (decomp, -1);
     392            else
     393              n_wc++;
     394          }
     395  
     396        p = next;
     397      }
     398  
     399    wc_buffer = g_new (gunichar, n_wc + 1);
     400  
     401    last_start = 0;
     402    n_wc = 0;
     403    p = str;
     404    while ((max_len < 0 || p < str + max_len) && *p)
     405      {
     406        gunichar wc = g_utf8_get_char (p);
     407        const gchar *decomp;
     408        int cc;
     409        gsize old_n_wc = n_wc;
     410  	  
     411        if (wc >= SBase && wc < SBase + SCount)
     412          {
     413            gsize result_len;
     414            decompose_hangul (wc, wc_buffer + n_wc, &result_len);
     415            n_wc += result_len;
     416          }
     417        else
     418          {
     419            decomp = find_decomposition (wc, do_compat);
     420            
     421            if (decomp)
     422              {
     423                const char *pd;
     424                for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
     425                  wc_buffer[n_wc++] = g_utf8_get_char (pd);
     426              }
     427            else
     428              wc_buffer[n_wc++] = wc;
     429          }
     430  
     431        if (n_wc > 0)
     432  	{
     433  	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
     434  
     435  	  if (cc == 0)
     436  	    {
     437  	      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
     438  	      last_start = old_n_wc;
     439  	    }
     440  	}
     441        
     442        p = g_utf8_next_char (p);
     443      }
     444  
     445    if (n_wc > 0)
     446      {
     447        g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
     448        last_start = n_wc;
     449        (void) last_start;
     450      }
     451  	  
     452    wc_buffer[n_wc] = 0;
     453  
     454    /* All decomposed and reordered */ 
     455  
     456    if (do_compose && n_wc > 0)
     457      {
     458        gsize i, j;
     459        int last_cc = 0;
     460        last_start = 0;
     461        
     462        for (i = 0; i < n_wc; i++)
     463  	{
     464  	  int cc = COMBINING_CLASS (wc_buffer[i]);
     465  
     466  	  if (i > 0 &&
     467  	      (last_cc == 0 || last_cc < cc) &&
     468  	      combine (wc_buffer[last_start], wc_buffer[i],
     469  		       &wc_buffer[last_start]))
     470  	    {
     471  	      for (j = i + 1; j < n_wc; j++)
     472  		wc_buffer[j-1] = wc_buffer[j];
     473  	      n_wc--;
     474  	      i--;
     475  	      
     476  	      if (i == last_start)
     477  		last_cc = 0;
     478  	      else
     479  		last_cc = COMBINING_CLASS (wc_buffer[i-1]);
     480  	      
     481  	      continue;
     482  	    }
     483  
     484  	  if (cc == 0)
     485  	    last_start = i;
     486  
     487  	  last_cc = cc;
     488  	}
     489      }
     490  
     491    wc_buffer[n_wc] = 0;
     492  
     493    return wc_buffer;
     494  }
     495  
     496  /**
     497   * g_utf8_normalize:
     498   * @str: a UTF-8 encoded string.
     499   * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
     500   * @mode: the type of normalization to perform.
     501   *
     502   * Converts a string into canonical form, standardizing
     503   * such issues as whether a character with an accent
     504   * is represented as a base character and combining
     505   * accent or as a single precomposed character. The
     506   * string has to be valid UTF-8, otherwise %NULL is
     507   * returned. You should generally call g_utf8_normalize()
     508   * before comparing two Unicode strings.
     509   *
     510   * The normalization mode %G_NORMALIZE_DEFAULT only
     511   * standardizes differences that do not affect the
     512   * text content, such as the above-mentioned accent
     513   * representation. %G_NORMALIZE_ALL also standardizes
     514   * the "compatibility" characters in Unicode, such
     515   * as SUPERSCRIPT THREE to the standard forms
     516   * (in this case DIGIT THREE). Formatting information
     517   * may be lost but for most text operations such
     518   * characters should be considered the same.
     519   *
     520   * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
     521   * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
     522   * but returned a result with composed forms rather
     523   * than a maximally decomposed form. This is often
     524   * useful if you intend to convert the string to
     525   * a legacy encoding or pass it to a system with
     526   * less capable Unicode handling.
     527   *
     528   * Returns: (nullable): a newly allocated string, that
     529   *   is the normalized form of @str, or %NULL if @str
     530   *   is not valid UTF-8.
     531   **/
     532  gchar *
     533  g_utf8_normalize (const gchar    *str,
     534  		  gssize          len,
     535  		  GNormalizeMode  mode)
     536  {
     537    gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
     538    gchar *result = NULL;
     539  
     540    if (G_LIKELY (result_wc != NULL))
     541      {
     542        result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
     543        g_free (result_wc);
     544      }
     545  
     546    return result;
     547  }
     548  
     549  static gboolean
     550  decompose_hangul_step (gunichar  ch,
     551                         gunichar *a,
     552                         gunichar *b)
     553  {
     554    gint SIndex, TIndex;
     555  
     556    if (ch < SBase || ch >= SBase + SCount)
     557      return FALSE;  /* not a hangul syllable */
     558  
     559    SIndex = ch - SBase;
     560    TIndex = SIndex % TCount;
     561  
     562    if (TIndex)
     563      {
     564        /* split LVT -> LV,T */
     565        *a = ch - TIndex;
     566        *b = TBase + TIndex;
     567      }
     568    else
     569      {
     570        /* split LV -> L,V */
     571        *a = LBase + SIndex / NCount;
     572        *b = VBase + (SIndex % NCount) / TCount;
     573      }
     574  
     575    return TRUE;
     576  }
     577  
     578  /**
     579   * g_unichar_decompose:
     580   * @ch: a Unicode character
     581   * @a: (out) (not optional): return location for the first component of @ch
     582   * @b: (out) (not optional): return location for the second component of @ch
     583   *
     584   * Performs a single decomposition step of the
     585   * Unicode canonical decomposition algorithm.
     586   *
     587   * This function does not include compatibility
     588   * decompositions. It does, however, include algorithmic
     589   * Hangul Jamo decomposition, as well as 'singleton'
     590   * decompositions which replace a character by a single
     591   * other character. In the case of singletons *@b will
     592   * be set to zero.
     593   *
     594   * If @ch is not decomposable, *@a is set to @ch and *@b
     595   * is set to zero.
     596   *
     597   * Note that the way Unicode decomposition pairs are
     598   * defined, it is guaranteed that @b would not decompose
     599   * further, but @a may itself decompose.  To get the full
     600   * canonical decomposition for @ch, one would need to
     601   * recursively call this function on @a.  Or use
     602   * g_unichar_fully_decompose().
     603   *
     604   * See
     605   * [UAX#15](http://unicode.org/reports/tr15/)
     606   * for details.
     607   *
     608   * Returns: %TRUE if the character could be decomposed
     609   *
     610   * Since: 2.30
     611   */
     612  gboolean
     613  g_unichar_decompose (gunichar  ch,
     614                       gunichar *a,
     615                       gunichar *b)
     616  {
     617    gint start = 0;
     618    gint end = G_N_ELEMENTS (decomp_step_table);
     619  
     620    if (decompose_hangul_step (ch, a, b))
     621      return TRUE;
     622  
     623    /* TODO use bsearch() */
     624    if (ch >= decomp_step_table[start].ch &&
     625        ch <= decomp_step_table[end - 1].ch)
     626      {
     627        while (TRUE)
     628          {
     629            gint half = (start + end) / 2;
     630            const decomposition_step *p = &(decomp_step_table[half]);
     631            if (ch == p->ch)
     632              {
     633                *a = p->a;
     634                *b = p->b;
     635                return TRUE;
     636              }
     637            else if (half == start)
     638              break;
     639            else if (ch > p->ch)
     640              start = half;
     641            else
     642              end = half;
     643          }
     644      }
     645  
     646    *a = ch;
     647    *b = 0;
     648  
     649    return FALSE;
     650  }
     651  
     652  /**
     653   * g_unichar_compose:
     654   * @a: a Unicode character
     655   * @b: a Unicode character
     656   * @ch: (out) (not optional): return location for the composed character
     657   *
     658   * Performs a single composition step of the
     659   * Unicode canonical composition algorithm.
     660   *
     661   * This function includes algorithmic Hangul Jamo composition,
     662   * but it is not exactly the inverse of g_unichar_decompose().
     663   * No composition can have either of @a or @b equal to zero.
     664   * To be precise, this function composes if and only if
     665   * there exists a Primary Composite P which is canonically
     666   * equivalent to the sequence <@a,@b>.  See the Unicode
     667   * Standard for the definition of Primary Composite.
     668   *
     669   * If @a and @b do not compose a new character, @ch is set to zero.
     670   *
     671   * See
     672   * [UAX#15](http://unicode.org/reports/tr15/)
     673   * for details.
     674   *
     675   * Returns: %TRUE if the characters could be composed
     676   *
     677   * Since: 2.30
     678   */
     679  gboolean
     680  g_unichar_compose (gunichar  a,
     681                     gunichar  b,
     682                     gunichar *ch)
     683  {
     684    if (combine (a, b, ch))
     685      return TRUE;
     686  
     687    *ch = 0;
     688    return FALSE;
     689  }
     690  
     691  /**
     692   * g_unichar_fully_decompose:
     693   * @ch: a Unicode character.
     694   * @compat: whether perform canonical or compatibility decomposition
     695   * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL
     696   * @result_len: length of @result
     697   *
     698   * Computes the canonical or compatibility decomposition of a
     699   * Unicode character.  For compatibility decomposition,
     700   * pass %TRUE for @compat; for canonical decomposition
     701   * pass %FALSE for @compat.
     702   *
     703   * The decomposed sequence is placed in @result.  Only up to
     704   * @result_len characters are written into @result.  The length
     705   * of the full decomposition (irrespective of @result_len) is
     706   * returned by the function.  For canonical decomposition,
     707   * currently all decompositions are of length at most 4, but
     708   * this may change in the future (very unlikely though).
     709   * At any rate, Unicode does guarantee that a buffer of length
     710   * 18 is always enough for both compatibility and canonical
     711   * decompositions, so that is the size recommended. This is provided
     712   * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH.
     713   *
     714   * See
     715   * [UAX#15](http://unicode.org/reports/tr15/)
     716   * for details.
     717   *
     718   * Returns: the length of the full decomposition.
     719   *
     720   * Since: 2.30
     721   **/
     722  gsize
     723  g_unichar_fully_decompose (gunichar  ch,
     724  			   gboolean  compat,
     725  			   gunichar *result,
     726  			   gsize     result_len)
     727  {
     728    const gchar *decomp;
     729    const gchar *p;
     730  
     731    /* Hangul syllable */
     732    if (ch >= SBase && ch < SBase + SCount)
     733      {
     734        gsize len, i;
     735        gunichar buffer[3];
     736        decompose_hangul (ch, result ? buffer : NULL, &len);
     737        if (result)
     738          for (i = 0; i < len && i < result_len; i++)
     739  	  result[i] = buffer[i];
     740        return len;
     741      }
     742    else if ((decomp = find_decomposition (ch, compat)) != NULL)
     743      {
     744        /* Found it.  */
     745        gsize len, i;
     746  
     747        len = g_utf8_strlen (decomp, -1);
     748  
     749        for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
     750          result[i] = g_utf8_get_char (p);
     751  
     752        return len;
     753      }
     754  
     755    /* Does not decompose */
     756    if (result && result_len >= 1)
     757      *result = ch;
     758    return 1;
     759  }