(root)/
tar-1.35/
gnu/
uninorm.in.h
       1  /* Normalization forms (composition and decomposition) of Unicode strings.
       2     Copyright (C) 2001-2002, 2009-2023 Free Software Foundation, Inc.
       3     Written by Bruno Haible <bruno@clisp.org>, 2009.
       4  
       5     This file is free software: you can redistribute it and/or modify
       6     it under the terms of the GNU Lesser General Public License as
       7     published by the Free Software Foundation; either version 2.1 of the
       8     License, or (at your option) any later version.
       9  
      10     This file is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public License
      16     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      17  
      18  #ifndef _UNINORM_H
      19  #define _UNINORM_H
      20  
      21  /* Get size_t.  */
      22  #include <stddef.h>
      23  
      24  #include "unitypes.h"
      25  
      26  #if @HAVE_UNISTRING_WOE32DLL_H@
      27  # include <unistring/woe32dll.h>
      28  #else
      29  # define LIBUNISTRING_DLL_VARIABLE
      30  #endif
      31  
      32  
      33  #ifdef __cplusplus
      34  extern "C" {
      35  #endif
      36  
      37  
      38  /* Conventions:
      39  
      40     All functions prefixed with u8_ operate on UTF-8 encoded strings.
      41     Their unit is an uint8_t (1 byte).
      42  
      43     All functions prefixed with u16_ operate on UTF-16 encoded strings.
      44     Their unit is an uint16_t (a 2-byte word).
      45  
      46     All functions prefixed with u32_ operate on UCS-4 encoded strings.
      47     Their unit is an uint32_t (a 4-byte word).
      48  
      49     All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
      50     n units.
      51  
      52     Functions returning a string result take a (resultbuf, lengthp) argument
      53     pair.  If resultbuf is not NULL and the result fits into *lengthp units,
      54     it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
      55     allocated string is returned.  In both cases, *lengthp is set to the
      56     length (number of units) of the returned string.  In case of error,
      57     NULL is returned and errno is set.  */
      58  
      59  
      60  enum
      61  {
      62    UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
      63    UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
      64    UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
      65    UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
      66    UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
      67    UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
      68    UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
      69    UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
      70    UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
      71    UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
      72    UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
      73    UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
      74    UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
      75    UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
      76    UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
      77    UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
      78    UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
      79  };
      80  
      81  /* Maximum size of decomposition of a single Unicode character.  */
      82  #define UC_DECOMPOSITION_MAX_LENGTH 32
      83  
      84  /* Return the character decomposition mapping of a Unicode character.
      85     DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
      86     ucs_t elements.
      87     When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
      88     filled and N is returned.  Otherwise -1 is returned.  */
      89  extern int
      90         uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
      91  
      92  /* Return the canonical character decomposition mapping of a Unicode character.
      93     DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
      94     ucs_t elements.
      95     When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
      96     returned.  Otherwise -1 is returned.  */
      97  extern int
      98         uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
      99  
     100  
     101  /* Attempt to combine the Unicode characters uc1, uc2.
     102     uc1 is known to have canonical combining class 0.
     103     Return the combination of uc1 and uc2, if it exists.
     104     Return 0 otherwise.
     105     Not all decompositions can be recombined using this function.  See the
     106     Unicode file CompositionExclusions.txt for details.  */
     107  extern ucs4_t
     108         uc_composition (ucs4_t uc1, ucs4_t uc2)
     109         _UC_ATTRIBUTE_CONST;
     110  
     111  
     112  /* An object of type uninorm_t denotes a Unicode normalization form.  */
     113  struct unicode_normalization_form;
     114  typedef const struct unicode_normalization_form *uninorm_t;
     115  
     116  /* UNINORM_NFD: Normalization form D: canonical decomposition.  */
     117  extern @GNULIB_UNINORM_NFD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfd;
     118  #define UNINORM_NFD (&uninorm_nfd)
     119  
     120  /* UNINORM_NFC: Normalization form C: canonical decomposition, then
     121     canonical composition.  */
     122  extern @GNULIB_UNINORM_NFC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfc;
     123  #define UNINORM_NFC (&uninorm_nfc)
     124  
     125  /* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
     126  extern @GNULIB_UNINORM_NFKD_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkd;
     127  #define UNINORM_NFKD (&uninorm_nfkd)
     128  
     129  /* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
     130     canonical composition.  */
     131  extern @GNULIB_UNINORM_NFKC_DLL_VARIABLE@ const struct unicode_normalization_form uninorm_nfkc;
     132  #define UNINORM_NFKC (&uninorm_nfkc)
     133  
     134  /* Test whether a normalization form does compatibility decomposition.  */
     135  #define uninorm_is_compat_decomposing(nf) \
     136    ((* (const unsigned int *) (nf) >> 0) & 1)
     137  
     138  /* Test whether a normalization form includes canonical composition.  */
     139  #define uninorm_is_composing(nf) \
     140    ((* (const unsigned int *) (nf) >> 1) & 1)
     141  
     142  /* Return the decomposing variant of a normalization form.
     143     This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD.  */
     144  extern uninorm_t
     145         uninorm_decomposing_form (uninorm_t nf)
     146         _UC_ATTRIBUTE_PURE;
     147  
     148  
     149  /* Return the specified normalization form of a string.  */
     150  extern uint8_t *
     151         u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
     152                       uint8_t *_UC_RESTRICT resultbuf, size_t *lengthp);
     153  extern uint16_t *
     154         u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
     155                        uint16_t *_UC_RESTRICT resultbuf, size_t *lengthp);
     156  extern uint32_t *
     157         u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
     158                        uint32_t *_UC_RESTRICT resultbuf, size_t *lengthp);
     159  
     160  
     161  /* Compare S1 and S2, ignoring differences in normalization.
     162     NF must be either UNINORM_NFD or UNINORM_NFKD.
     163     If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
     164     return 0.  Upon failure, return -1 with errno set.  */
     165  extern int
     166         u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
     167                     uninorm_t nf, int *resultp);
     168  extern int
     169         u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
     170                      uninorm_t nf, int *resultp);
     171  extern int
     172         u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
     173                      uninorm_t nf, int *resultp);
     174  
     175  
     176  /* Converts the string S of length N to a NUL-terminated byte sequence, in such
     177     a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
     178     equivalent to comparing S1 and S2 with uN_normcoll().
     179     NF must be either UNINORM_NFC or UNINORM_NFKC.  */
     180  extern char *
     181         u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
     182                      char *resultbuf, size_t *lengthp);
     183  extern char *
     184         u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
     185                       char *resultbuf, size_t *lengthp);
     186  extern char *
     187         u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
     188                       char *resultbuf, size_t *lengthp);
     189  
     190  
     191  /* Compare S1 and S2, ignoring differences in normalization, using the
     192     collation rules of the current locale.
     193     NF must be either UNINORM_NFC or UNINORM_NFKC.
     194     If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
     195     return 0.  Upon failure, return -1 with errno set.  */
     196  extern int
     197         u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
     198                      uninorm_t nf, int *resultp);
     199  extern int
     200         u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
     201                       uninorm_t nf, int *resultp);
     202  extern int
     203         u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
     204                       uninorm_t nf, int *resultp);
     205  
     206  
     207  /* Normalization of a stream of Unicode characters.
     208  
     209     A "stream of Unicode characters" is essentially a function that accepts an
     210     ucs4_t argument repeatedly, optionally combined with a function that
     211     "flushes" the stream.  */
     212  
     213  /* Data type of a stream of Unicode characters that normalizes its input
     214     according to a given normalization form and passes the normalized character
     215     sequence to the encapsulated stream of Unicode characters.  */
     216  struct uninorm_filter;
     217  
     218  /* Bring data buffered in the filter to its destination, the encapsulated
     219     stream, then close and free the filter.
     220     Return 0 if successful, or -1 with errno set upon failure.  */
     221  extern int
     222         uninorm_filter_free (struct uninorm_filter *filter);
     223  
     224  /* Create and return a normalization filter for Unicode characters.
     225     The pair (stream_func, stream_data) is the encapsulated stream.
     226     stream_func (stream_data, uc) receives the Unicode character uc
     227     and returns 0 if successful, or -1 with errno set upon failure.
     228     Return the new filter, or NULL with errno set upon failure.  */
     229  extern struct uninorm_filter *
     230         uninorm_filter_create (uninorm_t nf,
     231                                int (*stream_func) (void *stream_data, ucs4_t uc),
     232                                void *stream_data)
     233         _GL_ATTRIBUTE_DEALLOC (uninorm_filter_free, 1);
     234  
     235  /* Stuff a Unicode character into a normalizing filter.
     236     Return 0 if successful, or -1 with errno set upon failure.  */
     237  extern int
     238         uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
     239  
     240  /* Bring data buffered in the filter to its destination, the encapsulated
     241     stream.
     242     Return 0 if successful, or -1 with errno set upon failure.
     243     Note! If after calling this function, additional characters are written
     244     into the filter, the resulting character sequence in the encapsulated stream
     245     will not necessarily be normalized.  */
     246  extern int
     247         uninorm_filter_flush (struct uninorm_filter *filter);
     248  
     249  
     250  #ifdef __cplusplus
     251  }
     252  #endif
     253  
     254  
     255  #endif /* _UNINORM_H */