1  /*
       2     Unicode character type helpers.
       3  
       4     Written by Marc-Andre Lemburg (mal@lemburg.com).
       5     Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       6  
       7     Copyright (c) Corporation for National Research Initiatives.
       8  
       9  */
      10  
      11  #include "Python.h"
      12  
      13  #define ALPHA_MASK 0x01
      14  #define DECIMAL_MASK 0x02
      15  #define DIGIT_MASK 0x04
      16  #define LOWER_MASK 0x08
      17  #define TITLE_MASK 0x40
      18  #define UPPER_MASK 0x80
      19  #define XID_START_MASK 0x100
      20  #define XID_CONTINUE_MASK 0x200
      21  #define PRINTABLE_MASK 0x400
      22  #define NUMERIC_MASK 0x800
      23  #define CASE_IGNORABLE_MASK 0x1000
      24  #define CASED_MASK 0x2000
      25  #define EXTENDED_CASE_MASK 0x4000
      26  
      27  typedef struct {
      28      /*
      29         These are either deltas to the character or offsets in
      30         _PyUnicode_ExtendedCase.
      31      */
      32      const int upper;
      33      const int lower;
      34      const int title;
      35      /* Note if more flag space is needed, decimal and digit could be unified. */
      36      const unsigned char decimal;
      37      const unsigned char digit;
      38      const unsigned short flags;
      39  } _PyUnicode_TypeRecord;
      40  
      41  #include "unicodetype_db.h"
      42  
      43  static const _PyUnicode_TypeRecord *
      44  gettyperecord(Py_UCS4 code)
      45  {
      46      int index;
      47  
      48      if (code >= 0x110000)
      49          index = 0;
      50      else
      51      {
      52          index = index1[(code>>SHIFT)];
      53          index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
      54      }
      55  
      56      return &_PyUnicode_TypeRecords[index];
      57  }
      58  
      59  /* Returns the titlecase Unicode characters corresponding to ch or just
      60     ch if no titlecase mapping is known. */
      61  
      62  Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
      63  {
      64      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      65  
      66      if (ctype->flags & EXTENDED_CASE_MASK)
      67          return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
      68      return ch + ctype->title;
      69  }
      70  
      71  /* Returns 1 for Unicode characters having the category 'Lt', 0
      72     otherwise. */
      73  
      74  int _PyUnicode_IsTitlecase(Py_UCS4 ch)
      75  {
      76      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      77  
      78      return (ctype->flags & TITLE_MASK) != 0;
      79  }
      80  
      81  /* Returns 1 for Unicode characters having the XID_Start property, 0
      82     otherwise. */
      83  
      84  int _PyUnicode_IsXidStart(Py_UCS4 ch)
      85  {
      86      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      87  
      88      return (ctype->flags & XID_START_MASK) != 0;
      89  }
      90  
      91  /* Returns 1 for Unicode characters having the XID_Continue property,
      92     0 otherwise. */
      93  
      94  int _PyUnicode_IsXidContinue(Py_UCS4 ch)
      95  {
      96      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
      97  
      98      return (ctype->flags & XID_CONTINUE_MASK) != 0;
      99  }
     100  
     101  /* Returns the integer decimal (0-9) for Unicode characters having
     102     this property, -1 otherwise. */
     103  
     104  int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
     105  {
     106      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     107  
     108      return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
     109  }
     110  
     111  int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
     112  {
     113      if (_PyUnicode_ToDecimalDigit(ch) < 0)
     114          return 0;
     115      return 1;
     116  }
     117  
     118  /* Returns the integer digit (0-9) for Unicode characters having
     119     this property, -1 otherwise. */
     120  
     121  int _PyUnicode_ToDigit(Py_UCS4 ch)
     122  {
     123      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     124  
     125      return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
     126  }
     127  
     128  int _PyUnicode_IsDigit(Py_UCS4 ch)
     129  {
     130      if (_PyUnicode_ToDigit(ch) < 0)
     131          return 0;
     132      return 1;
     133  }
     134  
     135  /* Returns the numeric value as double for Unicode characters having
     136     this property, -1.0 otherwise. */
     137  
     138  int _PyUnicode_IsNumeric(Py_UCS4 ch)
     139  {
     140      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     141  
     142      return (ctype->flags & NUMERIC_MASK) != 0;
     143  }
     144  
     145  /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
     146     0 otherwise.
     147     All characters except those characters defined in the Unicode character
     148     database as following categories are considered printable.
     149        * Cc (Other, Control)
     150        * Cf (Other, Format)
     151        * Cs (Other, Surrogate)
     152        * Co (Other, Private Use)
     153        * Cn (Other, Not Assigned)
     154        * Zl Separator, Line ('\u2028', LINE SEPARATOR)
     155        * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
     156        * Zs (Separator, Space) other than ASCII space('\x20').
     157  */
     158  int _PyUnicode_IsPrintable(Py_UCS4 ch)
     159  {
     160      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     161  
     162      return (ctype->flags & PRINTABLE_MASK) != 0;
     163  }
     164  
     165  /* Returns 1 for Unicode characters having the category 'Ll', 0
     166     otherwise. */
     167  
     168  int _PyUnicode_IsLowercase(Py_UCS4 ch)
     169  {
     170      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     171  
     172      return (ctype->flags & LOWER_MASK) != 0;
     173  }
     174  
     175  /* Returns 1 for Unicode characters having the category 'Lu', 0
     176     otherwise. */
     177  
     178  int _PyUnicode_IsUppercase(Py_UCS4 ch)
     179  {
     180      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     181  
     182      return (ctype->flags & UPPER_MASK) != 0;
     183  }
     184  
     185  /* Returns the uppercase Unicode characters corresponding to ch or just
     186     ch if no uppercase mapping is known. */
     187  
     188  Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
     189  {
     190      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     191  
     192      if (ctype->flags & EXTENDED_CASE_MASK)
     193          return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
     194      return ch + ctype->upper;
     195  }
     196  
     197  /* Returns the lowercase Unicode characters corresponding to ch or just
     198     ch if no lowercase mapping is known. */
     199  
     200  Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
     201  {
     202      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     203  
     204      if (ctype->flags & EXTENDED_CASE_MASK)
     205          return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
     206      return ch + ctype->lower;
     207  }
     208  
     209  int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
     210  {
     211      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     212  
     213      if (ctype->flags & EXTENDED_CASE_MASK) {
     214          int index = ctype->lower & 0xFFFF;
     215          int n = ctype->lower >> 24;
     216          int i;
     217          for (i = 0; i < n; i++)
     218              res[i] = _PyUnicode_ExtendedCase[index + i];
     219          return n;
     220      }
     221      res[0] = ch + ctype->lower;
     222      return 1;
     223  }
     224  
     225  int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
     226  {
     227      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     228  
     229      if (ctype->flags & EXTENDED_CASE_MASK) {
     230          int index = ctype->title & 0xFFFF;
     231          int n = ctype->title >> 24;
     232          int i;
     233          for (i = 0; i < n; i++)
     234              res[i] = _PyUnicode_ExtendedCase[index + i];
     235          return n;
     236      }
     237      res[0] = ch + ctype->title;
     238      return 1;
     239  }
     240  
     241  int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
     242  {
     243      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     244  
     245      if (ctype->flags & EXTENDED_CASE_MASK) {
     246          int index = ctype->upper & 0xFFFF;
     247          int n = ctype->upper >> 24;
     248          int i;
     249          for (i = 0; i < n; i++)
     250              res[i] = _PyUnicode_ExtendedCase[index + i];
     251          return n;
     252      }
     253      res[0] = ch + ctype->upper;
     254      return 1;
     255  }
     256  
     257  int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
     258  {
     259      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     260  
     261      if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
     262          int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
     263          int n = (ctype->lower >> 20) & 7;
     264          int i;
     265          for (i = 0; i < n; i++)
     266              res[i] = _PyUnicode_ExtendedCase[index + i];
     267          return n;
     268      }
     269      return _PyUnicode_ToLowerFull(ch, res);
     270  }
     271  
     272  int _PyUnicode_IsCased(Py_UCS4 ch)
     273  {
     274      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     275  
     276      return (ctype->flags & CASED_MASK) != 0;
     277  }
     278  
     279  int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
     280  {
     281      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     282  
     283      return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
     284  }
     285  
     286  /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
     287     'Lo' or 'Lm',  0 otherwise. */
     288  
     289  int _PyUnicode_IsAlpha(Py_UCS4 ch)
     290  {
     291      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
     292  
     293      return (ctype->flags & ALPHA_MASK) != 0;
     294  }
     295