python (3.12.0)

Browse
Build Log
Usage
       1  # unicode.py
       2  
       3  import sys
       4  from itertools import filterfalse
       5  from typing import List, Tuple, Union
       6  
       7  
       8  class ESC[4;38;5;81m_lazyclassproperty:
       9      def __init__(self, fn):
      10          self.fn = fn
      11          self.__doc__ = fn.__doc__
      12          self.__name__ = fn.__name__
      13  
      14      def __get__(self, obj, cls):
      15          if cls is None:
      16              cls = type(obj)
      17          if not hasattr(cls, "_intern") or any(
      18              cls._intern is getattr(superclass, "_intern", [])
      19              for superclass in cls.__mro__[1:]
      20          ):
      21              cls._intern = {}
      22          attrname = self.fn.__name__
      23          if attrname not in cls._intern:
      24              cls._intern[attrname] = self.fn(cls)
      25          return cls._intern[attrname]
      26  
      27  
      28  UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
      29  
      30  
      31  class ESC[4;38;5;81municode_set:
      32      """
      33      A set of Unicode characters, for language-specific strings for
      34      ``alphas``, ``nums``, ``alphanums``, and ``printables``.
      35      A unicode_set is defined by a list of ranges in the Unicode character
      36      set, in a class attribute ``_ranges``. Ranges can be specified using
      37      2-tuples or a 1-tuple, such as::
      38  
      39          _ranges = [
      40              (0x0020, 0x007e),
      41              (0x00a0, 0x00ff),
      42              (0x0100,),
      43              ]
      44  
      45      Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
      46  
      47      A unicode set can also be defined using multiple inheritance of other unicode sets::
      48  
      49          class CJK(Chinese, Japanese, Korean):
      50              pass
      51      """
      52  
      53      _ranges: UnicodeRangeList = []
      54  
      55      @_lazyclassproperty
      56      def _chars_for_ranges(cls):
      57          ret = []
      58          for cc in cls.__mro__:
      59              if cc is unicode_set:
      60                  break
      61              for rr in getattr(cc, "_ranges", ()):
      62                  ret.extend(range(rr[0], rr[-1] + 1))
      63          return [chr(c) for c in sorted(set(ret))]
      64  
      65      @_lazyclassproperty
      66      def printables(cls):
      67          """all non-whitespace characters in this range"""
      68          return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
      69  
      70      @_lazyclassproperty
      71      def alphas(cls):
      72          """all alphabetic characters in this range"""
      73          return "".join(filter(str.isalpha, cls._chars_for_ranges))
      74  
      75      @_lazyclassproperty
      76      def nums(cls):
      77          """all numeric digit characters in this range"""
      78          return "".join(filter(str.isdigit, cls._chars_for_ranges))
      79  
      80      @_lazyclassproperty
      81      def alphanums(cls):
      82          """all alphanumeric characters in this range"""
      83          return cls.alphas + cls.nums
      84  
      85      @_lazyclassproperty
      86      def identchars(cls):
      87          """all characters in this range that are valid identifier characters, plus underscore '_'"""
      88          return "".join(
      89              sorted(
      90                  set(
      91                      "".join(filter(str.isidentifier, cls._chars_for_ranges))
      92                      + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
      93                      + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
      94                      + "_"
      95                  )
      96              )
      97          )
      98  
      99      @_lazyclassproperty
     100      def identbodychars(cls):
     101          """
     102          all characters in this range that are valid identifier body characters,
     103          plus the digits 0-9, and · (Unicode MIDDLE DOT)
     104          """
     105          return "".join(
     106              sorted(
     107                  set(
     108                      cls.identchars
     109                      + "0123456789·"
     110                      + "".join(
     111                          [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
     112                      )
     113                  )
     114              )
     115          )
     116  
     117      @_lazyclassproperty
     118      def identifier(cls):
     119          """
     120          a pyparsing Word expression for an identifier using this range's definitions for
     121          identchars and identbodychars
     122          """
     123          from pip._vendor.pyparsing import Word
     124  
     125          return Word(cls.identchars, cls.identbodychars)
     126  
     127  
     128  class ESC[4;38;5;81mpyparsing_unicode(ESC[4;38;5;149municode_set):
     129      """
     130      A namespace class for defining common language unicode_sets.
     131      """
     132  
     133      # fmt: off
     134  
     135      # define ranges in language character sets
     136      _ranges: UnicodeRangeList = [
     137          (0x0020, sys.maxunicode),
     138      ]
     139  
     140      class ESC[4;38;5;81mBasicMultilingualPlane(ESC[4;38;5;149municode_set):
     141          """Unicode set for the Basic Multilingual Plane"""
     142          _ranges: UnicodeRangeList = [
     143              (0x0020, 0xFFFF),
     144          ]
     145  
     146      class ESC[4;38;5;81mLatin1(ESC[4;38;5;149municode_set):
     147          """Unicode set for Latin-1 Unicode Character Range"""
     148          _ranges: UnicodeRangeList = [
     149              (0x0020, 0x007E),
     150              (0x00A0, 0x00FF),
     151          ]
     152  
     153      class ESC[4;38;5;81mLatinA(ESC[4;38;5;149municode_set):
     154          """Unicode set for Latin-A Unicode Character Range"""
     155          _ranges: UnicodeRangeList = [
     156              (0x0100, 0x017F),
     157          ]
     158  
     159      class ESC[4;38;5;81mLatinB(ESC[4;38;5;149municode_set):
     160          """Unicode set for Latin-B Unicode Character Range"""
     161          _ranges: UnicodeRangeList = [
     162              (0x0180, 0x024F),
     163          ]
     164  
     165      class ESC[4;38;5;81mGreek(ESC[4;38;5;149municode_set):
     166          """Unicode set for Greek Unicode Character Ranges"""
     167          _ranges: UnicodeRangeList = [
     168              (0x0342, 0x0345),
     169              (0x0370, 0x0377),
     170              (0x037A, 0x037F),
     171              (0x0384, 0x038A),
     172              (0x038C,),
     173              (0x038E, 0x03A1),
     174              (0x03A3, 0x03E1),
     175              (0x03F0, 0x03FF),
     176              (0x1D26, 0x1D2A),
     177              (0x1D5E,),
     178              (0x1D60,),
     179              (0x1D66, 0x1D6A),
     180              (0x1F00, 0x1F15),
     181              (0x1F18, 0x1F1D),
     182              (0x1F20, 0x1F45),
     183              (0x1F48, 0x1F4D),
     184              (0x1F50, 0x1F57),
     185              (0x1F59,),
     186              (0x1F5B,),
     187              (0x1F5D,),
     188              (0x1F5F, 0x1F7D),
     189              (0x1F80, 0x1FB4),
     190              (0x1FB6, 0x1FC4),
     191              (0x1FC6, 0x1FD3),
     192              (0x1FD6, 0x1FDB),
     193              (0x1FDD, 0x1FEF),
     194              (0x1FF2, 0x1FF4),
     195              (0x1FF6, 0x1FFE),
     196              (0x2129,),
     197              (0x2719, 0x271A),
     198              (0xAB65,),
     199              (0x10140, 0x1018D),
     200              (0x101A0,),
     201              (0x1D200, 0x1D245),
     202              (0x1F7A1, 0x1F7A7),
     203          ]
     204  
     205      class ESC[4;38;5;81mCyrillic(ESC[4;38;5;149municode_set):
     206          """Unicode set for Cyrillic Unicode Character Range"""
     207          _ranges: UnicodeRangeList = [
     208              (0x0400, 0x052F),
     209              (0x1C80, 0x1C88),
     210              (0x1D2B,),
     211              (0x1D78,),
     212              (0x2DE0, 0x2DFF),
     213              (0xA640, 0xA672),
     214              (0xA674, 0xA69F),
     215              (0xFE2E, 0xFE2F),
     216          ]
     217  
     218      class ESC[4;38;5;81mChinese(ESC[4;38;5;149municode_set):
     219          """Unicode set for Chinese Unicode Character Range"""
     220          _ranges: UnicodeRangeList = [
     221              (0x2E80, 0x2E99),
     222              (0x2E9B, 0x2EF3),
     223              (0x31C0, 0x31E3),
     224              (0x3400, 0x4DB5),
     225              (0x4E00, 0x9FEF),
     226              (0xA700, 0xA707),
     227              (0xF900, 0xFA6D),
     228              (0xFA70, 0xFAD9),
     229              (0x16FE2, 0x16FE3),
     230              (0x1F210, 0x1F212),
     231              (0x1F214, 0x1F23B),
     232              (0x1F240, 0x1F248),
     233              (0x20000, 0x2A6D6),
     234              (0x2A700, 0x2B734),
     235              (0x2B740, 0x2B81D),
     236              (0x2B820, 0x2CEA1),
     237              (0x2CEB0, 0x2EBE0),
     238              (0x2F800, 0x2FA1D),
     239          ]
     240  
     241      class ESC[4;38;5;81mJapanese(ESC[4;38;5;149municode_set):
     242          """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
     243  
     244          class ESC[4;38;5;81mKanji(ESC[4;38;5;149municode_set):
     245              "Unicode set for Kanji Unicode Character Range"
     246              _ranges: UnicodeRangeList = [
     247                  (0x4E00, 0x9FBF),
     248                  (0x3000, 0x303F),
     249              ]
     250  
     251          class ESC[4;38;5;81mHiragana(ESC[4;38;5;149municode_set):
     252              """Unicode set for Hiragana Unicode Character Range"""
     253              _ranges: UnicodeRangeList = [
     254                  (0x3041, 0x3096),
     255                  (0x3099, 0x30A0),
     256                  (0x30FC,),
     257                  (0xFF70,),
     258                  (0x1B001,),
     259                  (0x1B150, 0x1B152),
     260                  (0x1F200,),
     261              ]
     262  
     263          class ESC[4;38;5;81mKatakana(ESC[4;38;5;149municode_set):
     264              """Unicode set for Katakana  Unicode Character Range"""
     265              _ranges: UnicodeRangeList = [
     266                  (0x3099, 0x309C),
     267                  (0x30A0, 0x30FF),
     268                  (0x31F0, 0x31FF),
     269                  (0x32D0, 0x32FE),
     270                  (0xFF65, 0xFF9F),
     271                  (0x1B000,),
     272                  (0x1B164, 0x1B167),
     273                  (0x1F201, 0x1F202),
     274                  (0x1F213,),
     275              ]
     276  
     277          漢字 = Kanji
     278          カタカナ = Katakana
     279          ひらがな = Hiragana
     280  
     281          _ranges = (
     282              Kanji._ranges
     283              + Hiragana._ranges
     284              + Katakana._ranges
     285          )
     286  
     287      class ESC[4;38;5;81mHangul(ESC[4;38;5;149municode_set):
     288          """Unicode set for Hangul (Korean) Unicode Character Range"""
     289          _ranges: UnicodeRangeList = [
     290              (0x1100, 0x11FF),
     291              (0x302E, 0x302F),
     292              (0x3131, 0x318E),
     293              (0x3200, 0x321C),
     294              (0x3260, 0x327B),
     295              (0x327E,),
     296              (0xA960, 0xA97C),
     297              (0xAC00, 0xD7A3),
     298              (0xD7B0, 0xD7C6),
     299              (0xD7CB, 0xD7FB),
     300              (0xFFA0, 0xFFBE),
     301              (0xFFC2, 0xFFC7),
     302              (0xFFCA, 0xFFCF),
     303              (0xFFD2, 0xFFD7),
     304              (0xFFDA, 0xFFDC),
     305          ]
     306  
     307      Korean = Hangul
     308  
     309      class ESC[4;38;5;81mCJK(ESC[4;38;5;149mChinese, ESC[4;38;5;149mJapanese, ESC[4;38;5;149mHangul):
     310          """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
     311  
     312      class ESC[4;38;5;81mThai(ESC[4;38;5;149municode_set):
     313          """Unicode set for Thai Unicode Character Range"""
     314          _ranges: UnicodeRangeList = [
     315              (0x0E01, 0x0E3A),
     316              (0x0E3F, 0x0E5B)
     317          ]
     318  
     319      class ESC[4;38;5;81mArabic(ESC[4;38;5;149municode_set):
     320          """Unicode set for Arabic Unicode Character Range"""
     321          _ranges: UnicodeRangeList = [
     322              (0x0600, 0x061B),
     323              (0x061E, 0x06FF),
     324              (0x0700, 0x077F),
     325          ]
     326  
     327      class ESC[4;38;5;81mHebrew(ESC[4;38;5;149municode_set):
     328          """Unicode set for Hebrew Unicode Character Range"""
     329          _ranges: UnicodeRangeList = [
     330              (0x0591, 0x05C7),
     331              (0x05D0, 0x05EA),
     332              (0x05EF, 0x05F4),
     333              (0xFB1D, 0xFB36),
     334              (0xFB38, 0xFB3C),
     335              (0xFB3E,),
     336              (0xFB40, 0xFB41),
     337              (0xFB43, 0xFB44),
     338              (0xFB46, 0xFB4F),
     339          ]
     340  
     341      class ESC[4;38;5;81mDevanagari(ESC[4;38;5;149municode_set):
     342          """Unicode set for Devanagari Unicode Character Range"""
     343          _ranges: UnicodeRangeList = [
     344              (0x0900, 0x097F),
     345              (0xA8E0, 0xA8FF)
     346          ]
     347  
     348      BMP = BasicMultilingualPlane
     349  
     350      # add language identifiers using language Unicode
     351      العربية = Arabic
     352      中文 = Chinese
     353      кириллица = Cyrillic
     354      Ελληνικά = Greek
     355      עִברִית = Hebrew
     356      日本語 = Japanese
     357      한국어 = Korean
     358      ไทย = Thai
     359      देवनागरी = Devanagari
     360  
     361      # fmt: on