python (3.11.7)
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is Mozilla Communicator client code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  # Netscape Communications Corporation.
       6  # Portions created by the Initial Developer are Copyright (C) 1998
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #
      12  # This library is free software; you can redistribute it and/or
      13  # modify it under the terms of the GNU Lesser General Public
      14  # License as published by the Free Software Foundation; either
      15  # version 2.1 of the License, or (at your option) any later version.
      16  #
      17  # This library is distributed in the hope that it will be useful,
      18  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      19  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      20  # Lesser General Public License for more details.
      21  #
      22  # You should have received a copy of the GNU Lesser General Public
      23  # License along with this library; if not, write to the Free Software
      24  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      25  # 02110-1301  USA
      26  ######################### END LICENSE BLOCK #########################
      27  
      28  from typing import Tuple, Union
      29  
      30  from .big5freq import (
      31      BIG5_CHAR_TO_FREQ_ORDER,
      32      BIG5_TABLE_SIZE,
      33      BIG5_TYPICAL_DISTRIBUTION_RATIO,
      34  )
      35  from .euckrfreq import (
      36      EUCKR_CHAR_TO_FREQ_ORDER,
      37      EUCKR_TABLE_SIZE,
      38      EUCKR_TYPICAL_DISTRIBUTION_RATIO,
      39  )
      40  from .euctwfreq import (
      41      EUCTW_CHAR_TO_FREQ_ORDER,
      42      EUCTW_TABLE_SIZE,
      43      EUCTW_TYPICAL_DISTRIBUTION_RATIO,
      44  )
      45  from .gb2312freq import (
      46      GB2312_CHAR_TO_FREQ_ORDER,
      47      GB2312_TABLE_SIZE,
      48      GB2312_TYPICAL_DISTRIBUTION_RATIO,
      49  )
      50  from .jisfreq import (
      51      JIS_CHAR_TO_FREQ_ORDER,
      52      JIS_TABLE_SIZE,
      53      JIS_TYPICAL_DISTRIBUTION_RATIO,
      54  )
      55  from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
      56  
      57  
      58  class ESC[4;38;5;81mCharDistributionAnalysis:
      59      ENOUGH_DATA_THRESHOLD = 1024
      60      SURE_YES = 0.99
      61      SURE_NO = 0.01
      62      MINIMUM_DATA_THRESHOLD = 3
      63  
      64      def __init__(self) -> None:
      65          # Mapping table to get frequency order from char order (get from
      66          # GetOrder())
      67          self._char_to_freq_order: Tuple[int, ...] = tuple()
      68          self._table_size = 0  # Size of above table
      69          # This is a constant value which varies from language to language,
      70          # used in calculating confidence.  See
      71          # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
      72          # for further detail.
      73          self.typical_distribution_ratio = 0.0
      74          self._done = False
      75          self._total_chars = 0
      76          self._freq_chars = 0
      77          self.reset()
      78  
      79      def reset(self) -> None:
      80          """reset analyser, clear any state"""
      81          # If this flag is set to True, detection is done and conclusion has
      82          # been made
      83          self._done = False
      84          self._total_chars = 0  # Total characters encountered
      85          # The number of characters whose frequency order is less than 512
      86          self._freq_chars = 0
      87  
      88      def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
      89          """feed a character with known length"""
      90          if char_len == 2:
      91              # we only care about 2-bytes character in our distribution analysis
      92              order = self.get_order(char)
      93          else:
      94              order = -1
      95          if order >= 0:
      96              self._total_chars += 1
      97              # order is valid
      98              if order < self._table_size:
      99                  if 512 > self._char_to_freq_order[order]:
     100                      self._freq_chars += 1
     101  
     102      def get_confidence(self) -> float:
     103          """return confidence based on existing data"""
     104          # if we didn't receive any character in our consideration range,
     105          # return negative answer
     106          if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
     107              return self.SURE_NO
     108  
     109          if self._total_chars != self._freq_chars:
     110              r = self._freq_chars / (
     111                  (self._total_chars - self._freq_chars) * self.typical_distribution_ratio
     112              )
     113              if r < self.SURE_YES:
     114                  return r
     115  
     116          # normalize confidence (we don't want to be 100% sure)
     117          return self.SURE_YES
     118  
     119      def got_enough_data(self) -> bool:
     120          # It is not necessary to receive all data to draw conclusion.
     121          # For charset detection, certain amount of data is enough
     122          return self._total_chars > self.ENOUGH_DATA_THRESHOLD
     123  
     124      def get_order(self, _: Union[bytes, bytearray]) -> int:
     125          # We do not handle characters based on the original encoding string,
     126          # but convert this encoding string to a number, here called order.
     127          # This allows multiple encodings of a language to share one frequency
     128          # table.
     129          return -1
     130  
     131  
     132  class ESC[4;38;5;81mEUCTWDistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     133      def __init__(self) -> None:
     134          super().__init__()
     135          self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
     136          self._table_size = EUCTW_TABLE_SIZE
     137          self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
     138  
     139      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     140          # for euc-TW encoding, we are interested
     141          #   first  byte range: 0xc4 -- 0xfe
     142          #   second byte range: 0xa1 -- 0xfe
     143          # no validation needed here. State machine has done that
     144          first_char = byte_str[0]
     145          if first_char >= 0xC4:
     146              return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
     147          return -1
     148  
     149  
     150  class ESC[4;38;5;81mEUCKRDistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     151      def __init__(self) -> None:
     152          super().__init__()
     153          self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
     154          self._table_size = EUCKR_TABLE_SIZE
     155          self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
     156  
     157      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     158          # for euc-KR encoding, we are interested
     159          #   first  byte range: 0xb0 -- 0xfe
     160          #   second byte range: 0xa1 -- 0xfe
     161          # no validation needed here. State machine has done that
     162          first_char = byte_str[0]
     163          if first_char >= 0xB0:
     164              return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
     165          return -1
     166  
     167  
     168  class ESC[4;38;5;81mJOHABDistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     169      def __init__(self) -> None:
     170          super().__init__()
     171          self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
     172          self._table_size = EUCKR_TABLE_SIZE
     173          self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
     174  
     175      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     176          first_char = byte_str[0]
     177          if 0x88 <= first_char < 0xD4:
     178              code = first_char * 256 + byte_str[1]
     179              return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
     180          return -1
     181  
     182  
     183  class ESC[4;38;5;81mGB2312DistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     184      def __init__(self) -> None:
     185          super().__init__()
     186          self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
     187          self._table_size = GB2312_TABLE_SIZE
     188          self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
     189  
     190      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     191          # for GB2312 encoding, we are interested
     192          #  first  byte range: 0xb0 -- 0xfe
     193          #  second byte range: 0xa1 -- 0xfe
     194          # no validation needed here. State machine has done that
     195          first_char, second_char = byte_str[0], byte_str[1]
     196          if (first_char >= 0xB0) and (second_char >= 0xA1):
     197              return 94 * (first_char - 0xB0) + second_char - 0xA1
     198          return -1
     199  
     200  
     201  class ESC[4;38;5;81mBig5DistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     202      def __init__(self) -> None:
     203          super().__init__()
     204          self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
     205          self._table_size = BIG5_TABLE_SIZE
     206          self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
     207  
     208      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     209          # for big5 encoding, we are interested
     210          #   first  byte range: 0xa4 -- 0xfe
     211          #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
     212          # no validation needed here. State machine has done that
     213          first_char, second_char = byte_str[0], byte_str[1]
     214          if first_char >= 0xA4:
     215              if second_char >= 0xA1:
     216                  return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
     217              return 157 * (first_char - 0xA4) + second_char - 0x40
     218          return -1
     219  
     220  
     221  class ESC[4;38;5;81mSJISDistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     222      def __init__(self) -> None:
     223          super().__init__()
     224          self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
     225          self._table_size = JIS_TABLE_SIZE
     226          self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
     227  
     228      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     229          # for sjis encoding, we are interested
     230          #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
     231          #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
     232          # no validation needed here. State machine has done that
     233          first_char, second_char = byte_str[0], byte_str[1]
     234          if 0x81 <= first_char <= 0x9F:
     235              order = 188 * (first_char - 0x81)
     236          elif 0xE0 <= first_char <= 0xEF:
     237              order = 188 * (first_char - 0xE0 + 31)
     238          else:
     239              return -1
     240          order = order + second_char - 0x40
     241          if second_char > 0x7F:
     242              order = -1
     243          return order
     244  
     245  
     246  class ESC[4;38;5;81mEUCJPDistributionAnalysis(ESC[4;38;5;149mCharDistributionAnalysis):
     247      def __init__(self) -> None:
     248          super().__init__()
     249          self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
     250          self._table_size = JIS_TABLE_SIZE
     251          self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
     252  
     253      def get_order(self, byte_str: Union[bytes, bytearray]) -> int:
     254          # for euc-JP encoding, we are interested
     255          #   first  byte range: 0xa0 -- 0xfe
     256          #   second byte range: 0xa1 -- 0xfe
     257          # no validation needed here. State machine has done that
     258          char = byte_str[0]
     259          if char >= 0xA0:
     260              return 94 * (char - 0xA1) + byte_str[1] - 0xA1
     261          return -1