python (3.11.7)

Browse
Build Log
Usage
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is Mozilla Universal charset detector code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  # Netscape Communications Corporation.
       6  # Portions created by the Initial Developer are Copyright (C) 2001
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #   Shy Shalom - original C code
      12  #
      13  # This library is free software; you can redistribute it and/or
      14  # modify it under the terms of the GNU Lesser General Public
      15  # License as published by the Free Software Foundation; either
      16  # version 2.1 of the License, or (at your option) any later version.
      17  #
      18  # This library is distributed in the hope that it will be useful,
      19  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      20  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      21  # Lesser General Public License for more details.
      22  #
      23  # You should have received a copy of the GNU Lesser General Public
      24  # License along with this library; if not, write to the Free Software
      25  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      26  # 02110-1301  USA
      27  ######################### END LICENSE BLOCK #########################
      28  
      29  from typing import Dict, List, NamedTuple, Optional, Union
      30  
      31  from .charsetprober import CharSetProber
      32  from .enums import CharacterCategory, ProbingState, SequenceLikelihood
      33  
      34  
      35  class ESC[4;38;5;81mSingleByteCharSetModel(ESC[4;38;5;149mNamedTuple):
      36      charset_name: str
      37      language: str
      38      char_to_order_map: Dict[int, int]
      39      language_model: Dict[int, Dict[int, int]]
      40      typical_positive_ratio: float
      41      keep_ascii_letters: bool
      42      alphabet: str
      43  
      44  
      45  class ESC[4;38;5;81mSingleByteCharSetProber(ESC[4;38;5;149mCharSetProber):
      46      SAMPLE_SIZE = 64
      47      SB_ENOUGH_REL_THRESHOLD = 1024  # 0.25 * SAMPLE_SIZE^2
      48      POSITIVE_SHORTCUT_THRESHOLD = 0.95
      49      NEGATIVE_SHORTCUT_THRESHOLD = 0.05
      50  
      51      def __init__(
      52          self,
      53          model: SingleByteCharSetModel,
      54          is_reversed: bool = False,
      55          name_prober: Optional[CharSetProber] = None,
      56      ) -> None:
      57          super().__init__()
      58          self._model = model
      59          # TRUE if we need to reverse every pair in the model lookup
      60          self._reversed = is_reversed
      61          # Optional auxiliary prober for name decision
      62          self._name_prober = name_prober
      63          self._last_order = 255
      64          self._seq_counters: List[int] = []
      65          self._total_seqs = 0
      66          self._total_char = 0
      67          self._control_char = 0
      68          self._freq_char = 0
      69          self.reset()
      70  
      71      def reset(self) -> None:
      72          super().reset()
      73          # char order of last character
      74          self._last_order = 255
      75          self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
      76          self._total_seqs = 0
      77          self._total_char = 0
      78          self._control_char = 0
      79          # characters that fall in our sampling range
      80          self._freq_char = 0
      81  
      82      @property
      83      def charset_name(self) -> Optional[str]:
      84          if self._name_prober:
      85              return self._name_prober.charset_name
      86          return self._model.charset_name
      87  
      88      @property
      89      def language(self) -> Optional[str]:
      90          if self._name_prober:
      91              return self._name_prober.language
      92          return self._model.language
      93  
      94      def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
      95          # TODO: Make filter_international_words keep things in self.alphabet
      96          if not self._model.keep_ascii_letters:
      97              byte_str = self.filter_international_words(byte_str)
      98          else:
      99              byte_str = self.remove_xml_tags(byte_str)
     100          if not byte_str:
     101              return self.state
     102          char_to_order_map = self._model.char_to_order_map
     103          language_model = self._model.language_model
     104          for char in byte_str:
     105              order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
     106              # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
     107              #      CharacterCategory.SYMBOL is actually 253, so we use CONTROL
     108              #      to make it closer to the original intent. The only difference
     109              #      is whether or not we count digits and control characters for
     110              #      _total_char purposes.
     111              if order < CharacterCategory.CONTROL:
     112                  self._total_char += 1
     113              if order < self.SAMPLE_SIZE:
     114                  self._freq_char += 1
     115                  if self._last_order < self.SAMPLE_SIZE:
     116                      self._total_seqs += 1
     117                      if not self._reversed:
     118                          lm_cat = language_model[self._last_order][order]
     119                      else:
     120                          lm_cat = language_model[order][self._last_order]
     121                      self._seq_counters[lm_cat] += 1
     122              self._last_order = order
     123  
     124          charset_name = self._model.charset_name
     125          if self.state == ProbingState.DETECTING:
     126              if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
     127                  confidence = self.get_confidence()
     128                  if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:
     129                      self.logger.debug(
     130                          "%s confidence = %s, we have a winner", charset_name, confidence
     131                      )
     132                      self._state = ProbingState.FOUND_IT
     133                  elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:
     134                      self.logger.debug(
     135                          "%s confidence = %s, below negative shortcut threshold %s",
     136                          charset_name,
     137                          confidence,
     138                          self.NEGATIVE_SHORTCUT_THRESHOLD,
     139                      )
     140                      self._state = ProbingState.NOT_ME
     141  
     142          return self.state
     143  
     144      def get_confidence(self) -> float:
     145          r = 0.01
     146          if self._total_seqs > 0:
     147              r = (
     148                  (
     149                      self._seq_counters[SequenceLikelihood.POSITIVE]
     150                      + 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
     151                  )
     152                  / self._total_seqs
     153                  / self._model.typical_positive_ratio
     154              )
     155              # The more control characters (proportionnaly to the size
     156              # of the text), the less confident we become in the current
     157              # charset.
     158              r = r * (self._total_char - self._control_char) / self._total_char
     159              r = r * self._freq_char / self._total_char
     160              if r >= 1.0:
     161                  r = 0.99
     162          return r