python (3.11.7)
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is mozilla.org code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  # Netscape Communications Corporation.
       6  # Portions created by the Initial Developer are Copyright (C) 1998
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #
      12  # This library is free software; you can redistribute it and/or
      13  # modify it under the terms of the GNU Lesser General Public
      14  # License as published by the Free Software Foundation; either
      15  # version 2.1 of the License, or (at your option) any later version.
      16  #
      17  # This library is distributed in the hope that it will be useful,
      18  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      19  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      20  # Lesser General Public License for more details.
      21  #
      22  # You should have received a copy of the GNU Lesser General Public
      23  # License along with this library; if not, write to the Free Software
      24  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      25  # 02110-1301  USA
      26  ######################### END LICENSE BLOCK #########################
      27  
      28  from typing import Union
      29  
      30  from .chardistribution import SJISDistributionAnalysis
      31  from .codingstatemachine import CodingStateMachine
      32  from .enums import MachineState, ProbingState
      33  from .jpcntx import SJISContextAnalysis
      34  from .mbcharsetprober import MultiByteCharSetProber
      35  from .mbcssm import SJIS_SM_MODEL
      36  
      37  
      38  class ESC[4;38;5;81mSJISProber(ESC[4;38;5;149mMultiByteCharSetProber):
      39      def __init__(self) -> None:
      40          super().__init__()
      41          self.coding_sm = CodingStateMachine(SJIS_SM_MODEL)
      42          self.distribution_analyzer = SJISDistributionAnalysis()
      43          self.context_analyzer = SJISContextAnalysis()
      44          self.reset()
      45  
      46      def reset(self) -> None:
      47          super().reset()
      48          self.context_analyzer.reset()
      49  
      50      @property
      51      def charset_name(self) -> str:
      52          return self.context_analyzer.charset_name
      53  
      54      @property
      55      def language(self) -> str:
      56          return "Japanese"
      57  
      58      def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
      59          assert self.coding_sm is not None
      60          assert self.distribution_analyzer is not None
      61  
      62          for i, byte in enumerate(byte_str):
      63              coding_state = self.coding_sm.next_state(byte)
      64              if coding_state == MachineState.ERROR:
      65                  self.logger.debug(
      66                      "%s %s prober hit error at byte %s",
      67                      self.charset_name,
      68                      self.language,
      69                      i,
      70                  )
      71                  self._state = ProbingState.NOT_ME
      72                  break
      73              if coding_state == MachineState.ITS_ME:
      74                  self._state = ProbingState.FOUND_IT
      75                  break
      76              if coding_state == MachineState.START:
      77                  char_len = self.coding_sm.get_current_charlen()
      78                  if i == 0:
      79                      self._last_char[1] = byte
      80                      self.context_analyzer.feed(
      81                          self._last_char[2 - char_len :], char_len
      82                      )
      83                      self.distribution_analyzer.feed(self._last_char, char_len)
      84                  else:
      85                      self.context_analyzer.feed(
      86                          byte_str[i + 1 - char_len : i + 3 - char_len], char_len
      87                      )
      88                      self.distribution_analyzer.feed(byte_str[i - 1 : i + 1], char_len)
      89  
      90          self._last_char[0] = byte_str[-1]
      91  
      92          if self.state == ProbingState.DETECTING:
      93              if self.context_analyzer.got_enough_data() and (
      94                  self.get_confidence() > self.SHORTCUT_THRESHOLD
      95              ):
      96                  self._state = ProbingState.FOUND_IT
      97  
      98          return self.state
      99  
     100      def get_confidence(self) -> float:
     101          assert self.distribution_analyzer is not None
     102  
     103          context_conf = self.context_analyzer.get_confidence()
     104          distrib_conf = self.distribution_analyzer.get_confidence()
     105          return max(context_conf, distrib_conf)