python (3.11.7)

Browse
Build Log
Usage
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is Mozilla Universal charset detector code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  # Netscape Communications Corporation.
       6  # Portions created by the Initial Developer are Copyright (C) 2001
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #   Shy Shalom - original C code
      12  #
      13  # This library is free software; you can redistribute it and/or
      14  # modify it under the terms of the GNU Lesser General Public
      15  # License as published by the Free Software Foundation; either
      16  # version 2.1 of the License, or (at your option) any later version.
      17  #
      18  # This library is distributed in the hope that it will be useful,
      19  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      20  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      21  # Lesser General Public License for more details.
      22  #
      23  # You should have received a copy of the GNU Lesser General Public
      24  # License along with this library; if not, write to the Free Software
      25  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      26  # 02110-1301  USA
      27  ######################### END LICENSE BLOCK #########################
      28  
      29  import logging
      30  import re
      31  from typing import Optional, Union
      32  
      33  from .enums import LanguageFilter, ProbingState
      34  
      35  INTERNATIONAL_WORDS_PATTERN = re.compile(
      36      b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?"
      37  )
      38  
      39  
      40  class ESC[4;38;5;81mCharSetProber:
      41  
      42      SHORTCUT_THRESHOLD = 0.95
      43  
      44      def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
      45          self._state = ProbingState.DETECTING
      46          self.active = True
      47          self.lang_filter = lang_filter
      48          self.logger = logging.getLogger(__name__)
      49  
      50      def reset(self) -> None:
      51          self._state = ProbingState.DETECTING
      52  
      53      @property
      54      def charset_name(self) -> Optional[str]:
      55          return None
      56  
      57      @property
      58      def language(self) -> Optional[str]:
      59          raise NotImplementedError
      60  
      61      def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
      62          raise NotImplementedError
      63  
      64      @property
      65      def state(self) -> ProbingState:
      66          return self._state
      67  
      68      def get_confidence(self) -> float:
      69          return 0.0
      70  
      71      @staticmethod
      72      def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes:
      73          buf = re.sub(b"([\x00-\x7F])+", b" ", buf)
      74          return buf
      75  
      76      @staticmethod
      77      def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray:
      78          """
      79          We define three types of bytes:
      80          alphabet: english alphabets [a-zA-Z]
      81          international: international characters [\x80-\xFF]
      82          marker: everything else [^a-zA-Z\x80-\xFF]
      83          The input buffer can be thought to contain a series of words delimited
      84          by markers. This function works to filter all words that contain at
      85          least one international character. All contiguous sequences of markers
      86          are replaced by a single space ascii character.
      87          This filter applies to all scripts which do not use English characters.
      88          """
      89          filtered = bytearray()
      90  
      91          # This regex expression filters out only words that have at-least one
      92          # international character. The word may include one marker character at
      93          # the end.
      94          words = INTERNATIONAL_WORDS_PATTERN.findall(buf)
      95  
      96          for word in words:
      97              filtered.extend(word[:-1])
      98  
      99              # If the last character in the word is a marker, replace it with a
     100              # space as markers shouldn't affect our analysis (they are used
     101              # similarly across all languages and may thus have similar
     102              # frequencies).
     103              last_char = word[-1:]
     104              if not last_char.isalpha() and last_char < b"\x80":
     105                  last_char = b" "
     106              filtered.extend(last_char)
     107  
     108          return filtered
     109  
     110      @staticmethod
     111      def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes:
     112          """
     113          Returns a copy of ``buf`` that retains only the sequences of English
     114          alphabet and high byte characters that are not between <> characters.
     115          This filter can be applied to all scripts which contain both English
     116          characters and extended ASCII characters, but is currently only used by
     117          ``Latin1Prober``.
     118          """
     119          filtered = bytearray()
     120          in_tag = False
     121          prev = 0
     122          buf = memoryview(buf).cast("c")
     123  
     124          for curr, buf_char in enumerate(buf):
     125              # Check if we're coming out of or entering an XML tag
     126  
     127              # https://github.com/python/typeshed/issues/8182
     128              if buf_char == b">":  # type: ignore[comparison-overlap]
     129                  prev = curr + 1
     130                  in_tag = False
     131              # https://github.com/python/typeshed/issues/8182
     132              elif buf_char == b"<":  # type: ignore[comparison-overlap]
     133                  if curr > prev and not in_tag:
     134                      # Keep everything after last non-extended-ASCII,
     135                      # non-alphabetic character
     136                      filtered.extend(buf[prev:curr])
     137                      # Output a space to delimit stretch we kept
     138                      filtered.extend(b" ")
     139                  in_tag = True
     140  
     141          # If we're not in a tag...
     142          if not in_tag:
     143              # Keep everything after last non-extended-ASCII, non-alphabetic
     144              # character
     145              filtered.extend(buf[prev:])
     146  
     147          return filtered