python (3.11.7)
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  #
       3  # Contributor(s):
       4  #   Jason Zavaglia
       5  #
       6  # This library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # This library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with this library; if not, write to the Free Software
      18  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      19  # 02110-1301  USA
      20  ######################### END LICENSE BLOCK #########################
      21  from typing import List, Union
      22  
      23  from .charsetprober import CharSetProber
      24  from .enums import ProbingState
      25  
      26  
      27  class ESC[4;38;5;81mUTF1632Prober(ESC[4;38;5;149mCharSetProber):
      28      """
      29      This class simply looks for occurrences of zero bytes, and infers
      30      whether the file is UTF16 or UTF32 (low-endian or big-endian)
      31      For instance, files looking like ( \0 \0 \0 [nonzero] )+
      32      have a good probability to be UTF32BE.  Files looking like ( \0 [nonzero] )+
      33      may be guessed to be UTF16BE, and inversely for little-endian varieties.
      34      """
      35  
      36      # how many logical characters to scan before feeling confident of prediction
      37      MIN_CHARS_FOR_DETECTION = 20
      38      # a fixed constant ratio of expected zeros or non-zeros in modulo-position.
      39      EXPECTED_RATIO = 0.94
      40  
      41      def __init__(self) -> None:
      42          super().__init__()
      43          self.position = 0
      44          self.zeros_at_mod = [0] * 4
      45          self.nonzeros_at_mod = [0] * 4
      46          self._state = ProbingState.DETECTING
      47          self.quad = [0, 0, 0, 0]
      48          self.invalid_utf16be = False
      49          self.invalid_utf16le = False
      50          self.invalid_utf32be = False
      51          self.invalid_utf32le = False
      52          self.first_half_surrogate_pair_detected_16be = False
      53          self.first_half_surrogate_pair_detected_16le = False
      54          self.reset()
      55  
      56      def reset(self) -> None:
      57          super().reset()
      58          self.position = 0
      59          self.zeros_at_mod = [0] * 4
      60          self.nonzeros_at_mod = [0] * 4
      61          self._state = ProbingState.DETECTING
      62          self.invalid_utf16be = False
      63          self.invalid_utf16le = False
      64          self.invalid_utf32be = False
      65          self.invalid_utf32le = False
      66          self.first_half_surrogate_pair_detected_16be = False
      67          self.first_half_surrogate_pair_detected_16le = False
      68          self.quad = [0, 0, 0, 0]
      69  
      70      @property
      71      def charset_name(self) -> str:
      72          if self.is_likely_utf32be():
      73              return "utf-32be"
      74          if self.is_likely_utf32le():
      75              return "utf-32le"
      76          if self.is_likely_utf16be():
      77              return "utf-16be"
      78          if self.is_likely_utf16le():
      79              return "utf-16le"
      80          # default to something valid
      81          return "utf-16"
      82  
      83      @property
      84      def language(self) -> str:
      85          return ""
      86  
      87      def approx_32bit_chars(self) -> float:
      88          return max(1.0, self.position / 4.0)
      89  
      90      def approx_16bit_chars(self) -> float:
      91          return max(1.0, self.position / 2.0)
      92  
      93      def is_likely_utf32be(self) -> bool:
      94          approx_chars = self.approx_32bit_chars()
      95          return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
      96              self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
      97              and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
      98              and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
      99              and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
     100              and not self.invalid_utf32be
     101          )
     102  
     103      def is_likely_utf32le(self) -> bool:
     104          approx_chars = self.approx_32bit_chars()
     105          return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
     106              self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
     107              and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
     108              and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
     109              and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
     110              and not self.invalid_utf32le
     111          )
     112  
     113      def is_likely_utf16be(self) -> bool:
     114          approx_chars = self.approx_16bit_chars()
     115          return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
     116              (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
     117              > self.EXPECTED_RATIO
     118              and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
     119              > self.EXPECTED_RATIO
     120              and not self.invalid_utf16be
     121          )
     122  
     123      def is_likely_utf16le(self) -> bool:
     124          approx_chars = self.approx_16bit_chars()
     125          return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
     126              (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
     127              > self.EXPECTED_RATIO
     128              and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
     129              > self.EXPECTED_RATIO
     130              and not self.invalid_utf16le
     131          )
     132  
     133      def validate_utf32_characters(self, quad: List[int]) -> None:
     134          """
     135          Validate if the quad of bytes is valid UTF-32.
     136  
     137          UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
     138          excluding 0x0000D800 - 0x0000DFFF
     139  
     140          https://en.wikipedia.org/wiki/UTF-32
     141          """
     142          if (
     143              quad[0] != 0
     144              or quad[1] > 0x10
     145              or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
     146          ):
     147              self.invalid_utf32be = True
     148          if (
     149              quad[3] != 0
     150              or quad[2] > 0x10
     151              or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
     152          ):
     153              self.invalid_utf32le = True
     154  
     155      def validate_utf16_characters(self, pair: List[int]) -> None:
     156          """
     157          Validate if the pair of bytes is  valid UTF-16.
     158  
     159          UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
     160          with an exception for surrogate pairs, which must be in the range
     161          0xD800-0xDBFF followed by 0xDC00-0xDFFF
     162  
     163          https://en.wikipedia.org/wiki/UTF-16
     164          """
     165          if not self.first_half_surrogate_pair_detected_16be:
     166              if 0xD8 <= pair[0] <= 0xDB:
     167                  self.first_half_surrogate_pair_detected_16be = True
     168              elif 0xDC <= pair[0] <= 0xDF:
     169                  self.invalid_utf16be = True
     170          else:
     171              if 0xDC <= pair[0] <= 0xDF:
     172                  self.first_half_surrogate_pair_detected_16be = False
     173              else:
     174                  self.invalid_utf16be = True
     175  
     176          if not self.first_half_surrogate_pair_detected_16le:
     177              if 0xD8 <= pair[1] <= 0xDB:
     178                  self.first_half_surrogate_pair_detected_16le = True
     179              elif 0xDC <= pair[1] <= 0xDF:
     180                  self.invalid_utf16le = True
     181          else:
     182              if 0xDC <= pair[1] <= 0xDF:
     183                  self.first_half_surrogate_pair_detected_16le = False
     184              else:
     185                  self.invalid_utf16le = True
     186  
     187      def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
     188          for c in byte_str:
     189              mod4 = self.position % 4
     190              self.quad[mod4] = c
     191              if mod4 == 3:
     192                  self.validate_utf32_characters(self.quad)
     193                  self.validate_utf16_characters(self.quad[0:2])
     194                  self.validate_utf16_characters(self.quad[2:4])
     195              if c == 0:
     196                  self.zeros_at_mod[mod4] += 1
     197              else:
     198                  self.nonzeros_at_mod[mod4] += 1
     199              self.position += 1
     200          return self.state
     201  
     202      @property
     203      def state(self) -> ProbingState:
     204          if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
     205              # terminal, decided states
     206              return self._state
     207          if self.get_confidence() > 0.80:
     208              self._state = ProbingState.FOUND_IT
     209          elif self.position > 4 * 1024:
     210              # if we get to 4kb into the file, and we can't conclude it's UTF,
     211              # let's give up
     212              self._state = ProbingState.NOT_ME
     213          return self._state
     214  
     215      def get_confidence(self) -> float:
     216          return (
     217              0.85
     218              if (
     219                  self.is_likely_utf16le()
     220                  or self.is_likely_utf16be()
     221                  or self.is_likely_utf32le()
     222                  or self.is_likely_utf32be()
     223              )
     224              else 0.00
     225          )