python (3.12.0)

Browse
Build Log
Usage
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is Mozilla Universal charset detector code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  #          Shy Shalom
       6  # Portions created by the Initial Developer are Copyright (C) 2005
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #
      12  # This library is free software; you can redistribute it and/or
      13  # modify it under the terms of the GNU Lesser General Public
      14  # License as published by the Free Software Foundation; either
      15  # version 2.1 of the License, or (at your option) any later version.
      16  #
      17  # This library is distributed in the hope that it will be useful,
      18  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      19  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      20  # Lesser General Public License for more details.
      21  #
      22  # You should have received a copy of the GNU Lesser General Public
      23  # License along with this library; if not, write to the Free Software
      24  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      25  # 02110-1301  USA
      26  ######################### END LICENSE BLOCK #########################
      27  
      28  from typing import Optional, Union
      29  
      30  from .charsetprober import CharSetProber
      31  from .enums import ProbingState
      32  from .sbcharsetprober import SingleByteCharSetProber
      33  
      34  # This prober doesn't actually recognize a language or a charset.
      35  # It is a helper prober for the use of the Hebrew model probers
      36  
      37  ### General ideas of the Hebrew charset recognition ###
      38  #
      39  # Four main charsets exist in Hebrew:
      40  # "ISO-8859-8" - Visual Hebrew
      41  # "windows-1255" - Logical Hebrew
      42  # "ISO-8859-8-I" - Logical Hebrew
      43  # "x-mac-hebrew" - ?? Logical Hebrew ??
      44  #
      45  # Both "ISO" charsets use a completely identical set of code points, whereas
      46  # "windows-1255" and "x-mac-hebrew" are two different proper supersets of
      47  # these code points. windows-1255 defines additional characters in the range
      48  # 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
      49  # diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
      50  # x-mac-hebrew defines similar additional code points but with a different
      51  # mapping.
      52  #
      53  # As far as an average Hebrew text with no diacritics is concerned, all four
      54  # charsets are identical with respect to code points. Meaning that for the
      55  # main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
      56  # (including final letters).
      57  #
      58  # The dominant difference between these charsets is their directionality.
      59  # "Visual" directionality means that the text is ordered as if the renderer is
      60  # not aware of a BIDI rendering algorithm. The renderer sees the text and
      61  # draws it from left to right. The text itself when ordered naturally is read
      62  # backwards. A buffer of Visual Hebrew generally looks like so:
      63  # "[last word of first line spelled backwards] [whole line ordered backwards
      64  # and spelled backwards] [first word of first line spelled backwards]
      65  # [end of line] [last word of second line] ... etc' "
      66  # adding punctuation marks, numbers and English text to visual text is
      67  # naturally also "visual" and from left to right.
      68  #
      69  # "Logical" directionality means the text is ordered "naturally" according to
      70  # the order it is read. It is the responsibility of the renderer to display
      71  # the text from right to left. A BIDI algorithm is used to place general
      72  # punctuation marks, numbers and English text in the text.
      73  #
      74  # Texts in x-mac-hebrew are almost impossible to find on the Internet. From
      75  # what little evidence I could find, it seems that its general directionality
      76  # is Logical.
      77  #
      78  # To sum up all of the above, the Hebrew probing mechanism knows about two
      79  # charsets:
      80  # Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
      81  #    backwards while line order is natural. For charset recognition purposes
      82  #    the line order is unimportant (In fact, for this implementation, even
      83  #    word order is unimportant).
      84  # Logical Hebrew - "windows-1255" - normal, naturally ordered text.
      85  #
      86  # "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
      87  #    specifically identified.
      88  # "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
      89  #    that contain special punctuation marks or diacritics is displayed with
      90  #    some unconverted characters showing as question marks. This problem might
      91  #    be corrected using another model prober for x-mac-hebrew. Due to the fact
      92  #    that x-mac-hebrew texts are so rare, writing another model prober isn't
      93  #    worth the effort and performance hit.
      94  #
      95  #### The Prober ####
      96  #
      97  # The prober is divided between two SBCharSetProbers and a HebrewProber,
      98  # all of which are managed, created, fed data, inquired and deleted by the
      99  # SBCSGroupProber. The two SBCharSetProbers identify that the text is in
     100  # fact some kind of Hebrew, Logical or Visual. The final decision about which
     101  # one is it is made by the HebrewProber by combining final-letter scores
     102  # with the scores of the two SBCharSetProbers to produce a final answer.
     103  #
     104  # The SBCSGroupProber is responsible for stripping the original text of HTML
     105  # tags, English characters, numbers, low-ASCII punctuation characters, spaces
     106  # and new lines. It reduces any sequence of such characters to a single space.
     107  # The buffer fed to each prober in the SBCS group prober is pure text in
     108  # high-ASCII.
     109  # The two SBCharSetProbers (model probers) share the same language model:
     110  # Win1255Model.
     111  # The first SBCharSetProber uses the model normally as any other
     112  # SBCharSetProber does, to recognize windows-1255, upon which this model was
     113  # built. The second SBCharSetProber is told to make the pair-of-letter
     114  # lookup in the language model backwards. This in practice exactly simulates
     115  # a visual Hebrew model using the windows-1255 logical Hebrew model.
     116  #
     117  # The HebrewProber is not using any language model. All it does is look for
     118  # final-letter evidence suggesting the text is either logical Hebrew or visual
     119  # Hebrew. Disjointed from the model probers, the results of the HebrewProber
     120  # alone are meaningless. HebrewProber always returns 0.00 as confidence
     121  # since it never identifies a charset by itself. Instead, the pointer to the
     122  # HebrewProber is passed to the model probers as a helper "Name Prober".
     123  # When the Group prober receives a positive identification from any prober,
     124  # it asks for the name of the charset identified. If the prober queried is a
     125  # Hebrew model prober, the model prober forwards the call to the
     126  # HebrewProber to make the final decision. In the HebrewProber, the
     127  # decision is made according to the final-letters scores maintained and Both
     128  # model probers scores. The answer is returned in the form of the name of the
     129  # charset identified, either "windows-1255" or "ISO-8859-8".
     130  
     131  
     132  class ESC[4;38;5;81mHebrewProber(ESC[4;38;5;149mCharSetProber):
     133      SPACE = 0x20
     134      # windows-1255 / ISO-8859-8 code points of interest
     135      FINAL_KAF = 0xEA
     136      NORMAL_KAF = 0xEB
     137      FINAL_MEM = 0xED
     138      NORMAL_MEM = 0xEE
     139      FINAL_NUN = 0xEF
     140      NORMAL_NUN = 0xF0
     141      FINAL_PE = 0xF3
     142      NORMAL_PE = 0xF4
     143      FINAL_TSADI = 0xF5
     144      NORMAL_TSADI = 0xF6
     145  
     146      # Minimum Visual vs Logical final letter score difference.
     147      # If the difference is below this, don't rely solely on the final letter score
     148      # distance.
     149      MIN_FINAL_CHAR_DISTANCE = 5
     150  
     151      # Minimum Visual vs Logical model score difference.
     152      # If the difference is below this, don't rely at all on the model score
     153      # distance.
     154      MIN_MODEL_DISTANCE = 0.01
     155  
     156      VISUAL_HEBREW_NAME = "ISO-8859-8"
     157      LOGICAL_HEBREW_NAME = "windows-1255"
     158  
     159      def __init__(self) -> None:
     160          super().__init__()
     161          self._final_char_logical_score = 0
     162          self._final_char_visual_score = 0
     163          self._prev = self.SPACE
     164          self._before_prev = self.SPACE
     165          self._logical_prober: Optional[SingleByteCharSetProber] = None
     166          self._visual_prober: Optional[SingleByteCharSetProber] = None
     167          self.reset()
     168  
     169      def reset(self) -> None:
     170          self._final_char_logical_score = 0
     171          self._final_char_visual_score = 0
     172          # The two last characters seen in the previous buffer,
     173          # mPrev and mBeforePrev are initialized to space in order to simulate
     174          # a word delimiter at the beginning of the data
     175          self._prev = self.SPACE
     176          self._before_prev = self.SPACE
     177          # These probers are owned by the group prober.
     178  
     179      def set_model_probers(
     180          self,
     181          logical_prober: SingleByteCharSetProber,
     182          visual_prober: SingleByteCharSetProber,
     183      ) -> None:
     184          self._logical_prober = logical_prober
     185          self._visual_prober = visual_prober
     186  
     187      def is_final(self, c: int) -> bool:
     188          return c in [
     189              self.FINAL_KAF,
     190              self.FINAL_MEM,
     191              self.FINAL_NUN,
     192              self.FINAL_PE,
     193              self.FINAL_TSADI,
     194          ]
     195  
     196      def is_non_final(self, c: int) -> bool:
     197          # The normal Tsadi is not a good Non-Final letter due to words like
     198          # 'lechotet' (to chat) containing an apostrophe after the tsadi. This
     199          # apostrophe is converted to a space in FilterWithoutEnglishLetters
     200          # causing the Non-Final tsadi to appear at an end of a word even
     201          # though this is not the case in the original text.
     202          # The letters Pe and Kaf rarely display a related behavior of not being
     203          # a good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak'
     204          # for example legally end with a Non-Final Pe or Kaf. However, the
     205          # benefit of these letters as Non-Final letters outweighs the damage
     206          # since these words are quite rare.
     207          return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE]
     208  
     209      def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
     210          # Final letter analysis for logical-visual decision.
     211          # Look for evidence that the received buffer is either logical Hebrew
     212          # or visual Hebrew.
     213          # The following cases are checked:
     214          # 1) A word longer than 1 letter, ending with a final letter. This is
     215          #    an indication that the text is laid out "naturally" since the
     216          #    final letter really appears at the end. +1 for logical score.
     217          # 2) A word longer than 1 letter, ending with a Non-Final letter. In
     218          #    normal Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi,
     219          #    should not end with the Non-Final form of that letter. Exceptions
     220          #    to this rule are mentioned above in isNonFinal(). This is an
     221          #    indication that the text is laid out backwards. +1 for visual
     222          #    score
     223          # 3) A word longer than 1 letter, starting with a final letter. Final
     224          #    letters should not appear at the beginning of a word. This is an
     225          #    indication that the text is laid out backwards. +1 for visual
     226          #    score.
     227          #
     228          # The visual score and logical score are accumulated throughout the
     229          # text and are finally checked against each other in GetCharSetName().
     230          # No checking for final letters in the middle of words is done since
     231          # that case is not an indication for either Logical or Visual text.
     232          #
     233          # We automatically filter out all 7-bit characters (replace them with
     234          # spaces) so the word boundary detection works properly. [MAP]
     235  
     236          if self.state == ProbingState.NOT_ME:
     237              # Both model probers say it's not them. No reason to continue.
     238              return ProbingState.NOT_ME
     239  
     240          byte_str = self.filter_high_byte_only(byte_str)
     241  
     242          for cur in byte_str:
     243              if cur == self.SPACE:
     244                  # We stand on a space - a word just ended
     245                  if self._before_prev != self.SPACE:
     246                      # next-to-last char was not a space so self._prev is not a
     247                      # 1 letter word
     248                      if self.is_final(self._prev):
     249                          # case (1) [-2:not space][-1:final letter][cur:space]
     250                          self._final_char_logical_score += 1
     251                      elif self.is_non_final(self._prev):
     252                          # case (2) [-2:not space][-1:Non-Final letter][
     253                          #  cur:space]
     254                          self._final_char_visual_score += 1
     255              else:
     256                  # Not standing on a space
     257                  if (
     258                      (self._before_prev == self.SPACE)
     259                      and (self.is_final(self._prev))
     260                      and (cur != self.SPACE)
     261                  ):
     262                      # case (3) [-2:space][-1:final letter][cur:not space]
     263                      self._final_char_visual_score += 1
     264              self._before_prev = self._prev
     265              self._prev = cur
     266  
     267          # Forever detecting, till the end or until both model probers return
     268          # ProbingState.NOT_ME (handled above)
     269          return ProbingState.DETECTING
     270  
     271      @property
     272      def charset_name(self) -> str:
     273          assert self._logical_prober is not None
     274          assert self._visual_prober is not None
     275  
     276          # Make the decision: is it Logical or Visual?
     277          # If the final letter score distance is dominant enough, rely on it.
     278          finalsub = self._final_char_logical_score - self._final_char_visual_score
     279          if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
     280              return self.LOGICAL_HEBREW_NAME
     281          if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
     282              return self.VISUAL_HEBREW_NAME
     283  
     284          # It's not dominant enough, try to rely on the model scores instead.
     285          modelsub = (
     286              self._logical_prober.get_confidence() - self._visual_prober.get_confidence()
     287          )
     288          if modelsub > self.MIN_MODEL_DISTANCE:
     289              return self.LOGICAL_HEBREW_NAME
     290          if modelsub < -self.MIN_MODEL_DISTANCE:
     291              return self.VISUAL_HEBREW_NAME
     292  
     293          # Still no good, back to final letter distance, maybe it'll save the
     294          # day.
     295          if finalsub < 0.0:
     296              return self.VISUAL_HEBREW_NAME
     297  
     298          # (finalsub > 0 - Logical) or (don't know what to do) default to
     299          # Logical.
     300          return self.LOGICAL_HEBREW_NAME
     301  
     302      @property
     303      def language(self) -> str:
     304          return "Hebrew"
     305  
     306      @property
     307      def state(self) -> ProbingState:
     308          assert self._logical_prober is not None
     309          assert self._visual_prober is not None
     310  
     311          # Remain active as long as any of the model probers are active.
     312          if (self._logical_prober.state == ProbingState.NOT_ME) and (
     313              self._visual_prober.state == ProbingState.NOT_ME
     314          ):
     315              return ProbingState.NOT_ME
     316          return ProbingState.DETECTING