python (3.11.7)
       1  ######################## BEGIN LICENSE BLOCK ########################
       2  # The Original Code is Mozilla Universal charset detector code.
       3  #
       4  # The Initial Developer of the Original Code is
       5  # Netscape Communications Corporation.
       6  # Portions created by the Initial Developer are Copyright (C) 2001
       7  # the Initial Developer. All Rights Reserved.
       8  #
       9  # Contributor(s):
      10  #   Mark Pilgrim - port to Python
      11  #   Shy Shalom - original C code
      12  #
      13  # This library is free software; you can redistribute it and/or
      14  # modify it under the terms of the GNU Lesser General Public
      15  # License as published by the Free Software Foundation; either
      16  # version 2.1 of the License, or (at your option) any later version.
      17  #
      18  # This library is distributed in the hope that it will be useful,
      19  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      20  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      21  # Lesser General Public License for more details.
      22  #
      23  # You should have received a copy of the GNU Lesser General Public
      24  # License along with this library; if not, write to the Free Software
      25  # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
      26  # 02110-1301  USA
      27  ######################### END LICENSE BLOCK #########################
      28  
      29  from .charsetgroupprober import CharSetGroupProber
      30  from .hebrewprober import HebrewProber
      31  from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
      32  from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
      33  from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
      34  
      35  # from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
      36  #                                  WINDOWS_1250_HUNGARIAN_MODEL)
      37  from .langrussianmodel import (
      38      IBM855_RUSSIAN_MODEL,
      39      IBM866_RUSSIAN_MODEL,
      40      ISO_8859_5_RUSSIAN_MODEL,
      41      KOI8_R_RUSSIAN_MODEL,
      42      MACCYRILLIC_RUSSIAN_MODEL,
      43      WINDOWS_1251_RUSSIAN_MODEL,
      44  )
      45  from .langthaimodel import TIS_620_THAI_MODEL
      46  from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
      47  from .sbcharsetprober import SingleByteCharSetProber
      48  
      49  
      50  class ESC[4;38;5;81mSBCSGroupProber(ESC[4;38;5;149mCharSetGroupProber):
      51      def __init__(self) -> None:
      52          super().__init__()
      53          hebrew_prober = HebrewProber()
      54          logical_hebrew_prober = SingleByteCharSetProber(
      55              WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
      56          )
      57          # TODO: See if using ISO-8859-8 Hebrew model works better here, since
      58          #       it's actually the visual one
      59          visual_hebrew_prober = SingleByteCharSetProber(
      60              WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
      61          )
      62          hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
      63          # TODO: ORDER MATTERS HERE. I changed the order vs what was in master
      64          #       and several tests failed that did not before. Some thought
      65          #       should be put into the ordering, and we should consider making
      66          #       order not matter here, because that is very counter-intuitive.
      67          self.probers = [
      68              SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
      69              SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
      70              SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
      71              SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
      72              SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
      73              SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
      74              SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
      75              SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
      76              SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
      77              SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
      78              # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
      79              #       after we retrain model.
      80              # SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
      81              # SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
      82              SingleByteCharSetProber(TIS_620_THAI_MODEL),
      83              SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
      84              hebrew_prober,
      85              logical_hebrew_prober,
      86              visual_hebrew_prober,
      87          ]
      88          self.reset()