python (3.11.7)
       1  import codecs
       2  import locale
       3  import re
       4  import sys
       5  from typing import List, Tuple
       6  
       7  BOMS: List[Tuple[bytes, str]] = [
       8      (codecs.BOM_UTF8, "utf-8"),
       9      (codecs.BOM_UTF16, "utf-16"),
      10      (codecs.BOM_UTF16_BE, "utf-16-be"),
      11      (codecs.BOM_UTF16_LE, "utf-16-le"),
      12      (codecs.BOM_UTF32, "utf-32"),
      13      (codecs.BOM_UTF32_BE, "utf-32-be"),
      14      (codecs.BOM_UTF32_LE, "utf-32-le"),
      15  ]
      16  
      17  ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
      18  
      19  
      20  def auto_decode(data: bytes) -> str:
      21      """Check a bytes string for a BOM to correctly detect the encoding
      22  
      23      Fallback to locale.getpreferredencoding(False) like open() on Python3"""
      24      for bom, encoding in BOMS:
      25          if data.startswith(bom):
      26              return data[len(bom) :].decode(encoding)
      27      # Lets check the first two lines as in PEP263
      28      for line in data.split(b"\n")[:2]:
      29          if line[0:1] == b"#" and ENCODING_RE.search(line):
      30              result = ENCODING_RE.search(line)
      31              assert result is not None
      32              encoding = result.groups()[0].decode("ascii")
      33              return data.decode(encoding)
      34      return data.decode(
      35          locale.getpreferredencoding(False) or sys.getdefaultencoding(),
      36      )