1  //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
       2  //
       3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
       4  // See https://llvm.org/LICENSE.txt for license information.
       5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
       6  //
       7  //===----------------------------------------------------------------------===//
       8  //
       9  // Lempel–Ziv–Welch encoding/decoding
      10  //
      11  //===----------------------------------------------------------------------===//
      12  
      13  #ifndef SANITIZER_LZW_H
      14  #define SANITIZER_LZW_H
      15  
      16  #include "sanitizer_dense_map.h"
      17  
      18  namespace __sanitizer {
      19  
      20  using LzwCodeType = u32;
      21  
      22  template <class T, class ItIn, class ItOut>
      23  ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) {
      24    using Substring =
      25        detail::DenseMapPair<LzwCodeType /* Prefix */, T /* Next input */>;
      26  
      27    // Sentinel value for substrings of len 1.
      28    static constexpr LzwCodeType kNoPrefix =
      29        Min(DenseMapInfo<Substring>::getEmptyKey().first,
      30            DenseMapInfo<Substring>::getTombstoneKey().first) -
      31        1;
      32    DenseMap<Substring, LzwCodeType> prefix_to_code;
      33    {
      34      // Add all substring of len 1 as initial dictionary.
      35      InternalMmapVector<T> dict_len1;
      36      for (auto it = begin; it != end; ++it)
      37        if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second)
      38          dict_len1.push_back(*it);
      39  
      40      // Slightly helps with later delta encoding.
      41      Sort(dict_len1.data(), dict_len1.size());
      42  
      43      // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
      44      // just generate them.
      45      *out = dict_len1.size();
      46      ++out;
      47  
      48      for (uptr i = 0; i != dict_len1.size(); ++i) {
      49        // Remap after the Sort.
      50        prefix_to_code[{kNoPrefix, dict_len1[i]}] = i;
      51        *out = dict_len1[i];
      52        ++out;
      53      }
      54      CHECK_EQ(prefix_to_code.size(), dict_len1.size());
      55    }
      56  
      57    if (begin == end)
      58      return out;
      59  
      60    // Main LZW encoding loop.
      61    LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second;
      62    ++begin;
      63    for (auto it = begin; it != end; ++it) {
      64      // Extend match with the new item.
      65      auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size());
      66      if (ins.second) {
      67        // This is a new substring, but emit the code for the current match
      68        // (before extend). This allows LZW decoder to recover the dictionary.
      69        *out = match;
      70        ++out;
      71        // Reset the match to a single item, which must be already in the map.
      72        match = prefix_to_code.find({kNoPrefix, *it})->second;
      73      } else {
      74        // Already known, use as the current match.
      75        match = ins.first->second;
      76      }
      77    }
      78  
      79    *out = match;
      80    ++out;
      81  
      82    return out;
      83  }
      84  
      85  template <class T, class ItIn, class ItOut>
      86  ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) {
      87    if (begin == end)
      88      return out;
      89  
      90    // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
      91    InternalMmapVector<T> dict_len1(*begin);
      92    ++begin;
      93  
      94    if (begin == end)
      95      return out;
      96  
      97    for (auto& v : dict_len1) {
      98      v = *begin;
      99      ++begin;
     100    }
     101  
     102    // Substrings of len 2 and up. Indexes are shifted because [0,
     103    // dict_len1.size()) stored in dict_len1. Substings get here after being
     104    // emitted to the output, so we can use output position.
     105    InternalMmapVector<detail::DenseMapPair<ItOut /* begin. */, ItOut /* end */>>
     106        code_to_substr;
     107  
     108    // Copies already emitted substrings into the output again.
     109    auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) {
     110      if (code < dict_len1.size()) {
     111        *out = dict_len1[code];
     112        ++out;
     113        return out;
     114      }
     115      const auto& s = code_to_substr[code - dict_len1.size()];
     116  
     117      for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it;
     118      return out;
     119    };
     120  
     121    // Returns lens of the substring with the given code.
     122    auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr {
     123      if (code < dict_len1.size())
     124        return 1;
     125      const auto& s = code_to_substr[code - dict_len1.size()];
     126      return s.second - s.first;
     127    };
     128  
     129    // Main LZW decoding loop.
     130    LzwCodeType prev_code = *begin;
     131    ++begin;
     132    out = copy(prev_code, out);
     133    for (auto it = begin; it != end; ++it) {
     134      LzwCodeType code = *it;
     135      auto start = out;
     136      if (code == dict_len1.size() + code_to_substr.size()) {
     137        // Special LZW case. The code is not in the dictionary yet. This is
     138        // possible only when the new substring is the same as previous one plus
     139        // the first item of the previous substring. We can emit that in two
     140        // steps.
     141        out = copy(prev_code, out);
     142        *out = *start;
     143        ++out;
     144      } else {
     145        out = copy(code, out);
     146      }
     147  
     148      // Every time encoded emits the code, it also creates substing of len + 1
     149      // including the first item of the just emmited substring. Do the same here.
     150      uptr len = code_to_len(prev_code);
     151      code_to_substr.push_back({start - len, start + 1});
     152  
     153      prev_code = code;
     154    }
     155    return out;
     156  }
     157  
     158  }  // namespace __sanitizer
     159  #endif