1  ///////////////////////////////////////////////////////////////////////////////
       2  //
       3  /// \file       arm64.c
       4  /// \brief      Filter for ARM64 binaries
       5  ///
       6  /// This converts ARM64 relative addresses in the BL and ADRP immediates
       7  /// to absolute values to increase redundancy of ARM64 code.
       8  ///
       9  /// Converting B or ADR instructions was also tested but it's not useful.
      10  /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
      11  /// These are typical for loops and if-statements. Encoding them to their
      12  /// absolute address reduces redundancy since many of the small relative
      13  /// jump values are repeated, but very few of the absolute addresses are.
      14  //
      15  //  Authors:    Lasse Collin
      16  //              Jia Tan
      17  //              Igor Pavlov
      18  //
      19  //  This file has been put into the public domain.
      20  //  You can do whatever you want with this file.
      21  //
      22  ///////////////////////////////////////////////////////////////////////////////
      23  
      24  #include "simple_private.h"
      25  
      26  
      27  static size_t
      28  arm64_code(void *simple lzma_attribute((__unused__)),
      29  		uint32_t now_pos, bool is_encoder,
      30  		uint8_t *buffer, size_t size)
      31  {
      32  	size_t i;
      33  
      34  	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
      35  	// with auto-vectorization that is enabled by default with -O2.
      36  	// Such vectorization bloat happens with -O2 when targeting ARM64 too
      37  	// but performance hasn't been tested.
      38  #ifdef __clang__
      39  #	pragma clang loop vectorize(disable)
      40  #endif
      41  	for (i = 0; i + 4 <= size; i += 4) {
      42  		uint32_t pc = (uint32_t)(now_pos + i);
      43  		uint32_t instr = read32le(buffer + i);
      44  
      45  		if ((instr >> 26) == 0x25) {
      46  			// BL instruction:
      47  			// The full 26-bit immediate is converted.
      48  			// The range is +/-128 MiB.
      49  			//
      50  			// Using the full range is helps quite a lot with
      51  			// big executables. Smaller range would reduce false
      52  			// positives in non-code sections of the input though
      53  			// so this is a compromise that slightly favors big
      54  			// files. With the full range only six bits of the 32
      55  			// need to match to trigger a conversion.
      56  			const uint32_t src = instr;
      57  			instr = 0x94000000;
      58  
      59  			pc >>= 2;
      60  			if (!is_encoder)
      61  				pc = 0U - pc;
      62  
      63  			instr |= (src + pc) & 0x03FFFFFF;
      64  			write32le(buffer + i, instr);
      65  
      66  		} else if ((instr & 0x9F000000) == 0x90000000) {
      67  			// ADRP instruction:
      68  			// Only values in the range +/-512 MiB are converted.
      69  			//
      70  			// Using less than the full +/-4 GiB range reduces
      71  			// false positives on non-code sections of the input
      72  			// while being excellent for executables up to 512 MiB.
      73  			// The positive effect of ADRP conversion is smaller
      74  			// than that of BL but it also doesn't hurt so much in
      75  			// non-code sections of input because, with +/-512 MiB
      76  			// range, nine bits of 32 need to match to trigger a
      77  			// conversion (two 10-bit match choices = 9 bits).
      78  			const uint32_t src = ((instr >> 29) & 3)
      79  					| ((instr >> 3) & 0x001FFFFC);
      80  
      81  			// With the addition only one branch is needed to
      82  			// check the +/- range. This is usually false when
      83  			// processing ARM64 code so branch prediction will
      84  			// handle it well in terms of performance.
      85  			//
      86  			//if ((src & 0x001E0000) != 0
      87  			// && (src & 0x001E0000) != 0x001E0000)
      88  			if ((src + 0x00020000) & 0x001C0000)
      89  				continue;
      90  
      91  			instr &= 0x9000001F;
      92  
      93  			pc >>= 12;
      94  			if (!is_encoder)
      95  				pc = 0U - pc;
      96  
      97  			const uint32_t dest = src + pc;
      98  			instr |= (dest & 3) << 29;
      99  			instr |= (dest & 0x0003FFFC) << 3;
     100  			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
     101  			write32le(buffer + i, instr);
     102  		}
     103  	}
     104  
     105  	return i;
     106  }
     107  
     108  
     109  static lzma_ret
     110  arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
     111  		const lzma_filter_info *filters, bool is_encoder)
     112  {
     113  	return lzma_simple_coder_init(next, allocator, filters,
     114  			&arm64_code, 0, 4, 4, is_encoder);
     115  }
     116  
     117  
     118  #ifdef HAVE_ENCODER_ARM64
     119  extern lzma_ret
     120  lzma_simple_arm64_encoder_init(lzma_next_coder *next,
     121  		const lzma_allocator *allocator,
     122  		const lzma_filter_info *filters)
     123  {
     124  	return arm64_coder_init(next, allocator, filters, true);
     125  }
     126  #endif
     127  
     128  
     129  #ifdef HAVE_DECODER_ARM64
     130  extern lzma_ret
     131  lzma_simple_arm64_decoder_init(lzma_next_coder *next,
     132  		const lzma_allocator *allocator,
     133  		const lzma_filter_info *filters)
     134  {
     135  	return arm64_coder_init(next, allocator, filters, false);
     136  }
     137  #endif