1  ///////////////////////////////////////////////////////////////////////////////
       2  //
       3  /// \file       tuklib_integer.h
       4  /// \brief      Various integer and bit operations
       5  ///
       6  /// This file provides macros or functions to do some basic integer and bit
       7  /// operations.
       8  ///
       9  /// Native endian inline functions (XX = 16, 32, or 64):
      10  ///   - Unaligned native endian reads: readXXne(ptr)
      11  ///   - Unaligned native endian writes: writeXXne(ptr, num)
      12  ///   - Aligned native endian reads: aligned_readXXne(ptr)
      13  ///   - Aligned native endian writes: aligned_writeXXne(ptr, num)
      14  ///
      15  /// Endianness-converting integer operations (these can be macros!)
      16  /// (XX = 16, 32, or 64; Y = b or l):
      17  ///   - Byte swapping: bswapXX(num)
      18  ///   - Byte order conversions to/from native (byteswaps if Y isn't
      19  ///     the native endianness): convXXYe(num)
      20  ///   - Unaligned reads: readXXYe(ptr)
      21  ///   - Unaligned writes: writeXXYe(ptr, num)
      22  ///   - Aligned reads: aligned_readXXYe(ptr)
      23  ///   - Aligned writes: aligned_writeXXYe(ptr, num)
      24  ///
      25  /// Since the above can macros, the arguments should have no side effects
      26  /// because they may be evaluated more than once.
      27  ///
      28  /// Bit scan operations for non-zero 32-bit integers (inline functions):
      29  ///   - Bit scan reverse (find highest non-zero bit): bsr32(num)
      30  ///   - Count leading zeros: clz32(num)
      31  ///   - Count trailing zeros: ctz32(num)
      32  ///   - Bit scan forward (simply an alias for ctz32()): bsf32(num)
      33  ///
      34  /// The above bit scan operations return 0-31. If num is zero,
      35  /// the result is undefined.
      36  //
      37  //  Authors:    Lasse Collin
      38  //              Joachim Henke
      39  //
      40  //  This file has been put into the public domain.
      41  //  You can do whatever you want with this file.
      42  //
      43  ///////////////////////////////////////////////////////////////////////////////
      44  
      45  #ifndef TUKLIB_INTEGER_H
      46  #define TUKLIB_INTEGER_H
      47  
      48  #include "tuklib_common.h"
      49  #include <string.h>
      50  
      51  // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
      52  // and such functions.
      53  #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
      54  #	include <immintrin.h>
      55  // Only include <intrin.h> when it is needed. GCC and Clang can both
      56  // use __builtin's, so we only need Windows instrincs when using MSVC.
      57  // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
      58  // cases explicitly.
      59  #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
      60  #	include <intrin.h>
      61  #endif
      62  
      63  
      64  ///////////////////
      65  // Byte swapping //
      66  ///////////////////
      67  
      68  #if defined(HAVE___BUILTIN_BSWAPXX)
      69  	// GCC >= 4.8 and Clang
      70  #	define bswap16(n) __builtin_bswap16(n)
      71  #	define bswap32(n) __builtin_bswap32(n)
      72  #	define bswap64(n) __builtin_bswap64(n)
      73  
      74  #elif defined(HAVE_BYTESWAP_H)
      75  	// glibc, uClibc, dietlibc
      76  #	include <byteswap.h>
      77  #	ifdef HAVE_BSWAP_16
      78  #		define bswap16(num) bswap_16(num)
      79  #	endif
      80  #	ifdef HAVE_BSWAP_32
      81  #		define bswap32(num) bswap_32(num)
      82  #	endif
      83  #	ifdef HAVE_BSWAP_64
      84  #		define bswap64(num) bswap_64(num)
      85  #	endif
      86  
      87  #elif defined(HAVE_SYS_ENDIAN_H)
      88  	// *BSDs and Darwin
      89  #	include <sys/endian.h>
      90  
      91  #elif defined(HAVE_SYS_BYTEORDER_H)
      92  	// Solaris
      93  #	include <sys/byteorder.h>
      94  #	ifdef BSWAP_16
      95  #		define bswap16(num) BSWAP_16(num)
      96  #	endif
      97  #	ifdef BSWAP_32
      98  #		define bswap32(num) BSWAP_32(num)
      99  #	endif
     100  #	ifdef BSWAP_64
     101  #		define bswap64(num) BSWAP_64(num)
     102  #	endif
     103  #	ifdef BE_16
     104  #		define conv16be(num) BE_16(num)
     105  #	endif
     106  #	ifdef BE_32
     107  #		define conv32be(num) BE_32(num)
     108  #	endif
     109  #	ifdef BE_64
     110  #		define conv64be(num) BE_64(num)
     111  #	endif
     112  #	ifdef LE_16
     113  #		define conv16le(num) LE_16(num)
     114  #	endif
     115  #	ifdef LE_32
     116  #		define conv32le(num) LE_32(num)
     117  #	endif
     118  #	ifdef LE_64
     119  #		define conv64le(num) LE_64(num)
     120  #	endif
     121  #endif
     122  
     123  #ifndef bswap16
     124  #	define bswap16(n) (uint16_t)( \
     125  		  (((n) & 0x00FFU) << 8) \
     126  		| (((n) & 0xFF00U) >> 8) \
     127  	)
     128  #endif
     129  
     130  #ifndef bswap32
     131  #	define bswap32(n) (uint32_t)( \
     132  		  (((n) & UINT32_C(0x000000FF)) << 24) \
     133  		| (((n) & UINT32_C(0x0000FF00)) << 8) \
     134  		| (((n) & UINT32_C(0x00FF0000)) >> 8) \
     135  		| (((n) & UINT32_C(0xFF000000)) >> 24) \
     136  	)
     137  #endif
     138  
     139  #ifndef bswap64
     140  #	define bswap64(n) (uint64_t)( \
     141  		  (((n) & UINT64_C(0x00000000000000FF)) << 56) \
     142  		| (((n) & UINT64_C(0x000000000000FF00)) << 40) \
     143  		| (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
     144  		| (((n) & UINT64_C(0x00000000FF000000)) << 8) \
     145  		| (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
     146  		| (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
     147  		| (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
     148  		| (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
     149  	)
     150  #endif
     151  
     152  // Define conversion macros using the basic byte swapping macros.
     153  #ifdef WORDS_BIGENDIAN
     154  #	ifndef conv16be
     155  #		define conv16be(num) ((uint16_t)(num))
     156  #	endif
     157  #	ifndef conv32be
     158  #		define conv32be(num) ((uint32_t)(num))
     159  #	endif
     160  #	ifndef conv64be
     161  #		define conv64be(num) ((uint64_t)(num))
     162  #	endif
     163  #	ifndef conv16le
     164  #		define conv16le(num) bswap16(num)
     165  #	endif
     166  #	ifndef conv32le
     167  #		define conv32le(num) bswap32(num)
     168  #	endif
     169  #	ifndef conv64le
     170  #		define conv64le(num) bswap64(num)
     171  #	endif
     172  #else
     173  #	ifndef conv16be
     174  #		define conv16be(num) bswap16(num)
     175  #	endif
     176  #	ifndef conv32be
     177  #		define conv32be(num) bswap32(num)
     178  #	endif
     179  #	ifndef conv64be
     180  #		define conv64be(num) bswap64(num)
     181  #	endif
     182  #	ifndef conv16le
     183  #		define conv16le(num) ((uint16_t)(num))
     184  #	endif
     185  #	ifndef conv32le
     186  #		define conv32le(num) ((uint32_t)(num))
     187  #	endif
     188  #	ifndef conv64le
     189  #		define conv64le(num) ((uint64_t)(num))
     190  #	endif
     191  #endif
     192  
     193  
     194  ////////////////////////////////
     195  // Unaligned reads and writes //
     196  ////////////////////////////////
     197  
     198  // No-strict-align archs like x86-64
     199  // ---------------------------------
     200  //
     201  // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
     202  // is bad even if the uint8_pointer is properly aligned because this kind
     203  // of casts break strict aliasing rules and result in undefined behavior.
     204  // With unaligned pointers it's even worse: compilers may emit vector
     205  // instructions that require aligned pointers even if non-vector
     206  // instructions work with unaligned pointers.
     207  //
     208  // Using memcpy() is the standard compliant way to do unaligned access.
     209  // Many modern compilers inline it so there is no function call overhead.
     210  // For those compilers that don't handle the memcpy() method well, the
     211  // old casting method (that violates strict aliasing) can be requested at
     212  // build time. A third method, casting to a packed struct, would also be
     213  // an option but isn't provided to keep things simpler (it's already a mess).
     214  // Hopefully this is flexible enough in practice.
     215  //
     216  // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
     217  //
     218  //     buf[0] | (buf[1] << 8)
     219  //
     220  // reads a 16-bit value and can emit a single 16-bit load and produce
     221  // identical code than with the memcpy() method. In other cases Clang and GCC
     222  // produce either the same or better code with memcpy(). For example, Clang 9
     223  // on x86-64 can detect 32-bit load but not 16-bit load.
     224  //
     225  // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
     226  // code for "buf[0] | (buf[1] << 8)".
     227  //
     228  // Conclusion: The memcpy() method is the best choice when unaligned access
     229  // is supported.
     230  //
     231  // Strict-align archs like SPARC
     232  // -----------------------------
     233  //
     234  // GCC versions from around 4.x to to at least 13.2.0 produce worse code
     235  // from the memcpy() method than from simple byte-by-byte shift-or code
     236  // when reading a 32-bit integer:
     237  //
     238  //     (1) It may be constructed on stack using using four 8-bit loads,
     239  //         four 8-bit stores to stack, and finally one 32-bit load from stack.
     240  //
     241  //     (2) Especially with -Os, an actual memcpy() call may be emitted.
     242  //
     243  // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
     244  // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
     245  // some processors but not all so this is relevant only in the case when
     246  // GCC assumes that unaligned is not supported or -mstrict-align or
     247  // -mno-unaligned-access is used.
     248  //
     249  // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
     250  // was one the very few with a minor difference: the memcpy() version
     251  // was one instruction longer.
     252  //
     253  // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
     254  // the best choise for strict-align archs to do unaligned access.
     255  //
     256  // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
     257  //
     258  // Thanks to <https://godbolt.org/> it was easy to test different compilers.
     259  // The following is for little endian targets:
     260  /*
     261  #include <stdint.h>
     262  #include <string.h>
     263  
     264  uint32_t bytes16(const uint8_t *b)
     265  {
     266      return (uint32_t)b[0]
     267          | ((uint32_t)b[1] << 8);
     268  }
     269  
     270  uint32_t copy16(const uint8_t *b)
     271  {
     272      uint16_t v;
     273      memcpy(&v, b, sizeof(v));
     274      return v;
     275  }
     276  
     277  uint32_t bytes32(const uint8_t *b)
     278  {
     279      return (uint32_t)b[0]
     280          | ((uint32_t)b[1] << 8)
     281          | ((uint32_t)b[2] << 16)
     282          | ((uint32_t)b[3] << 24);
     283  }
     284  
     285  uint32_t copy32(const uint8_t *b)
     286  {
     287      uint32_t v;
     288      memcpy(&v, b, sizeof(v));
     289      return v;
     290  }
     291  
     292  void wbytes16(uint8_t *b, uint16_t v)
     293  {
     294      b[0] = (uint8_t)v;
     295      b[1] = (uint8_t)(v >> 8);
     296  }
     297  
     298  void wcopy16(uint8_t *b, uint16_t v)
     299  {
     300      memcpy(b, &v, sizeof(v));
     301  }
     302  
     303  void wbytes32(uint8_t *b, uint32_t v)
     304  {
     305      b[0] = (uint8_t)v;
     306      b[1] = (uint8_t)(v >> 8);
     307      b[2] = (uint8_t)(v >> 16);
     308      b[3] = (uint8_t)(v >> 24);
     309  }
     310  
     311  void wcopy32(uint8_t *b, uint32_t v)
     312  {
     313      memcpy(b, &v, sizeof(v));
     314  }
     315  */
     316  
     317  
     318  #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
     319  
     320  static inline uint16_t
     321  read16ne(const uint8_t *buf)
     322  {
     323  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     324  	return *(const uint16_t *)buf;
     325  #else
     326  	uint16_t num;
     327  	memcpy(&num, buf, sizeof(num));
     328  	return num;
     329  #endif
     330  }
     331  
     332  
     333  static inline uint32_t
     334  read32ne(const uint8_t *buf)
     335  {
     336  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     337  	return *(const uint32_t *)buf;
     338  #else
     339  	uint32_t num;
     340  	memcpy(&num, buf, sizeof(num));
     341  	return num;
     342  #endif
     343  }
     344  
     345  
     346  static inline uint64_t
     347  read64ne(const uint8_t *buf)
     348  {
     349  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     350  	return *(const uint64_t *)buf;
     351  #else
     352  	uint64_t num;
     353  	memcpy(&num, buf, sizeof(num));
     354  	return num;
     355  #endif
     356  }
     357  
     358  
     359  static inline void
     360  write16ne(uint8_t *buf, uint16_t num)
     361  {
     362  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     363  	*(uint16_t *)buf = num;
     364  #else
     365  	memcpy(buf, &num, sizeof(num));
     366  #endif
     367  	return;
     368  }
     369  
     370  
     371  static inline void
     372  write32ne(uint8_t *buf, uint32_t num)
     373  {
     374  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     375  	*(uint32_t *)buf = num;
     376  #else
     377  	memcpy(buf, &num, sizeof(num));
     378  #endif
     379  	return;
     380  }
     381  
     382  
     383  static inline void
     384  write64ne(uint8_t *buf, uint64_t num)
     385  {
     386  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     387  	*(uint64_t *)buf = num;
     388  #else
     389  	memcpy(buf, &num, sizeof(num));
     390  #endif
     391  	return;
     392  }
     393  
     394  
     395  static inline uint16_t
     396  read16be(const uint8_t *buf)
     397  {
     398  	uint16_t num = read16ne(buf);
     399  	return conv16be(num);
     400  }
     401  
     402  
     403  static inline uint16_t
     404  read16le(const uint8_t *buf)
     405  {
     406  	uint16_t num = read16ne(buf);
     407  	return conv16le(num);
     408  }
     409  
     410  
     411  static inline uint32_t
     412  read32be(const uint8_t *buf)
     413  {
     414  	uint32_t num = read32ne(buf);
     415  	return conv32be(num);
     416  }
     417  
     418  
     419  static inline uint32_t
     420  read32le(const uint8_t *buf)
     421  {
     422  	uint32_t num = read32ne(buf);
     423  	return conv32le(num);
     424  }
     425  
     426  
     427  static inline uint64_t
     428  read64be(const uint8_t *buf)
     429  {
     430  	uint64_t num = read64ne(buf);
     431  	return conv64be(num);
     432  }
     433  
     434  
     435  static inline uint64_t
     436  read64le(const uint8_t *buf)
     437  {
     438  	uint64_t num = read64ne(buf);
     439  	return conv64le(num);
     440  }
     441  
     442  
     443  // NOTE: Possible byte swapping must be done in a macro to allow the compiler
     444  // to optimize byte swapping of constants when using glibc's or *BSD's
     445  // byte swapping macros. The actual write is done in an inline function
     446  // to make type checking of the buf pointer possible.
     447  #define write16be(buf, num) write16ne(buf, conv16be(num))
     448  #define write32be(buf, num) write32ne(buf, conv32be(num))
     449  #define write64be(buf, num) write64ne(buf, conv64be(num))
     450  #define write16le(buf, num) write16ne(buf, conv16le(num))
     451  #define write32le(buf, num) write32ne(buf, conv32le(num))
     452  #define write64le(buf, num) write64ne(buf, conv64le(num))
     453  
     454  #else
     455  
     456  #ifdef WORDS_BIGENDIAN
     457  #	define read16ne read16be
     458  #	define read32ne read32be
     459  #	define read64ne read64be
     460  #	define write16ne write16be
     461  #	define write32ne write32be
     462  #	define write64ne write64be
     463  #else
     464  #	define read16ne read16le
     465  #	define read32ne read32le
     466  #	define read64ne read64le
     467  #	define write16ne write16le
     468  #	define write32ne write32le
     469  #	define write64ne write64le
     470  #endif
     471  
     472  
     473  static inline uint16_t
     474  read16be(const uint8_t *buf)
     475  {
     476  	uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
     477  	return num;
     478  }
     479  
     480  
     481  static inline uint16_t
     482  read16le(const uint8_t *buf)
     483  {
     484  	uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
     485  	return num;
     486  }
     487  
     488  
     489  static inline uint32_t
     490  read32be(const uint8_t *buf)
     491  {
     492  	uint32_t num = (uint32_t)buf[0] << 24;
     493  	num |= (uint32_t)buf[1] << 16;
     494  	num |= (uint32_t)buf[2] << 8;
     495  	num |= (uint32_t)buf[3];
     496  	return num;
     497  }
     498  
     499  
     500  static inline uint32_t
     501  read32le(const uint8_t *buf)
     502  {
     503  	uint32_t num = (uint32_t)buf[0];
     504  	num |= (uint32_t)buf[1] << 8;
     505  	num |= (uint32_t)buf[2] << 16;
     506  	num |= (uint32_t)buf[3] << 24;
     507  	return num;
     508  }
     509  
     510  
     511  static inline uint64_t
     512  read64be(const uint8_t *buf)
     513  {
     514  	uint64_t num = (uint64_t)buf[0] << 56;
     515  	num |= (uint64_t)buf[1] << 48;
     516  	num |= (uint64_t)buf[2] << 40;
     517  	num |= (uint64_t)buf[3] << 32;
     518  	num |= (uint64_t)buf[4] << 24;
     519  	num |= (uint64_t)buf[5] << 16;
     520  	num |= (uint64_t)buf[6] << 8;
     521  	num |= (uint64_t)buf[7];
     522  	return num;
     523  }
     524  
     525  
     526  static inline uint64_t
     527  read64le(const uint8_t *buf)
     528  {
     529  	uint64_t num = (uint64_t)buf[0];
     530  	num |= (uint64_t)buf[1] << 8;
     531  	num |= (uint64_t)buf[2] << 16;
     532  	num |= (uint64_t)buf[3] << 24;
     533  	num |= (uint64_t)buf[4] << 32;
     534  	num |= (uint64_t)buf[5] << 40;
     535  	num |= (uint64_t)buf[6] << 48;
     536  	num |= (uint64_t)buf[7] << 56;
     537  	return num;
     538  }
     539  
     540  
     541  static inline void
     542  write16be(uint8_t *buf, uint16_t num)
     543  {
     544  	buf[0] = (uint8_t)(num >> 8);
     545  	buf[1] = (uint8_t)num;
     546  	return;
     547  }
     548  
     549  
     550  static inline void
     551  write16le(uint8_t *buf, uint16_t num)
     552  {
     553  	buf[0] = (uint8_t)num;
     554  	buf[1] = (uint8_t)(num >> 8);
     555  	return;
     556  }
     557  
     558  
     559  static inline void
     560  write32be(uint8_t *buf, uint32_t num)
     561  {
     562  	buf[0] = (uint8_t)(num >> 24);
     563  	buf[1] = (uint8_t)(num >> 16);
     564  	buf[2] = (uint8_t)(num >> 8);
     565  	buf[3] = (uint8_t)num;
     566  	return;
     567  }
     568  
     569  
     570  static inline void
     571  write32le(uint8_t *buf, uint32_t num)
     572  {
     573  	buf[0] = (uint8_t)num;
     574  	buf[1] = (uint8_t)(num >> 8);
     575  	buf[2] = (uint8_t)(num >> 16);
     576  	buf[3] = (uint8_t)(num >> 24);
     577  	return;
     578  }
     579  
     580  
     581  static inline void
     582  write64be(uint8_t *buf, uint64_t num)
     583  {
     584  	buf[0] = (uint8_t)(num >> 56);
     585  	buf[1] = (uint8_t)(num >> 48);
     586  	buf[2] = (uint8_t)(num >> 40);
     587  	buf[3] = (uint8_t)(num >> 32);
     588  	buf[4] = (uint8_t)(num >> 24);
     589  	buf[5] = (uint8_t)(num >> 16);
     590  	buf[6] = (uint8_t)(num >> 8);
     591  	buf[7] = (uint8_t)num;
     592  	return;
     593  }
     594  
     595  
     596  static inline void
     597  write64le(uint8_t *buf, uint64_t num)
     598  {
     599  	buf[0] = (uint8_t)num;
     600  	buf[1] = (uint8_t)(num >> 8);
     601  	buf[2] = (uint8_t)(num >> 16);
     602  	buf[3] = (uint8_t)(num >> 24);
     603  	buf[4] = (uint8_t)(num >> 32);
     604  	buf[5] = (uint8_t)(num >> 40);
     605  	buf[6] = (uint8_t)(num >> 48);
     606  	buf[7] = (uint8_t)(num >> 56);
     607  	return;
     608  }
     609  
     610  #endif
     611  
     612  
     613  //////////////////////////////
     614  // Aligned reads and writes //
     615  //////////////////////////////
     616  
     617  // Separate functions for aligned reads and writes are provided since on
     618  // strict-align archs aligned access is much faster than unaligned access.
     619  //
     620  // Just like in the unaligned case, memcpy() is needed to avoid
     621  // strict aliasing violations. However, on archs that don't support
     622  // unaligned access the compiler cannot know that the pointers given
     623  // to memcpy() are aligned which results in slow code. As of C11 there is
     624  // no standard way to tell the compiler that we know that the address is
     625  // aligned but some compilers have language extensions to do that. With
     626  // such language extensions the memcpy() method gives excellent results.
     627  //
     628  // What to do on a strict-align system when no known language extentensions
     629  // are available? Falling back to byte-by-byte access would be safe but ruin
     630  // optimizations that have been made specifically with aligned access in mind.
     631  // As a compromise, aligned reads will fall back to non-compliant type punning
     632  // but aligned writes will be byte-by-byte, that is, fast reads are preferred
     633  // over fast writes. This obviously isn't great but hopefully it's a working
     634  // compromise for now.
     635  //
     636  // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
     637  #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
     638  #	define tuklib_memcpy_aligned(dest, src, size) \
     639  		memcpy(dest, __builtin_assume_aligned(src, size), size)
     640  #else
     641  #	define tuklib_memcpy_aligned(dest, src, size) \
     642  		memcpy(dest, src, size)
     643  #	ifndef TUKLIB_FAST_UNALIGNED_ACCESS
     644  #		define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
     645  #	endif
     646  #endif
     647  
     648  
     649  static inline uint16_t
     650  aligned_read16ne(const uint8_t *buf)
     651  {
     652  #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
     653  		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
     654  	return *(const uint16_t *)buf;
     655  #else
     656  	uint16_t num;
     657  	tuklib_memcpy_aligned(&num, buf, sizeof(num));
     658  	return num;
     659  #endif
     660  }
     661  
     662  
     663  static inline uint32_t
     664  aligned_read32ne(const uint8_t *buf)
     665  {
     666  #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
     667  		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
     668  	return *(const uint32_t *)buf;
     669  #else
     670  	uint32_t num;
     671  	tuklib_memcpy_aligned(&num, buf, sizeof(num));
     672  	return num;
     673  #endif
     674  }
     675  
     676  
     677  static inline uint64_t
     678  aligned_read64ne(const uint8_t *buf)
     679  {
     680  #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
     681  		|| defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
     682  	return *(const uint64_t *)buf;
     683  #else
     684  	uint64_t num;
     685  	tuklib_memcpy_aligned(&num, buf, sizeof(num));
     686  	return num;
     687  #endif
     688  }
     689  
     690  
     691  static inline void
     692  aligned_write16ne(uint8_t *buf, uint16_t num)
     693  {
     694  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     695  	*(uint16_t *)buf = num;
     696  #else
     697  	tuklib_memcpy_aligned(buf, &num, sizeof(num));
     698  #endif
     699  	return;
     700  }
     701  
     702  
     703  static inline void
     704  aligned_write32ne(uint8_t *buf, uint32_t num)
     705  {
     706  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     707  	*(uint32_t *)buf = num;
     708  #else
     709  	tuklib_memcpy_aligned(buf, &num, sizeof(num));
     710  #endif
     711  	return;
     712  }
     713  
     714  
     715  static inline void
     716  aligned_write64ne(uint8_t *buf, uint64_t num)
     717  {
     718  #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
     719  	*(uint64_t *)buf = num;
     720  #else
     721  	tuklib_memcpy_aligned(buf, &num, sizeof(num));
     722  #endif
     723  	return;
     724  }
     725  
     726  
     727  static inline uint16_t
     728  aligned_read16be(const uint8_t *buf)
     729  {
     730  	uint16_t num = aligned_read16ne(buf);
     731  	return conv16be(num);
     732  }
     733  
     734  
     735  static inline uint16_t
     736  aligned_read16le(const uint8_t *buf)
     737  {
     738  	uint16_t num = aligned_read16ne(buf);
     739  	return conv16le(num);
     740  }
     741  
     742  
     743  static inline uint32_t
     744  aligned_read32be(const uint8_t *buf)
     745  {
     746  	uint32_t num = aligned_read32ne(buf);
     747  	return conv32be(num);
     748  }
     749  
     750  
     751  static inline uint32_t
     752  aligned_read32le(const uint8_t *buf)
     753  {
     754  	uint32_t num = aligned_read32ne(buf);
     755  	return conv32le(num);
     756  }
     757  
     758  
     759  static inline uint64_t
     760  aligned_read64be(const uint8_t *buf)
     761  {
     762  	uint64_t num = aligned_read64ne(buf);
     763  	return conv64be(num);
     764  }
     765  
     766  
     767  static inline uint64_t
     768  aligned_read64le(const uint8_t *buf)
     769  {
     770  	uint64_t num = aligned_read64ne(buf);
     771  	return conv64le(num);
     772  }
     773  
     774  
     775  // These need to be macros like in the unaligned case.
     776  #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
     777  #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
     778  #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
     779  #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
     780  #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
     781  #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
     782  
     783  
     784  ////////////////////
     785  // Bit operations //
     786  ////////////////////
     787  
     788  static inline uint32_t
     789  bsr32(uint32_t n)
     790  {
     791  	// Check for ICC first, since it tends to define __GNUC__ too.
     792  #if defined(__INTEL_COMPILER)
     793  	return _bit_scan_reverse(n);
     794  
     795  #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
     796  	// GCC >= 3.4 has __builtin_clz(), which gives good results on
     797  	// multiple architectures. On x86, __builtin_clz() ^ 31U becomes
     798  	// either plain BSR (so the XOR gets optimized away) or LZCNT and
     799  	// XOR (if -march indicates that SSE4a instructions are supported).
     800  	return (uint32_t)__builtin_clz(n) ^ 31U;
     801  
     802  #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
     803  	uint32_t i;
     804  	__asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
     805  	return i;
     806  
     807  #elif defined(_MSC_VER)
     808  	unsigned long i;
     809  	_BitScanReverse(&i, n);
     810  	return i;
     811  
     812  #else
     813  	uint32_t i = 31;
     814  
     815  	if ((n & 0xFFFF0000) == 0) {
     816  		n <<= 16;
     817  		i = 15;
     818  	}
     819  
     820  	if ((n & 0xFF000000) == 0) {
     821  		n <<= 8;
     822  		i -= 8;
     823  	}
     824  
     825  	if ((n & 0xF0000000) == 0) {
     826  		n <<= 4;
     827  		i -= 4;
     828  	}
     829  
     830  	if ((n & 0xC0000000) == 0) {
     831  		n <<= 2;
     832  		i -= 2;
     833  	}
     834  
     835  	if ((n & 0x80000000) == 0)
     836  		--i;
     837  
     838  	return i;
     839  #endif
     840  }
     841  
     842  
     843  static inline uint32_t
     844  clz32(uint32_t n)
     845  {
     846  #if defined(__INTEL_COMPILER)
     847  	return _bit_scan_reverse(n) ^ 31U;
     848  
     849  #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
     850  	return (uint32_t)__builtin_clz(n);
     851  
     852  #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
     853  	uint32_t i;
     854  	__asm__("bsrl %1, %0\n\t"
     855  		"xorl $31, %0"
     856  		: "=r" (i) : "rm" (n));
     857  	return i;
     858  
     859  #elif defined(_MSC_VER)
     860  	unsigned long i;
     861  	_BitScanReverse(&i, n);
     862  	return i ^ 31U;
     863  
     864  #else
     865  	uint32_t i = 0;
     866  
     867  	if ((n & 0xFFFF0000) == 0) {
     868  		n <<= 16;
     869  		i = 16;
     870  	}
     871  
     872  	if ((n & 0xFF000000) == 0) {
     873  		n <<= 8;
     874  		i += 8;
     875  	}
     876  
     877  	if ((n & 0xF0000000) == 0) {
     878  		n <<= 4;
     879  		i += 4;
     880  	}
     881  
     882  	if ((n & 0xC0000000) == 0) {
     883  		n <<= 2;
     884  		i += 2;
     885  	}
     886  
     887  	if ((n & 0x80000000) == 0)
     888  		++i;
     889  
     890  	return i;
     891  #endif
     892  }
     893  
     894  
     895  static inline uint32_t
     896  ctz32(uint32_t n)
     897  {
     898  #if defined(__INTEL_COMPILER)
     899  	return _bit_scan_forward(n);
     900  
     901  #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
     902  	return (uint32_t)__builtin_ctz(n);
     903  
     904  #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
     905  	uint32_t i;
     906  	__asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
     907  	return i;
     908  
     909  #elif defined(_MSC_VER)
     910  	unsigned long i;
     911  	_BitScanForward(&i, n);
     912  	return i;
     913  
     914  #else
     915  	uint32_t i = 0;
     916  
     917  	if ((n & 0x0000FFFF) == 0) {
     918  		n >>= 16;
     919  		i = 16;
     920  	}
     921  
     922  	if ((n & 0x000000FF) == 0) {
     923  		n >>= 8;
     924  		i += 8;
     925  	}
     926  
     927  	if ((n & 0x0000000F) == 0) {
     928  		n >>= 4;
     929  		i += 4;
     930  	}
     931  
     932  	if ((n & 0x00000003) == 0) {
     933  		n >>= 2;
     934  		i += 2;
     935  	}
     936  
     937  	if ((n & 0x00000001) == 0)
     938  		++i;
     939  
     940  	return i;
     941  #endif
     942  }
     943  
     944  #define bsf32 ctz32
     945  
     946  #endif