1 ///////////////////////////////////////////////////////////////////////////////
2 //
3 /// \file tuklib_integer.h
4 /// \brief Various integer and bit operations
5 ///
6 /// This file provides macros or functions to do some basic integer and bit
7 /// operations.
8 ///
9 /// Native endian inline functions (XX = 16, 32, or 64):
10 /// - Unaligned native endian reads: readXXne(ptr)
11 /// - Unaligned native endian writes: writeXXne(ptr, num)
12 /// - Aligned native endian reads: aligned_readXXne(ptr)
13 /// - Aligned native endian writes: aligned_writeXXne(ptr, num)
14 ///
15 /// Endianness-converting integer operations (these can be macros!)
16 /// (XX = 16, 32, or 64; Y = b or l):
17 /// - Byte swapping: bswapXX(num)
18 /// - Byte order conversions to/from native (byteswaps if Y isn't
19 /// the native endianness): convXXYe(num)
20 /// - Unaligned reads: readXXYe(ptr)
21 /// - Unaligned writes: writeXXYe(ptr, num)
22 /// - Aligned reads: aligned_readXXYe(ptr)
23 /// - Aligned writes: aligned_writeXXYe(ptr, num)
24 ///
25 /// Since the above can macros, the arguments should have no side effects
26 /// because they may be evaluated more than once.
27 ///
28 /// Bit scan operations for non-zero 32-bit integers (inline functions):
29 /// - Bit scan reverse (find highest non-zero bit): bsr32(num)
30 /// - Count leading zeros: clz32(num)
31 /// - Count trailing zeros: ctz32(num)
32 /// - Bit scan forward (simply an alias for ctz32()): bsf32(num)
33 ///
34 /// The above bit scan operations return 0-31. If num is zero,
35 /// the result is undefined.
36 //
37 // Authors: Lasse Collin
38 // Joachim Henke
39 //
40 // This file has been put into the public domain.
41 // You can do whatever you want with this file.
42 //
43 ///////////////////////////////////////////////////////////////////////////////
44
45 #ifndef TUKLIB_INTEGER_H
46 #define TUKLIB_INTEGER_H
47
48 #include "tuklib_common.h"
49 #include <string.h>
50
51 // Newer Intel C compilers require immintrin.h for _bit_scan_reverse()
52 // and such functions.
53 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500)
54 # include <immintrin.h>
55 // Only include <intrin.h> when it is needed. GCC and Clang can both
56 // use __builtin's, so we only need Windows instrincs when using MSVC.
57 // GCC and Clang can set _MSC_VER on Windows, so we need to exclude these
58 // cases explicitly.
59 #elif defined(_MSC_VER) && !TUKLIB_GNUC_REQ(3, 4) && !defined(__clang__)
60 # include <intrin.h>
61 #endif
62
63
64 ///////////////////
65 // Byte swapping //
66 ///////////////////
67
68 #if defined(HAVE___BUILTIN_BSWAPXX)
69 // GCC >= 4.8 and Clang
70 # define bswap16(n) __builtin_bswap16(n)
71 # define bswap32(n) __builtin_bswap32(n)
72 # define bswap64(n) __builtin_bswap64(n)
73
74 #elif defined(HAVE_BYTESWAP_H)
75 // glibc, uClibc, dietlibc
76 # include <byteswap.h>
77 # ifdef HAVE_BSWAP_16
78 # define bswap16(num) bswap_16(num)
79 # endif
80 # ifdef HAVE_BSWAP_32
81 # define bswap32(num) bswap_32(num)
82 # endif
83 # ifdef HAVE_BSWAP_64
84 # define bswap64(num) bswap_64(num)
85 # endif
86
87 #elif defined(HAVE_SYS_ENDIAN_H)
88 // *BSDs and Darwin
89 # include <sys/endian.h>
90
91 #elif defined(HAVE_SYS_BYTEORDER_H)
92 // Solaris
93 # include <sys/byteorder.h>
94 # ifdef BSWAP_16
95 # define bswap16(num) BSWAP_16(num)
96 # endif
97 # ifdef BSWAP_32
98 # define bswap32(num) BSWAP_32(num)
99 # endif
100 # ifdef BSWAP_64
101 # define bswap64(num) BSWAP_64(num)
102 # endif
103 # ifdef BE_16
104 # define conv16be(num) BE_16(num)
105 # endif
106 # ifdef BE_32
107 # define conv32be(num) BE_32(num)
108 # endif
109 # ifdef BE_64
110 # define conv64be(num) BE_64(num)
111 # endif
112 # ifdef LE_16
113 # define conv16le(num) LE_16(num)
114 # endif
115 # ifdef LE_32
116 # define conv32le(num) LE_32(num)
117 # endif
118 # ifdef LE_64
119 # define conv64le(num) LE_64(num)
120 # endif
121 #endif
122
123 #ifndef bswap16
124 # define bswap16(n) (uint16_t)( \
125 (((n) & 0x00FFU) << 8) \
126 | (((n) & 0xFF00U) >> 8) \
127 )
128 #endif
129
130 #ifndef bswap32
131 # define bswap32(n) (uint32_t)( \
132 (((n) & UINT32_C(0x000000FF)) << 24) \
133 | (((n) & UINT32_C(0x0000FF00)) << 8) \
134 | (((n) & UINT32_C(0x00FF0000)) >> 8) \
135 | (((n) & UINT32_C(0xFF000000)) >> 24) \
136 )
137 #endif
138
139 #ifndef bswap64
140 # define bswap64(n) (uint64_t)( \
141 (((n) & UINT64_C(0x00000000000000FF)) << 56) \
142 | (((n) & UINT64_C(0x000000000000FF00)) << 40) \
143 | (((n) & UINT64_C(0x0000000000FF0000)) << 24) \
144 | (((n) & UINT64_C(0x00000000FF000000)) << 8) \
145 | (((n) & UINT64_C(0x000000FF00000000)) >> 8) \
146 | (((n) & UINT64_C(0x0000FF0000000000)) >> 24) \
147 | (((n) & UINT64_C(0x00FF000000000000)) >> 40) \
148 | (((n) & UINT64_C(0xFF00000000000000)) >> 56) \
149 )
150 #endif
151
152 // Define conversion macros using the basic byte swapping macros.
153 #ifdef WORDS_BIGENDIAN
154 # ifndef conv16be
155 # define conv16be(num) ((uint16_t)(num))
156 # endif
157 # ifndef conv32be
158 # define conv32be(num) ((uint32_t)(num))
159 # endif
160 # ifndef conv64be
161 # define conv64be(num) ((uint64_t)(num))
162 # endif
163 # ifndef conv16le
164 # define conv16le(num) bswap16(num)
165 # endif
166 # ifndef conv32le
167 # define conv32le(num) bswap32(num)
168 # endif
169 # ifndef conv64le
170 # define conv64le(num) bswap64(num)
171 # endif
172 #else
173 # ifndef conv16be
174 # define conv16be(num) bswap16(num)
175 # endif
176 # ifndef conv32be
177 # define conv32be(num) bswap32(num)
178 # endif
179 # ifndef conv64be
180 # define conv64be(num) bswap64(num)
181 # endif
182 # ifndef conv16le
183 # define conv16le(num) ((uint16_t)(num))
184 # endif
185 # ifndef conv32le
186 # define conv32le(num) ((uint32_t)(num))
187 # endif
188 # ifndef conv64le
189 # define conv64le(num) ((uint64_t)(num))
190 # endif
191 #endif
192
193
194 ////////////////////////////////
195 // Unaligned reads and writes //
196 ////////////////////////////////
197
198 // No-strict-align archs like x86-64
199 // ---------------------------------
200 //
201 // The traditional way of casting e.g. *(const uint16_t *)uint8_pointer
202 // is bad even if the uint8_pointer is properly aligned because this kind
203 // of casts break strict aliasing rules and result in undefined behavior.
204 // With unaligned pointers it's even worse: compilers may emit vector
205 // instructions that require aligned pointers even if non-vector
206 // instructions work with unaligned pointers.
207 //
208 // Using memcpy() is the standard compliant way to do unaligned access.
209 // Many modern compilers inline it so there is no function call overhead.
210 // For those compilers that don't handle the memcpy() method well, the
211 // old casting method (that violates strict aliasing) can be requested at
212 // build time. A third method, casting to a packed struct, would also be
213 // an option but isn't provided to keep things simpler (it's already a mess).
214 // Hopefully this is flexible enough in practice.
215 //
216 // Some compilers on x86-64 like Clang >= 10 and GCC >= 5.1 detect that
217 //
218 // buf[0] | (buf[1] << 8)
219 //
220 // reads a 16-bit value and can emit a single 16-bit load and produce
221 // identical code than with the memcpy() method. In other cases Clang and GCC
222 // produce either the same or better code with memcpy(). For example, Clang 9
223 // on x86-64 can detect 32-bit load but not 16-bit load.
224 //
225 // MSVC uses unaligned access with the memcpy() method but emits byte-by-byte
226 // code for "buf[0] | (buf[1] << 8)".
227 //
228 // Conclusion: The memcpy() method is the best choice when unaligned access
229 // is supported.
230 //
231 // Strict-align archs like SPARC
232 // -----------------------------
233 //
234 // GCC versions from around 4.x to to at least 13.2.0 produce worse code
235 // from the memcpy() method than from simple byte-by-byte shift-or code
236 // when reading a 32-bit integer:
237 //
238 // (1) It may be constructed on stack using using four 8-bit loads,
239 // four 8-bit stores to stack, and finally one 32-bit load from stack.
240 //
241 // (2) Especially with -Os, an actual memcpy() call may be emitted.
242 //
243 // This is true on at least on ARM, ARM64, SPARC, SPARC64, MIPS64EL, and
244 // RISC-V. Of these, ARM, ARM64, and RISC-V support unaligned access in
245 // some processors but not all so this is relevant only in the case when
246 // GCC assumes that unaligned is not supported or -mstrict-align or
247 // -mno-unaligned-access is used.
248 //
249 // For Clang it makes little difference. ARM64 with -O2 -mstrict-align
250 // was one the very few with a minor difference: the memcpy() version
251 // was one instruction longer.
252 //
253 // Conclusion: At least in case of GCC and Clang, byte-by-byte code is
254 // the best choise for strict-align archs to do unaligned access.
255 //
256 // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111502
257 //
258 // Thanks to <https://godbolt.org/> it was easy to test different compilers.
259 // The following is for little endian targets:
260 /*
261 #include <stdint.h>
262 #include <string.h>
263
264 uint32_t bytes16(const uint8_t *b)
265 {
266 return (uint32_t)b[0]
267 | ((uint32_t)b[1] << 8);
268 }
269
270 uint32_t copy16(const uint8_t *b)
271 {
272 uint16_t v;
273 memcpy(&v, b, sizeof(v));
274 return v;
275 }
276
277 uint32_t bytes32(const uint8_t *b)
278 {
279 return (uint32_t)b[0]
280 | ((uint32_t)b[1] << 8)
281 | ((uint32_t)b[2] << 16)
282 | ((uint32_t)b[3] << 24);
283 }
284
285 uint32_t copy32(const uint8_t *b)
286 {
287 uint32_t v;
288 memcpy(&v, b, sizeof(v));
289 return v;
290 }
291
292 void wbytes16(uint8_t *b, uint16_t v)
293 {
294 b[0] = (uint8_t)v;
295 b[1] = (uint8_t)(v >> 8);
296 }
297
298 void wcopy16(uint8_t *b, uint16_t v)
299 {
300 memcpy(b, &v, sizeof(v));
301 }
302
303 void wbytes32(uint8_t *b, uint32_t v)
304 {
305 b[0] = (uint8_t)v;
306 b[1] = (uint8_t)(v >> 8);
307 b[2] = (uint8_t)(v >> 16);
308 b[3] = (uint8_t)(v >> 24);
309 }
310
311 void wcopy32(uint8_t *b, uint32_t v)
312 {
313 memcpy(b, &v, sizeof(v));
314 }
315 */
316
317
318 #ifdef TUKLIB_FAST_UNALIGNED_ACCESS
319
320 static inline uint16_t
321 read16ne(const uint8_t *buf)
322 {
323 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
324 return *(const uint16_t *)buf;
325 #else
326 uint16_t num;
327 memcpy(&num, buf, sizeof(num));
328 return num;
329 #endif
330 }
331
332
333 static inline uint32_t
334 read32ne(const uint8_t *buf)
335 {
336 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
337 return *(const uint32_t *)buf;
338 #else
339 uint32_t num;
340 memcpy(&num, buf, sizeof(num));
341 return num;
342 #endif
343 }
344
345
346 static inline uint64_t
347 read64ne(const uint8_t *buf)
348 {
349 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
350 return *(const uint64_t *)buf;
351 #else
352 uint64_t num;
353 memcpy(&num, buf, sizeof(num));
354 return num;
355 #endif
356 }
357
358
359 static inline void
360 write16ne(uint8_t *buf, uint16_t num)
361 {
362 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
363 *(uint16_t *)buf = num;
364 #else
365 memcpy(buf, &num, sizeof(num));
366 #endif
367 return;
368 }
369
370
371 static inline void
372 write32ne(uint8_t *buf, uint32_t num)
373 {
374 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
375 *(uint32_t *)buf = num;
376 #else
377 memcpy(buf, &num, sizeof(num));
378 #endif
379 return;
380 }
381
382
383 static inline void
384 write64ne(uint8_t *buf, uint64_t num)
385 {
386 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
387 *(uint64_t *)buf = num;
388 #else
389 memcpy(buf, &num, sizeof(num));
390 #endif
391 return;
392 }
393
394
395 static inline uint16_t
396 read16be(const uint8_t *buf)
397 {
398 uint16_t num = read16ne(buf);
399 return conv16be(num);
400 }
401
402
403 static inline uint16_t
404 read16le(const uint8_t *buf)
405 {
406 uint16_t num = read16ne(buf);
407 return conv16le(num);
408 }
409
410
411 static inline uint32_t
412 read32be(const uint8_t *buf)
413 {
414 uint32_t num = read32ne(buf);
415 return conv32be(num);
416 }
417
418
419 static inline uint32_t
420 read32le(const uint8_t *buf)
421 {
422 uint32_t num = read32ne(buf);
423 return conv32le(num);
424 }
425
426
427 static inline uint64_t
428 read64be(const uint8_t *buf)
429 {
430 uint64_t num = read64ne(buf);
431 return conv64be(num);
432 }
433
434
435 static inline uint64_t
436 read64le(const uint8_t *buf)
437 {
438 uint64_t num = read64ne(buf);
439 return conv64le(num);
440 }
441
442
443 // NOTE: Possible byte swapping must be done in a macro to allow the compiler
444 // to optimize byte swapping of constants when using glibc's or *BSD's
445 // byte swapping macros. The actual write is done in an inline function
446 // to make type checking of the buf pointer possible.
447 #define write16be(buf, num) write16ne(buf, conv16be(num))
448 #define write32be(buf, num) write32ne(buf, conv32be(num))
449 #define write64be(buf, num) write64ne(buf, conv64be(num))
450 #define write16le(buf, num) write16ne(buf, conv16le(num))
451 #define write32le(buf, num) write32ne(buf, conv32le(num))
452 #define write64le(buf, num) write64ne(buf, conv64le(num))
453
454 #else
455
456 #ifdef WORDS_BIGENDIAN
457 # define read16ne read16be
458 # define read32ne read32be
459 # define read64ne read64be
460 # define write16ne write16be
461 # define write32ne write32be
462 # define write64ne write64be
463 #else
464 # define read16ne read16le
465 # define read32ne read32le
466 # define read64ne read64le
467 # define write16ne write16le
468 # define write32ne write32le
469 # define write64ne write64le
470 #endif
471
472
473 static inline uint16_t
474 read16be(const uint8_t *buf)
475 {
476 uint16_t num = ((uint16_t)buf[0] << 8) | (uint16_t)buf[1];
477 return num;
478 }
479
480
481 static inline uint16_t
482 read16le(const uint8_t *buf)
483 {
484 uint16_t num = ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8);
485 return num;
486 }
487
488
489 static inline uint32_t
490 read32be(const uint8_t *buf)
491 {
492 uint32_t num = (uint32_t)buf[0] << 24;
493 num |= (uint32_t)buf[1] << 16;
494 num |= (uint32_t)buf[2] << 8;
495 num |= (uint32_t)buf[3];
496 return num;
497 }
498
499
500 static inline uint32_t
501 read32le(const uint8_t *buf)
502 {
503 uint32_t num = (uint32_t)buf[0];
504 num |= (uint32_t)buf[1] << 8;
505 num |= (uint32_t)buf[2] << 16;
506 num |= (uint32_t)buf[3] << 24;
507 return num;
508 }
509
510
511 static inline uint64_t
512 read64be(const uint8_t *buf)
513 {
514 uint64_t num = (uint64_t)buf[0] << 56;
515 num |= (uint64_t)buf[1] << 48;
516 num |= (uint64_t)buf[2] << 40;
517 num |= (uint64_t)buf[3] << 32;
518 num |= (uint64_t)buf[4] << 24;
519 num |= (uint64_t)buf[5] << 16;
520 num |= (uint64_t)buf[6] << 8;
521 num |= (uint64_t)buf[7];
522 return num;
523 }
524
525
526 static inline uint64_t
527 read64le(const uint8_t *buf)
528 {
529 uint64_t num = (uint64_t)buf[0];
530 num |= (uint64_t)buf[1] << 8;
531 num |= (uint64_t)buf[2] << 16;
532 num |= (uint64_t)buf[3] << 24;
533 num |= (uint64_t)buf[4] << 32;
534 num |= (uint64_t)buf[5] << 40;
535 num |= (uint64_t)buf[6] << 48;
536 num |= (uint64_t)buf[7] << 56;
537 return num;
538 }
539
540
541 static inline void
542 write16be(uint8_t *buf, uint16_t num)
543 {
544 buf[0] = (uint8_t)(num >> 8);
545 buf[1] = (uint8_t)num;
546 return;
547 }
548
549
550 static inline void
551 write16le(uint8_t *buf, uint16_t num)
552 {
553 buf[0] = (uint8_t)num;
554 buf[1] = (uint8_t)(num >> 8);
555 return;
556 }
557
558
559 static inline void
560 write32be(uint8_t *buf, uint32_t num)
561 {
562 buf[0] = (uint8_t)(num >> 24);
563 buf[1] = (uint8_t)(num >> 16);
564 buf[2] = (uint8_t)(num >> 8);
565 buf[3] = (uint8_t)num;
566 return;
567 }
568
569
570 static inline void
571 write32le(uint8_t *buf, uint32_t num)
572 {
573 buf[0] = (uint8_t)num;
574 buf[1] = (uint8_t)(num >> 8);
575 buf[2] = (uint8_t)(num >> 16);
576 buf[3] = (uint8_t)(num >> 24);
577 return;
578 }
579
580
581 static inline void
582 write64be(uint8_t *buf, uint64_t num)
583 {
584 buf[0] = (uint8_t)(num >> 56);
585 buf[1] = (uint8_t)(num >> 48);
586 buf[2] = (uint8_t)(num >> 40);
587 buf[3] = (uint8_t)(num >> 32);
588 buf[4] = (uint8_t)(num >> 24);
589 buf[5] = (uint8_t)(num >> 16);
590 buf[6] = (uint8_t)(num >> 8);
591 buf[7] = (uint8_t)num;
592 return;
593 }
594
595
596 static inline void
597 write64le(uint8_t *buf, uint64_t num)
598 {
599 buf[0] = (uint8_t)num;
600 buf[1] = (uint8_t)(num >> 8);
601 buf[2] = (uint8_t)(num >> 16);
602 buf[3] = (uint8_t)(num >> 24);
603 buf[4] = (uint8_t)(num >> 32);
604 buf[5] = (uint8_t)(num >> 40);
605 buf[6] = (uint8_t)(num >> 48);
606 buf[7] = (uint8_t)(num >> 56);
607 return;
608 }
609
610 #endif
611
612
613 //////////////////////////////
614 // Aligned reads and writes //
615 //////////////////////////////
616
617 // Separate functions for aligned reads and writes are provided since on
618 // strict-align archs aligned access is much faster than unaligned access.
619 //
620 // Just like in the unaligned case, memcpy() is needed to avoid
621 // strict aliasing violations. However, on archs that don't support
622 // unaligned access the compiler cannot know that the pointers given
623 // to memcpy() are aligned which results in slow code. As of C11 there is
624 // no standard way to tell the compiler that we know that the address is
625 // aligned but some compilers have language extensions to do that. With
626 // such language extensions the memcpy() method gives excellent results.
627 //
628 // What to do on a strict-align system when no known language extentensions
629 // are available? Falling back to byte-by-byte access would be safe but ruin
630 // optimizations that have been made specifically with aligned access in mind.
631 // As a compromise, aligned reads will fall back to non-compliant type punning
632 // but aligned writes will be byte-by-byte, that is, fast reads are preferred
633 // over fast writes. This obviously isn't great but hopefully it's a working
634 // compromise for now.
635 //
636 // __builtin_assume_aligned is support by GCC >= 4.7 and clang >= 3.6.
637 #ifdef HAVE___BUILTIN_ASSUME_ALIGNED
638 # define tuklib_memcpy_aligned(dest, src, size) \
639 memcpy(dest, __builtin_assume_aligned(src, size), size)
640 #else
641 # define tuklib_memcpy_aligned(dest, src, size) \
642 memcpy(dest, src, size)
643 # ifndef TUKLIB_FAST_UNALIGNED_ACCESS
644 # define TUKLIB_USE_UNSAFE_ALIGNED_READS 1
645 # endif
646 #endif
647
648
649 static inline uint16_t
650 aligned_read16ne(const uint8_t *buf)
651 {
652 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
653 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
654 return *(const uint16_t *)buf;
655 #else
656 uint16_t num;
657 tuklib_memcpy_aligned(&num, buf, sizeof(num));
658 return num;
659 #endif
660 }
661
662
663 static inline uint32_t
664 aligned_read32ne(const uint8_t *buf)
665 {
666 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
667 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
668 return *(const uint32_t *)buf;
669 #else
670 uint32_t num;
671 tuklib_memcpy_aligned(&num, buf, sizeof(num));
672 return num;
673 #endif
674 }
675
676
677 static inline uint64_t
678 aligned_read64ne(const uint8_t *buf)
679 {
680 #if defined(TUKLIB_USE_UNSAFE_TYPE_PUNNING) \
681 || defined(TUKLIB_USE_UNSAFE_ALIGNED_READS)
682 return *(const uint64_t *)buf;
683 #else
684 uint64_t num;
685 tuklib_memcpy_aligned(&num, buf, sizeof(num));
686 return num;
687 #endif
688 }
689
690
691 static inline void
692 aligned_write16ne(uint8_t *buf, uint16_t num)
693 {
694 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
695 *(uint16_t *)buf = num;
696 #else
697 tuklib_memcpy_aligned(buf, &num, sizeof(num));
698 #endif
699 return;
700 }
701
702
703 static inline void
704 aligned_write32ne(uint8_t *buf, uint32_t num)
705 {
706 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
707 *(uint32_t *)buf = num;
708 #else
709 tuklib_memcpy_aligned(buf, &num, sizeof(num));
710 #endif
711 return;
712 }
713
714
715 static inline void
716 aligned_write64ne(uint8_t *buf, uint64_t num)
717 {
718 #ifdef TUKLIB_USE_UNSAFE_TYPE_PUNNING
719 *(uint64_t *)buf = num;
720 #else
721 tuklib_memcpy_aligned(buf, &num, sizeof(num));
722 #endif
723 return;
724 }
725
726
727 static inline uint16_t
728 aligned_read16be(const uint8_t *buf)
729 {
730 uint16_t num = aligned_read16ne(buf);
731 return conv16be(num);
732 }
733
734
735 static inline uint16_t
736 aligned_read16le(const uint8_t *buf)
737 {
738 uint16_t num = aligned_read16ne(buf);
739 return conv16le(num);
740 }
741
742
743 static inline uint32_t
744 aligned_read32be(const uint8_t *buf)
745 {
746 uint32_t num = aligned_read32ne(buf);
747 return conv32be(num);
748 }
749
750
751 static inline uint32_t
752 aligned_read32le(const uint8_t *buf)
753 {
754 uint32_t num = aligned_read32ne(buf);
755 return conv32le(num);
756 }
757
758
759 static inline uint64_t
760 aligned_read64be(const uint8_t *buf)
761 {
762 uint64_t num = aligned_read64ne(buf);
763 return conv64be(num);
764 }
765
766
767 static inline uint64_t
768 aligned_read64le(const uint8_t *buf)
769 {
770 uint64_t num = aligned_read64ne(buf);
771 return conv64le(num);
772 }
773
774
775 // These need to be macros like in the unaligned case.
776 #define aligned_write16be(buf, num) aligned_write16ne((buf), conv16be(num))
777 #define aligned_write16le(buf, num) aligned_write16ne((buf), conv16le(num))
778 #define aligned_write32be(buf, num) aligned_write32ne((buf), conv32be(num))
779 #define aligned_write32le(buf, num) aligned_write32ne((buf), conv32le(num))
780 #define aligned_write64be(buf, num) aligned_write64ne((buf), conv64be(num))
781 #define aligned_write64le(buf, num) aligned_write64ne((buf), conv64le(num))
782
783
784 ////////////////////
785 // Bit operations //
786 ////////////////////
787
788 static inline uint32_t
789 bsr32(uint32_t n)
790 {
791 // Check for ICC first, since it tends to define __GNUC__ too.
792 #if defined(__INTEL_COMPILER)
793 return _bit_scan_reverse(n);
794
795 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
796 // GCC >= 3.4 has __builtin_clz(), which gives good results on
797 // multiple architectures. On x86, __builtin_clz() ^ 31U becomes
798 // either plain BSR (so the XOR gets optimized away) or LZCNT and
799 // XOR (if -march indicates that SSE4a instructions are supported).
800 return (uint32_t)__builtin_clz(n) ^ 31U;
801
802 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
803 uint32_t i;
804 __asm__("bsrl %1, %0" : "=r" (i) : "rm" (n));
805 return i;
806
807 #elif defined(_MSC_VER)
808 unsigned long i;
809 _BitScanReverse(&i, n);
810 return i;
811
812 #else
813 uint32_t i = 31;
814
815 if ((n & 0xFFFF0000) == 0) {
816 n <<= 16;
817 i = 15;
818 }
819
820 if ((n & 0xFF000000) == 0) {
821 n <<= 8;
822 i -= 8;
823 }
824
825 if ((n & 0xF0000000) == 0) {
826 n <<= 4;
827 i -= 4;
828 }
829
830 if ((n & 0xC0000000) == 0) {
831 n <<= 2;
832 i -= 2;
833 }
834
835 if ((n & 0x80000000) == 0)
836 --i;
837
838 return i;
839 #endif
840 }
841
842
843 static inline uint32_t
844 clz32(uint32_t n)
845 {
846 #if defined(__INTEL_COMPILER)
847 return _bit_scan_reverse(n) ^ 31U;
848
849 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX == UINT32_MAX
850 return (uint32_t)__builtin_clz(n);
851
852 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
853 uint32_t i;
854 __asm__("bsrl %1, %0\n\t"
855 "xorl $31, %0"
856 : "=r" (i) : "rm" (n));
857 return i;
858
859 #elif defined(_MSC_VER)
860 unsigned long i;
861 _BitScanReverse(&i, n);
862 return i ^ 31U;
863
864 #else
865 uint32_t i = 0;
866
867 if ((n & 0xFFFF0000) == 0) {
868 n <<= 16;
869 i = 16;
870 }
871
872 if ((n & 0xFF000000) == 0) {
873 n <<= 8;
874 i += 8;
875 }
876
877 if ((n & 0xF0000000) == 0) {
878 n <<= 4;
879 i += 4;
880 }
881
882 if ((n & 0xC0000000) == 0) {
883 n <<= 2;
884 i += 2;
885 }
886
887 if ((n & 0x80000000) == 0)
888 ++i;
889
890 return i;
891 #endif
892 }
893
894
895 static inline uint32_t
896 ctz32(uint32_t n)
897 {
898 #if defined(__INTEL_COMPILER)
899 return _bit_scan_forward(n);
900
901 #elif (TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) && UINT_MAX >= UINT32_MAX
902 return (uint32_t)__builtin_ctz(n);
903
904 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
905 uint32_t i;
906 __asm__("bsfl %1, %0" : "=r" (i) : "rm" (n));
907 return i;
908
909 #elif defined(_MSC_VER)
910 unsigned long i;
911 _BitScanForward(&i, n);
912 return i;
913
914 #else
915 uint32_t i = 0;
916
917 if ((n & 0x0000FFFF) == 0) {
918 n >>= 16;
919 i = 16;
920 }
921
922 if ((n & 0x000000FF) == 0) {
923 n >>= 8;
924 i += 8;
925 }
926
927 if ((n & 0x0000000F) == 0) {
928 n >>= 4;
929 i += 4;
930 }
931
932 if ((n & 0x00000003) == 0) {
933 n >>= 2;
934 i += 2;
935 }
936
937 if ((n & 0x00000001) == 0)
938 ++i;
939
940 return i;
941 #endif
942 }
943
944 #define bsf32 ctz32
945
946 #endif